btrfs: replace many BUG_ONs with proper error handling
[linux-2.6-block.git] / fs / btrfs / ctree.c
CommitLineData
6cbd5570 1/*
d352ac68 2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6cbd5570
CM
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
a6b6e75e 19#include <linux/sched.h>
5a0e3ad6 20#include <linux/slab.h>
eb60ceac
CM
21#include "ctree.h"
22#include "disk-io.h"
7f5c1516 23#include "transaction.h"
5f39d397 24#include "print-tree.h"
925baedd 25#include "locking.h"
9a8dd150 26
e089f05c
CM
27static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28 *root, struct btrfs_path *path, int level);
29static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
d4dbff95 30 *root, struct btrfs_key *ins_key,
cc0c5538 31 struct btrfs_path *path, int data_size, int extend);
5f39d397
CM
32static int push_node_left(struct btrfs_trans_handle *trans,
33 struct btrfs_root *root, struct extent_buffer *dst,
971a1f66 34 struct extent_buffer *src, int empty);
5f39d397
CM
35static int balance_node_right(struct btrfs_trans_handle *trans,
36 struct btrfs_root *root,
37 struct extent_buffer *dst_buf,
38 struct extent_buffer *src_buf);
143bede5 39static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e089f05c 40 struct btrfs_path *path, int level, int slot);
d97e63b6 41
df24a2b9 42struct btrfs_path *btrfs_alloc_path(void)
2c90e5d6 43{
df24a2b9 44 struct btrfs_path *path;
e00f7308 45 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
df24a2b9 46 return path;
2c90e5d6
CM
47}
48
b4ce94de
CM
49/*
50 * set all locked nodes in the path to blocking locks. This should
51 * be done before scheduling
52 */
53noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54{
55 int i;
56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
bd681513
CM
57 if (!p->nodes[i] || !p->locks[i])
58 continue;
59 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60 if (p->locks[i] == BTRFS_READ_LOCK)
61 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
b4ce94de
CM
64 }
65}
66
67/*
68 * reset all the locked nodes in the patch to spinning locks.
4008c04a
CM
69 *
70 * held is used to keep lockdep happy, when lockdep is enabled
71 * we set held to a blocking lock before we go around and
72 * retake all the spinlocks in the path. You can safely use NULL
73 * for held
b4ce94de 74 */
4008c04a 75noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
bd681513 76 struct extent_buffer *held, int held_rw)
b4ce94de
CM
77{
78 int i;
4008c04a
CM
79
80#ifdef CONFIG_DEBUG_LOCK_ALLOC
81 /* lockdep really cares that we take all of these spinlocks
82 * in the right order. If any of the locks in the path are not
83 * currently blocking, it is going to complain. So, make really
84 * really sure by forcing the path to blocking before we clear
85 * the path blocking.
86 */
bd681513
CM
87 if (held) {
88 btrfs_set_lock_blocking_rw(held, held_rw);
89 if (held_rw == BTRFS_WRITE_LOCK)
90 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91 else if (held_rw == BTRFS_READ_LOCK)
92 held_rw = BTRFS_READ_LOCK_BLOCKING;
93 }
4008c04a
CM
94 btrfs_set_path_blocking(p);
95#endif
96
97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
bd681513
CM
98 if (p->nodes[i] && p->locks[i]) {
99 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101 p->locks[i] = BTRFS_WRITE_LOCK;
102 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103 p->locks[i] = BTRFS_READ_LOCK;
104 }
b4ce94de 105 }
4008c04a
CM
106
107#ifdef CONFIG_DEBUG_LOCK_ALLOC
108 if (held)
bd681513 109 btrfs_clear_lock_blocking_rw(held, held_rw);
4008c04a 110#endif
b4ce94de
CM
111}
112
d352ac68 113/* this also releases the path */
df24a2b9 114void btrfs_free_path(struct btrfs_path *p)
be0e5c09 115{
ff175d57
JJ
116 if (!p)
117 return;
b3b4aa74 118 btrfs_release_path(p);
df24a2b9 119 kmem_cache_free(btrfs_path_cachep, p);
be0e5c09
CM
120}
121
d352ac68
CM
122/*
123 * path release drops references on the extent buffers in the path
124 * and it drops any locks held by this path
125 *
126 * It is safe to call this on paths that no locks or extent buffers held.
127 */
b3b4aa74 128noinline void btrfs_release_path(struct btrfs_path *p)
eb60ceac
CM
129{
130 int i;
a2135011 131
234b63a0 132 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3f157a2f 133 p->slots[i] = 0;
eb60ceac 134 if (!p->nodes[i])
925baedd
CM
135 continue;
136 if (p->locks[i]) {
bd681513 137 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
925baedd
CM
138 p->locks[i] = 0;
139 }
5f39d397 140 free_extent_buffer(p->nodes[i]);
3f157a2f 141 p->nodes[i] = NULL;
eb60ceac
CM
142 }
143}
144
d352ac68
CM
145/*
146 * safely gets a reference on the root node of a tree. A lock
147 * is not taken, so a concurrent writer may put a different node
148 * at the root of the tree. See btrfs_lock_root_node for the
149 * looping required.
150 *
151 * The extent buffer returned by this has a reference taken, so
152 * it won't disappear. It may stop being the root of the tree
153 * at any time because there are no locks held.
154 */
925baedd
CM
155struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
156{
157 struct extent_buffer *eb;
240f62c8
CM
158
159 rcu_read_lock();
160 eb = rcu_dereference(root->node);
925baedd 161 extent_buffer_get(eb);
240f62c8 162 rcu_read_unlock();
925baedd
CM
163 return eb;
164}
165
d352ac68
CM
166/* loop around taking references on and locking the root node of the
167 * tree until you end up with a lock on the root. A locked buffer
168 * is returned, with a reference held.
169 */
925baedd
CM
170struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
171{
172 struct extent_buffer *eb;
173
d397712b 174 while (1) {
925baedd
CM
175 eb = btrfs_root_node(root);
176 btrfs_tree_lock(eb);
240f62c8 177 if (eb == root->node)
925baedd 178 break;
925baedd
CM
179 btrfs_tree_unlock(eb);
180 free_extent_buffer(eb);
181 }
182 return eb;
183}
184
bd681513
CM
185/* loop around taking references on and locking the root node of the
186 * tree until you end up with a lock on the root. A locked buffer
187 * is returned, with a reference held.
188 */
189struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190{
191 struct extent_buffer *eb;
192
193 while (1) {
194 eb = btrfs_root_node(root);
195 btrfs_tree_read_lock(eb);
196 if (eb == root->node)
197 break;
198 btrfs_tree_read_unlock(eb);
199 free_extent_buffer(eb);
200 }
201 return eb;
202}
203
d352ac68
CM
204/* cowonly root (everything not a reference counted cow subvolume), just get
205 * put onto a simple dirty list. transaction.c walks this to make sure they
206 * get properly updated on disk.
207 */
0b86a832
CM
208static void add_root_to_dirty_list(struct btrfs_root *root)
209{
210 if (root->track_dirty && list_empty(&root->dirty_list)) {
211 list_add(&root->dirty_list,
212 &root->fs_info->dirty_cowonly_roots);
213 }
214}
215
d352ac68
CM
216/*
217 * used by snapshot creation to make a copy of a root for a tree with
218 * a given objectid. The buffer with the new root node is returned in
219 * cow_ret, and this func returns zero on success or a negative error code.
220 */
be20aa9d
CM
221int btrfs_copy_root(struct btrfs_trans_handle *trans,
222 struct btrfs_root *root,
223 struct extent_buffer *buf,
224 struct extent_buffer **cow_ret, u64 new_root_objectid)
225{
226 struct extent_buffer *cow;
be20aa9d
CM
227 int ret = 0;
228 int level;
5d4f98a2 229 struct btrfs_disk_key disk_key;
be20aa9d
CM
230
231 WARN_ON(root->ref_cows && trans->transid !=
232 root->fs_info->running_transaction->transid);
233 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
234
235 level = btrfs_header_level(buf);
5d4f98a2
YZ
236 if (level == 0)
237 btrfs_item_key(buf, &disk_key, 0);
238 else
239 btrfs_node_key(buf, &disk_key, 0);
31840ae1 240
5d4f98a2
YZ
241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242 new_root_objectid, &disk_key, level,
66d7e7f0 243 buf->start, 0, 1);
5d4f98a2 244 if (IS_ERR(cow))
be20aa9d
CM
245 return PTR_ERR(cow);
246
247 copy_extent_buffer(cow, buf, 0, 0, cow->len);
248 btrfs_set_header_bytenr(cow, cow->start);
249 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
250 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
251 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
252 BTRFS_HEADER_FLAG_RELOC);
253 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
254 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
255 else
256 btrfs_set_header_owner(cow, new_root_objectid);
be20aa9d 257
2b82032c
YZ
258 write_extent_buffer(cow, root->fs_info->fsid,
259 (unsigned long)btrfs_header_fsid(cow),
260 BTRFS_FSID_SIZE);
261
be20aa9d 262 WARN_ON(btrfs_header_generation(buf) > trans->transid);
5d4f98a2 263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
66d7e7f0 264 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
5d4f98a2 265 else
66d7e7f0 266 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
4aec2b52 267
be20aa9d
CM
268 if (ret)
269 return ret;
270
271 btrfs_mark_buffer_dirty(cow);
272 *cow_ret = cow;
273 return 0;
274}
275
5d4f98a2
YZ
276/*
277 * check if the tree block can be shared by multiple trees
278 */
279int btrfs_block_can_be_shared(struct btrfs_root *root,
280 struct extent_buffer *buf)
281{
282 /*
283 * Tree blocks not in refernece counted trees and tree roots
284 * are never shared. If a block was allocated after the last
285 * snapshot and the block was not allocated by tree relocation,
286 * we know the block is not shared.
287 */
288 if (root->ref_cows &&
289 buf != root->node && buf != root->commit_root &&
290 (btrfs_header_generation(buf) <=
291 btrfs_root_last_snapshot(&root->root_item) ||
292 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
293 return 1;
294#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
295 if (root->ref_cows &&
296 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
297 return 1;
298#endif
299 return 0;
300}
301
302static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
303 struct btrfs_root *root,
304 struct extent_buffer *buf,
f0486c68
YZ
305 struct extent_buffer *cow,
306 int *last_ref)
5d4f98a2
YZ
307{
308 u64 refs;
309 u64 owner;
310 u64 flags;
311 u64 new_flags = 0;
312 int ret;
313
314 /*
315 * Backrefs update rules:
316 *
317 * Always use full backrefs for extent pointers in tree block
318 * allocated by tree relocation.
319 *
320 * If a shared tree block is no longer referenced by its owner
321 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
322 * use full backrefs for extent pointers in tree block.
323 *
324 * If a tree block is been relocating
325 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
326 * use full backrefs for extent pointers in tree block.
327 * The reason for this is some operations (such as drop tree)
328 * are only allowed for blocks use full backrefs.
329 */
330
331 if (btrfs_block_can_be_shared(root, buf)) {
332 ret = btrfs_lookup_extent_info(trans, root, buf->start,
333 buf->len, &refs, &flags);
be1a5564
MF
334 if (ret)
335 return ret;
e5df9573
MF
336 if (refs == 0) {
337 ret = -EROFS;
338 btrfs_std_error(root->fs_info, ret);
339 return ret;
340 }
5d4f98a2
YZ
341 } else {
342 refs = 1;
343 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
344 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
345 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
346 else
347 flags = 0;
348 }
349
350 owner = btrfs_header_owner(buf);
351 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
353
354 if (refs > 1) {
355 if ((owner == root->root_key.objectid ||
356 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
357 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
66d7e7f0 358 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
79787eaa 359 BUG_ON(ret); /* -ENOMEM */
5d4f98a2
YZ
360
361 if (root->root_key.objectid ==
362 BTRFS_TREE_RELOC_OBJECTID) {
66d7e7f0 363 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
79787eaa 364 BUG_ON(ret); /* -ENOMEM */
66d7e7f0 365 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
79787eaa 366 BUG_ON(ret); /* -ENOMEM */
5d4f98a2
YZ
367 }
368 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
369 } else {
370
371 if (root->root_key.objectid ==
372 BTRFS_TREE_RELOC_OBJECTID)
66d7e7f0 373 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
5d4f98a2 374 else
66d7e7f0 375 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
79787eaa 376 BUG_ON(ret); /* -ENOMEM */
5d4f98a2
YZ
377 }
378 if (new_flags != 0) {
379 ret = btrfs_set_disk_extent_flags(trans, root,
380 buf->start,
381 buf->len,
382 new_flags, 0);
be1a5564
MF
383 if (ret)
384 return ret;
5d4f98a2
YZ
385 }
386 } else {
387 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
388 if (root->root_key.objectid ==
389 BTRFS_TREE_RELOC_OBJECTID)
66d7e7f0 390 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
5d4f98a2 391 else
66d7e7f0 392 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
79787eaa 393 BUG_ON(ret); /* -ENOMEM */
66d7e7f0 394 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
79787eaa 395 BUG_ON(ret); /* -ENOMEM */
5d4f98a2
YZ
396 }
397 clean_tree_block(trans, root, buf);
f0486c68 398 *last_ref = 1;
5d4f98a2
YZ
399 }
400 return 0;
401}
402
d352ac68 403/*
d397712b
CM
404 * does the dirty work in cow of a single block. The parent block (if
405 * supplied) is updated to point to the new cow copy. The new buffer is marked
406 * dirty and returned locked. If you modify the block it needs to be marked
407 * dirty again.
d352ac68
CM
408 *
409 * search_start -- an allocation hint for the new block
410 *
d397712b
CM
411 * empty_size -- a hint that you plan on doing more cow. This is the size in
412 * bytes the allocator should try to find free next to the block it returns.
413 * This is just a hint and may be ignored by the allocator.
d352ac68 414 */
d397712b 415static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
416 struct btrfs_root *root,
417 struct extent_buffer *buf,
418 struct extent_buffer *parent, int parent_slot,
419 struct extent_buffer **cow_ret,
9fa8cfe7 420 u64 search_start, u64 empty_size)
02217ed2 421{
5d4f98a2 422 struct btrfs_disk_key disk_key;
5f39d397 423 struct extent_buffer *cow;
be1a5564 424 int level, ret;
f0486c68 425 int last_ref = 0;
925baedd 426 int unlock_orig = 0;
5d4f98a2 427 u64 parent_start;
7bb86316 428
925baedd
CM
429 if (*cow_ret == buf)
430 unlock_orig = 1;
431
b9447ef8 432 btrfs_assert_tree_locked(buf);
925baedd 433
7bb86316
CM
434 WARN_ON(root->ref_cows && trans->transid !=
435 root->fs_info->running_transaction->transid);
6702ed49 436 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
5f39d397 437
7bb86316 438 level = btrfs_header_level(buf);
31840ae1 439
5d4f98a2
YZ
440 if (level == 0)
441 btrfs_item_key(buf, &disk_key, 0);
442 else
443 btrfs_node_key(buf, &disk_key, 0);
444
445 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
446 if (parent)
447 parent_start = parent->start;
448 else
449 parent_start = 0;
450 } else
451 parent_start = 0;
452
453 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
454 root->root_key.objectid, &disk_key,
66d7e7f0 455 level, search_start, empty_size, 1);
54aa1f4d
CM
456 if (IS_ERR(cow))
457 return PTR_ERR(cow);
6702ed49 458
b4ce94de
CM
459 /* cow is set to blocking by btrfs_init_new_buffer */
460
5f39d397 461 copy_extent_buffer(cow, buf, 0, 0, cow->len);
db94535d 462 btrfs_set_header_bytenr(cow, cow->start);
5f39d397 463 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
464 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
465 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
466 BTRFS_HEADER_FLAG_RELOC);
467 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
468 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
469 else
470 btrfs_set_header_owner(cow, root->root_key.objectid);
6702ed49 471
2b82032c
YZ
472 write_extent_buffer(cow, root->fs_info->fsid,
473 (unsigned long)btrfs_header_fsid(cow),
474 BTRFS_FSID_SIZE);
475
be1a5564 476 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
b68dc2a9 477 if (ret) {
79787eaa 478 btrfs_abort_transaction(trans, root, ret);
b68dc2a9
MF
479 return ret;
480 }
1a40e23b 481
3fd0a558
YZ
482 if (root->ref_cows)
483 btrfs_reloc_cow_block(trans, root, buf, cow);
484
02217ed2 485 if (buf == root->node) {
925baedd 486 WARN_ON(parent && parent != buf);
5d4f98a2
YZ
487 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
488 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
489 parent_start = buf->start;
490 else
491 parent_start = 0;
925baedd 492
5f39d397 493 extent_buffer_get(cow);
240f62c8 494 rcu_assign_pointer(root->node, cow);
925baedd 495
f0486c68 496 btrfs_free_tree_block(trans, root, buf, parent_start,
66d7e7f0 497 last_ref, 1);
5f39d397 498 free_extent_buffer(buf);
0b86a832 499 add_root_to_dirty_list(root);
02217ed2 500 } else {
5d4f98a2
YZ
501 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
502 parent_start = parent->start;
503 else
504 parent_start = 0;
505
506 WARN_ON(trans->transid != btrfs_header_generation(parent));
5f39d397 507 btrfs_set_node_blockptr(parent, parent_slot,
db94535d 508 cow->start);
74493f7a
CM
509 btrfs_set_node_ptr_generation(parent, parent_slot,
510 trans->transid);
d6025579 511 btrfs_mark_buffer_dirty(parent);
f0486c68 512 btrfs_free_tree_block(trans, root, buf, parent_start,
66d7e7f0 513 last_ref, 1);
02217ed2 514 }
925baedd
CM
515 if (unlock_orig)
516 btrfs_tree_unlock(buf);
5f39d397 517 free_extent_buffer(buf);
ccd467d6 518 btrfs_mark_buffer_dirty(cow);
2c90e5d6 519 *cow_ret = cow;
02217ed2
CM
520 return 0;
521}
522
5d4f98a2
YZ
523static inline int should_cow_block(struct btrfs_trans_handle *trans,
524 struct btrfs_root *root,
525 struct extent_buffer *buf)
526{
f1ebcc74
LB
527 /* ensure we can see the force_cow */
528 smp_rmb();
529
530 /*
531 * We do not need to cow a block if
532 * 1) this block is not created or changed in this transaction;
533 * 2) this block does not belong to TREE_RELOC tree;
534 * 3) the root is not forced COW.
535 *
536 * What is forced COW:
537 * when we create snapshot during commiting the transaction,
538 * after we've finished coping src root, we must COW the shared
539 * block to ensure the metadata consistency.
540 */
5d4f98a2
YZ
541 if (btrfs_header_generation(buf) == trans->transid &&
542 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
543 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
f1ebcc74
LB
544 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
545 !root->force_cow)
5d4f98a2
YZ
546 return 0;
547 return 1;
548}
549
d352ac68
CM
550/*
551 * cows a single block, see __btrfs_cow_block for the real work.
552 * This version of it has extra checks so that a block isn't cow'd more than
553 * once per transaction, as long as it hasn't been written yet
554 */
d397712b 555noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
556 struct btrfs_root *root, struct extent_buffer *buf,
557 struct extent_buffer *parent, int parent_slot,
9fa8cfe7 558 struct extent_buffer **cow_ret)
6702ed49
CM
559{
560 u64 search_start;
f510cfec 561 int ret;
dc17ff8f 562
6702ed49 563 if (trans->transaction != root->fs_info->running_transaction) {
d397712b
CM
564 printk(KERN_CRIT "trans %llu running %llu\n",
565 (unsigned long long)trans->transid,
566 (unsigned long long)
6702ed49
CM
567 root->fs_info->running_transaction->transid);
568 WARN_ON(1);
569 }
570 if (trans->transid != root->fs_info->generation) {
d397712b
CM
571 printk(KERN_CRIT "trans %llu running %llu\n",
572 (unsigned long long)trans->transid,
573 (unsigned long long)root->fs_info->generation);
6702ed49
CM
574 WARN_ON(1);
575 }
dc17ff8f 576
5d4f98a2 577 if (!should_cow_block(trans, root, buf)) {
6702ed49
CM
578 *cow_ret = buf;
579 return 0;
580 }
c487685d 581
0b86a832 582 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
b4ce94de
CM
583
584 if (parent)
585 btrfs_set_lock_blocking(parent);
586 btrfs_set_lock_blocking(buf);
587
f510cfec 588 ret = __btrfs_cow_block(trans, root, buf, parent,
9fa8cfe7 589 parent_slot, cow_ret, search_start, 0);
1abe9b8a 590
591 trace_btrfs_cow_block(root, buf, *cow_ret);
592
f510cfec 593 return ret;
6702ed49
CM
594}
595
d352ac68
CM
596/*
597 * helper function for defrag to decide if two blocks pointed to by a
598 * node are actually close by
599 */
6b80053d 600static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
6702ed49 601{
6b80053d 602 if (blocknr < other && other - (blocknr + blocksize) < 32768)
6702ed49 603 return 1;
6b80053d 604 if (blocknr > other && blocknr - (other + blocksize) < 32768)
6702ed49
CM
605 return 1;
606 return 0;
607}
608
081e9573
CM
609/*
610 * compare two keys in a memcmp fashion
611 */
612static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
613{
614 struct btrfs_key k1;
615
616 btrfs_disk_key_to_cpu(&k1, disk);
617
20736aba 618 return btrfs_comp_cpu_keys(&k1, k2);
081e9573
CM
619}
620
f3465ca4
JB
621/*
622 * same as comp_keys only with two btrfs_key's
623 */
5d4f98a2 624int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
f3465ca4
JB
625{
626 if (k1->objectid > k2->objectid)
627 return 1;
628 if (k1->objectid < k2->objectid)
629 return -1;
630 if (k1->type > k2->type)
631 return 1;
632 if (k1->type < k2->type)
633 return -1;
634 if (k1->offset > k2->offset)
635 return 1;
636 if (k1->offset < k2->offset)
637 return -1;
638 return 0;
639}
081e9573 640
d352ac68
CM
641/*
642 * this is used by the defrag code to go through all the
643 * leaves pointed to by a node and reallocate them so that
644 * disk order is close to key order
645 */
6702ed49 646int btrfs_realloc_node(struct btrfs_trans_handle *trans,
5f39d397 647 struct btrfs_root *root, struct extent_buffer *parent,
a6b6e75e
CM
648 int start_slot, int cache_only, u64 *last_ret,
649 struct btrfs_key *progress)
6702ed49 650{
6b80053d 651 struct extent_buffer *cur;
6702ed49 652 u64 blocknr;
ca7a79ad 653 u64 gen;
e9d0b13b
CM
654 u64 search_start = *last_ret;
655 u64 last_block = 0;
6702ed49
CM
656 u64 other;
657 u32 parent_nritems;
6702ed49
CM
658 int end_slot;
659 int i;
660 int err = 0;
f2183bde 661 int parent_level;
6b80053d
CM
662 int uptodate;
663 u32 blocksize;
081e9573
CM
664 int progress_passed = 0;
665 struct btrfs_disk_key disk_key;
6702ed49 666
5708b959
CM
667 parent_level = btrfs_header_level(parent);
668 if (cache_only && parent_level != 1)
669 return 0;
670
d397712b 671 if (trans->transaction != root->fs_info->running_transaction)
6702ed49 672 WARN_ON(1);
d397712b 673 if (trans->transid != root->fs_info->generation)
6702ed49 674 WARN_ON(1);
86479a04 675
6b80053d 676 parent_nritems = btrfs_header_nritems(parent);
6b80053d 677 blocksize = btrfs_level_size(root, parent_level - 1);
6702ed49
CM
678 end_slot = parent_nritems;
679
680 if (parent_nritems == 1)
681 return 0;
682
b4ce94de
CM
683 btrfs_set_lock_blocking(parent);
684
6702ed49
CM
685 for (i = start_slot; i < end_slot; i++) {
686 int close = 1;
a6b6e75e 687
081e9573
CM
688 btrfs_node_key(parent, &disk_key, i);
689 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
690 continue;
691
692 progress_passed = 1;
6b80053d 693 blocknr = btrfs_node_blockptr(parent, i);
ca7a79ad 694 gen = btrfs_node_ptr_generation(parent, i);
e9d0b13b
CM
695 if (last_block == 0)
696 last_block = blocknr;
5708b959 697
6702ed49 698 if (i > 0) {
6b80053d
CM
699 other = btrfs_node_blockptr(parent, i - 1);
700 close = close_blocks(blocknr, other, blocksize);
6702ed49 701 }
0ef3e66b 702 if (!close && i < end_slot - 2) {
6b80053d
CM
703 other = btrfs_node_blockptr(parent, i + 1);
704 close = close_blocks(blocknr, other, blocksize);
6702ed49 705 }
e9d0b13b
CM
706 if (close) {
707 last_block = blocknr;
6702ed49 708 continue;
e9d0b13b 709 }
6702ed49 710
6b80053d
CM
711 cur = btrfs_find_tree_block(root, blocknr, blocksize);
712 if (cur)
1259ab75 713 uptodate = btrfs_buffer_uptodate(cur, gen);
6b80053d
CM
714 else
715 uptodate = 0;
5708b959 716 if (!cur || !uptodate) {
6702ed49 717 if (cache_only) {
6b80053d 718 free_extent_buffer(cur);
6702ed49
CM
719 continue;
720 }
6b80053d
CM
721 if (!cur) {
722 cur = read_tree_block(root, blocknr,
ca7a79ad 723 blocksize, gen);
97d9a8a4
TI
724 if (!cur)
725 return -EIO;
6b80053d 726 } else if (!uptodate) {
ca7a79ad 727 btrfs_read_buffer(cur, gen);
f2183bde 728 }
6702ed49 729 }
e9d0b13b 730 if (search_start == 0)
6b80053d 731 search_start = last_block;
e9d0b13b 732
e7a84565 733 btrfs_tree_lock(cur);
b4ce94de 734 btrfs_set_lock_blocking(cur);
6b80053d 735 err = __btrfs_cow_block(trans, root, cur, parent, i,
e7a84565 736 &cur, search_start,
6b80053d 737 min(16 * blocksize,
9fa8cfe7 738 (end_slot - i) * blocksize));
252c38f0 739 if (err) {
e7a84565 740 btrfs_tree_unlock(cur);
6b80053d 741 free_extent_buffer(cur);
6702ed49 742 break;
252c38f0 743 }
e7a84565
CM
744 search_start = cur->start;
745 last_block = cur->start;
f2183bde 746 *last_ret = search_start;
e7a84565
CM
747 btrfs_tree_unlock(cur);
748 free_extent_buffer(cur);
6702ed49
CM
749 }
750 return err;
751}
752
74123bd7
CM
753/*
754 * The leaf data grows from end-to-front in the node.
755 * this returns the address of the start of the last item,
756 * which is the stop of the leaf data stack
757 */
123abc88 758static inline unsigned int leaf_data_end(struct btrfs_root *root,
5f39d397 759 struct extent_buffer *leaf)
be0e5c09 760{
5f39d397 761 u32 nr = btrfs_header_nritems(leaf);
be0e5c09 762 if (nr == 0)
123abc88 763 return BTRFS_LEAF_DATA_SIZE(root);
5f39d397 764 return btrfs_item_offset_nr(leaf, nr - 1);
be0e5c09
CM
765}
766
aa5d6bed 767
74123bd7 768/*
5f39d397
CM
769 * search for key in the extent_buffer. The items start at offset p,
770 * and they are item_size apart. There are 'max' items in p.
771 *
74123bd7
CM
772 * the slot in the array is returned via slot, and it points to
773 * the place where you would insert key if it is not found in
774 * the array.
775 *
776 * slot may point to max if the key is bigger than all of the keys
777 */
e02119d5
CM
778static noinline int generic_bin_search(struct extent_buffer *eb,
779 unsigned long p,
780 int item_size, struct btrfs_key *key,
781 int max, int *slot)
be0e5c09
CM
782{
783 int low = 0;
784 int high = max;
785 int mid;
786 int ret;
479965d6 787 struct btrfs_disk_key *tmp = NULL;
5f39d397
CM
788 struct btrfs_disk_key unaligned;
789 unsigned long offset;
5f39d397
CM
790 char *kaddr = NULL;
791 unsigned long map_start = 0;
792 unsigned long map_len = 0;
479965d6 793 int err;
be0e5c09 794
d397712b 795 while (low < high) {
be0e5c09 796 mid = (low + high) / 2;
5f39d397
CM
797 offset = p + mid * item_size;
798
a6591715 799 if (!kaddr || offset < map_start ||
5f39d397
CM
800 (offset + sizeof(struct btrfs_disk_key)) >
801 map_start + map_len) {
934d375b
CM
802
803 err = map_private_extent_buffer(eb, offset,
479965d6 804 sizeof(struct btrfs_disk_key),
a6591715 805 &kaddr, &map_start, &map_len);
479965d6
CM
806
807 if (!err) {
808 tmp = (struct btrfs_disk_key *)(kaddr + offset -
809 map_start);
810 } else {
811 read_extent_buffer(eb, &unaligned,
812 offset, sizeof(unaligned));
813 tmp = &unaligned;
814 }
5f39d397 815
5f39d397
CM
816 } else {
817 tmp = (struct btrfs_disk_key *)(kaddr + offset -
818 map_start);
819 }
be0e5c09
CM
820 ret = comp_keys(tmp, key);
821
822 if (ret < 0)
823 low = mid + 1;
824 else if (ret > 0)
825 high = mid;
826 else {
827 *slot = mid;
828 return 0;
829 }
830 }
831 *slot = low;
832 return 1;
833}
834
97571fd0
CM
835/*
836 * simple bin_search frontend that does the right thing for
837 * leaves vs nodes
838 */
5f39d397
CM
839static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
840 int level, int *slot)
be0e5c09 841{
5f39d397
CM
842 if (level == 0) {
843 return generic_bin_search(eb,
844 offsetof(struct btrfs_leaf, items),
0783fcfc 845 sizeof(struct btrfs_item),
5f39d397 846 key, btrfs_header_nritems(eb),
7518a238 847 slot);
be0e5c09 848 } else {
5f39d397
CM
849 return generic_bin_search(eb,
850 offsetof(struct btrfs_node, ptrs),
123abc88 851 sizeof(struct btrfs_key_ptr),
5f39d397 852 key, btrfs_header_nritems(eb),
7518a238 853 slot);
be0e5c09
CM
854 }
855 return -1;
856}
857
5d4f98a2
YZ
858int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
859 int level, int *slot)
860{
861 return bin_search(eb, key, level, slot);
862}
863
f0486c68
YZ
864static void root_add_used(struct btrfs_root *root, u32 size)
865{
866 spin_lock(&root->accounting_lock);
867 btrfs_set_root_used(&root->root_item,
868 btrfs_root_used(&root->root_item) + size);
869 spin_unlock(&root->accounting_lock);
870}
871
872static void root_sub_used(struct btrfs_root *root, u32 size)
873{
874 spin_lock(&root->accounting_lock);
875 btrfs_set_root_used(&root->root_item,
876 btrfs_root_used(&root->root_item) - size);
877 spin_unlock(&root->accounting_lock);
878}
879
d352ac68
CM
880/* given a node and slot number, this reads the blocks it points to. The
881 * extent buffer is returned with a reference taken (but unlocked).
882 * NULL is returned on error.
883 */
e02119d5 884static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
5f39d397 885 struct extent_buffer *parent, int slot)
bb803951 886{
ca7a79ad 887 int level = btrfs_header_level(parent);
bb803951
CM
888 if (slot < 0)
889 return NULL;
5f39d397 890 if (slot >= btrfs_header_nritems(parent))
bb803951 891 return NULL;
ca7a79ad
CM
892
893 BUG_ON(level == 0);
894
db94535d 895 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
ca7a79ad
CM
896 btrfs_level_size(root, level - 1),
897 btrfs_node_ptr_generation(parent, slot));
bb803951
CM
898}
899
d352ac68
CM
900/*
901 * node level balancing, used to make sure nodes are in proper order for
902 * item deletion. We balance from the top down, so we have to make sure
903 * that a deletion won't leave an node completely empty later on.
904 */
e02119d5 905static noinline int balance_level(struct btrfs_trans_handle *trans,
98ed5174
CM
906 struct btrfs_root *root,
907 struct btrfs_path *path, int level)
bb803951 908{
5f39d397
CM
909 struct extent_buffer *right = NULL;
910 struct extent_buffer *mid;
911 struct extent_buffer *left = NULL;
912 struct extent_buffer *parent = NULL;
bb803951
CM
913 int ret = 0;
914 int wret;
915 int pslot;
bb803951 916 int orig_slot = path->slots[level];
79f95c82 917 u64 orig_ptr;
bb803951
CM
918
919 if (level == 0)
920 return 0;
921
5f39d397 922 mid = path->nodes[level];
b4ce94de 923
bd681513
CM
924 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
925 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
7bb86316
CM
926 WARN_ON(btrfs_header_generation(mid) != trans->transid);
927
1d4f8a0c 928 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
79f95c82 929
a05a9bb1 930 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 931 parent = path->nodes[level + 1];
a05a9bb1
LZ
932 pslot = path->slots[level + 1];
933 }
bb803951 934
40689478
CM
935 /*
936 * deal with the case where there is only one pointer in the root
937 * by promoting the node below to a root
938 */
5f39d397
CM
939 if (!parent) {
940 struct extent_buffer *child;
bb803951 941
5f39d397 942 if (btrfs_header_nritems(mid) != 1)
bb803951
CM
943 return 0;
944
945 /* promote the child to a root */
5f39d397 946 child = read_node_slot(root, mid, 0);
305a26af
MF
947 if (!child) {
948 ret = -EROFS;
949 btrfs_std_error(root->fs_info, ret);
950 goto enospc;
951 }
952
925baedd 953 btrfs_tree_lock(child);
b4ce94de 954 btrfs_set_lock_blocking(child);
9fa8cfe7 955 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
f0486c68
YZ
956 if (ret) {
957 btrfs_tree_unlock(child);
958 free_extent_buffer(child);
959 goto enospc;
960 }
2f375ab9 961
240f62c8 962 rcu_assign_pointer(root->node, child);
925baedd 963
0b86a832 964 add_root_to_dirty_list(root);
925baedd 965 btrfs_tree_unlock(child);
b4ce94de 966
925baedd 967 path->locks[level] = 0;
bb803951 968 path->nodes[level] = NULL;
5f39d397 969 clean_tree_block(trans, root, mid);
925baedd 970 btrfs_tree_unlock(mid);
bb803951 971 /* once for the path */
5f39d397 972 free_extent_buffer(mid);
f0486c68
YZ
973
974 root_sub_used(root, mid->len);
66d7e7f0 975 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
bb803951 976 /* once for the root ptr */
5f39d397 977 free_extent_buffer(mid);
f0486c68 978 return 0;
bb803951 979 }
5f39d397 980 if (btrfs_header_nritems(mid) >
123abc88 981 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
bb803951
CM
982 return 0;
983
559af821 984 btrfs_header_nritems(mid);
54aa1f4d 985
5f39d397
CM
986 left = read_node_slot(root, parent, pslot - 1);
987 if (left) {
925baedd 988 btrfs_tree_lock(left);
b4ce94de 989 btrfs_set_lock_blocking(left);
5f39d397 990 wret = btrfs_cow_block(trans, root, left,
9fa8cfe7 991 parent, pslot - 1, &left);
54aa1f4d
CM
992 if (wret) {
993 ret = wret;
994 goto enospc;
995 }
2cc58cf2 996 }
5f39d397
CM
997 right = read_node_slot(root, parent, pslot + 1);
998 if (right) {
925baedd 999 btrfs_tree_lock(right);
b4ce94de 1000 btrfs_set_lock_blocking(right);
5f39d397 1001 wret = btrfs_cow_block(trans, root, right,
9fa8cfe7 1002 parent, pslot + 1, &right);
2cc58cf2
CM
1003 if (wret) {
1004 ret = wret;
1005 goto enospc;
1006 }
1007 }
1008
1009 /* first, try to make some room in the middle buffer */
5f39d397
CM
1010 if (left) {
1011 orig_slot += btrfs_header_nritems(left);
bce4eae9 1012 wret = push_node_left(trans, root, left, mid, 1);
79f95c82
CM
1013 if (wret < 0)
1014 ret = wret;
559af821 1015 btrfs_header_nritems(mid);
bb803951 1016 }
79f95c82
CM
1017
1018 /*
1019 * then try to empty the right most buffer into the middle
1020 */
5f39d397 1021 if (right) {
971a1f66 1022 wret = push_node_left(trans, root, mid, right, 1);
54aa1f4d 1023 if (wret < 0 && wret != -ENOSPC)
79f95c82 1024 ret = wret;
5f39d397 1025 if (btrfs_header_nritems(right) == 0) {
5f39d397 1026 clean_tree_block(trans, root, right);
925baedd 1027 btrfs_tree_unlock(right);
143bede5 1028 del_ptr(trans, root, path, level + 1, pslot + 1);
f0486c68 1029 root_sub_used(root, right->len);
66d7e7f0 1030 btrfs_free_tree_block(trans, root, right, 0, 1, 0);
f0486c68
YZ
1031 free_extent_buffer(right);
1032 right = NULL;
bb803951 1033 } else {
5f39d397
CM
1034 struct btrfs_disk_key right_key;
1035 btrfs_node_key(right, &right_key, 0);
1036 btrfs_set_node_key(parent, &right_key, pslot + 1);
1037 btrfs_mark_buffer_dirty(parent);
bb803951
CM
1038 }
1039 }
5f39d397 1040 if (btrfs_header_nritems(mid) == 1) {
79f95c82
CM
1041 /*
1042 * we're not allowed to leave a node with one item in the
1043 * tree during a delete. A deletion from lower in the tree
1044 * could try to delete the only pointer in this node.
1045 * So, pull some keys from the left.
1046 * There has to be a left pointer at this point because
1047 * otherwise we would have pulled some pointers from the
1048 * right
1049 */
305a26af
MF
1050 if (!left) {
1051 ret = -EROFS;
1052 btrfs_std_error(root->fs_info, ret);
1053 goto enospc;
1054 }
5f39d397 1055 wret = balance_node_right(trans, root, mid, left);
54aa1f4d 1056 if (wret < 0) {
79f95c82 1057 ret = wret;
54aa1f4d
CM
1058 goto enospc;
1059 }
bce4eae9
CM
1060 if (wret == 1) {
1061 wret = push_node_left(trans, root, left, mid, 1);
1062 if (wret < 0)
1063 ret = wret;
1064 }
79f95c82
CM
1065 BUG_ON(wret == 1);
1066 }
5f39d397 1067 if (btrfs_header_nritems(mid) == 0) {
5f39d397 1068 clean_tree_block(trans, root, mid);
925baedd 1069 btrfs_tree_unlock(mid);
143bede5 1070 del_ptr(trans, root, path, level + 1, pslot);
f0486c68 1071 root_sub_used(root, mid->len);
66d7e7f0 1072 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
f0486c68
YZ
1073 free_extent_buffer(mid);
1074 mid = NULL;
79f95c82
CM
1075 } else {
1076 /* update the parent key to reflect our changes */
5f39d397
CM
1077 struct btrfs_disk_key mid_key;
1078 btrfs_node_key(mid, &mid_key, 0);
1079 btrfs_set_node_key(parent, &mid_key, pslot);
1080 btrfs_mark_buffer_dirty(parent);
79f95c82 1081 }
bb803951 1082
79f95c82 1083 /* update the path */
5f39d397
CM
1084 if (left) {
1085 if (btrfs_header_nritems(left) > orig_slot) {
1086 extent_buffer_get(left);
925baedd 1087 /* left was locked after cow */
5f39d397 1088 path->nodes[level] = left;
bb803951
CM
1089 path->slots[level + 1] -= 1;
1090 path->slots[level] = orig_slot;
925baedd
CM
1091 if (mid) {
1092 btrfs_tree_unlock(mid);
5f39d397 1093 free_extent_buffer(mid);
925baedd 1094 }
bb803951 1095 } else {
5f39d397 1096 orig_slot -= btrfs_header_nritems(left);
bb803951
CM
1097 path->slots[level] = orig_slot;
1098 }
1099 }
79f95c82 1100 /* double check we haven't messed things up */
e20d96d6 1101 if (orig_ptr !=
5f39d397 1102 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
79f95c82 1103 BUG();
54aa1f4d 1104enospc:
925baedd
CM
1105 if (right) {
1106 btrfs_tree_unlock(right);
5f39d397 1107 free_extent_buffer(right);
925baedd
CM
1108 }
1109 if (left) {
1110 if (path->nodes[level] != left)
1111 btrfs_tree_unlock(left);
5f39d397 1112 free_extent_buffer(left);
925baedd 1113 }
bb803951
CM
1114 return ret;
1115}
1116
d352ac68
CM
1117/* Node balancing for insertion. Here we only split or push nodes around
1118 * when they are completely full. This is also done top down, so we
1119 * have to be pessimistic.
1120 */
d397712b 1121static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
98ed5174
CM
1122 struct btrfs_root *root,
1123 struct btrfs_path *path, int level)
e66f709b 1124{
5f39d397
CM
1125 struct extent_buffer *right = NULL;
1126 struct extent_buffer *mid;
1127 struct extent_buffer *left = NULL;
1128 struct extent_buffer *parent = NULL;
e66f709b
CM
1129 int ret = 0;
1130 int wret;
1131 int pslot;
1132 int orig_slot = path->slots[level];
e66f709b
CM
1133
1134 if (level == 0)
1135 return 1;
1136
5f39d397 1137 mid = path->nodes[level];
7bb86316 1138 WARN_ON(btrfs_header_generation(mid) != trans->transid);
e66f709b 1139
a05a9bb1 1140 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 1141 parent = path->nodes[level + 1];
a05a9bb1
LZ
1142 pslot = path->slots[level + 1];
1143 }
e66f709b 1144
5f39d397 1145 if (!parent)
e66f709b 1146 return 1;
e66f709b 1147
5f39d397 1148 left = read_node_slot(root, parent, pslot - 1);
e66f709b
CM
1149
1150 /* first, try to make some room in the middle buffer */
5f39d397 1151 if (left) {
e66f709b 1152 u32 left_nr;
925baedd
CM
1153
1154 btrfs_tree_lock(left);
b4ce94de
CM
1155 btrfs_set_lock_blocking(left);
1156
5f39d397 1157 left_nr = btrfs_header_nritems(left);
33ade1f8
CM
1158 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1159 wret = 1;
1160 } else {
5f39d397 1161 ret = btrfs_cow_block(trans, root, left, parent,
9fa8cfe7 1162 pslot - 1, &left);
54aa1f4d
CM
1163 if (ret)
1164 wret = 1;
1165 else {
54aa1f4d 1166 wret = push_node_left(trans, root,
971a1f66 1167 left, mid, 0);
54aa1f4d 1168 }
33ade1f8 1169 }
e66f709b
CM
1170 if (wret < 0)
1171 ret = wret;
1172 if (wret == 0) {
5f39d397 1173 struct btrfs_disk_key disk_key;
e66f709b 1174 orig_slot += left_nr;
5f39d397
CM
1175 btrfs_node_key(mid, &disk_key, 0);
1176 btrfs_set_node_key(parent, &disk_key, pslot);
1177 btrfs_mark_buffer_dirty(parent);
1178 if (btrfs_header_nritems(left) > orig_slot) {
1179 path->nodes[level] = left;
e66f709b
CM
1180 path->slots[level + 1] -= 1;
1181 path->slots[level] = orig_slot;
925baedd 1182 btrfs_tree_unlock(mid);
5f39d397 1183 free_extent_buffer(mid);
e66f709b
CM
1184 } else {
1185 orig_slot -=
5f39d397 1186 btrfs_header_nritems(left);
e66f709b 1187 path->slots[level] = orig_slot;
925baedd 1188 btrfs_tree_unlock(left);
5f39d397 1189 free_extent_buffer(left);
e66f709b 1190 }
e66f709b
CM
1191 return 0;
1192 }
925baedd 1193 btrfs_tree_unlock(left);
5f39d397 1194 free_extent_buffer(left);
e66f709b 1195 }
925baedd 1196 right = read_node_slot(root, parent, pslot + 1);
e66f709b
CM
1197
1198 /*
1199 * then try to empty the right most buffer into the middle
1200 */
5f39d397 1201 if (right) {
33ade1f8 1202 u32 right_nr;
b4ce94de 1203
925baedd 1204 btrfs_tree_lock(right);
b4ce94de
CM
1205 btrfs_set_lock_blocking(right);
1206
5f39d397 1207 right_nr = btrfs_header_nritems(right);
33ade1f8
CM
1208 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1209 wret = 1;
1210 } else {
5f39d397
CM
1211 ret = btrfs_cow_block(trans, root, right,
1212 parent, pslot + 1,
9fa8cfe7 1213 &right);
54aa1f4d
CM
1214 if (ret)
1215 wret = 1;
1216 else {
54aa1f4d 1217 wret = balance_node_right(trans, root,
5f39d397 1218 right, mid);
54aa1f4d 1219 }
33ade1f8 1220 }
e66f709b
CM
1221 if (wret < 0)
1222 ret = wret;
1223 if (wret == 0) {
5f39d397
CM
1224 struct btrfs_disk_key disk_key;
1225
1226 btrfs_node_key(right, &disk_key, 0);
1227 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1228 btrfs_mark_buffer_dirty(parent);
1229
1230 if (btrfs_header_nritems(mid) <= orig_slot) {
1231 path->nodes[level] = right;
e66f709b
CM
1232 path->slots[level + 1] += 1;
1233 path->slots[level] = orig_slot -
5f39d397 1234 btrfs_header_nritems(mid);
925baedd 1235 btrfs_tree_unlock(mid);
5f39d397 1236 free_extent_buffer(mid);
e66f709b 1237 } else {
925baedd 1238 btrfs_tree_unlock(right);
5f39d397 1239 free_extent_buffer(right);
e66f709b 1240 }
e66f709b
CM
1241 return 0;
1242 }
925baedd 1243 btrfs_tree_unlock(right);
5f39d397 1244 free_extent_buffer(right);
e66f709b 1245 }
e66f709b
CM
1246 return 1;
1247}
1248
3c69faec 1249/*
d352ac68
CM
1250 * readahead one full node of leaves, finding things that are close
1251 * to the block in 'slot', and triggering ra on them.
3c69faec 1252 */
c8c42864
CM
1253static void reada_for_search(struct btrfs_root *root,
1254 struct btrfs_path *path,
1255 int level, int slot, u64 objectid)
3c69faec 1256{
5f39d397 1257 struct extent_buffer *node;
01f46658 1258 struct btrfs_disk_key disk_key;
3c69faec 1259 u32 nritems;
3c69faec 1260 u64 search;
a7175319 1261 u64 target;
6b80053d 1262 u64 nread = 0;
cb25c2ea 1263 u64 gen;
3c69faec 1264 int direction = path->reada;
5f39d397 1265 struct extent_buffer *eb;
6b80053d
CM
1266 u32 nr;
1267 u32 blocksize;
1268 u32 nscan = 0;
db94535d 1269
a6b6e75e 1270 if (level != 1)
6702ed49
CM
1271 return;
1272
1273 if (!path->nodes[level])
3c69faec
CM
1274 return;
1275
5f39d397 1276 node = path->nodes[level];
925baedd 1277
3c69faec 1278 search = btrfs_node_blockptr(node, slot);
6b80053d
CM
1279 blocksize = btrfs_level_size(root, level - 1);
1280 eb = btrfs_find_tree_block(root, search, blocksize);
5f39d397
CM
1281 if (eb) {
1282 free_extent_buffer(eb);
3c69faec
CM
1283 return;
1284 }
1285
a7175319 1286 target = search;
6b80053d 1287
5f39d397 1288 nritems = btrfs_header_nritems(node);
6b80053d 1289 nr = slot;
25b8b936 1290
d397712b 1291 while (1) {
6b80053d
CM
1292 if (direction < 0) {
1293 if (nr == 0)
1294 break;
1295 nr--;
1296 } else if (direction > 0) {
1297 nr++;
1298 if (nr >= nritems)
1299 break;
3c69faec 1300 }
01f46658
CM
1301 if (path->reada < 0 && objectid) {
1302 btrfs_node_key(node, &disk_key, nr);
1303 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1304 break;
1305 }
6b80053d 1306 search = btrfs_node_blockptr(node, nr);
a7175319
CM
1307 if ((search <= target && target - search <= 65536) ||
1308 (search > target && search - target <= 65536)) {
cb25c2ea 1309 gen = btrfs_node_ptr_generation(node, nr);
cb25c2ea 1310 readahead_tree_block(root, search, blocksize, gen);
6b80053d
CM
1311 nread += blocksize;
1312 }
1313 nscan++;
a7175319 1314 if ((nread > 65536 || nscan > 32))
6b80053d 1315 break;
3c69faec
CM
1316 }
1317}
925baedd 1318
b4ce94de
CM
1319/*
1320 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1321 * cache
1322 */
1323static noinline int reada_for_balance(struct btrfs_root *root,
1324 struct btrfs_path *path, int level)
1325{
1326 int slot;
1327 int nritems;
1328 struct extent_buffer *parent;
1329 struct extent_buffer *eb;
1330 u64 gen;
1331 u64 block1 = 0;
1332 u64 block2 = 0;
1333 int ret = 0;
1334 int blocksize;
1335
8c594ea8 1336 parent = path->nodes[level + 1];
b4ce94de
CM
1337 if (!parent)
1338 return 0;
1339
1340 nritems = btrfs_header_nritems(parent);
8c594ea8 1341 slot = path->slots[level + 1];
b4ce94de
CM
1342 blocksize = btrfs_level_size(root, level);
1343
1344 if (slot > 0) {
1345 block1 = btrfs_node_blockptr(parent, slot - 1);
1346 gen = btrfs_node_ptr_generation(parent, slot - 1);
1347 eb = btrfs_find_tree_block(root, block1, blocksize);
1348 if (eb && btrfs_buffer_uptodate(eb, gen))
1349 block1 = 0;
1350 free_extent_buffer(eb);
1351 }
8c594ea8 1352 if (slot + 1 < nritems) {
b4ce94de
CM
1353 block2 = btrfs_node_blockptr(parent, slot + 1);
1354 gen = btrfs_node_ptr_generation(parent, slot + 1);
1355 eb = btrfs_find_tree_block(root, block2, blocksize);
1356 if (eb && btrfs_buffer_uptodate(eb, gen))
1357 block2 = 0;
1358 free_extent_buffer(eb);
1359 }
1360 if (block1 || block2) {
1361 ret = -EAGAIN;
8c594ea8
CM
1362
1363 /* release the whole path */
b3b4aa74 1364 btrfs_release_path(path);
8c594ea8
CM
1365
1366 /* read the blocks */
b4ce94de
CM
1367 if (block1)
1368 readahead_tree_block(root, block1, blocksize, 0);
1369 if (block2)
1370 readahead_tree_block(root, block2, blocksize, 0);
1371
1372 if (block1) {
1373 eb = read_tree_block(root, block1, blocksize, 0);
1374 free_extent_buffer(eb);
1375 }
8c594ea8 1376 if (block2) {
b4ce94de
CM
1377 eb = read_tree_block(root, block2, blocksize, 0);
1378 free_extent_buffer(eb);
1379 }
1380 }
1381 return ret;
1382}
1383
1384
d352ac68 1385/*
d397712b
CM
1386 * when we walk down the tree, it is usually safe to unlock the higher layers
1387 * in the tree. The exceptions are when our path goes through slot 0, because
1388 * operations on the tree might require changing key pointers higher up in the
1389 * tree.
d352ac68 1390 *
d397712b
CM
1391 * callers might also have set path->keep_locks, which tells this code to keep
1392 * the lock if the path points to the last slot in the block. This is part of
1393 * walking through the tree, and selecting the next slot in the higher block.
d352ac68 1394 *
d397712b
CM
1395 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1396 * if lowest_unlock is 1, level 0 won't be unlocked
d352ac68 1397 */
e02119d5
CM
1398static noinline void unlock_up(struct btrfs_path *path, int level,
1399 int lowest_unlock)
925baedd
CM
1400{
1401 int i;
1402 int skip_level = level;
051e1b9f 1403 int no_skips = 0;
925baedd
CM
1404 struct extent_buffer *t;
1405
1406 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1407 if (!path->nodes[i])
1408 break;
1409 if (!path->locks[i])
1410 break;
051e1b9f 1411 if (!no_skips && path->slots[i] == 0) {
925baedd
CM
1412 skip_level = i + 1;
1413 continue;
1414 }
051e1b9f 1415 if (!no_skips && path->keep_locks) {
925baedd
CM
1416 u32 nritems;
1417 t = path->nodes[i];
1418 nritems = btrfs_header_nritems(t);
051e1b9f 1419 if (nritems < 1 || path->slots[i] >= nritems - 1) {
925baedd
CM
1420 skip_level = i + 1;
1421 continue;
1422 }
1423 }
051e1b9f
CM
1424 if (skip_level < i && i >= lowest_unlock)
1425 no_skips = 1;
1426
925baedd
CM
1427 t = path->nodes[i];
1428 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
bd681513 1429 btrfs_tree_unlock_rw(t, path->locks[i]);
925baedd
CM
1430 path->locks[i] = 0;
1431 }
1432 }
1433}
1434
b4ce94de
CM
1435/*
1436 * This releases any locks held in the path starting at level and
1437 * going all the way up to the root.
1438 *
1439 * btrfs_search_slot will keep the lock held on higher nodes in a few
1440 * corner cases, such as COW of the block at slot zero in the node. This
1441 * ignores those rules, and it should only be called when there are no
1442 * more updates to be done higher up in the tree.
1443 */
1444noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1445{
1446 int i;
1447
5d4f98a2 1448 if (path->keep_locks)
b4ce94de
CM
1449 return;
1450
1451 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1452 if (!path->nodes[i])
12f4dacc 1453 continue;
b4ce94de 1454 if (!path->locks[i])
12f4dacc 1455 continue;
bd681513 1456 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
b4ce94de
CM
1457 path->locks[i] = 0;
1458 }
1459}
1460
c8c42864
CM
1461/*
1462 * helper function for btrfs_search_slot. The goal is to find a block
1463 * in cache without setting the path to blocking. If we find the block
1464 * we return zero and the path is unchanged.
1465 *
1466 * If we can't find the block, we set the path blocking and do some
1467 * reada. -EAGAIN is returned and the search must be repeated.
1468 */
1469static int
1470read_block_for_search(struct btrfs_trans_handle *trans,
1471 struct btrfs_root *root, struct btrfs_path *p,
1472 struct extent_buffer **eb_ret, int level, int slot,
1473 struct btrfs_key *key)
1474{
1475 u64 blocknr;
1476 u64 gen;
1477 u32 blocksize;
1478 struct extent_buffer *b = *eb_ret;
1479 struct extent_buffer *tmp;
76a05b35 1480 int ret;
c8c42864
CM
1481
1482 blocknr = btrfs_node_blockptr(b, slot);
1483 gen = btrfs_node_ptr_generation(b, slot);
1484 blocksize = btrfs_level_size(root, level - 1);
1485
1486 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
cb44921a
CM
1487 if (tmp) {
1488 if (btrfs_buffer_uptodate(tmp, 0)) {
1489 if (btrfs_buffer_uptodate(tmp, gen)) {
1490 /*
1491 * we found an up to date block without
1492 * sleeping, return
1493 * right away
1494 */
1495 *eb_ret = tmp;
1496 return 0;
1497 }
1498 /* the pages were up to date, but we failed
1499 * the generation number check. Do a full
1500 * read for the generation number that is correct.
1501 * We must do this without dropping locks so
1502 * we can trust our generation number
1503 */
1504 free_extent_buffer(tmp);
bd681513
CM
1505 btrfs_set_path_blocking(p);
1506
cb44921a
CM
1507 tmp = read_tree_block(root, blocknr, blocksize, gen);
1508 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1509 *eb_ret = tmp;
1510 return 0;
1511 }
1512 free_extent_buffer(tmp);
b3b4aa74 1513 btrfs_release_path(p);
cb44921a
CM
1514 return -EIO;
1515 }
c8c42864
CM
1516 }
1517
1518 /*
1519 * reduce lock contention at high levels
1520 * of the btree by dropping locks before
76a05b35
CM
1521 * we read. Don't release the lock on the current
1522 * level because we need to walk this node to figure
1523 * out which blocks to read.
c8c42864 1524 */
8c594ea8
CM
1525 btrfs_unlock_up_safe(p, level + 1);
1526 btrfs_set_path_blocking(p);
1527
cb44921a 1528 free_extent_buffer(tmp);
c8c42864
CM
1529 if (p->reada)
1530 reada_for_search(root, p, level, slot, key->objectid);
1531
b3b4aa74 1532 btrfs_release_path(p);
76a05b35
CM
1533
1534 ret = -EAGAIN;
5bdd3536 1535 tmp = read_tree_block(root, blocknr, blocksize, 0);
76a05b35
CM
1536 if (tmp) {
1537 /*
1538 * If the read above didn't mark this buffer up to date,
1539 * it will never end up being up to date. Set ret to EIO now
1540 * and give up so that our caller doesn't loop forever
1541 * on our EAGAINs.
1542 */
1543 if (!btrfs_buffer_uptodate(tmp, 0))
1544 ret = -EIO;
c8c42864 1545 free_extent_buffer(tmp);
76a05b35
CM
1546 }
1547 return ret;
c8c42864
CM
1548}
1549
1550/*
1551 * helper function for btrfs_search_slot. This does all of the checks
1552 * for node-level blocks and does any balancing required based on
1553 * the ins_len.
1554 *
1555 * If no extra work was required, zero is returned. If we had to
1556 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1557 * start over
1558 */
1559static int
1560setup_nodes_for_search(struct btrfs_trans_handle *trans,
1561 struct btrfs_root *root, struct btrfs_path *p,
bd681513
CM
1562 struct extent_buffer *b, int level, int ins_len,
1563 int *write_lock_level)
c8c42864
CM
1564{
1565 int ret;
1566 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1567 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1568 int sret;
1569
bd681513
CM
1570 if (*write_lock_level < level + 1) {
1571 *write_lock_level = level + 1;
1572 btrfs_release_path(p);
1573 goto again;
1574 }
1575
c8c42864
CM
1576 sret = reada_for_balance(root, p, level);
1577 if (sret)
1578 goto again;
1579
1580 btrfs_set_path_blocking(p);
1581 sret = split_node(trans, root, p, level);
bd681513 1582 btrfs_clear_path_blocking(p, NULL, 0);
c8c42864
CM
1583
1584 BUG_ON(sret > 0);
1585 if (sret) {
1586 ret = sret;
1587 goto done;
1588 }
1589 b = p->nodes[level];
1590 } else if (ins_len < 0 && btrfs_header_nritems(b) <
cfbb9308 1591 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
c8c42864
CM
1592 int sret;
1593
bd681513
CM
1594 if (*write_lock_level < level + 1) {
1595 *write_lock_level = level + 1;
1596 btrfs_release_path(p);
1597 goto again;
1598 }
1599
c8c42864
CM
1600 sret = reada_for_balance(root, p, level);
1601 if (sret)
1602 goto again;
1603
1604 btrfs_set_path_blocking(p);
1605 sret = balance_level(trans, root, p, level);
bd681513 1606 btrfs_clear_path_blocking(p, NULL, 0);
c8c42864
CM
1607
1608 if (sret) {
1609 ret = sret;
1610 goto done;
1611 }
1612 b = p->nodes[level];
1613 if (!b) {
b3b4aa74 1614 btrfs_release_path(p);
c8c42864
CM
1615 goto again;
1616 }
1617 BUG_ON(btrfs_header_nritems(b) == 1);
1618 }
1619 return 0;
1620
1621again:
1622 ret = -EAGAIN;
1623done:
1624 return ret;
1625}
1626
74123bd7
CM
1627/*
1628 * look for key in the tree. path is filled in with nodes along the way
1629 * if key is found, we return zero and you can find the item in the leaf
1630 * level of the path (level 0)
1631 *
1632 * If the key isn't found, the path points to the slot where it should
aa5d6bed
CM
1633 * be inserted, and 1 is returned. If there are other errors during the
1634 * search a negative error number is returned.
97571fd0
CM
1635 *
1636 * if ins_len > 0, nodes and leaves will be split as we walk down the
1637 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1638 * possible)
74123bd7 1639 */
e089f05c
CM
1640int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1641 *root, struct btrfs_key *key, struct btrfs_path *p, int
1642 ins_len, int cow)
be0e5c09 1643{
5f39d397 1644 struct extent_buffer *b;
be0e5c09
CM
1645 int slot;
1646 int ret;
33c66f43 1647 int err;
be0e5c09 1648 int level;
925baedd 1649 int lowest_unlock = 1;
bd681513
CM
1650 int root_lock;
1651 /* everything at write_lock_level or lower must be write locked */
1652 int write_lock_level = 0;
9f3a7427
CM
1653 u8 lowest_level = 0;
1654
6702ed49 1655 lowest_level = p->lowest_level;
323ac95b 1656 WARN_ON(lowest_level && ins_len > 0);
22b0ebda 1657 WARN_ON(p->nodes[0] != NULL);
25179201 1658
bd681513 1659 if (ins_len < 0) {
925baedd 1660 lowest_unlock = 2;
65b51a00 1661
bd681513
CM
1662 /* when we are removing items, we might have to go up to level
1663 * two as we update tree pointers Make sure we keep write
1664 * for those levels as well
1665 */
1666 write_lock_level = 2;
1667 } else if (ins_len > 0) {
1668 /*
1669 * for inserting items, make sure we have a write lock on
1670 * level 1 so we can update keys
1671 */
1672 write_lock_level = 1;
1673 }
1674
1675 if (!cow)
1676 write_lock_level = -1;
1677
1678 if (cow && (p->keep_locks || p->lowest_level))
1679 write_lock_level = BTRFS_MAX_LEVEL;
1680
bb803951 1681again:
bd681513
CM
1682 /*
1683 * we try very hard to do read locks on the root
1684 */
1685 root_lock = BTRFS_READ_LOCK;
1686 level = 0;
5d4f98a2 1687 if (p->search_commit_root) {
bd681513
CM
1688 /*
1689 * the commit roots are read only
1690 * so we always do read locks
1691 */
5d4f98a2
YZ
1692 b = root->commit_root;
1693 extent_buffer_get(b);
bd681513 1694 level = btrfs_header_level(b);
5d4f98a2 1695 if (!p->skip_locking)
bd681513 1696 btrfs_tree_read_lock(b);
5d4f98a2 1697 } else {
bd681513 1698 if (p->skip_locking) {
5d4f98a2 1699 b = btrfs_root_node(root);
bd681513
CM
1700 level = btrfs_header_level(b);
1701 } else {
1702 /* we don't know the level of the root node
1703 * until we actually have it read locked
1704 */
1705 b = btrfs_read_lock_root_node(root);
1706 level = btrfs_header_level(b);
1707 if (level <= write_lock_level) {
1708 /* whoops, must trade for write lock */
1709 btrfs_tree_read_unlock(b);
1710 free_extent_buffer(b);
1711 b = btrfs_lock_root_node(root);
1712 root_lock = BTRFS_WRITE_LOCK;
1713
1714 /* the level might have changed, check again */
1715 level = btrfs_header_level(b);
1716 }
1717 }
5d4f98a2 1718 }
bd681513
CM
1719 p->nodes[level] = b;
1720 if (!p->skip_locking)
1721 p->locks[level] = root_lock;
925baedd 1722
eb60ceac 1723 while (b) {
5f39d397 1724 level = btrfs_header_level(b);
65b51a00
CM
1725
1726 /*
1727 * setup the path here so we can release it under lock
1728 * contention with the cow code
1729 */
02217ed2 1730 if (cow) {
c8c42864
CM
1731 /*
1732 * if we don't really need to cow this block
1733 * then we don't want to set the path blocking,
1734 * so we test it here
1735 */
5d4f98a2 1736 if (!should_cow_block(trans, root, b))
65b51a00 1737 goto cow_done;
5d4f98a2 1738
b4ce94de
CM
1739 btrfs_set_path_blocking(p);
1740
bd681513
CM
1741 /*
1742 * must have write locks on this node and the
1743 * parent
1744 */
1745 if (level + 1 > write_lock_level) {
1746 write_lock_level = level + 1;
1747 btrfs_release_path(p);
1748 goto again;
1749 }
1750
33c66f43
YZ
1751 err = btrfs_cow_block(trans, root, b,
1752 p->nodes[level + 1],
1753 p->slots[level + 1], &b);
1754 if (err) {
33c66f43 1755 ret = err;
65b51a00 1756 goto done;
54aa1f4d 1757 }
02217ed2 1758 }
65b51a00 1759cow_done:
02217ed2 1760 BUG_ON(!cow && ins_len);
65b51a00 1761
eb60ceac 1762 p->nodes[level] = b;
bd681513 1763 btrfs_clear_path_blocking(p, NULL, 0);
b4ce94de
CM
1764
1765 /*
1766 * we have a lock on b and as long as we aren't changing
1767 * the tree, there is no way to for the items in b to change.
1768 * It is safe to drop the lock on our parent before we
1769 * go through the expensive btree search on b.
1770 *
1771 * If cow is true, then we might be changing slot zero,
1772 * which may require changing the parent. So, we can't
1773 * drop the lock until after we know which slot we're
1774 * operating on.
1775 */
1776 if (!cow)
1777 btrfs_unlock_up_safe(p, level + 1);
1778
5f39d397 1779 ret = bin_search(b, key, level, &slot);
b4ce94de 1780
5f39d397 1781 if (level != 0) {
33c66f43
YZ
1782 int dec = 0;
1783 if (ret && slot > 0) {
1784 dec = 1;
be0e5c09 1785 slot -= 1;
33c66f43 1786 }
be0e5c09 1787 p->slots[level] = slot;
33c66f43 1788 err = setup_nodes_for_search(trans, root, p, b, level,
bd681513 1789 ins_len, &write_lock_level);
33c66f43 1790 if (err == -EAGAIN)
c8c42864 1791 goto again;
33c66f43
YZ
1792 if (err) {
1793 ret = err;
c8c42864 1794 goto done;
33c66f43 1795 }
c8c42864
CM
1796 b = p->nodes[level];
1797 slot = p->slots[level];
b4ce94de 1798
bd681513
CM
1799 /*
1800 * slot 0 is special, if we change the key
1801 * we have to update the parent pointer
1802 * which means we must have a write lock
1803 * on the parent
1804 */
1805 if (slot == 0 && cow &&
1806 write_lock_level < level + 1) {
1807 write_lock_level = level + 1;
1808 btrfs_release_path(p);
1809 goto again;
1810 }
1811
f9efa9c7
CM
1812 unlock_up(p, level, lowest_unlock);
1813
925baedd 1814 if (level == lowest_level) {
33c66f43
YZ
1815 if (dec)
1816 p->slots[level]++;
5b21f2ed 1817 goto done;
925baedd 1818 }
ca7a79ad 1819
33c66f43 1820 err = read_block_for_search(trans, root, p,
c8c42864 1821 &b, level, slot, key);
33c66f43 1822 if (err == -EAGAIN)
c8c42864 1823 goto again;
33c66f43
YZ
1824 if (err) {
1825 ret = err;
76a05b35 1826 goto done;
33c66f43 1827 }
76a05b35 1828
b4ce94de 1829 if (!p->skip_locking) {
bd681513
CM
1830 level = btrfs_header_level(b);
1831 if (level <= write_lock_level) {
1832 err = btrfs_try_tree_write_lock(b);
1833 if (!err) {
1834 btrfs_set_path_blocking(p);
1835 btrfs_tree_lock(b);
1836 btrfs_clear_path_blocking(p, b,
1837 BTRFS_WRITE_LOCK);
1838 }
1839 p->locks[level] = BTRFS_WRITE_LOCK;
1840 } else {
1841 err = btrfs_try_tree_read_lock(b);
1842 if (!err) {
1843 btrfs_set_path_blocking(p);
1844 btrfs_tree_read_lock(b);
1845 btrfs_clear_path_blocking(p, b,
1846 BTRFS_READ_LOCK);
1847 }
1848 p->locks[level] = BTRFS_READ_LOCK;
b4ce94de 1849 }
bd681513 1850 p->nodes[level] = b;
b4ce94de 1851 }
be0e5c09
CM
1852 } else {
1853 p->slots[level] = slot;
87b29b20
YZ
1854 if (ins_len > 0 &&
1855 btrfs_leaf_free_space(root, b) < ins_len) {
bd681513
CM
1856 if (write_lock_level < 1) {
1857 write_lock_level = 1;
1858 btrfs_release_path(p);
1859 goto again;
1860 }
1861
b4ce94de 1862 btrfs_set_path_blocking(p);
33c66f43
YZ
1863 err = split_leaf(trans, root, key,
1864 p, ins_len, ret == 0);
bd681513 1865 btrfs_clear_path_blocking(p, NULL, 0);
b4ce94de 1866
33c66f43
YZ
1867 BUG_ON(err > 0);
1868 if (err) {
1869 ret = err;
65b51a00
CM
1870 goto done;
1871 }
5c680ed6 1872 }
459931ec
CM
1873 if (!p->search_for_split)
1874 unlock_up(p, level, lowest_unlock);
65b51a00 1875 goto done;
be0e5c09
CM
1876 }
1877 }
65b51a00
CM
1878 ret = 1;
1879done:
b4ce94de
CM
1880 /*
1881 * we don't really know what they plan on doing with the path
1882 * from here on, so for now just mark it as blocking
1883 */
b9473439
CM
1884 if (!p->leave_spinning)
1885 btrfs_set_path_blocking(p);
76a05b35 1886 if (ret < 0)
b3b4aa74 1887 btrfs_release_path(p);
65b51a00 1888 return ret;
be0e5c09
CM
1889}
1890
74123bd7
CM
1891/*
1892 * adjust the pointers going up the tree, starting at level
1893 * making sure the right key of each node is points to 'key'.
1894 * This is used after shifting pointers to the left, so it stops
1895 * fixing up pointers when a given leaf/node is not in slot 0 of the
1896 * higher levels
aa5d6bed 1897 *
74123bd7 1898 */
143bede5
JM
1899static void fixup_low_keys(struct btrfs_trans_handle *trans,
1900 struct btrfs_root *root, struct btrfs_path *path,
1901 struct btrfs_disk_key *key, int level)
be0e5c09
CM
1902{
1903 int i;
5f39d397
CM
1904 struct extent_buffer *t;
1905
234b63a0 1906 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
be0e5c09 1907 int tslot = path->slots[i];
eb60ceac 1908 if (!path->nodes[i])
be0e5c09 1909 break;
5f39d397
CM
1910 t = path->nodes[i];
1911 btrfs_set_node_key(t, key, tslot);
d6025579 1912 btrfs_mark_buffer_dirty(path->nodes[i]);
be0e5c09
CM
1913 if (tslot != 0)
1914 break;
1915 }
1916}
1917
31840ae1
ZY
1918/*
1919 * update item key.
1920 *
1921 * This function isn't completely safe. It's the caller's responsibility
1922 * that the new key won't break the order
1923 */
143bede5
JM
1924void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1925 struct btrfs_root *root, struct btrfs_path *path,
1926 struct btrfs_key *new_key)
31840ae1
ZY
1927{
1928 struct btrfs_disk_key disk_key;
1929 struct extent_buffer *eb;
1930 int slot;
1931
1932 eb = path->nodes[0];
1933 slot = path->slots[0];
1934 if (slot > 0) {
1935 btrfs_item_key(eb, &disk_key, slot - 1);
143bede5 1936 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
31840ae1
ZY
1937 }
1938 if (slot < btrfs_header_nritems(eb) - 1) {
1939 btrfs_item_key(eb, &disk_key, slot + 1);
143bede5 1940 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
31840ae1
ZY
1941 }
1942
1943 btrfs_cpu_key_to_disk(&disk_key, new_key);
1944 btrfs_set_item_key(eb, &disk_key, slot);
1945 btrfs_mark_buffer_dirty(eb);
1946 if (slot == 0)
1947 fixup_low_keys(trans, root, path, &disk_key, 1);
31840ae1
ZY
1948}
1949
74123bd7
CM
1950/*
1951 * try to push data from one node into the next node left in the
79f95c82 1952 * tree.
aa5d6bed
CM
1953 *
1954 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1955 * error, and > 0 if there was no room in the left hand block.
74123bd7 1956 */
98ed5174
CM
1957static int push_node_left(struct btrfs_trans_handle *trans,
1958 struct btrfs_root *root, struct extent_buffer *dst,
971a1f66 1959 struct extent_buffer *src, int empty)
be0e5c09 1960{
be0e5c09 1961 int push_items = 0;
bb803951
CM
1962 int src_nritems;
1963 int dst_nritems;
aa5d6bed 1964 int ret = 0;
be0e5c09 1965
5f39d397
CM
1966 src_nritems = btrfs_header_nritems(src);
1967 dst_nritems = btrfs_header_nritems(dst);
123abc88 1968 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
7bb86316
CM
1969 WARN_ON(btrfs_header_generation(src) != trans->transid);
1970 WARN_ON(btrfs_header_generation(dst) != trans->transid);
54aa1f4d 1971
bce4eae9 1972 if (!empty && src_nritems <= 8)
971a1f66
CM
1973 return 1;
1974
d397712b 1975 if (push_items <= 0)
be0e5c09
CM
1976 return 1;
1977
bce4eae9 1978 if (empty) {
971a1f66 1979 push_items = min(src_nritems, push_items);
bce4eae9
CM
1980 if (push_items < src_nritems) {
1981 /* leave at least 8 pointers in the node if
1982 * we aren't going to empty it
1983 */
1984 if (src_nritems - push_items < 8) {
1985 if (push_items <= 8)
1986 return 1;
1987 push_items -= 8;
1988 }
1989 }
1990 } else
1991 push_items = min(src_nritems - 8, push_items);
79f95c82 1992
5f39d397
CM
1993 copy_extent_buffer(dst, src,
1994 btrfs_node_key_ptr_offset(dst_nritems),
1995 btrfs_node_key_ptr_offset(0),
d397712b 1996 push_items * sizeof(struct btrfs_key_ptr));
5f39d397 1997
bb803951 1998 if (push_items < src_nritems) {
5f39d397
CM
1999 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2000 btrfs_node_key_ptr_offset(push_items),
2001 (src_nritems - push_items) *
2002 sizeof(struct btrfs_key_ptr));
2003 }
2004 btrfs_set_header_nritems(src, src_nritems - push_items);
2005 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2006 btrfs_mark_buffer_dirty(src);
2007 btrfs_mark_buffer_dirty(dst);
31840ae1 2008
79f95c82
CM
2009 return ret;
2010}
2011
2012/*
2013 * try to push data from one node into the next node right in the
2014 * tree.
2015 *
2016 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2017 * error, and > 0 if there was no room in the right hand block.
2018 *
2019 * this will only push up to 1/2 the contents of the left node over
2020 */
5f39d397
CM
2021static int balance_node_right(struct btrfs_trans_handle *trans,
2022 struct btrfs_root *root,
2023 struct extent_buffer *dst,
2024 struct extent_buffer *src)
79f95c82 2025{
79f95c82
CM
2026 int push_items = 0;
2027 int max_push;
2028 int src_nritems;
2029 int dst_nritems;
2030 int ret = 0;
79f95c82 2031
7bb86316
CM
2032 WARN_ON(btrfs_header_generation(src) != trans->transid);
2033 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2034
5f39d397
CM
2035 src_nritems = btrfs_header_nritems(src);
2036 dst_nritems = btrfs_header_nritems(dst);
123abc88 2037 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
d397712b 2038 if (push_items <= 0)
79f95c82 2039 return 1;
bce4eae9 2040
d397712b 2041 if (src_nritems < 4)
bce4eae9 2042 return 1;
79f95c82
CM
2043
2044 max_push = src_nritems / 2 + 1;
2045 /* don't try to empty the node */
d397712b 2046 if (max_push >= src_nritems)
79f95c82 2047 return 1;
252c38f0 2048
79f95c82
CM
2049 if (max_push < push_items)
2050 push_items = max_push;
2051
5f39d397
CM
2052 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2053 btrfs_node_key_ptr_offset(0),
2054 (dst_nritems) *
2055 sizeof(struct btrfs_key_ptr));
d6025579 2056
5f39d397
CM
2057 copy_extent_buffer(dst, src,
2058 btrfs_node_key_ptr_offset(0),
2059 btrfs_node_key_ptr_offset(src_nritems - push_items),
d397712b 2060 push_items * sizeof(struct btrfs_key_ptr));
79f95c82 2061
5f39d397
CM
2062 btrfs_set_header_nritems(src, src_nritems - push_items);
2063 btrfs_set_header_nritems(dst, dst_nritems + push_items);
79f95c82 2064
5f39d397
CM
2065 btrfs_mark_buffer_dirty(src);
2066 btrfs_mark_buffer_dirty(dst);
31840ae1 2067
aa5d6bed 2068 return ret;
be0e5c09
CM
2069}
2070
97571fd0
CM
2071/*
2072 * helper function to insert a new root level in the tree.
2073 * A new node is allocated, and a single item is inserted to
2074 * point to the existing root
aa5d6bed
CM
2075 *
2076 * returns zero on success or < 0 on failure.
97571fd0 2077 */
d397712b 2078static noinline int insert_new_root(struct btrfs_trans_handle *trans,
5f39d397
CM
2079 struct btrfs_root *root,
2080 struct btrfs_path *path, int level)
5c680ed6 2081{
7bb86316 2082 u64 lower_gen;
5f39d397
CM
2083 struct extent_buffer *lower;
2084 struct extent_buffer *c;
925baedd 2085 struct extent_buffer *old;
5f39d397 2086 struct btrfs_disk_key lower_key;
5c680ed6
CM
2087
2088 BUG_ON(path->nodes[level]);
2089 BUG_ON(path->nodes[level-1] != root->node);
2090
7bb86316
CM
2091 lower = path->nodes[level-1];
2092 if (level == 1)
2093 btrfs_item_key(lower, &lower_key, 0);
2094 else
2095 btrfs_node_key(lower, &lower_key, 0);
2096
31840ae1 2097 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
5d4f98a2 2098 root->root_key.objectid, &lower_key,
66d7e7f0 2099 level, root->node->start, 0, 0);
5f39d397
CM
2100 if (IS_ERR(c))
2101 return PTR_ERR(c);
925baedd 2102
f0486c68
YZ
2103 root_add_used(root, root->nodesize);
2104
5d4f98a2 2105 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
5f39d397
CM
2106 btrfs_set_header_nritems(c, 1);
2107 btrfs_set_header_level(c, level);
db94535d 2108 btrfs_set_header_bytenr(c, c->start);
5f39d397 2109 btrfs_set_header_generation(c, trans->transid);
5d4f98a2 2110 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
5f39d397 2111 btrfs_set_header_owner(c, root->root_key.objectid);
5f39d397
CM
2112
2113 write_extent_buffer(c, root->fs_info->fsid,
2114 (unsigned long)btrfs_header_fsid(c),
2115 BTRFS_FSID_SIZE);
e17cade2
CM
2116
2117 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2118 (unsigned long)btrfs_header_chunk_tree_uuid(c),
2119 BTRFS_UUID_SIZE);
2120
5f39d397 2121 btrfs_set_node_key(c, &lower_key, 0);
db94535d 2122 btrfs_set_node_blockptr(c, 0, lower->start);
7bb86316 2123 lower_gen = btrfs_header_generation(lower);
31840ae1 2124 WARN_ON(lower_gen != trans->transid);
7bb86316
CM
2125
2126 btrfs_set_node_ptr_generation(c, 0, lower_gen);
d5719762 2127
5f39d397 2128 btrfs_mark_buffer_dirty(c);
d5719762 2129
925baedd 2130 old = root->node;
240f62c8 2131 rcu_assign_pointer(root->node, c);
925baedd
CM
2132
2133 /* the super has an extra ref to root->node */
2134 free_extent_buffer(old);
2135
0b86a832 2136 add_root_to_dirty_list(root);
5f39d397
CM
2137 extent_buffer_get(c);
2138 path->nodes[level] = c;
bd681513 2139 path->locks[level] = BTRFS_WRITE_LOCK;
5c680ed6
CM
2140 path->slots[level] = 0;
2141 return 0;
2142}
2143
74123bd7
CM
2144/*
2145 * worker function to insert a single pointer in a node.
2146 * the node should have enough room for the pointer already
97571fd0 2147 *
74123bd7
CM
2148 * slot and level indicate where you want the key to go, and
2149 * blocknr is the block the key points to.
2150 */
143bede5
JM
2151static void insert_ptr(struct btrfs_trans_handle *trans,
2152 struct btrfs_root *root, struct btrfs_path *path,
2153 struct btrfs_disk_key *key, u64 bytenr,
2154 int slot, int level)
74123bd7 2155{
5f39d397 2156 struct extent_buffer *lower;
74123bd7 2157 int nritems;
5c680ed6
CM
2158
2159 BUG_ON(!path->nodes[level]);
f0486c68 2160 btrfs_assert_tree_locked(path->nodes[level]);
5f39d397
CM
2161 lower = path->nodes[level];
2162 nritems = btrfs_header_nritems(lower);
c293498b 2163 BUG_ON(slot > nritems);
143bede5 2164 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
74123bd7 2165 if (slot != nritems) {
5f39d397
CM
2166 memmove_extent_buffer(lower,
2167 btrfs_node_key_ptr_offset(slot + 1),
2168 btrfs_node_key_ptr_offset(slot),
d6025579 2169 (nritems - slot) * sizeof(struct btrfs_key_ptr));
74123bd7 2170 }
5f39d397 2171 btrfs_set_node_key(lower, key, slot);
db94535d 2172 btrfs_set_node_blockptr(lower, slot, bytenr);
74493f7a
CM
2173 WARN_ON(trans->transid == 0);
2174 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
5f39d397
CM
2175 btrfs_set_header_nritems(lower, nritems + 1);
2176 btrfs_mark_buffer_dirty(lower);
74123bd7
CM
2177}
2178
97571fd0
CM
2179/*
2180 * split the node at the specified level in path in two.
2181 * The path is corrected to point to the appropriate node after the split
2182 *
2183 * Before splitting this tries to make some room in the node by pushing
2184 * left and right, if either one works, it returns right away.
aa5d6bed
CM
2185 *
2186 * returns 0 on success and < 0 on failure
97571fd0 2187 */
e02119d5
CM
2188static noinline int split_node(struct btrfs_trans_handle *trans,
2189 struct btrfs_root *root,
2190 struct btrfs_path *path, int level)
be0e5c09 2191{
5f39d397
CM
2192 struct extent_buffer *c;
2193 struct extent_buffer *split;
2194 struct btrfs_disk_key disk_key;
be0e5c09 2195 int mid;
5c680ed6 2196 int ret;
7518a238 2197 u32 c_nritems;
eb60ceac 2198
5f39d397 2199 c = path->nodes[level];
7bb86316 2200 WARN_ON(btrfs_header_generation(c) != trans->transid);
5f39d397 2201 if (c == root->node) {
5c680ed6 2202 /* trying to split the root, lets make a new one */
e089f05c 2203 ret = insert_new_root(trans, root, path, level + 1);
5c680ed6
CM
2204 if (ret)
2205 return ret;
b3612421 2206 } else {
e66f709b 2207 ret = push_nodes_for_insert(trans, root, path, level);
5f39d397
CM
2208 c = path->nodes[level];
2209 if (!ret && btrfs_header_nritems(c) <
c448acf0 2210 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
e66f709b 2211 return 0;
54aa1f4d
CM
2212 if (ret < 0)
2213 return ret;
be0e5c09 2214 }
e66f709b 2215
5f39d397 2216 c_nritems = btrfs_header_nritems(c);
5d4f98a2
YZ
2217 mid = (c_nritems + 1) / 2;
2218 btrfs_node_key(c, &disk_key, mid);
7bb86316 2219
5d4f98a2 2220 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
31840ae1 2221 root->root_key.objectid,
66d7e7f0 2222 &disk_key, level, c->start, 0, 0);
5f39d397
CM
2223 if (IS_ERR(split))
2224 return PTR_ERR(split);
2225
f0486c68
YZ
2226 root_add_used(root, root->nodesize);
2227
5d4f98a2 2228 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
5f39d397 2229 btrfs_set_header_level(split, btrfs_header_level(c));
db94535d 2230 btrfs_set_header_bytenr(split, split->start);
5f39d397 2231 btrfs_set_header_generation(split, trans->transid);
5d4f98a2 2232 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
5f39d397
CM
2233 btrfs_set_header_owner(split, root->root_key.objectid);
2234 write_extent_buffer(split, root->fs_info->fsid,
2235 (unsigned long)btrfs_header_fsid(split),
2236 BTRFS_FSID_SIZE);
e17cade2
CM
2237 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2238 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2239 BTRFS_UUID_SIZE);
54aa1f4d 2240
5f39d397
CM
2241
2242 copy_extent_buffer(split, c,
2243 btrfs_node_key_ptr_offset(0),
2244 btrfs_node_key_ptr_offset(mid),
2245 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2246 btrfs_set_header_nritems(split, c_nritems - mid);
2247 btrfs_set_header_nritems(c, mid);
aa5d6bed
CM
2248 ret = 0;
2249
5f39d397
CM
2250 btrfs_mark_buffer_dirty(c);
2251 btrfs_mark_buffer_dirty(split);
2252
143bede5
JM
2253 insert_ptr(trans, root, path, &disk_key, split->start,
2254 path->slots[level + 1] + 1, level + 1);
aa5d6bed 2255
5de08d7d 2256 if (path->slots[level] >= mid) {
5c680ed6 2257 path->slots[level] -= mid;
925baedd 2258 btrfs_tree_unlock(c);
5f39d397
CM
2259 free_extent_buffer(c);
2260 path->nodes[level] = split;
5c680ed6
CM
2261 path->slots[level + 1] += 1;
2262 } else {
925baedd 2263 btrfs_tree_unlock(split);
5f39d397 2264 free_extent_buffer(split);
be0e5c09 2265 }
aa5d6bed 2266 return ret;
be0e5c09
CM
2267}
2268
74123bd7
CM
2269/*
2270 * how many bytes are required to store the items in a leaf. start
2271 * and nr indicate which items in the leaf to check. This totals up the
2272 * space used both by the item structs and the item data
2273 */
5f39d397 2274static int leaf_space_used(struct extent_buffer *l, int start, int nr)
be0e5c09
CM
2275{
2276 int data_len;
5f39d397 2277 int nritems = btrfs_header_nritems(l);
d4dbff95 2278 int end = min(nritems, start + nr) - 1;
be0e5c09
CM
2279
2280 if (!nr)
2281 return 0;
5f39d397
CM
2282 data_len = btrfs_item_end_nr(l, start);
2283 data_len = data_len - btrfs_item_offset_nr(l, end);
0783fcfc 2284 data_len += sizeof(struct btrfs_item) * nr;
d4dbff95 2285 WARN_ON(data_len < 0);
be0e5c09
CM
2286 return data_len;
2287}
2288
d4dbff95
CM
2289/*
2290 * The space between the end of the leaf items and
2291 * the start of the leaf data. IOW, how much room
2292 * the leaf has left for both items and data
2293 */
d397712b 2294noinline int btrfs_leaf_free_space(struct btrfs_root *root,
e02119d5 2295 struct extent_buffer *leaf)
d4dbff95 2296{
5f39d397
CM
2297 int nritems = btrfs_header_nritems(leaf);
2298 int ret;
2299 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2300 if (ret < 0) {
d397712b
CM
2301 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2302 "used %d nritems %d\n",
ae2f5411 2303 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
5f39d397
CM
2304 leaf_space_used(leaf, 0, nritems), nritems);
2305 }
2306 return ret;
d4dbff95
CM
2307}
2308
99d8f83c
CM
2309/*
2310 * min slot controls the lowest index we're willing to push to the
2311 * right. We'll push up to and including min_slot, but no lower
2312 */
44871b1b
CM
2313static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2314 struct btrfs_root *root,
2315 struct btrfs_path *path,
2316 int data_size, int empty,
2317 struct extent_buffer *right,
99d8f83c
CM
2318 int free_space, u32 left_nritems,
2319 u32 min_slot)
00ec4c51 2320{
5f39d397 2321 struct extent_buffer *left = path->nodes[0];
44871b1b 2322 struct extent_buffer *upper = path->nodes[1];
5f39d397 2323 struct btrfs_disk_key disk_key;
00ec4c51 2324 int slot;
34a38218 2325 u32 i;
00ec4c51
CM
2326 int push_space = 0;
2327 int push_items = 0;
0783fcfc 2328 struct btrfs_item *item;
34a38218 2329 u32 nr;
7518a238 2330 u32 right_nritems;
5f39d397 2331 u32 data_end;
db94535d 2332 u32 this_item_size;
00ec4c51 2333
34a38218
CM
2334 if (empty)
2335 nr = 0;
2336 else
99d8f83c 2337 nr = max_t(u32, 1, min_slot);
34a38218 2338
31840ae1 2339 if (path->slots[0] >= left_nritems)
87b29b20 2340 push_space += data_size;
31840ae1 2341
44871b1b 2342 slot = path->slots[1];
34a38218
CM
2343 i = left_nritems - 1;
2344 while (i >= nr) {
5f39d397 2345 item = btrfs_item_nr(left, i);
db94535d 2346
31840ae1
ZY
2347 if (!empty && push_items > 0) {
2348 if (path->slots[0] > i)
2349 break;
2350 if (path->slots[0] == i) {
2351 int space = btrfs_leaf_free_space(root, left);
2352 if (space + push_space * 2 > free_space)
2353 break;
2354 }
2355 }
2356
00ec4c51 2357 if (path->slots[0] == i)
87b29b20 2358 push_space += data_size;
db94535d 2359
db94535d
CM
2360 this_item_size = btrfs_item_size(left, item);
2361 if (this_item_size + sizeof(*item) + push_space > free_space)
00ec4c51 2362 break;
31840ae1 2363
00ec4c51 2364 push_items++;
db94535d 2365 push_space += this_item_size + sizeof(*item);
34a38218
CM
2366 if (i == 0)
2367 break;
2368 i--;
db94535d 2369 }
5f39d397 2370
925baedd
CM
2371 if (push_items == 0)
2372 goto out_unlock;
5f39d397 2373
34a38218 2374 if (!empty && push_items == left_nritems)
a429e513 2375 WARN_ON(1);
5f39d397 2376
00ec4c51 2377 /* push left to right */
5f39d397 2378 right_nritems = btrfs_header_nritems(right);
34a38218 2379
5f39d397 2380 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
123abc88 2381 push_space -= leaf_data_end(root, left);
5f39d397 2382
00ec4c51 2383 /* make room in the right data area */
5f39d397
CM
2384 data_end = leaf_data_end(root, right);
2385 memmove_extent_buffer(right,
2386 btrfs_leaf_data(right) + data_end - push_space,
2387 btrfs_leaf_data(right) + data_end,
2388 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2389
00ec4c51 2390 /* copy from the left data area */
5f39d397 2391 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
d6025579
CM
2392 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2393 btrfs_leaf_data(left) + leaf_data_end(root, left),
2394 push_space);
5f39d397
CM
2395
2396 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2397 btrfs_item_nr_offset(0),
2398 right_nritems * sizeof(struct btrfs_item));
2399
00ec4c51 2400 /* copy the items from left to right */
5f39d397
CM
2401 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2402 btrfs_item_nr_offset(left_nritems - push_items),
2403 push_items * sizeof(struct btrfs_item));
00ec4c51
CM
2404
2405 /* update the item pointers */
7518a238 2406 right_nritems += push_items;
5f39d397 2407 btrfs_set_header_nritems(right, right_nritems);
123abc88 2408 push_space = BTRFS_LEAF_DATA_SIZE(root);
7518a238 2409 for (i = 0; i < right_nritems; i++) {
5f39d397 2410 item = btrfs_item_nr(right, i);
db94535d
CM
2411 push_space -= btrfs_item_size(right, item);
2412 btrfs_set_item_offset(right, item, push_space);
2413 }
2414
7518a238 2415 left_nritems -= push_items;
5f39d397 2416 btrfs_set_header_nritems(left, left_nritems);
00ec4c51 2417
34a38218
CM
2418 if (left_nritems)
2419 btrfs_mark_buffer_dirty(left);
f0486c68
YZ
2420 else
2421 clean_tree_block(trans, root, left);
2422
5f39d397 2423 btrfs_mark_buffer_dirty(right);
a429e513 2424
5f39d397
CM
2425 btrfs_item_key(right, &disk_key, 0);
2426 btrfs_set_node_key(upper, &disk_key, slot + 1);
d6025579 2427 btrfs_mark_buffer_dirty(upper);
02217ed2 2428
00ec4c51 2429 /* then fixup the leaf pointer in the path */
7518a238
CM
2430 if (path->slots[0] >= left_nritems) {
2431 path->slots[0] -= left_nritems;
925baedd
CM
2432 if (btrfs_header_nritems(path->nodes[0]) == 0)
2433 clean_tree_block(trans, root, path->nodes[0]);
2434 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
2435 free_extent_buffer(path->nodes[0]);
2436 path->nodes[0] = right;
00ec4c51
CM
2437 path->slots[1] += 1;
2438 } else {
925baedd 2439 btrfs_tree_unlock(right);
5f39d397 2440 free_extent_buffer(right);
00ec4c51
CM
2441 }
2442 return 0;
925baedd
CM
2443
2444out_unlock:
2445 btrfs_tree_unlock(right);
2446 free_extent_buffer(right);
2447 return 1;
00ec4c51 2448}
925baedd 2449
44871b1b
CM
2450/*
2451 * push some data in the path leaf to the right, trying to free up at
2452 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2453 *
2454 * returns 1 if the push failed because the other node didn't have enough
2455 * room, 0 if everything worked out and < 0 if there were major errors.
99d8f83c
CM
2456 *
2457 * this will push starting from min_slot to the end of the leaf. It won't
2458 * push any slot lower than min_slot
44871b1b
CM
2459 */
2460static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
2461 *root, struct btrfs_path *path,
2462 int min_data_size, int data_size,
2463 int empty, u32 min_slot)
44871b1b
CM
2464{
2465 struct extent_buffer *left = path->nodes[0];
2466 struct extent_buffer *right;
2467 struct extent_buffer *upper;
2468 int slot;
2469 int free_space;
2470 u32 left_nritems;
2471 int ret;
2472
2473 if (!path->nodes[1])
2474 return 1;
2475
2476 slot = path->slots[1];
2477 upper = path->nodes[1];
2478 if (slot >= btrfs_header_nritems(upper) - 1)
2479 return 1;
2480
2481 btrfs_assert_tree_locked(path->nodes[1]);
2482
2483 right = read_node_slot(root, upper, slot + 1);
91ca338d
TI
2484 if (right == NULL)
2485 return 1;
2486
44871b1b
CM
2487 btrfs_tree_lock(right);
2488 btrfs_set_lock_blocking(right);
2489
2490 free_space = btrfs_leaf_free_space(root, right);
2491 if (free_space < data_size)
2492 goto out_unlock;
2493
2494 /* cow and double check */
2495 ret = btrfs_cow_block(trans, root, right, upper,
2496 slot + 1, &right);
2497 if (ret)
2498 goto out_unlock;
2499
2500 free_space = btrfs_leaf_free_space(root, right);
2501 if (free_space < data_size)
2502 goto out_unlock;
2503
2504 left_nritems = btrfs_header_nritems(left);
2505 if (left_nritems == 0)
2506 goto out_unlock;
2507
99d8f83c
CM
2508 return __push_leaf_right(trans, root, path, min_data_size, empty,
2509 right, free_space, left_nritems, min_slot);
44871b1b
CM
2510out_unlock:
2511 btrfs_tree_unlock(right);
2512 free_extent_buffer(right);
2513 return 1;
2514}
2515
74123bd7
CM
2516/*
2517 * push some data in the path leaf to the left, trying to free up at
2518 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
2519 *
2520 * max_slot can put a limit on how far into the leaf we'll push items. The
2521 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2522 * items
74123bd7 2523 */
44871b1b
CM
2524static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2525 struct btrfs_root *root,
2526 struct btrfs_path *path, int data_size,
2527 int empty, struct extent_buffer *left,
99d8f83c
CM
2528 int free_space, u32 right_nritems,
2529 u32 max_slot)
be0e5c09 2530{
5f39d397
CM
2531 struct btrfs_disk_key disk_key;
2532 struct extent_buffer *right = path->nodes[0];
be0e5c09 2533 int i;
be0e5c09
CM
2534 int push_space = 0;
2535 int push_items = 0;
0783fcfc 2536 struct btrfs_item *item;
7518a238 2537 u32 old_left_nritems;
34a38218 2538 u32 nr;
aa5d6bed 2539 int ret = 0;
db94535d
CM
2540 u32 this_item_size;
2541 u32 old_left_item_size;
be0e5c09 2542
34a38218 2543 if (empty)
99d8f83c 2544 nr = min(right_nritems, max_slot);
34a38218 2545 else
99d8f83c 2546 nr = min(right_nritems - 1, max_slot);
34a38218
CM
2547
2548 for (i = 0; i < nr; i++) {
5f39d397 2549 item = btrfs_item_nr(right, i);
db94535d 2550
31840ae1
ZY
2551 if (!empty && push_items > 0) {
2552 if (path->slots[0] < i)
2553 break;
2554 if (path->slots[0] == i) {
2555 int space = btrfs_leaf_free_space(root, right);
2556 if (space + push_space * 2 > free_space)
2557 break;
2558 }
2559 }
2560
be0e5c09 2561 if (path->slots[0] == i)
87b29b20 2562 push_space += data_size;
db94535d
CM
2563
2564 this_item_size = btrfs_item_size(right, item);
2565 if (this_item_size + sizeof(*item) + push_space > free_space)
be0e5c09 2566 break;
db94535d 2567
be0e5c09 2568 push_items++;
db94535d
CM
2569 push_space += this_item_size + sizeof(*item);
2570 }
2571
be0e5c09 2572 if (push_items == 0) {
925baedd
CM
2573 ret = 1;
2574 goto out;
be0e5c09 2575 }
34a38218 2576 if (!empty && push_items == btrfs_header_nritems(right))
a429e513 2577 WARN_ON(1);
5f39d397 2578
be0e5c09 2579 /* push data from right to left */
5f39d397
CM
2580 copy_extent_buffer(left, right,
2581 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2582 btrfs_item_nr_offset(0),
2583 push_items * sizeof(struct btrfs_item));
2584
123abc88 2585 push_space = BTRFS_LEAF_DATA_SIZE(root) -
d397712b 2586 btrfs_item_offset_nr(right, push_items - 1);
5f39d397
CM
2587
2588 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
d6025579
CM
2589 leaf_data_end(root, left) - push_space,
2590 btrfs_leaf_data(right) +
5f39d397 2591 btrfs_item_offset_nr(right, push_items - 1),
d6025579 2592 push_space);
5f39d397 2593 old_left_nritems = btrfs_header_nritems(left);
87b29b20 2594 BUG_ON(old_left_nritems <= 0);
eb60ceac 2595
db94535d 2596 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
0783fcfc 2597 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
5f39d397 2598 u32 ioff;
db94535d 2599
5f39d397 2600 item = btrfs_item_nr(left, i);
db94535d 2601
5f39d397
CM
2602 ioff = btrfs_item_offset(left, item);
2603 btrfs_set_item_offset(left, item,
db94535d 2604 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
be0e5c09 2605 }
5f39d397 2606 btrfs_set_header_nritems(left, old_left_nritems + push_items);
be0e5c09
CM
2607
2608 /* fixup right node */
34a38218 2609 if (push_items > right_nritems) {
d397712b
CM
2610 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2611 right_nritems);
34a38218
CM
2612 WARN_ON(1);
2613 }
2614
2615 if (push_items < right_nritems) {
2616 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2617 leaf_data_end(root, right);
2618 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2619 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2620 btrfs_leaf_data(right) +
2621 leaf_data_end(root, right), push_space);
2622
2623 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
5f39d397
CM
2624 btrfs_item_nr_offset(push_items),
2625 (btrfs_header_nritems(right) - push_items) *
2626 sizeof(struct btrfs_item));
34a38218 2627 }
eef1c494
Y
2628 right_nritems -= push_items;
2629 btrfs_set_header_nritems(right, right_nritems);
123abc88 2630 push_space = BTRFS_LEAF_DATA_SIZE(root);
5f39d397
CM
2631 for (i = 0; i < right_nritems; i++) {
2632 item = btrfs_item_nr(right, i);
db94535d 2633
db94535d
CM
2634 push_space = push_space - btrfs_item_size(right, item);
2635 btrfs_set_item_offset(right, item, push_space);
2636 }
eb60ceac 2637
5f39d397 2638 btrfs_mark_buffer_dirty(left);
34a38218
CM
2639 if (right_nritems)
2640 btrfs_mark_buffer_dirty(right);
f0486c68
YZ
2641 else
2642 clean_tree_block(trans, root, right);
098f59c2 2643
5f39d397 2644 btrfs_item_key(right, &disk_key, 0);
143bede5 2645 fixup_low_keys(trans, root, path, &disk_key, 1);
be0e5c09
CM
2646
2647 /* then fixup the leaf pointer in the path */
2648 if (path->slots[0] < push_items) {
2649 path->slots[0] += old_left_nritems;
925baedd 2650 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
2651 free_extent_buffer(path->nodes[0]);
2652 path->nodes[0] = left;
be0e5c09
CM
2653 path->slots[1] -= 1;
2654 } else {
925baedd 2655 btrfs_tree_unlock(left);
5f39d397 2656 free_extent_buffer(left);
be0e5c09
CM
2657 path->slots[0] -= push_items;
2658 }
eb60ceac 2659 BUG_ON(path->slots[0] < 0);
aa5d6bed 2660 return ret;
925baedd
CM
2661out:
2662 btrfs_tree_unlock(left);
2663 free_extent_buffer(left);
2664 return ret;
be0e5c09
CM
2665}
2666
44871b1b
CM
2667/*
2668 * push some data in the path leaf to the left, trying to free up at
2669 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
2670 *
2671 * max_slot can put a limit on how far into the leaf we'll push items. The
2672 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
2673 * items
44871b1b
CM
2674 */
2675static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
2676 *root, struct btrfs_path *path, int min_data_size,
2677 int data_size, int empty, u32 max_slot)
44871b1b
CM
2678{
2679 struct extent_buffer *right = path->nodes[0];
2680 struct extent_buffer *left;
2681 int slot;
2682 int free_space;
2683 u32 right_nritems;
2684 int ret = 0;
2685
2686 slot = path->slots[1];
2687 if (slot == 0)
2688 return 1;
2689 if (!path->nodes[1])
2690 return 1;
2691
2692 right_nritems = btrfs_header_nritems(right);
2693 if (right_nritems == 0)
2694 return 1;
2695
2696 btrfs_assert_tree_locked(path->nodes[1]);
2697
2698 left = read_node_slot(root, path->nodes[1], slot - 1);
91ca338d
TI
2699 if (left == NULL)
2700 return 1;
2701
44871b1b
CM
2702 btrfs_tree_lock(left);
2703 btrfs_set_lock_blocking(left);
2704
2705 free_space = btrfs_leaf_free_space(root, left);
2706 if (free_space < data_size) {
2707 ret = 1;
2708 goto out;
2709 }
2710
2711 /* cow and double check */
2712 ret = btrfs_cow_block(trans, root, left,
2713 path->nodes[1], slot - 1, &left);
2714 if (ret) {
2715 /* we hit -ENOSPC, but it isn't fatal here */
79787eaa
JM
2716 if (ret == -ENOSPC)
2717 ret = 1;
44871b1b
CM
2718 goto out;
2719 }
2720
2721 free_space = btrfs_leaf_free_space(root, left);
2722 if (free_space < data_size) {
2723 ret = 1;
2724 goto out;
2725 }
2726
99d8f83c
CM
2727 return __push_leaf_left(trans, root, path, min_data_size,
2728 empty, left, free_space, right_nritems,
2729 max_slot);
44871b1b
CM
2730out:
2731 btrfs_tree_unlock(left);
2732 free_extent_buffer(left);
2733 return ret;
2734}
2735
2736/*
2737 * split the path's leaf in two, making sure there is at least data_size
2738 * available for the resulting leaf level of the path.
44871b1b 2739 */
143bede5
JM
2740static noinline void copy_for_split(struct btrfs_trans_handle *trans,
2741 struct btrfs_root *root,
2742 struct btrfs_path *path,
2743 struct extent_buffer *l,
2744 struct extent_buffer *right,
2745 int slot, int mid, int nritems)
44871b1b
CM
2746{
2747 int data_copy_size;
2748 int rt_data_off;
2749 int i;
44871b1b
CM
2750 struct btrfs_disk_key disk_key;
2751
2752 nritems = nritems - mid;
2753 btrfs_set_header_nritems(right, nritems);
2754 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2755
2756 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2757 btrfs_item_nr_offset(mid),
2758 nritems * sizeof(struct btrfs_item));
2759
2760 copy_extent_buffer(right, l,
2761 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2762 data_copy_size, btrfs_leaf_data(l) +
2763 leaf_data_end(root, l), data_copy_size);
2764
2765 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2766 btrfs_item_end_nr(l, mid);
2767
2768 for (i = 0; i < nritems; i++) {
2769 struct btrfs_item *item = btrfs_item_nr(right, i);
2770 u32 ioff;
2771
44871b1b
CM
2772 ioff = btrfs_item_offset(right, item);
2773 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2774 }
2775
44871b1b 2776 btrfs_set_header_nritems(l, mid);
44871b1b 2777 btrfs_item_key(right, &disk_key, 0);
143bede5
JM
2778 insert_ptr(trans, root, path, &disk_key, right->start,
2779 path->slots[1] + 1, 1);
44871b1b
CM
2780
2781 btrfs_mark_buffer_dirty(right);
2782 btrfs_mark_buffer_dirty(l);
2783 BUG_ON(path->slots[0] != slot);
2784
44871b1b
CM
2785 if (mid <= slot) {
2786 btrfs_tree_unlock(path->nodes[0]);
2787 free_extent_buffer(path->nodes[0]);
2788 path->nodes[0] = right;
2789 path->slots[0] -= mid;
2790 path->slots[1] += 1;
2791 } else {
2792 btrfs_tree_unlock(right);
2793 free_extent_buffer(right);
2794 }
2795
2796 BUG_ON(path->slots[0] < 0);
44871b1b
CM
2797}
2798
99d8f83c
CM
2799/*
2800 * double splits happen when we need to insert a big item in the middle
2801 * of a leaf. A double split can leave us with 3 mostly empty leaves:
2802 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2803 * A B C
2804 *
2805 * We avoid this by trying to push the items on either side of our target
2806 * into the adjacent leaves. If all goes well we can avoid the double split
2807 * completely.
2808 */
2809static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2810 struct btrfs_root *root,
2811 struct btrfs_path *path,
2812 int data_size)
2813{
2814 int ret;
2815 int progress = 0;
2816 int slot;
2817 u32 nritems;
2818
2819 slot = path->slots[0];
2820
2821 /*
2822 * try to push all the items after our slot into the
2823 * right leaf
2824 */
2825 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2826 if (ret < 0)
2827 return ret;
2828
2829 if (ret == 0)
2830 progress++;
2831
2832 nritems = btrfs_header_nritems(path->nodes[0]);
2833 /*
2834 * our goal is to get our slot at the start or end of a leaf. If
2835 * we've done so we're done
2836 */
2837 if (path->slots[0] == 0 || path->slots[0] == nritems)
2838 return 0;
2839
2840 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2841 return 0;
2842
2843 /* try to push all the items before our slot into the next leaf */
2844 slot = path->slots[0];
2845 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2846 if (ret < 0)
2847 return ret;
2848
2849 if (ret == 0)
2850 progress++;
2851
2852 if (progress)
2853 return 0;
2854 return 1;
2855}
2856
74123bd7
CM
2857/*
2858 * split the path's leaf in two, making sure there is at least data_size
2859 * available for the resulting leaf level of the path.
aa5d6bed
CM
2860 *
2861 * returns 0 if all went well and < 0 on failure.
74123bd7 2862 */
e02119d5
CM
2863static noinline int split_leaf(struct btrfs_trans_handle *trans,
2864 struct btrfs_root *root,
2865 struct btrfs_key *ins_key,
2866 struct btrfs_path *path, int data_size,
2867 int extend)
be0e5c09 2868{
5d4f98a2 2869 struct btrfs_disk_key disk_key;
5f39d397 2870 struct extent_buffer *l;
7518a238 2871 u32 nritems;
eb60ceac
CM
2872 int mid;
2873 int slot;
5f39d397 2874 struct extent_buffer *right;
d4dbff95 2875 int ret = 0;
aa5d6bed 2876 int wret;
5d4f98a2 2877 int split;
cc0c5538 2878 int num_doubles = 0;
99d8f83c 2879 int tried_avoid_double = 0;
aa5d6bed 2880
a5719521
YZ
2881 l = path->nodes[0];
2882 slot = path->slots[0];
2883 if (extend && data_size + btrfs_item_size_nr(l, slot) +
2884 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2885 return -EOVERFLOW;
2886
40689478 2887 /* first try to make some room by pushing left and right */
99d8f83c
CM
2888 if (data_size) {
2889 wret = push_leaf_right(trans, root, path, data_size,
2890 data_size, 0, 0);
d397712b 2891 if (wret < 0)
eaee50e8 2892 return wret;
3685f791 2893 if (wret) {
99d8f83c
CM
2894 wret = push_leaf_left(trans, root, path, data_size,
2895 data_size, 0, (u32)-1);
3685f791
CM
2896 if (wret < 0)
2897 return wret;
2898 }
2899 l = path->nodes[0];
aa5d6bed 2900
3685f791 2901 /* did the pushes work? */
87b29b20 2902 if (btrfs_leaf_free_space(root, l) >= data_size)
3685f791 2903 return 0;
3326d1b0 2904 }
aa5d6bed 2905
5c680ed6 2906 if (!path->nodes[1]) {
e089f05c 2907 ret = insert_new_root(trans, root, path, 1);
5c680ed6
CM
2908 if (ret)
2909 return ret;
2910 }
cc0c5538 2911again:
5d4f98a2 2912 split = 1;
cc0c5538 2913 l = path->nodes[0];
eb60ceac 2914 slot = path->slots[0];
5f39d397 2915 nritems = btrfs_header_nritems(l);
d397712b 2916 mid = (nritems + 1) / 2;
54aa1f4d 2917
5d4f98a2
YZ
2918 if (mid <= slot) {
2919 if (nritems == 1 ||
2920 leaf_space_used(l, mid, nritems - mid) + data_size >
2921 BTRFS_LEAF_DATA_SIZE(root)) {
2922 if (slot >= nritems) {
2923 split = 0;
2924 } else {
2925 mid = slot;
2926 if (mid != nritems &&
2927 leaf_space_used(l, mid, nritems - mid) +
2928 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
99d8f83c
CM
2929 if (data_size && !tried_avoid_double)
2930 goto push_for_double;
5d4f98a2
YZ
2931 split = 2;
2932 }
2933 }
2934 }
2935 } else {
2936 if (leaf_space_used(l, 0, mid) + data_size >
2937 BTRFS_LEAF_DATA_SIZE(root)) {
2938 if (!extend && data_size && slot == 0) {
2939 split = 0;
2940 } else if ((extend || !data_size) && slot == 0) {
2941 mid = 1;
2942 } else {
2943 mid = slot;
2944 if (mid != nritems &&
2945 leaf_space_used(l, mid, nritems - mid) +
2946 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
99d8f83c
CM
2947 if (data_size && !tried_avoid_double)
2948 goto push_for_double;
5d4f98a2
YZ
2949 split = 2 ;
2950 }
2951 }
2952 }
2953 }
2954
2955 if (split == 0)
2956 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2957 else
2958 btrfs_item_key(l, &disk_key, mid);
2959
2960 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
31840ae1 2961 root->root_key.objectid,
66d7e7f0 2962 &disk_key, 0, l->start, 0, 0);
f0486c68 2963 if (IS_ERR(right))
5f39d397 2964 return PTR_ERR(right);
f0486c68
YZ
2965
2966 root_add_used(root, root->leafsize);
5f39d397
CM
2967
2968 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
db94535d 2969 btrfs_set_header_bytenr(right, right->start);
5f39d397 2970 btrfs_set_header_generation(right, trans->transid);
5d4f98a2 2971 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
5f39d397
CM
2972 btrfs_set_header_owner(right, root->root_key.objectid);
2973 btrfs_set_header_level(right, 0);
2974 write_extent_buffer(right, root->fs_info->fsid,
2975 (unsigned long)btrfs_header_fsid(right),
2976 BTRFS_FSID_SIZE);
e17cade2
CM
2977
2978 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2979 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2980 BTRFS_UUID_SIZE);
44871b1b 2981
5d4f98a2
YZ
2982 if (split == 0) {
2983 if (mid <= slot) {
2984 btrfs_set_header_nritems(right, 0);
143bede5
JM
2985 insert_ptr(trans, root, path, &disk_key, right->start,
2986 path->slots[1] + 1, 1);
5d4f98a2
YZ
2987 btrfs_tree_unlock(path->nodes[0]);
2988 free_extent_buffer(path->nodes[0]);
2989 path->nodes[0] = right;
2990 path->slots[0] = 0;
2991 path->slots[1] += 1;
2992 } else {
2993 btrfs_set_header_nritems(right, 0);
143bede5 2994 insert_ptr(trans, root, path, &disk_key, right->start,
5d4f98a2 2995 path->slots[1], 1);
5d4f98a2
YZ
2996 btrfs_tree_unlock(path->nodes[0]);
2997 free_extent_buffer(path->nodes[0]);
2998 path->nodes[0] = right;
2999 path->slots[0] = 0;
143bede5
JM
3000 if (path->slots[1] == 0)
3001 fixup_low_keys(trans, root, path,
3002 &disk_key, 1);
d4dbff95 3003 }
5d4f98a2
YZ
3004 btrfs_mark_buffer_dirty(right);
3005 return ret;
d4dbff95 3006 }
74123bd7 3007
143bede5 3008 copy_for_split(trans, root, path, l, right, slot, mid, nritems);
31840ae1 3009
5d4f98a2 3010 if (split == 2) {
cc0c5538
CM
3011 BUG_ON(num_doubles != 0);
3012 num_doubles++;
3013 goto again;
a429e513 3014 }
44871b1b 3015
143bede5 3016 return 0;
99d8f83c
CM
3017
3018push_for_double:
3019 push_for_double_split(trans, root, path, data_size);
3020 tried_avoid_double = 1;
3021 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3022 return 0;
3023 goto again;
be0e5c09
CM
3024}
3025
ad48fd75
YZ
3026static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3027 struct btrfs_root *root,
3028 struct btrfs_path *path, int ins_len)
459931ec 3029{
ad48fd75 3030 struct btrfs_key key;
459931ec 3031 struct extent_buffer *leaf;
ad48fd75
YZ
3032 struct btrfs_file_extent_item *fi;
3033 u64 extent_len = 0;
3034 u32 item_size;
3035 int ret;
459931ec
CM
3036
3037 leaf = path->nodes[0];
ad48fd75
YZ
3038 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3039
3040 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3041 key.type != BTRFS_EXTENT_CSUM_KEY);
3042
3043 if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3044 return 0;
459931ec
CM
3045
3046 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ad48fd75
YZ
3047 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3048 fi = btrfs_item_ptr(leaf, path->slots[0],
3049 struct btrfs_file_extent_item);
3050 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3051 }
b3b4aa74 3052 btrfs_release_path(path);
459931ec 3053
459931ec 3054 path->keep_locks = 1;
ad48fd75
YZ
3055 path->search_for_split = 1;
3056 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
459931ec 3057 path->search_for_split = 0;
ad48fd75
YZ
3058 if (ret < 0)
3059 goto err;
459931ec 3060
ad48fd75
YZ
3061 ret = -EAGAIN;
3062 leaf = path->nodes[0];
459931ec 3063 /* if our item isn't there or got smaller, return now */
ad48fd75
YZ
3064 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3065 goto err;
3066
109f6aef
CM
3067 /* the leaf has changed, it now has room. return now */
3068 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3069 goto err;
3070
ad48fd75
YZ
3071 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3072 fi = btrfs_item_ptr(leaf, path->slots[0],
3073 struct btrfs_file_extent_item);
3074 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3075 goto err;
459931ec
CM
3076 }
3077
b9473439 3078 btrfs_set_path_blocking(path);
ad48fd75 3079 ret = split_leaf(trans, root, &key, path, ins_len, 1);
f0486c68
YZ
3080 if (ret)
3081 goto err;
459931ec 3082
ad48fd75 3083 path->keep_locks = 0;
b9473439 3084 btrfs_unlock_up_safe(path, 1);
ad48fd75
YZ
3085 return 0;
3086err:
3087 path->keep_locks = 0;
3088 return ret;
3089}
3090
3091static noinline int split_item(struct btrfs_trans_handle *trans,
3092 struct btrfs_root *root,
3093 struct btrfs_path *path,
3094 struct btrfs_key *new_key,
3095 unsigned long split_offset)
3096{
3097 struct extent_buffer *leaf;
3098 struct btrfs_item *item;
3099 struct btrfs_item *new_item;
3100 int slot;
3101 char *buf;
3102 u32 nritems;
3103 u32 item_size;
3104 u32 orig_offset;
3105 struct btrfs_disk_key disk_key;
3106
b9473439
CM
3107 leaf = path->nodes[0];
3108 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3109
b4ce94de
CM
3110 btrfs_set_path_blocking(path);
3111
459931ec
CM
3112 item = btrfs_item_nr(leaf, path->slots[0]);
3113 orig_offset = btrfs_item_offset(leaf, item);
3114 item_size = btrfs_item_size(leaf, item);
3115
459931ec 3116 buf = kmalloc(item_size, GFP_NOFS);
ad48fd75
YZ
3117 if (!buf)
3118 return -ENOMEM;
3119
459931ec
CM
3120 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3121 path->slots[0]), item_size);
459931ec 3122
ad48fd75 3123 slot = path->slots[0] + 1;
459931ec 3124 nritems = btrfs_header_nritems(leaf);
459931ec
CM
3125 if (slot != nritems) {
3126 /* shift the items */
3127 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
ad48fd75
YZ
3128 btrfs_item_nr_offset(slot),
3129 (nritems - slot) * sizeof(struct btrfs_item));
459931ec
CM
3130 }
3131
3132 btrfs_cpu_key_to_disk(&disk_key, new_key);
3133 btrfs_set_item_key(leaf, &disk_key, slot);
3134
3135 new_item = btrfs_item_nr(leaf, slot);
3136
3137 btrfs_set_item_offset(leaf, new_item, orig_offset);
3138 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3139
3140 btrfs_set_item_offset(leaf, item,
3141 orig_offset + item_size - split_offset);
3142 btrfs_set_item_size(leaf, item, split_offset);
3143
3144 btrfs_set_header_nritems(leaf, nritems + 1);
3145
3146 /* write the data for the start of the original item */
3147 write_extent_buffer(leaf, buf,
3148 btrfs_item_ptr_offset(leaf, path->slots[0]),
3149 split_offset);
3150
3151 /* write the data for the new item */
3152 write_extent_buffer(leaf, buf + split_offset,
3153 btrfs_item_ptr_offset(leaf, slot),
3154 item_size - split_offset);
3155 btrfs_mark_buffer_dirty(leaf);
3156
ad48fd75 3157 BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
459931ec 3158 kfree(buf);
ad48fd75
YZ
3159 return 0;
3160}
3161
3162/*
3163 * This function splits a single item into two items,
3164 * giving 'new_key' to the new item and splitting the
3165 * old one at split_offset (from the start of the item).
3166 *
3167 * The path may be released by this operation. After
3168 * the split, the path is pointing to the old item. The
3169 * new item is going to be in the same node as the old one.
3170 *
3171 * Note, the item being split must be smaller enough to live alone on
3172 * a tree block with room for one extra struct btrfs_item
3173 *
3174 * This allows us to split the item in place, keeping a lock on the
3175 * leaf the entire time.
3176 */
3177int btrfs_split_item(struct btrfs_trans_handle *trans,
3178 struct btrfs_root *root,
3179 struct btrfs_path *path,
3180 struct btrfs_key *new_key,
3181 unsigned long split_offset)
3182{
3183 int ret;
3184 ret = setup_leaf_for_split(trans, root, path,
3185 sizeof(struct btrfs_item));
3186 if (ret)
3187 return ret;
3188
3189 ret = split_item(trans, root, path, new_key, split_offset);
459931ec
CM
3190 return ret;
3191}
3192
ad48fd75
YZ
3193/*
3194 * This function duplicate a item, giving 'new_key' to the new item.
3195 * It guarantees both items live in the same tree leaf and the new item
3196 * is contiguous with the original item.
3197 *
3198 * This allows us to split file extent in place, keeping a lock on the
3199 * leaf the entire time.
3200 */
3201int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3202 struct btrfs_root *root,
3203 struct btrfs_path *path,
3204 struct btrfs_key *new_key)
3205{
3206 struct extent_buffer *leaf;
3207 int ret;
3208 u32 item_size;
3209
3210 leaf = path->nodes[0];
3211 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3212 ret = setup_leaf_for_split(trans, root, path,
3213 item_size + sizeof(struct btrfs_item));
3214 if (ret)
3215 return ret;
3216
3217 path->slots[0]++;
143bede5
JM
3218 setup_items_for_insert(trans, root, path, new_key, &item_size,
3219 item_size, item_size +
3220 sizeof(struct btrfs_item), 1);
ad48fd75
YZ
3221 leaf = path->nodes[0];
3222 memcpy_extent_buffer(leaf,
3223 btrfs_item_ptr_offset(leaf, path->slots[0]),
3224 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3225 item_size);
3226 return 0;
3227}
3228
d352ac68
CM
3229/*
3230 * make the item pointed to by the path smaller. new_size indicates
3231 * how small to make it, and from_end tells us if we just chop bytes
3232 * off the end of the item or if we shift the item to chop bytes off
3233 * the front.
3234 */
143bede5
JM
3235void btrfs_truncate_item(struct btrfs_trans_handle *trans,
3236 struct btrfs_root *root,
3237 struct btrfs_path *path,
3238 u32 new_size, int from_end)
b18c6685 3239{
b18c6685 3240 int slot;
5f39d397
CM
3241 struct extent_buffer *leaf;
3242 struct btrfs_item *item;
b18c6685
CM
3243 u32 nritems;
3244 unsigned int data_end;
3245 unsigned int old_data_start;
3246 unsigned int old_size;
3247 unsigned int size_diff;
3248 int i;
3249
5f39d397 3250 leaf = path->nodes[0];
179e29e4
CM
3251 slot = path->slots[0];
3252
3253 old_size = btrfs_item_size_nr(leaf, slot);
3254 if (old_size == new_size)
143bede5 3255 return;
b18c6685 3256
5f39d397 3257 nritems = btrfs_header_nritems(leaf);
b18c6685
CM
3258 data_end = leaf_data_end(root, leaf);
3259
5f39d397 3260 old_data_start = btrfs_item_offset_nr(leaf, slot);
179e29e4 3261
b18c6685
CM
3262 size_diff = old_size - new_size;
3263
3264 BUG_ON(slot < 0);
3265 BUG_ON(slot >= nritems);
3266
3267 /*
3268 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3269 */
3270 /* first correct the data pointers */
3271 for (i = slot; i < nritems; i++) {
5f39d397
CM
3272 u32 ioff;
3273 item = btrfs_item_nr(leaf, i);
db94535d 3274
5f39d397
CM
3275 ioff = btrfs_item_offset(leaf, item);
3276 btrfs_set_item_offset(leaf, item, ioff + size_diff);
b18c6685 3277 }
db94535d 3278
b18c6685 3279 /* shift the data */
179e29e4
CM
3280 if (from_end) {
3281 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3282 data_end + size_diff, btrfs_leaf_data(leaf) +
3283 data_end, old_data_start + new_size - data_end);
3284 } else {
3285 struct btrfs_disk_key disk_key;
3286 u64 offset;
3287
3288 btrfs_item_key(leaf, &disk_key, slot);
3289
3290 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3291 unsigned long ptr;
3292 struct btrfs_file_extent_item *fi;
3293
3294 fi = btrfs_item_ptr(leaf, slot,
3295 struct btrfs_file_extent_item);
3296 fi = (struct btrfs_file_extent_item *)(
3297 (unsigned long)fi - size_diff);
3298
3299 if (btrfs_file_extent_type(leaf, fi) ==
3300 BTRFS_FILE_EXTENT_INLINE) {
3301 ptr = btrfs_item_ptr_offset(leaf, slot);
3302 memmove_extent_buffer(leaf, ptr,
d397712b
CM
3303 (unsigned long)fi,
3304 offsetof(struct btrfs_file_extent_item,
179e29e4
CM
3305 disk_bytenr));
3306 }
3307 }
3308
3309 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3310 data_end + size_diff, btrfs_leaf_data(leaf) +
3311 data_end, old_data_start - data_end);
3312
3313 offset = btrfs_disk_key_offset(&disk_key);
3314 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3315 btrfs_set_item_key(leaf, &disk_key, slot);
3316 if (slot == 0)
3317 fixup_low_keys(trans, root, path, &disk_key, 1);
3318 }
5f39d397
CM
3319
3320 item = btrfs_item_nr(leaf, slot);
3321 btrfs_set_item_size(leaf, item, new_size);
3322 btrfs_mark_buffer_dirty(leaf);
b18c6685 3323
5f39d397
CM
3324 if (btrfs_leaf_free_space(root, leaf) < 0) {
3325 btrfs_print_leaf(root, leaf);
b18c6685 3326 BUG();
5f39d397 3327 }
b18c6685
CM
3328}
3329
d352ac68
CM
3330/*
3331 * make the item pointed to by the path bigger, data_size is the new size.
3332 */
143bede5
JM
3333void btrfs_extend_item(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, struct btrfs_path *path,
3335 u32 data_size)
6567e837 3336{
6567e837 3337 int slot;
5f39d397
CM
3338 struct extent_buffer *leaf;
3339 struct btrfs_item *item;
6567e837
CM
3340 u32 nritems;
3341 unsigned int data_end;
3342 unsigned int old_data;
3343 unsigned int old_size;
3344 int i;
3345
5f39d397 3346 leaf = path->nodes[0];
6567e837 3347
5f39d397 3348 nritems = btrfs_header_nritems(leaf);
6567e837
CM
3349 data_end = leaf_data_end(root, leaf);
3350
5f39d397
CM
3351 if (btrfs_leaf_free_space(root, leaf) < data_size) {
3352 btrfs_print_leaf(root, leaf);
6567e837 3353 BUG();
5f39d397 3354 }
6567e837 3355 slot = path->slots[0];
5f39d397 3356 old_data = btrfs_item_end_nr(leaf, slot);
6567e837
CM
3357
3358 BUG_ON(slot < 0);
3326d1b0
CM
3359 if (slot >= nritems) {
3360 btrfs_print_leaf(root, leaf);
d397712b
CM
3361 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3362 slot, nritems);
3326d1b0
CM
3363 BUG_ON(1);
3364 }
6567e837
CM
3365
3366 /*
3367 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3368 */
3369 /* first correct the data pointers */
3370 for (i = slot; i < nritems; i++) {
5f39d397
CM
3371 u32 ioff;
3372 item = btrfs_item_nr(leaf, i);
db94535d 3373
5f39d397
CM
3374 ioff = btrfs_item_offset(leaf, item);
3375 btrfs_set_item_offset(leaf, item, ioff - data_size);
6567e837 3376 }
5f39d397 3377
6567e837 3378 /* shift the data */
5f39d397 3379 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
6567e837
CM
3380 data_end - data_size, btrfs_leaf_data(leaf) +
3381 data_end, old_data - data_end);
5f39d397 3382
6567e837 3383 data_end = old_data;
5f39d397
CM
3384 old_size = btrfs_item_size_nr(leaf, slot);
3385 item = btrfs_item_nr(leaf, slot);
3386 btrfs_set_item_size(leaf, item, old_size + data_size);
3387 btrfs_mark_buffer_dirty(leaf);
6567e837 3388
5f39d397
CM
3389 if (btrfs_leaf_free_space(root, leaf) < 0) {
3390 btrfs_print_leaf(root, leaf);
6567e837 3391 BUG();
5f39d397 3392 }
6567e837
CM
3393}
3394
f3465ca4
JB
3395/*
3396 * Given a key and some data, insert items into the tree.
3397 * This does all the path init required, making room in the tree if needed.
3398 * Returns the number of keys that were inserted.
3399 */
3400int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3401 struct btrfs_root *root,
3402 struct btrfs_path *path,
3403 struct btrfs_key *cpu_key, u32 *data_size,
3404 int nr)
3405{
3406 struct extent_buffer *leaf;
3407 struct btrfs_item *item;
3408 int ret = 0;
3409 int slot;
f3465ca4
JB
3410 int i;
3411 u32 nritems;
3412 u32 total_data = 0;
3413 u32 total_size = 0;
3414 unsigned int data_end;
3415 struct btrfs_disk_key disk_key;
3416 struct btrfs_key found_key;
3417
87b29b20
YZ
3418 for (i = 0; i < nr; i++) {
3419 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3420 BTRFS_LEAF_DATA_SIZE(root)) {
3421 break;
3422 nr = i;
3423 }
f3465ca4 3424 total_data += data_size[i];
87b29b20
YZ
3425 total_size += data_size[i] + sizeof(struct btrfs_item);
3426 }
3427 BUG_ON(nr == 0);
f3465ca4 3428
f3465ca4
JB
3429 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3430 if (ret == 0)
3431 return -EEXIST;
3432 if (ret < 0)
3433 goto out;
3434
f3465ca4
JB
3435 leaf = path->nodes[0];
3436
3437 nritems = btrfs_header_nritems(leaf);
3438 data_end = leaf_data_end(root, leaf);
3439
3440 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3441 for (i = nr; i >= 0; i--) {
3442 total_data -= data_size[i];
3443 total_size -= data_size[i] + sizeof(struct btrfs_item);
3444 if (total_size < btrfs_leaf_free_space(root, leaf))
3445 break;
3446 }
3447 nr = i;
3448 }
3449
3450 slot = path->slots[0];
3451 BUG_ON(slot < 0);
3452
3453 if (slot != nritems) {
3454 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3455
3456 item = btrfs_item_nr(leaf, slot);
3457 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3458
3459 /* figure out how many keys we can insert in here */
3460 total_data = data_size[0];
3461 for (i = 1; i < nr; i++) {
5d4f98a2 3462 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
f3465ca4
JB
3463 break;
3464 total_data += data_size[i];
3465 }
3466 nr = i;
3467
3468 if (old_data < data_end) {
3469 btrfs_print_leaf(root, leaf);
d397712b 3470 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
f3465ca4
JB
3471 slot, old_data, data_end);
3472 BUG_ON(1);
3473 }
3474 /*
3475 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3476 */
3477 /* first correct the data pointers */
f3465ca4
JB
3478 for (i = slot; i < nritems; i++) {
3479 u32 ioff;
3480
3481 item = btrfs_item_nr(leaf, i);
f3465ca4
JB
3482 ioff = btrfs_item_offset(leaf, item);
3483 btrfs_set_item_offset(leaf, item, ioff - total_data);
3484 }
f3465ca4
JB
3485 /* shift the items */
3486 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3487 btrfs_item_nr_offset(slot),
3488 (nritems - slot) * sizeof(struct btrfs_item));
3489
3490 /* shift the data */
3491 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3492 data_end - total_data, btrfs_leaf_data(leaf) +
3493 data_end, old_data - data_end);
3494 data_end = old_data;
3495 } else {
3496 /*
3497 * this sucks but it has to be done, if we are inserting at
3498 * the end of the leaf only insert 1 of the items, since we
3499 * have no way of knowing whats on the next leaf and we'd have
3500 * to drop our current locks to figure it out
3501 */
3502 nr = 1;
3503 }
3504
3505 /* setup the item for the new data */
3506 for (i = 0; i < nr; i++) {
3507 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3508 btrfs_set_item_key(leaf, &disk_key, slot + i);
3509 item = btrfs_item_nr(leaf, slot + i);
3510 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3511 data_end -= data_size[i];
3512 btrfs_set_item_size(leaf, item, data_size[i]);
3513 }
3514 btrfs_set_header_nritems(leaf, nritems + nr);
3515 btrfs_mark_buffer_dirty(leaf);
3516
3517 ret = 0;
3518 if (slot == 0) {
3519 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
143bede5 3520 fixup_low_keys(trans, root, path, &disk_key, 1);
f3465ca4
JB
3521 }
3522
3523 if (btrfs_leaf_free_space(root, leaf) < 0) {
3524 btrfs_print_leaf(root, leaf);
3525 BUG();
3526 }
3527out:
3528 if (!ret)
3529 ret = nr;
3530 return ret;
3531}
3532
74123bd7 3533/*
44871b1b
CM
3534 * this is a helper for btrfs_insert_empty_items, the main goal here is
3535 * to save stack depth by doing the bulk of the work in a function
3536 * that doesn't call btrfs_search_slot
74123bd7 3537 */
143bede5
JM
3538void setup_items_for_insert(struct btrfs_trans_handle *trans,
3539 struct btrfs_root *root, struct btrfs_path *path,
3540 struct btrfs_key *cpu_key, u32 *data_size,
3541 u32 total_data, u32 total_size, int nr)
be0e5c09 3542{
5f39d397 3543 struct btrfs_item *item;
9c58309d 3544 int i;
7518a238 3545 u32 nritems;
be0e5c09 3546 unsigned int data_end;
e2fa7227 3547 struct btrfs_disk_key disk_key;
44871b1b
CM
3548 struct extent_buffer *leaf;
3549 int slot;
e2fa7227 3550
5f39d397 3551 leaf = path->nodes[0];
44871b1b 3552 slot = path->slots[0];
74123bd7 3553
5f39d397 3554 nritems = btrfs_header_nritems(leaf);
123abc88 3555 data_end = leaf_data_end(root, leaf);
eb60ceac 3556
f25956cc 3557 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3326d1b0 3558 btrfs_print_leaf(root, leaf);
d397712b 3559 printk(KERN_CRIT "not enough freespace need %u have %d\n",
9c58309d 3560 total_size, btrfs_leaf_free_space(root, leaf));
be0e5c09 3561 BUG();
d4dbff95 3562 }
5f39d397 3563
be0e5c09 3564 if (slot != nritems) {
5f39d397 3565 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
be0e5c09 3566
5f39d397
CM
3567 if (old_data < data_end) {
3568 btrfs_print_leaf(root, leaf);
d397712b 3569 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
5f39d397
CM
3570 slot, old_data, data_end);
3571 BUG_ON(1);
3572 }
be0e5c09
CM
3573 /*
3574 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3575 */
3576 /* first correct the data pointers */
0783fcfc 3577 for (i = slot; i < nritems; i++) {
5f39d397 3578 u32 ioff;
db94535d 3579
5f39d397
CM
3580 item = btrfs_item_nr(leaf, i);
3581 ioff = btrfs_item_offset(leaf, item);
9c58309d 3582 btrfs_set_item_offset(leaf, item, ioff - total_data);
0783fcfc 3583 }
be0e5c09 3584 /* shift the items */
9c58309d 3585 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
5f39d397 3586 btrfs_item_nr_offset(slot),
d6025579 3587 (nritems - slot) * sizeof(struct btrfs_item));
be0e5c09
CM
3588
3589 /* shift the data */
5f39d397 3590 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
9c58309d 3591 data_end - total_data, btrfs_leaf_data(leaf) +
d6025579 3592 data_end, old_data - data_end);
be0e5c09
CM
3593 data_end = old_data;
3594 }
5f39d397 3595
62e2749e 3596 /* setup the item for the new data */
9c58309d
CM
3597 for (i = 0; i < nr; i++) {
3598 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3599 btrfs_set_item_key(leaf, &disk_key, slot + i);
3600 item = btrfs_item_nr(leaf, slot + i);
3601 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3602 data_end -= data_size[i];
3603 btrfs_set_item_size(leaf, item, data_size[i]);
3604 }
44871b1b 3605
9c58309d 3606 btrfs_set_header_nritems(leaf, nritems + nr);
aa5d6bed 3607
5a01a2e3
CM
3608 if (slot == 0) {
3609 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
143bede5 3610 fixup_low_keys(trans, root, path, &disk_key, 1);
5a01a2e3 3611 }
b9473439
CM
3612 btrfs_unlock_up_safe(path, 1);
3613 btrfs_mark_buffer_dirty(leaf);
aa5d6bed 3614
5f39d397
CM
3615 if (btrfs_leaf_free_space(root, leaf) < 0) {
3616 btrfs_print_leaf(root, leaf);
be0e5c09 3617 BUG();
5f39d397 3618 }
44871b1b
CM
3619}
3620
3621/*
3622 * Given a key and some data, insert items into the tree.
3623 * This does all the path init required, making room in the tree if needed.
3624 */
3625int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3626 struct btrfs_root *root,
3627 struct btrfs_path *path,
3628 struct btrfs_key *cpu_key, u32 *data_size,
3629 int nr)
3630{
44871b1b
CM
3631 int ret = 0;
3632 int slot;
3633 int i;
3634 u32 total_size = 0;
3635 u32 total_data = 0;
3636
3637 for (i = 0; i < nr; i++)
3638 total_data += data_size[i];
3639
3640 total_size = total_data + (nr * sizeof(struct btrfs_item));
3641 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3642 if (ret == 0)
3643 return -EEXIST;
3644 if (ret < 0)
143bede5 3645 return ret;
44871b1b 3646
44871b1b
CM
3647 slot = path->slots[0];
3648 BUG_ON(slot < 0);
3649
143bede5 3650 setup_items_for_insert(trans, root, path, cpu_key, data_size,
44871b1b 3651 total_data, total_size, nr);
143bede5 3652 return 0;
62e2749e
CM
3653}
3654
3655/*
3656 * Given a key and some data, insert an item into the tree.
3657 * This does all the path init required, making room in the tree if needed.
3658 */
e089f05c
CM
3659int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3660 *root, struct btrfs_key *cpu_key, void *data, u32
3661 data_size)
62e2749e
CM
3662{
3663 int ret = 0;
2c90e5d6 3664 struct btrfs_path *path;
5f39d397
CM
3665 struct extent_buffer *leaf;
3666 unsigned long ptr;
62e2749e 3667
2c90e5d6 3668 path = btrfs_alloc_path();
db5b493a
TI
3669 if (!path)
3670 return -ENOMEM;
2c90e5d6 3671 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
62e2749e 3672 if (!ret) {
5f39d397
CM
3673 leaf = path->nodes[0];
3674 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3675 write_extent_buffer(leaf, data, ptr, data_size);
3676 btrfs_mark_buffer_dirty(leaf);
62e2749e 3677 }
2c90e5d6 3678 btrfs_free_path(path);
aa5d6bed 3679 return ret;
be0e5c09
CM
3680}
3681
74123bd7 3682/*
5de08d7d 3683 * delete the pointer from a given node.
74123bd7 3684 *
d352ac68
CM
3685 * the tree should have been previously balanced so the deletion does not
3686 * empty a node.
74123bd7 3687 */
143bede5
JM
3688static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3689 struct btrfs_path *path, int level, int slot)
be0e5c09 3690{
5f39d397 3691 struct extent_buffer *parent = path->nodes[level];
7518a238 3692 u32 nritems;
be0e5c09 3693
5f39d397 3694 nritems = btrfs_header_nritems(parent);
d397712b 3695 if (slot != nritems - 1) {
5f39d397
CM
3696 memmove_extent_buffer(parent,
3697 btrfs_node_key_ptr_offset(slot),
3698 btrfs_node_key_ptr_offset(slot + 1),
d6025579
CM
3699 sizeof(struct btrfs_key_ptr) *
3700 (nritems - slot - 1));
bb803951 3701 }
7518a238 3702 nritems--;
5f39d397 3703 btrfs_set_header_nritems(parent, nritems);
7518a238 3704 if (nritems == 0 && parent == root->node) {
5f39d397 3705 BUG_ON(btrfs_header_level(root->node) != 1);
bb803951 3706 /* just turn the root into a leaf and break */
5f39d397 3707 btrfs_set_header_level(root->node, 0);
bb803951 3708 } else if (slot == 0) {
5f39d397
CM
3709 struct btrfs_disk_key disk_key;
3710
3711 btrfs_node_key(parent, &disk_key, 0);
143bede5 3712 fixup_low_keys(trans, root, path, &disk_key, level + 1);
be0e5c09 3713 }
d6025579 3714 btrfs_mark_buffer_dirty(parent);
be0e5c09
CM
3715}
3716
323ac95b
CM
3717/*
3718 * a helper function to delete the leaf pointed to by path->slots[1] and
5d4f98a2 3719 * path->nodes[1].
323ac95b
CM
3720 *
3721 * This deletes the pointer in path->nodes[1] and frees the leaf
3722 * block extent. zero is returned if it all worked out, < 0 otherwise.
3723 *
3724 * The path must have already been setup for deleting the leaf, including
3725 * all the proper balancing. path->nodes[1] must be locked.
3726 */
143bede5
JM
3727static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3728 struct btrfs_root *root,
3729 struct btrfs_path *path,
3730 struct extent_buffer *leaf)
323ac95b 3731{
5d4f98a2 3732 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
143bede5 3733 del_ptr(trans, root, path, 1, path->slots[1]);
323ac95b 3734
4d081c41
CM
3735 /*
3736 * btrfs_free_extent is expensive, we want to make sure we
3737 * aren't holding any locks when we call it
3738 */
3739 btrfs_unlock_up_safe(path, 0);
3740
f0486c68
YZ
3741 root_sub_used(root, leaf->len);
3742
66d7e7f0 3743 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
323ac95b 3744}
74123bd7
CM
3745/*
3746 * delete the item at the leaf level in path. If that empties
3747 * the leaf, remove it from the tree
3748 */
85e21bac
CM
3749int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3750 struct btrfs_path *path, int slot, int nr)
be0e5c09 3751{
5f39d397
CM
3752 struct extent_buffer *leaf;
3753 struct btrfs_item *item;
85e21bac
CM
3754 int last_off;
3755 int dsize = 0;
aa5d6bed
CM
3756 int ret = 0;
3757 int wret;
85e21bac 3758 int i;
7518a238 3759 u32 nritems;
be0e5c09 3760
5f39d397 3761 leaf = path->nodes[0];
85e21bac
CM
3762 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3763
3764 for (i = 0; i < nr; i++)
3765 dsize += btrfs_item_size_nr(leaf, slot + i);
3766
5f39d397 3767 nritems = btrfs_header_nritems(leaf);
be0e5c09 3768
85e21bac 3769 if (slot + nr != nritems) {
123abc88 3770 int data_end = leaf_data_end(root, leaf);
5f39d397
CM
3771
3772 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
d6025579
CM
3773 data_end + dsize,
3774 btrfs_leaf_data(leaf) + data_end,
85e21bac 3775 last_off - data_end);
5f39d397 3776
85e21bac 3777 for (i = slot + nr; i < nritems; i++) {
5f39d397 3778 u32 ioff;
db94535d 3779
5f39d397
CM
3780 item = btrfs_item_nr(leaf, i);
3781 ioff = btrfs_item_offset(leaf, item);
3782 btrfs_set_item_offset(leaf, item, ioff + dsize);
0783fcfc 3783 }
db94535d 3784
5f39d397 3785 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
85e21bac 3786 btrfs_item_nr_offset(slot + nr),
d6025579 3787 sizeof(struct btrfs_item) *
85e21bac 3788 (nritems - slot - nr));
be0e5c09 3789 }
85e21bac
CM
3790 btrfs_set_header_nritems(leaf, nritems - nr);
3791 nritems -= nr;
5f39d397 3792
74123bd7 3793 /* delete the leaf if we've emptied it */
7518a238 3794 if (nritems == 0) {
5f39d397
CM
3795 if (leaf == root->node) {
3796 btrfs_set_header_level(leaf, 0);
9a8dd150 3797 } else {
f0486c68
YZ
3798 btrfs_set_path_blocking(path);
3799 clean_tree_block(trans, root, leaf);
143bede5 3800 btrfs_del_leaf(trans, root, path, leaf);
9a8dd150 3801 }
be0e5c09 3802 } else {
7518a238 3803 int used = leaf_space_used(leaf, 0, nritems);
aa5d6bed 3804 if (slot == 0) {
5f39d397
CM
3805 struct btrfs_disk_key disk_key;
3806
3807 btrfs_item_key(leaf, &disk_key, 0);
143bede5 3808 fixup_low_keys(trans, root, path, &disk_key, 1);
aa5d6bed 3809 }
aa5d6bed 3810
74123bd7 3811 /* delete the leaf if it is mostly empty */
d717aa1d 3812 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
be0e5c09
CM
3813 /* push_leaf_left fixes the path.
3814 * make sure the path still points to our leaf
3815 * for possible call to del_ptr below
3816 */
4920c9ac 3817 slot = path->slots[1];
5f39d397
CM
3818 extent_buffer_get(leaf);
3819
b9473439 3820 btrfs_set_path_blocking(path);
99d8f83c
CM
3821 wret = push_leaf_left(trans, root, path, 1, 1,
3822 1, (u32)-1);
54aa1f4d 3823 if (wret < 0 && wret != -ENOSPC)
aa5d6bed 3824 ret = wret;
5f39d397
CM
3825
3826 if (path->nodes[0] == leaf &&
3827 btrfs_header_nritems(leaf)) {
99d8f83c
CM
3828 wret = push_leaf_right(trans, root, path, 1,
3829 1, 1, 0);
54aa1f4d 3830 if (wret < 0 && wret != -ENOSPC)
aa5d6bed
CM
3831 ret = wret;
3832 }
5f39d397
CM
3833
3834 if (btrfs_header_nritems(leaf) == 0) {
323ac95b 3835 path->slots[1] = slot;
143bede5 3836 btrfs_del_leaf(trans, root, path, leaf);
5f39d397 3837 free_extent_buffer(leaf);
143bede5 3838 ret = 0;
5de08d7d 3839 } else {
925baedd
CM
3840 /* if we're still in the path, make sure
3841 * we're dirty. Otherwise, one of the
3842 * push_leaf functions must have already
3843 * dirtied this buffer
3844 */
3845 if (path->nodes[0] == leaf)
3846 btrfs_mark_buffer_dirty(leaf);
5f39d397 3847 free_extent_buffer(leaf);
be0e5c09 3848 }
d5719762 3849 } else {
5f39d397 3850 btrfs_mark_buffer_dirty(leaf);
be0e5c09
CM
3851 }
3852 }
aa5d6bed 3853 return ret;
be0e5c09
CM
3854}
3855
7bb86316 3856/*
925baedd 3857 * search the tree again to find a leaf with lesser keys
7bb86316
CM
3858 * returns 0 if it found something or 1 if there are no lesser leaves.
3859 * returns < 0 on io errors.
d352ac68
CM
3860 *
3861 * This may release the path, and so you may lose any locks held at the
3862 * time you call it.
7bb86316
CM
3863 */
3864int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3865{
925baedd
CM
3866 struct btrfs_key key;
3867 struct btrfs_disk_key found_key;
3868 int ret;
7bb86316 3869
925baedd 3870 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
7bb86316 3871
925baedd
CM
3872 if (key.offset > 0)
3873 key.offset--;
3874 else if (key.type > 0)
3875 key.type--;
3876 else if (key.objectid > 0)
3877 key.objectid--;
3878 else
3879 return 1;
7bb86316 3880
b3b4aa74 3881 btrfs_release_path(path);
925baedd
CM
3882 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3883 if (ret < 0)
3884 return ret;
3885 btrfs_item_key(path->nodes[0], &found_key, 0);
3886 ret = comp_keys(&found_key, &key);
3887 if (ret < 0)
3888 return 0;
3889 return 1;
7bb86316
CM
3890}
3891
3f157a2f
CM
3892/*
3893 * A helper function to walk down the tree starting at min_key, and looking
3894 * for nodes or leaves that are either in cache or have a minimum
d352ac68 3895 * transaction id. This is used by the btree defrag code, and tree logging
3f157a2f
CM
3896 *
3897 * This does not cow, but it does stuff the starting key it finds back
3898 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3899 * key and get a writable path.
3900 *
3901 * This does lock as it descends, and path->keep_locks should be set
3902 * to 1 by the caller.
3903 *
3904 * This honors path->lowest_level to prevent descent past a given level
3905 * of the tree.
3906 *
d352ac68
CM
3907 * min_trans indicates the oldest transaction that you are interested
3908 * in walking through. Any nodes or leaves older than min_trans are
3909 * skipped over (without reading them).
3910 *
3f157a2f
CM
3911 * returns zero if something useful was found, < 0 on error and 1 if there
3912 * was nothing in the tree that matched the search criteria.
3913 */
3914int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
e02119d5 3915 struct btrfs_key *max_key,
3f157a2f
CM
3916 struct btrfs_path *path, int cache_only,
3917 u64 min_trans)
3918{
3919 struct extent_buffer *cur;
3920 struct btrfs_key found_key;
3921 int slot;
9652480b 3922 int sret;
3f157a2f
CM
3923 u32 nritems;
3924 int level;
3925 int ret = 1;
3926
934d375b 3927 WARN_ON(!path->keep_locks);
3f157a2f 3928again:
bd681513 3929 cur = btrfs_read_lock_root_node(root);
3f157a2f 3930 level = btrfs_header_level(cur);
e02119d5 3931 WARN_ON(path->nodes[level]);
3f157a2f 3932 path->nodes[level] = cur;
bd681513 3933 path->locks[level] = BTRFS_READ_LOCK;
3f157a2f
CM
3934
3935 if (btrfs_header_generation(cur) < min_trans) {
3936 ret = 1;
3937 goto out;
3938 }
d397712b 3939 while (1) {
3f157a2f
CM
3940 nritems = btrfs_header_nritems(cur);
3941 level = btrfs_header_level(cur);
9652480b 3942 sret = bin_search(cur, min_key, level, &slot);
3f157a2f 3943
323ac95b
CM
3944 /* at the lowest level, we're done, setup the path and exit */
3945 if (level == path->lowest_level) {
e02119d5
CM
3946 if (slot >= nritems)
3947 goto find_next_key;
3f157a2f
CM
3948 ret = 0;
3949 path->slots[level] = slot;
3950 btrfs_item_key_to_cpu(cur, &found_key, slot);
3951 goto out;
3952 }
9652480b
Y
3953 if (sret && slot > 0)
3954 slot--;
3f157a2f
CM
3955 /*
3956 * check this node pointer against the cache_only and
3957 * min_trans parameters. If it isn't in cache or is too
3958 * old, skip to the next one.
3959 */
d397712b 3960 while (slot < nritems) {
3f157a2f
CM
3961 u64 blockptr;
3962 u64 gen;
3963 struct extent_buffer *tmp;
e02119d5
CM
3964 struct btrfs_disk_key disk_key;
3965
3f157a2f
CM
3966 blockptr = btrfs_node_blockptr(cur, slot);
3967 gen = btrfs_node_ptr_generation(cur, slot);
3968 if (gen < min_trans) {
3969 slot++;
3970 continue;
3971 }
3972 if (!cache_only)
3973 break;
3974
e02119d5
CM
3975 if (max_key) {
3976 btrfs_node_key(cur, &disk_key, slot);
3977 if (comp_keys(&disk_key, max_key) >= 0) {
3978 ret = 1;
3979 goto out;
3980 }
3981 }
3982
3f157a2f
CM
3983 tmp = btrfs_find_tree_block(root, blockptr,
3984 btrfs_level_size(root, level - 1));
3985
3986 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3987 free_extent_buffer(tmp);
3988 break;
3989 }
3990 if (tmp)
3991 free_extent_buffer(tmp);
3992 slot++;
3993 }
e02119d5 3994find_next_key:
3f157a2f
CM
3995 /*
3996 * we didn't find a candidate key in this node, walk forward
3997 * and find another one
3998 */
3999 if (slot >= nritems) {
e02119d5 4000 path->slots[level] = slot;
b4ce94de 4001 btrfs_set_path_blocking(path);
e02119d5 4002 sret = btrfs_find_next_key(root, path, min_key, level,
3f157a2f 4003 cache_only, min_trans);
e02119d5 4004 if (sret == 0) {
b3b4aa74 4005 btrfs_release_path(path);
3f157a2f
CM
4006 goto again;
4007 } else {
4008 goto out;
4009 }
4010 }
4011 /* save our key for returning back */
4012 btrfs_node_key_to_cpu(cur, &found_key, slot);
4013 path->slots[level] = slot;
4014 if (level == path->lowest_level) {
4015 ret = 0;
4016 unlock_up(path, level, 1);
4017 goto out;
4018 }
b4ce94de 4019 btrfs_set_path_blocking(path);
3f157a2f 4020 cur = read_node_slot(root, cur, slot);
79787eaa 4021 BUG_ON(!cur); /* -ENOMEM */
3f157a2f 4022
bd681513 4023 btrfs_tree_read_lock(cur);
b4ce94de 4024
bd681513 4025 path->locks[level - 1] = BTRFS_READ_LOCK;
3f157a2f
CM
4026 path->nodes[level - 1] = cur;
4027 unlock_up(path, level, 1);
bd681513 4028 btrfs_clear_path_blocking(path, NULL, 0);
3f157a2f
CM
4029 }
4030out:
4031 if (ret == 0)
4032 memcpy(min_key, &found_key, sizeof(found_key));
b4ce94de 4033 btrfs_set_path_blocking(path);
3f157a2f
CM
4034 return ret;
4035}
4036
4037/*
4038 * this is similar to btrfs_next_leaf, but does not try to preserve
4039 * and fixup the path. It looks for and returns the next key in the
4040 * tree based on the current path and the cache_only and min_trans
4041 * parameters.
4042 *
4043 * 0 is returned if another key is found, < 0 if there are any errors
4044 * and 1 is returned if there are no higher keys in the tree
4045 *
4046 * path->keep_locks should be set to 1 on the search made before
4047 * calling this function.
4048 */
e7a84565 4049int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
33c66f43 4050 struct btrfs_key *key, int level,
3f157a2f 4051 int cache_only, u64 min_trans)
e7a84565 4052{
e7a84565
CM
4053 int slot;
4054 struct extent_buffer *c;
4055
934d375b 4056 WARN_ON(!path->keep_locks);
d397712b 4057 while (level < BTRFS_MAX_LEVEL) {
e7a84565
CM
4058 if (!path->nodes[level])
4059 return 1;
4060
4061 slot = path->slots[level] + 1;
4062 c = path->nodes[level];
3f157a2f 4063next:
e7a84565 4064 if (slot >= btrfs_header_nritems(c)) {
33c66f43
YZ
4065 int ret;
4066 int orig_lowest;
4067 struct btrfs_key cur_key;
4068 if (level + 1 >= BTRFS_MAX_LEVEL ||
4069 !path->nodes[level + 1])
e7a84565 4070 return 1;
33c66f43
YZ
4071
4072 if (path->locks[level + 1]) {
4073 level++;
4074 continue;
4075 }
4076
4077 slot = btrfs_header_nritems(c) - 1;
4078 if (level == 0)
4079 btrfs_item_key_to_cpu(c, &cur_key, slot);
4080 else
4081 btrfs_node_key_to_cpu(c, &cur_key, slot);
4082
4083 orig_lowest = path->lowest_level;
b3b4aa74 4084 btrfs_release_path(path);
33c66f43
YZ
4085 path->lowest_level = level;
4086 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4087 0, 0);
4088 path->lowest_level = orig_lowest;
4089 if (ret < 0)
4090 return ret;
4091
4092 c = path->nodes[level];
4093 slot = path->slots[level];
4094 if (ret == 0)
4095 slot++;
4096 goto next;
e7a84565 4097 }
33c66f43 4098
e7a84565
CM
4099 if (level == 0)
4100 btrfs_item_key_to_cpu(c, key, slot);
3f157a2f
CM
4101 else {
4102 u64 blockptr = btrfs_node_blockptr(c, slot);
4103 u64 gen = btrfs_node_ptr_generation(c, slot);
4104
4105 if (cache_only) {
4106 struct extent_buffer *cur;
4107 cur = btrfs_find_tree_block(root, blockptr,
4108 btrfs_level_size(root, level - 1));
4109 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4110 slot++;
4111 if (cur)
4112 free_extent_buffer(cur);
4113 goto next;
4114 }
4115 free_extent_buffer(cur);
4116 }
4117 if (gen < min_trans) {
4118 slot++;
4119 goto next;
4120 }
e7a84565 4121 btrfs_node_key_to_cpu(c, key, slot);
3f157a2f 4122 }
e7a84565
CM
4123 return 0;
4124 }
4125 return 1;
4126}
4127
97571fd0 4128/*
925baedd 4129 * search the tree again to find a leaf with greater keys
0f70abe2
CM
4130 * returns 0 if it found something or 1 if there are no greater leaves.
4131 * returns < 0 on io errors.
97571fd0 4132 */
234b63a0 4133int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
d97e63b6
CM
4134{
4135 int slot;
8e73f275 4136 int level;
5f39d397 4137 struct extent_buffer *c;
8e73f275 4138 struct extent_buffer *next;
925baedd
CM
4139 struct btrfs_key key;
4140 u32 nritems;
4141 int ret;
8e73f275 4142 int old_spinning = path->leave_spinning;
bd681513 4143 int next_rw_lock = 0;
925baedd
CM
4144
4145 nritems = btrfs_header_nritems(path->nodes[0]);
d397712b 4146 if (nritems == 0)
925baedd 4147 return 1;
925baedd 4148
8e73f275
CM
4149 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4150again:
4151 level = 1;
4152 next = NULL;
bd681513 4153 next_rw_lock = 0;
b3b4aa74 4154 btrfs_release_path(path);
8e73f275 4155
a2135011 4156 path->keep_locks = 1;
31533fb2 4157 path->leave_spinning = 1;
8e73f275 4158
925baedd
CM
4159 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4160 path->keep_locks = 0;
4161
4162 if (ret < 0)
4163 return ret;
4164
a2135011 4165 nritems = btrfs_header_nritems(path->nodes[0]);
168fd7d2
CM
4166 /*
4167 * by releasing the path above we dropped all our locks. A balance
4168 * could have added more items next to the key that used to be
4169 * at the very end of the block. So, check again here and
4170 * advance the path if there are now more items available.
4171 */
a2135011 4172 if (nritems > 0 && path->slots[0] < nritems - 1) {
e457afec
YZ
4173 if (ret == 0)
4174 path->slots[0]++;
8e73f275 4175 ret = 0;
925baedd
CM
4176 goto done;
4177 }
d97e63b6 4178
d397712b 4179 while (level < BTRFS_MAX_LEVEL) {
8e73f275
CM
4180 if (!path->nodes[level]) {
4181 ret = 1;
4182 goto done;
4183 }
5f39d397 4184
d97e63b6
CM
4185 slot = path->slots[level] + 1;
4186 c = path->nodes[level];
5f39d397 4187 if (slot >= btrfs_header_nritems(c)) {
d97e63b6 4188 level++;
8e73f275
CM
4189 if (level == BTRFS_MAX_LEVEL) {
4190 ret = 1;
4191 goto done;
4192 }
d97e63b6
CM
4193 continue;
4194 }
5f39d397 4195
925baedd 4196 if (next) {
bd681513 4197 btrfs_tree_unlock_rw(next, next_rw_lock);
5f39d397 4198 free_extent_buffer(next);
925baedd 4199 }
5f39d397 4200
8e73f275 4201 next = c;
bd681513 4202 next_rw_lock = path->locks[level];
8e73f275
CM
4203 ret = read_block_for_search(NULL, root, path, &next, level,
4204 slot, &key);
4205 if (ret == -EAGAIN)
4206 goto again;
5f39d397 4207
76a05b35 4208 if (ret < 0) {
b3b4aa74 4209 btrfs_release_path(path);
76a05b35
CM
4210 goto done;
4211 }
4212
5cd57b2c 4213 if (!path->skip_locking) {
bd681513 4214 ret = btrfs_try_tree_read_lock(next);
8e73f275
CM
4215 if (!ret) {
4216 btrfs_set_path_blocking(path);
bd681513 4217 btrfs_tree_read_lock(next);
31533fb2 4218 btrfs_clear_path_blocking(path, next,
bd681513 4219 BTRFS_READ_LOCK);
8e73f275 4220 }
31533fb2 4221 next_rw_lock = BTRFS_READ_LOCK;
5cd57b2c 4222 }
d97e63b6
CM
4223 break;
4224 }
4225 path->slots[level] = slot;
d397712b 4226 while (1) {
d97e63b6
CM
4227 level--;
4228 c = path->nodes[level];
925baedd 4229 if (path->locks[level])
bd681513 4230 btrfs_tree_unlock_rw(c, path->locks[level]);
8e73f275 4231
5f39d397 4232 free_extent_buffer(c);
d97e63b6
CM
4233 path->nodes[level] = next;
4234 path->slots[level] = 0;
a74a4b97 4235 if (!path->skip_locking)
bd681513 4236 path->locks[level] = next_rw_lock;
d97e63b6
CM
4237 if (!level)
4238 break;
b4ce94de 4239
8e73f275
CM
4240 ret = read_block_for_search(NULL, root, path, &next, level,
4241 0, &key);
4242 if (ret == -EAGAIN)
4243 goto again;
4244
76a05b35 4245 if (ret < 0) {
b3b4aa74 4246 btrfs_release_path(path);
76a05b35
CM
4247 goto done;
4248 }
4249
5cd57b2c 4250 if (!path->skip_locking) {
bd681513 4251 ret = btrfs_try_tree_read_lock(next);
8e73f275
CM
4252 if (!ret) {
4253 btrfs_set_path_blocking(path);
bd681513 4254 btrfs_tree_read_lock(next);
31533fb2 4255 btrfs_clear_path_blocking(path, next,
bd681513
CM
4256 BTRFS_READ_LOCK);
4257 }
31533fb2 4258 next_rw_lock = BTRFS_READ_LOCK;
5cd57b2c 4259 }
d97e63b6 4260 }
8e73f275 4261 ret = 0;
925baedd
CM
4262done:
4263 unlock_up(path, 0, 1);
8e73f275
CM
4264 path->leave_spinning = old_spinning;
4265 if (!old_spinning)
4266 btrfs_set_path_blocking(path);
4267
4268 return ret;
d97e63b6 4269}
0b86a832 4270
3f157a2f
CM
4271/*
4272 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4273 * searching until it gets past min_objectid or finds an item of 'type'
4274 *
4275 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4276 */
0b86a832
CM
4277int btrfs_previous_item(struct btrfs_root *root,
4278 struct btrfs_path *path, u64 min_objectid,
4279 int type)
4280{
4281 struct btrfs_key found_key;
4282 struct extent_buffer *leaf;
e02119d5 4283 u32 nritems;
0b86a832
CM
4284 int ret;
4285
d397712b 4286 while (1) {
0b86a832 4287 if (path->slots[0] == 0) {
b4ce94de 4288 btrfs_set_path_blocking(path);
0b86a832
CM
4289 ret = btrfs_prev_leaf(root, path);
4290 if (ret != 0)
4291 return ret;
4292 } else {
4293 path->slots[0]--;
4294 }
4295 leaf = path->nodes[0];
e02119d5
CM
4296 nritems = btrfs_header_nritems(leaf);
4297 if (nritems == 0)
4298 return 1;
4299 if (path->slots[0] == nritems)
4300 path->slots[0]--;
4301
0b86a832 4302 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
e02119d5
CM
4303 if (found_key.objectid < min_objectid)
4304 break;
0a4eefbb
YZ
4305 if (found_key.type == type)
4306 return 0;
e02119d5
CM
4307 if (found_key.objectid == min_objectid &&
4308 found_key.type < type)
4309 break;
0b86a832
CM
4310 }
4311 return 1;
4312}