btrfs: remove the zoned/zone_size union in struct btrfs_fs_info
[linux-block.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
395cb57e 9#include <linux/sched/mm.h>
d1310b2e
CM
10#include <linux/spinlock.h>
11#include <linux/blkdev.h>
12#include <linux/swap.h>
d1310b2e
CM
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
268bb0ce 15#include <linux/prefetch.h>
14605409 16#include <linux/fsverity.h>
cea62800 17#include "misc.h"
d1310b2e 18#include "extent_io.h"
9c7d3a54 19#include "extent-io-tree.h"
d1310b2e 20#include "extent_map.h"
902b22f3
DW
21#include "ctree.h"
22#include "btrfs_inode.h"
4a54c8c1 23#include "volumes.h"
21adbd5c 24#include "check-integrity.h"
0b32f4bb 25#include "locking.h"
606686ee 26#include "rcu-string.h"
fe09e16c 27#include "backref.h"
6af49dbd 28#include "disk-io.h"
760f991f 29#include "subpage.h"
d3575156 30#include "zoned.h"
0bc09ca1 31#include "block-group.h"
d1310b2e 32
d1310b2e
CM
33static struct kmem_cache *extent_state_cache;
34static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 35static struct bio_set btrfs_bioset;
d1310b2e 36
27a3507d
FM
37static inline bool extent_state_in_tree(const struct extent_state *state)
38{
39 return !RB_EMPTY_NODE(&state->rb_node);
40}
41
6d49ba1b 42#ifdef CONFIG_BTRFS_DEBUG
d1310b2e 43static LIST_HEAD(states);
d397712b 44static DEFINE_SPINLOCK(leak_lock);
6d49ba1b 45
3fd63727
JB
46static inline void btrfs_leak_debug_add(spinlock_t *lock,
47 struct list_head *new,
48 struct list_head *head)
6d49ba1b
ES
49{
50 unsigned long flags;
51
3fd63727 52 spin_lock_irqsave(lock, flags);
6d49ba1b 53 list_add(new, head);
3fd63727 54 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
55}
56
3fd63727
JB
57static inline void btrfs_leak_debug_del(spinlock_t *lock,
58 struct list_head *entry)
6d49ba1b
ES
59{
60 unsigned long flags;
61
3fd63727 62 spin_lock_irqsave(lock, flags);
6d49ba1b 63 list_del(entry);
3fd63727 64 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
65}
66
3fd63727 67void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 68{
6d49ba1b 69 struct extent_buffer *eb;
3fd63727 70 unsigned long flags;
6d49ba1b 71
8c38938c
JB
72 /*
73 * If we didn't get into open_ctree our allocated_ebs will not be
74 * initialized, so just skip this.
75 */
76 if (!fs_info->allocated_ebs.next)
77 return;
78
b95b78e6 79 WARN_ON(!list_empty(&fs_info->allocated_ebs));
3fd63727
JB
80 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
81 while (!list_empty(&fs_info->allocated_ebs)) {
82 eb = list_first_entry(&fs_info->allocated_ebs,
83 struct extent_buffer, leak_list);
8c38938c
JB
84 pr_err(
85 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
86 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
87 btrfs_header_owner(eb));
33ca832f
JB
88 list_del(&eb->leak_list);
89 kmem_cache_free(extent_buffer_cache, eb);
90 }
3fd63727 91 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f
JB
92}
93
94static inline void btrfs_extent_state_leak_debug_check(void)
95{
96 struct extent_state *state;
97
6d49ba1b
ES
98 while (!list_empty(&states)) {
99 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 100 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
101 state->start, state->end, state->state,
102 extent_state_in_tree(state),
b7ac31b7 103 refcount_read(&state->refs));
6d49ba1b
ES
104 list_del(&state->leak_list);
105 kmem_cache_free(extent_state_cache, state);
106 }
6d49ba1b 107}
8d599ae1 108
a5dee37d
JB
109#define btrfs_debug_check_extent_io_range(tree, start, end) \
110 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 111static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 112 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 113{
65a680f6
NB
114 struct inode *inode = tree->private_data;
115 u64 isize;
116
117 if (!inode || !is_data_inode(inode))
118 return;
119
120 isize = i_size_read(inode);
121 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
122 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
123 "%s: ino %llu isize %llu odd range [%llu,%llu]",
124 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
125 }
8d599ae1 126}
6d49ba1b 127#else
3fd63727
JB
128#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
129#define btrfs_leak_debug_del(lock, entry) do {} while (0)
33ca832f 130#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 131#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 132#endif
d1310b2e 133
d1310b2e
CM
134struct tree_entry {
135 u64 start;
136 u64 end;
d1310b2e
CM
137 struct rb_node rb_node;
138};
139
140struct extent_page_data {
390ed29b 141 struct btrfs_bio_ctrl bio_ctrl;
771ed689
CM
142 /* tells writepage not to lock the state bits for this range
143 * it still does the unlocking
144 */
ffbd517d
CM
145 unsigned int extent_locked:1;
146
70fd7614 147 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 148 unsigned int sync_io:1;
d1310b2e
CM
149};
150
f97e27e9 151static int add_extent_changeset(struct extent_state *state, u32 bits,
d38ed27f
QW
152 struct extent_changeset *changeset,
153 int set)
154{
155 int ret;
156
157 if (!changeset)
57599c7e 158 return 0;
d38ed27f 159 if (set && (state->state & bits) == bits)
57599c7e 160 return 0;
fefdc557 161 if (!set && (state->state & bits) == 0)
57599c7e 162 return 0;
d38ed27f 163 changeset->bytes_changed += state->end - state->start + 1;
53d32359 164 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 165 GFP_ATOMIC);
57599c7e 166 return ret;
d38ed27f
QW
167}
168
c1be9c1a
NB
169int __must_check submit_one_bio(struct bio *bio, int mirror_num,
170 unsigned long bio_flags)
bb58eb9e
QW
171{
172 blk_status_t ret = 0;
bb58eb9e 173 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
174
175 bio->bi_private = NULL;
176
e0eefe07
QW
177 /* Caller should ensure the bio has at least some range added */
178 ASSERT(bio->bi_iter.bi_size);
908930f3
NB
179 if (is_data_inode(tree->private_data))
180 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
181 bio_flags);
182 else
1b36294a
NB
183 ret = btrfs_submit_metadata_bio(tree->private_data, bio,
184 mirror_num, bio_flags);
bb58eb9e
QW
185
186 return blk_status_to_errno(ret);
187}
188
3065976b
QW
189/* Cleanup unsubmitted bios */
190static void end_write_bio(struct extent_page_data *epd, int ret)
191{
390ed29b
QW
192 struct bio *bio = epd->bio_ctrl.bio;
193
194 if (bio) {
195 bio->bi_status = errno_to_blk_status(ret);
196 bio_endio(bio);
197 epd->bio_ctrl.bio = NULL;
3065976b
QW
198 }
199}
200
f4340622
QW
201/*
202 * Submit bio from extent page data via submit_one_bio
203 *
204 * Return 0 if everything is OK.
205 * Return <0 for error.
206 */
207static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 208{
f4340622 209 int ret = 0;
390ed29b 210 struct bio *bio = epd->bio_ctrl.bio;
bb58eb9e 211
390ed29b
QW
212 if (bio) {
213 ret = submit_one_bio(bio, 0, 0);
f4340622
QW
214 /*
215 * Clean up of epd->bio is handled by its endio function.
216 * And endio is either triggered by successful bio execution
217 * or the error handler of submit bio hook.
218 * So at this point, no matter what happened, we don't need
219 * to clean up epd->bio.
220 */
390ed29b 221 epd->bio_ctrl.bio = NULL;
bb58eb9e 222 }
f4340622 223 return ret;
bb58eb9e 224}
e2932ee0 225
6f0d04f8 226int __init extent_state_cache_init(void)
d1310b2e 227{
837e1972 228 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 229 sizeof(struct extent_state), 0,
fba4b697 230 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
231 if (!extent_state_cache)
232 return -ENOMEM;
6f0d04f8
JB
233 return 0;
234}
d1310b2e 235
6f0d04f8
JB
236int __init extent_io_init(void)
237{
837e1972 238 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 239 sizeof(struct extent_buffer), 0,
fba4b697 240 SLAB_MEM_SPREAD, NULL);
d1310b2e 241 if (!extent_buffer_cache)
6f0d04f8 242 return -ENOMEM;
9be3395b 243
8ac9f7c1 244 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
c3a3b19b 245 offsetof(struct btrfs_bio, bio),
8ac9f7c1 246 BIOSET_NEED_BVECS))
9be3395b 247 goto free_buffer_cache;
b208c2f7 248
8ac9f7c1 249 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
250 goto free_bioset;
251
d1310b2e
CM
252 return 0;
253
b208c2f7 254free_bioset:
8ac9f7c1 255 bioset_exit(&btrfs_bioset);
b208c2f7 256
9be3395b
CM
257free_buffer_cache:
258 kmem_cache_destroy(extent_buffer_cache);
259 extent_buffer_cache = NULL;
6f0d04f8
JB
260 return -ENOMEM;
261}
9be3395b 262
6f0d04f8
JB
263void __cold extent_state_cache_exit(void)
264{
265 btrfs_extent_state_leak_debug_check();
d1310b2e 266 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
267}
268
e67c718b 269void __cold extent_io_exit(void)
d1310b2e 270{
8c0a8537
KS
271 /*
272 * Make sure all delayed rcu free are flushed before we
273 * destroy caches.
274 */
275 rcu_barrier();
5598e900 276 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 277 bioset_exit(&btrfs_bioset);
d1310b2e
CM
278}
279
41a2ee75
JB
280/*
281 * For the file_extent_tree, we want to hold the inode lock when we lookup and
282 * update the disk_i_size, but lockdep will complain because our io_tree we hold
283 * the tree lock and get the inode lock when setting delalloc. These two things
284 * are unrelated, so make a class for the file_extent_tree so we don't get the
285 * two locking patterns mixed up.
286 */
287static struct lock_class_key file_extent_tree_class;
288
c258d6e3 289void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
290 struct extent_io_tree *tree, unsigned int owner,
291 void *private_data)
d1310b2e 292{
c258d6e3 293 tree->fs_info = fs_info;
6bef4d31 294 tree->state = RB_ROOT;
d1310b2e 295 tree->dirty_bytes = 0;
70dec807 296 spin_lock_init(&tree->lock);
c6100a4b 297 tree->private_data = private_data;
43eb5f29 298 tree->owner = owner;
41a2ee75
JB
299 if (owner == IO_TREE_INODE_FILE_EXTENT)
300 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 301}
d1310b2e 302
41e7acd3
NB
303void extent_io_tree_release(struct extent_io_tree *tree)
304{
305 spin_lock(&tree->lock);
306 /*
307 * Do a single barrier for the waitqueue_active check here, the state
308 * of the waitqueue should not change once extent_io_tree_release is
309 * called.
310 */
311 smp_mb();
312 while (!RB_EMPTY_ROOT(&tree->state)) {
313 struct rb_node *node;
314 struct extent_state *state;
315
316 node = rb_first(&tree->state);
317 state = rb_entry(node, struct extent_state, rb_node);
318 rb_erase(&state->rb_node, &tree->state);
319 RB_CLEAR_NODE(&state->rb_node);
320 /*
321 * btree io trees aren't supposed to have tasks waiting for
322 * changes in the flags of extent states ever.
323 */
324 ASSERT(!waitqueue_active(&state->wq));
325 free_extent_state(state);
326
327 cond_resched_lock(&tree->lock);
328 }
329 spin_unlock(&tree->lock);
330}
331
b2950863 332static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
333{
334 struct extent_state *state;
d1310b2e 335
3ba7ab22
MH
336 /*
337 * The given mask might be not appropriate for the slab allocator,
338 * drop the unsupported bits
339 */
340 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 341 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 342 if (!state)
d1310b2e
CM
343 return state;
344 state->state = 0;
47dc196a 345 state->failrec = NULL;
27a3507d 346 RB_CLEAR_NODE(&state->rb_node);
3fd63727 347 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
b7ac31b7 348 refcount_set(&state->refs, 1);
d1310b2e 349 init_waitqueue_head(&state->wq);
143bede5 350 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
351 return state;
352}
d1310b2e 353
4845e44f 354void free_extent_state(struct extent_state *state)
d1310b2e 355{
d1310b2e
CM
356 if (!state)
357 return;
b7ac31b7 358 if (refcount_dec_and_test(&state->refs)) {
27a3507d 359 WARN_ON(extent_state_in_tree(state));
3fd63727 360 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
143bede5 361 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
362 kmem_cache_free(extent_state_cache, state);
363 }
364}
d1310b2e 365
f2071b21
FM
366static struct rb_node *tree_insert(struct rb_root *root,
367 struct rb_node *search_start,
368 u64 offset,
12cfbad9
FDBM
369 struct rb_node *node,
370 struct rb_node ***p_in,
371 struct rb_node **parent_in)
d1310b2e 372{
f2071b21 373 struct rb_node **p;
d397712b 374 struct rb_node *parent = NULL;
d1310b2e
CM
375 struct tree_entry *entry;
376
12cfbad9
FDBM
377 if (p_in && parent_in) {
378 p = *p_in;
379 parent = *parent_in;
380 goto do_insert;
381 }
382
f2071b21 383 p = search_start ? &search_start : &root->rb_node;
d397712b 384 while (*p) {
d1310b2e
CM
385 parent = *p;
386 entry = rb_entry(parent, struct tree_entry, rb_node);
387
388 if (offset < entry->start)
389 p = &(*p)->rb_left;
390 else if (offset > entry->end)
391 p = &(*p)->rb_right;
392 else
393 return parent;
394 }
395
12cfbad9 396do_insert:
d1310b2e
CM
397 rb_link_node(node, parent, p);
398 rb_insert_color(node, root);
399 return NULL;
400}
401
8666e638 402/**
3bed2da1
NB
403 * Search @tree for an entry that contains @offset. Such entry would have
404 * entry->start <= offset && entry->end >= offset.
8666e638 405 *
3bed2da1
NB
406 * @tree: the tree to search
407 * @offset: offset that should fall within an entry in @tree
408 * @next_ret: pointer to the first entry whose range ends after @offset
409 * @prev_ret: pointer to the first entry whose range begins before @offset
410 * @p_ret: pointer where new node should be anchored (used when inserting an
411 * entry in the tree)
412 * @parent_ret: points to entry which would have been the parent of the entry,
8666e638
NB
413 * containing @offset
414 *
415 * This function returns a pointer to the entry that contains @offset byte
416 * address. If no such entry exists, then NULL is returned and the other
417 * pointer arguments to the function are filled, otherwise the found entry is
418 * returned and other pointers are left untouched.
419 */
80ea96b1 420static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 421 struct rb_node **next_ret,
352646c7 422 struct rb_node **prev_ret,
12cfbad9
FDBM
423 struct rb_node ***p_ret,
424 struct rb_node **parent_ret)
d1310b2e 425{
80ea96b1 426 struct rb_root *root = &tree->state;
12cfbad9 427 struct rb_node **n = &root->rb_node;
d1310b2e
CM
428 struct rb_node *prev = NULL;
429 struct rb_node *orig_prev = NULL;
430 struct tree_entry *entry;
431 struct tree_entry *prev_entry = NULL;
432
12cfbad9
FDBM
433 while (*n) {
434 prev = *n;
435 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
436 prev_entry = entry;
437
438 if (offset < entry->start)
12cfbad9 439 n = &(*n)->rb_left;
d1310b2e 440 else if (offset > entry->end)
12cfbad9 441 n = &(*n)->rb_right;
d397712b 442 else
12cfbad9 443 return *n;
d1310b2e
CM
444 }
445
12cfbad9
FDBM
446 if (p_ret)
447 *p_ret = n;
448 if (parent_ret)
449 *parent_ret = prev;
450
352646c7 451 if (next_ret) {
d1310b2e 452 orig_prev = prev;
d397712b 453 while (prev && offset > prev_entry->end) {
d1310b2e
CM
454 prev = rb_next(prev);
455 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
456 }
352646c7 457 *next_ret = prev;
d1310b2e
CM
458 prev = orig_prev;
459 }
460
352646c7 461 if (prev_ret) {
d1310b2e 462 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 463 while (prev && offset < prev_entry->start) {
d1310b2e
CM
464 prev = rb_prev(prev);
465 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
466 }
352646c7 467 *prev_ret = prev;
d1310b2e
CM
468 }
469 return NULL;
470}
471
12cfbad9
FDBM
472static inline struct rb_node *
473tree_search_for_insert(struct extent_io_tree *tree,
474 u64 offset,
475 struct rb_node ***p_ret,
476 struct rb_node **parent_ret)
d1310b2e 477{
352646c7 478 struct rb_node *next= NULL;
d1310b2e 479 struct rb_node *ret;
70dec807 480
352646c7 481 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 482 if (!ret)
352646c7 483 return next;
d1310b2e
CM
484 return ret;
485}
486
12cfbad9
FDBM
487static inline struct rb_node *tree_search(struct extent_io_tree *tree,
488 u64 offset)
489{
490 return tree_search_for_insert(tree, offset, NULL, NULL);
491}
492
d1310b2e
CM
493/*
494 * utility function to look for merge candidates inside a given range.
495 * Any extents with matching state are merged together into a single
496 * extent in the tree. Extents with EXTENT_IO in their state field
497 * are not merged because the end_io handlers need to be able to do
498 * operations on them without sleeping (or doing allocations/splits).
499 *
500 * This should be called with the tree lock held.
501 */
1bf85046
JM
502static void merge_state(struct extent_io_tree *tree,
503 struct extent_state *state)
d1310b2e
CM
504{
505 struct extent_state *other;
506 struct rb_node *other_node;
507
8882679e 508 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 509 return;
d1310b2e
CM
510
511 other_node = rb_prev(&state->rb_node);
512 if (other_node) {
513 other = rb_entry(other_node, struct extent_state, rb_node);
514 if (other->end == state->start - 1 &&
515 other->state == state->state) {
5c848198
NB
516 if (tree->private_data &&
517 is_data_inode(tree->private_data))
518 btrfs_merge_delalloc_extent(tree->private_data,
519 state, other);
d1310b2e 520 state->start = other->start;
d1310b2e 521 rb_erase(&other->rb_node, &tree->state);
27a3507d 522 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
523 free_extent_state(other);
524 }
525 }
526 other_node = rb_next(&state->rb_node);
527 if (other_node) {
528 other = rb_entry(other_node, struct extent_state, rb_node);
529 if (other->start == state->end + 1 &&
530 other->state == state->state) {
5c848198
NB
531 if (tree->private_data &&
532 is_data_inode(tree->private_data))
533 btrfs_merge_delalloc_extent(tree->private_data,
534 state, other);
df98b6e2 535 state->end = other->end;
df98b6e2 536 rb_erase(&other->rb_node, &tree->state);
27a3507d 537 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 538 free_extent_state(other);
d1310b2e
CM
539 }
540 }
d1310b2e
CM
541}
542
3150b699 543static void set_state_bits(struct extent_io_tree *tree,
f97e27e9 544 struct extent_state *state, u32 *bits,
d38ed27f 545 struct extent_changeset *changeset);
3150b699 546
d1310b2e
CM
547/*
548 * insert an extent_state struct into the tree. 'bits' are set on the
549 * struct before it is inserted.
550 *
551 * This may return -EEXIST if the extent is already there, in which case the
552 * state struct is freed.
553 *
554 * The tree lock is not taken internally. This is a utility function and
555 * probably isn't what you want to call (see set/clear_extent_bit).
556 */
557static int insert_state(struct extent_io_tree *tree,
558 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
559 struct rb_node ***p,
560 struct rb_node **parent,
f97e27e9 561 u32 *bits, struct extent_changeset *changeset)
d1310b2e
CM
562{
563 struct rb_node *node;
564
2792237d
DS
565 if (end < start) {
566 btrfs_err(tree->fs_info,
567 "insert state: end < start %llu %llu", end, start);
568 WARN_ON(1);
569 }
d1310b2e
CM
570 state->start = start;
571 state->end = end;
9ed74f2d 572
d38ed27f 573 set_state_bits(tree, state, bits, changeset);
3150b699 574
f2071b21 575 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
576 if (node) {
577 struct extent_state *found;
578 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
579 btrfs_err(tree->fs_info,
580 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 581 found->start, found->end, start, end);
d1310b2e
CM
582 return -EEXIST;
583 }
584 merge_state(tree, state);
585 return 0;
586}
587
588/*
589 * split a given extent state struct in two, inserting the preallocated
590 * struct 'prealloc' as the newly created second half. 'split' indicates an
591 * offset inside 'orig' where it should be split.
592 *
593 * Before calling,
594 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
595 * are two extent state structs in the tree:
596 * prealloc: [orig->start, split - 1]
597 * orig: [ split, orig->end ]
598 *
599 * The tree locks are not taken by this function. They need to be held
600 * by the caller.
601 */
602static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
603 struct extent_state *prealloc, u64 split)
604{
605 struct rb_node *node;
9ed74f2d 606
abbb55f4
NB
607 if (tree->private_data && is_data_inode(tree->private_data))
608 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 609
d1310b2e
CM
610 prealloc->start = orig->start;
611 prealloc->end = split - 1;
612 prealloc->state = orig->state;
613 orig->start = split;
614
f2071b21
FM
615 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
616 &prealloc->rb_node, NULL, NULL);
d1310b2e 617 if (node) {
d1310b2e
CM
618 free_extent_state(prealloc);
619 return -EEXIST;
620 }
621 return 0;
622}
623
cdc6a395
LZ
624static struct extent_state *next_state(struct extent_state *state)
625{
626 struct rb_node *next = rb_next(&state->rb_node);
627 if (next)
628 return rb_entry(next, struct extent_state, rb_node);
629 else
630 return NULL;
631}
632
d1310b2e
CM
633/*
634 * utility function to clear some bits in an extent state struct.
52042d8e 635 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
636 *
637 * If no bits are set on the state struct after clearing things, the
638 * struct is freed and removed from the tree
639 */
cdc6a395
LZ
640static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
641 struct extent_state *state,
f97e27e9 642 u32 *bits, int wake,
fefdc557 643 struct extent_changeset *changeset)
d1310b2e 644{
cdc6a395 645 struct extent_state *next;
f97e27e9 646 u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 647 int ret;
d1310b2e 648
0ca1f7ce 649 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
650 u64 range = state->end - state->start + 1;
651 WARN_ON(range > tree->dirty_bytes);
652 tree->dirty_bytes -= range;
653 }
a36bb5f9
NB
654
655 if (tree->private_data && is_data_inode(tree->private_data))
656 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
657
57599c7e
DS
658 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
659 BUG_ON(ret < 0);
32c00aff 660 state->state &= ~bits_to_clear;
d1310b2e
CM
661 if (wake)
662 wake_up(&state->wq);
0ca1f7ce 663 if (state->state == 0) {
cdc6a395 664 next = next_state(state);
27a3507d 665 if (extent_state_in_tree(state)) {
d1310b2e 666 rb_erase(&state->rb_node, &tree->state);
27a3507d 667 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
668 free_extent_state(state);
669 } else {
670 WARN_ON(1);
671 }
672 } else {
673 merge_state(tree, state);
cdc6a395 674 next = next_state(state);
d1310b2e 675 }
cdc6a395 676 return next;
d1310b2e
CM
677}
678
8233767a
XG
679static struct extent_state *
680alloc_extent_state_atomic(struct extent_state *prealloc)
681{
682 if (!prealloc)
683 prealloc = alloc_extent_state(GFP_ATOMIC);
684
685 return prealloc;
686}
687
48a3b636 688static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 689{
29b665cc 690 btrfs_panic(tree->fs_info, err,
05912a3c 691 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
692}
693
d1310b2e
CM
694/*
695 * clear some bits on a range in the tree. This may require splitting
696 * or inserting elements in the tree, so the gfp mask is used to
697 * indicate which allocations or sleeping are allowed.
698 *
699 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
700 * the given range from the tree regardless of state (ie for truncate).
701 *
702 * the range [start, end] is inclusive.
703 *
6763af84 704 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 705 */
66b0c887 706int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9
QW
707 u32 bits, int wake, int delete,
708 struct extent_state **cached_state,
709 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
710{
711 struct extent_state *state;
2c64c53d 712 struct extent_state *cached;
d1310b2e
CM
713 struct extent_state *prealloc = NULL;
714 struct rb_node *node;
5c939df5 715 u64 last_end;
d1310b2e 716 int err;
2ac55d41 717 int clear = 0;
d1310b2e 718
a5dee37d 719 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 720 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 721
7ee9e440
JB
722 if (bits & EXTENT_DELALLOC)
723 bits |= EXTENT_NORESERVE;
724
0ca1f7ce
YZ
725 if (delete)
726 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 727
8882679e 728 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 729 clear = 1;
d1310b2e 730again:
d0164adc 731 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
732 /*
733 * Don't care for allocation failure here because we might end
734 * up not needing the pre-allocated extent state at all, which
735 * is the case if we only have in the tree extent states that
736 * cover our input range and don't cover too any other range.
737 * If we end up needing a new extent state we allocate it later.
738 */
d1310b2e 739 prealloc = alloc_extent_state(mask);
d1310b2e
CM
740 }
741
cad321ad 742 spin_lock(&tree->lock);
2c64c53d
CM
743 if (cached_state) {
744 cached = *cached_state;
2ac55d41
JB
745
746 if (clear) {
747 *cached_state = NULL;
748 cached_state = NULL;
749 }
750
27a3507d
FM
751 if (cached && extent_state_in_tree(cached) &&
752 cached->start <= start && cached->end > start) {
2ac55d41 753 if (clear)
b7ac31b7 754 refcount_dec(&cached->refs);
2c64c53d 755 state = cached;
42daec29 756 goto hit_next;
2c64c53d 757 }
2ac55d41
JB
758 if (clear)
759 free_extent_state(cached);
2c64c53d 760 }
d1310b2e
CM
761 /*
762 * this search will find the extents that end after
763 * our range starts
764 */
80ea96b1 765 node = tree_search(tree, start);
d1310b2e
CM
766 if (!node)
767 goto out;
768 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 769hit_next:
d1310b2e
CM
770 if (state->start > end)
771 goto out;
772 WARN_ON(state->end < start);
5c939df5 773 last_end = state->end;
d1310b2e 774
0449314a 775 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
776 if (!(state->state & bits)) {
777 state = next_state(state);
0449314a 778 goto next;
cdc6a395 779 }
0449314a 780
d1310b2e
CM
781 /*
782 * | ---- desired range ---- |
783 * | state | or
784 * | ------------- state -------------- |
785 *
786 * We need to split the extent we found, and may flip
787 * bits on second half.
788 *
789 * If the extent we found extends past our range, we
790 * just split and search again. It'll get split again
791 * the next time though.
792 *
793 * If the extent we found is inside our range, we clear
794 * the desired bit on it.
795 */
796
797 if (state->start < start) {
8233767a
XG
798 prealloc = alloc_extent_state_atomic(prealloc);
799 BUG_ON(!prealloc);
d1310b2e 800 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
801 if (err)
802 extent_io_tree_panic(tree, err);
803
d1310b2e
CM
804 prealloc = NULL;
805 if (err)
806 goto out;
807 if (state->end <= end) {
fefdc557
QW
808 state = clear_state_bit(tree, state, &bits, wake,
809 changeset);
d1ac6e41 810 goto next;
d1310b2e
CM
811 }
812 goto search_again;
813 }
814 /*
815 * | ---- desired range ---- |
816 * | state |
817 * We need to split the extent, and clear the bit
818 * on the first half
819 */
820 if (state->start <= end && state->end > end) {
8233767a
XG
821 prealloc = alloc_extent_state_atomic(prealloc);
822 BUG_ON(!prealloc);
d1310b2e 823 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
824 if (err)
825 extent_io_tree_panic(tree, err);
826
d1310b2e
CM
827 if (wake)
828 wake_up(&state->wq);
42daec29 829
fefdc557 830 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 831
d1310b2e
CM
832 prealloc = NULL;
833 goto out;
834 }
42daec29 835
fefdc557 836 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 837next:
5c939df5
YZ
838 if (last_end == (u64)-1)
839 goto out;
840 start = last_end + 1;
cdc6a395 841 if (start <= end && state && !need_resched())
692e5759 842 goto hit_next;
d1310b2e
CM
843
844search_again:
845 if (start > end)
846 goto out;
cad321ad 847 spin_unlock(&tree->lock);
d0164adc 848 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
849 cond_resched();
850 goto again;
7ab5cb2a
DS
851
852out:
853 spin_unlock(&tree->lock);
854 if (prealloc)
855 free_extent_state(prealloc);
856
857 return 0;
858
d1310b2e 859}
d1310b2e 860
143bede5
JM
861static void wait_on_state(struct extent_io_tree *tree,
862 struct extent_state *state)
641f5219
CH
863 __releases(tree->lock)
864 __acquires(tree->lock)
d1310b2e
CM
865{
866 DEFINE_WAIT(wait);
867 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 868 spin_unlock(&tree->lock);
d1310b2e 869 schedule();
cad321ad 870 spin_lock(&tree->lock);
d1310b2e 871 finish_wait(&state->wq, &wait);
d1310b2e
CM
872}
873
874/*
875 * waits for one or more bits to clear on a range in the state tree.
876 * The range [start, end] is inclusive.
877 * The tree lock is taken by this function
878 */
41074888 879static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 880 u32 bits)
d1310b2e
CM
881{
882 struct extent_state *state;
883 struct rb_node *node;
884
a5dee37d 885 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 886
cad321ad 887 spin_lock(&tree->lock);
d1310b2e
CM
888again:
889 while (1) {
890 /*
891 * this search will find all the extents that end after
892 * our range starts
893 */
80ea96b1 894 node = tree_search(tree, start);
c50d3e71 895process_node:
d1310b2e
CM
896 if (!node)
897 break;
898
899 state = rb_entry(node, struct extent_state, rb_node);
900
901 if (state->start > end)
902 goto out;
903
904 if (state->state & bits) {
905 start = state->start;
b7ac31b7 906 refcount_inc(&state->refs);
d1310b2e
CM
907 wait_on_state(tree, state);
908 free_extent_state(state);
909 goto again;
910 }
911 start = state->end + 1;
912
913 if (start > end)
914 break;
915
c50d3e71
FM
916 if (!cond_resched_lock(&tree->lock)) {
917 node = rb_next(node);
918 goto process_node;
919 }
d1310b2e
CM
920 }
921out:
cad321ad 922 spin_unlock(&tree->lock);
d1310b2e 923}
d1310b2e 924
1bf85046 925static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 926 struct extent_state *state,
f97e27e9 927 u32 *bits, struct extent_changeset *changeset)
d1310b2e 928{
f97e27e9 929 u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 930 int ret;
9ed74f2d 931
e06a1fc9
NB
932 if (tree->private_data && is_data_inode(tree->private_data))
933 btrfs_set_delalloc_extent(tree->private_data, state, bits);
934
0ca1f7ce 935 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
936 u64 range = state->end - state->start + 1;
937 tree->dirty_bytes += range;
938 }
57599c7e
DS
939 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
940 BUG_ON(ret < 0);
0ca1f7ce 941 state->state |= bits_to_set;
d1310b2e
CM
942}
943
e38e2ed7
FM
944static void cache_state_if_flags(struct extent_state *state,
945 struct extent_state **cached_ptr,
9ee49a04 946 unsigned flags)
2c64c53d
CM
947{
948 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 949 if (!flags || (state->state & flags)) {
2c64c53d 950 *cached_ptr = state;
b7ac31b7 951 refcount_inc(&state->refs);
2c64c53d
CM
952 }
953 }
954}
955
e38e2ed7
FM
956static void cache_state(struct extent_state *state,
957 struct extent_state **cached_ptr)
958{
959 return cache_state_if_flags(state, cached_ptr,
8882679e 960 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
961}
962
d1310b2e 963/*
1edbb734
CM
964 * set some bits on a range in the tree. This may require allocations or
965 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 966 *
1edbb734
CM
967 * If any of the exclusive bits are set, this will fail with -EEXIST if some
968 * part of the range already has the desired bits set. The start of the
969 * existing range is returned in failed_start in this case.
d1310b2e 970 *
1edbb734 971 * [start, end] is inclusive This takes the tree lock.
d1310b2e 972 */
f97e27e9
QW
973int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
974 u32 exclusive_bits, u64 *failed_start,
1cab5e72
NB
975 struct extent_state **cached_state, gfp_t mask,
976 struct extent_changeset *changeset)
d1310b2e
CM
977{
978 struct extent_state *state;
979 struct extent_state *prealloc = NULL;
980 struct rb_node *node;
12cfbad9
FDBM
981 struct rb_node **p;
982 struct rb_node *parent;
d1310b2e 983 int err = 0;
d1310b2e
CM
984 u64 last_start;
985 u64 last_end;
42daec29 986
a5dee37d 987 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 988 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 989
3f6bb4ae
QW
990 if (exclusive_bits)
991 ASSERT(failed_start);
992 else
993 ASSERT(failed_start == NULL);
d1310b2e 994again:
d0164adc 995 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
996 /*
997 * Don't care for allocation failure here because we might end
998 * up not needing the pre-allocated extent state at all, which
999 * is the case if we only have in the tree extent states that
1000 * cover our input range and don't cover too any other range.
1001 * If we end up needing a new extent state we allocate it later.
1002 */
d1310b2e 1003 prealloc = alloc_extent_state(mask);
d1310b2e
CM
1004 }
1005
cad321ad 1006 spin_lock(&tree->lock);
9655d298
CM
1007 if (cached_state && *cached_state) {
1008 state = *cached_state;
df98b6e2 1009 if (state->start <= start && state->end > start &&
27a3507d 1010 extent_state_in_tree(state)) {
9655d298
CM
1011 node = &state->rb_node;
1012 goto hit_next;
1013 }
1014 }
d1310b2e
CM
1015 /*
1016 * this search will find all the extents that end after
1017 * our range starts.
1018 */
12cfbad9 1019 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1020 if (!node) {
8233767a
XG
1021 prealloc = alloc_extent_state_atomic(prealloc);
1022 BUG_ON(!prealloc);
12cfbad9 1023 err = insert_state(tree, prealloc, start, end,
d38ed27f 1024 &p, &parent, &bits, changeset);
c2d904e0
JM
1025 if (err)
1026 extent_io_tree_panic(tree, err);
1027
c42ac0bc 1028 cache_state(prealloc, cached_state);
d1310b2e 1029 prealloc = NULL;
d1310b2e
CM
1030 goto out;
1031 }
d1310b2e 1032 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1033hit_next:
d1310b2e
CM
1034 last_start = state->start;
1035 last_end = state->end;
1036
1037 /*
1038 * | ---- desired range ---- |
1039 * | state |
1040 *
1041 * Just lock what we found and keep going
1042 */
1043 if (state->start == start && state->end <= end) {
1edbb734 1044 if (state->state & exclusive_bits) {
d1310b2e
CM
1045 *failed_start = state->start;
1046 err = -EEXIST;
1047 goto out;
1048 }
42daec29 1049
d38ed27f 1050 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1051 cache_state(state, cached_state);
d1310b2e 1052 merge_state(tree, state);
5c939df5
YZ
1053 if (last_end == (u64)-1)
1054 goto out;
1055 start = last_end + 1;
d1ac6e41
LB
1056 state = next_state(state);
1057 if (start < end && state && state->start == start &&
1058 !need_resched())
1059 goto hit_next;
d1310b2e
CM
1060 goto search_again;
1061 }
1062
1063 /*
1064 * | ---- desired range ---- |
1065 * | state |
1066 * or
1067 * | ------------- state -------------- |
1068 *
1069 * We need to split the extent we found, and may flip bits on
1070 * second half.
1071 *
1072 * If the extent we found extends past our
1073 * range, we just split and search again. It'll get split
1074 * again the next time though.
1075 *
1076 * If the extent we found is inside our range, we set the
1077 * desired bit on it.
1078 */
1079 if (state->start < start) {
1edbb734 1080 if (state->state & exclusive_bits) {
d1310b2e
CM
1081 *failed_start = start;
1082 err = -EEXIST;
1083 goto out;
1084 }
8233767a 1085
55ffaabe
FM
1086 /*
1087 * If this extent already has all the bits we want set, then
1088 * skip it, not necessary to split it or do anything with it.
1089 */
1090 if ((state->state & bits) == bits) {
1091 start = state->end + 1;
1092 cache_state(state, cached_state);
1093 goto search_again;
1094 }
1095
8233767a
XG
1096 prealloc = alloc_extent_state_atomic(prealloc);
1097 BUG_ON(!prealloc);
d1310b2e 1098 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1099 if (err)
1100 extent_io_tree_panic(tree, err);
1101
d1310b2e
CM
1102 prealloc = NULL;
1103 if (err)
1104 goto out;
1105 if (state->end <= end) {
d38ed27f 1106 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1107 cache_state(state, cached_state);
d1310b2e 1108 merge_state(tree, state);
5c939df5
YZ
1109 if (last_end == (u64)-1)
1110 goto out;
1111 start = last_end + 1;
d1ac6e41
LB
1112 state = next_state(state);
1113 if (start < end && state && state->start == start &&
1114 !need_resched())
1115 goto hit_next;
d1310b2e
CM
1116 }
1117 goto search_again;
1118 }
1119 /*
1120 * | ---- desired range ---- |
1121 * | state | or | state |
1122 *
1123 * There's a hole, we need to insert something in it and
1124 * ignore the extent we found.
1125 */
1126 if (state->start > start) {
1127 u64 this_end;
1128 if (end < last_start)
1129 this_end = end;
1130 else
d397712b 1131 this_end = last_start - 1;
8233767a
XG
1132
1133 prealloc = alloc_extent_state_atomic(prealloc);
1134 BUG_ON(!prealloc);
c7f895a2
XG
1135
1136 /*
1137 * Avoid to free 'prealloc' if it can be merged with
1138 * the later extent.
1139 */
d1310b2e 1140 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1141 NULL, NULL, &bits, changeset);
c2d904e0
JM
1142 if (err)
1143 extent_io_tree_panic(tree, err);
1144
9ed74f2d
JB
1145 cache_state(prealloc, cached_state);
1146 prealloc = NULL;
d1310b2e
CM
1147 start = this_end + 1;
1148 goto search_again;
1149 }
1150 /*
1151 * | ---- desired range ---- |
1152 * | state |
1153 * We need to split the extent, and set the bit
1154 * on the first half
1155 */
1156 if (state->start <= end && state->end > end) {
1edbb734 1157 if (state->state & exclusive_bits) {
d1310b2e
CM
1158 *failed_start = start;
1159 err = -EEXIST;
1160 goto out;
1161 }
8233767a
XG
1162
1163 prealloc = alloc_extent_state_atomic(prealloc);
1164 BUG_ON(!prealloc);
d1310b2e 1165 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1166 if (err)
1167 extent_io_tree_panic(tree, err);
d1310b2e 1168
d38ed27f 1169 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1170 cache_state(prealloc, cached_state);
d1310b2e
CM
1171 merge_state(tree, prealloc);
1172 prealloc = NULL;
1173 goto out;
1174 }
1175
b5a4ba14
DS
1176search_again:
1177 if (start > end)
1178 goto out;
1179 spin_unlock(&tree->lock);
1180 if (gfpflags_allow_blocking(mask))
1181 cond_resched();
1182 goto again;
d1310b2e
CM
1183
1184out:
cad321ad 1185 spin_unlock(&tree->lock);
d1310b2e
CM
1186 if (prealloc)
1187 free_extent_state(prealloc);
1188
1189 return err;
1190
d1310b2e 1191}
d1310b2e 1192
462d6fac 1193/**
10983f2e
LB
1194 * convert_extent_bit - convert all bits in a given range from one bit to
1195 * another
462d6fac
JB
1196 * @tree: the io tree to search
1197 * @start: the start offset in bytes
1198 * @end: the end offset in bytes (inclusive)
1199 * @bits: the bits to set in this range
1200 * @clear_bits: the bits to clear in this range
e6138876 1201 * @cached_state: state that we're going to cache
462d6fac
JB
1202 *
1203 * This will go through and set bits for the given range. If any states exist
1204 * already in this range they are set with the given bit and cleared of the
1205 * clear_bits. This is only meant to be used by things that are mergeable, ie
1206 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1207 * boundary bits like LOCK.
210aa277
DS
1208 *
1209 * All allocations are done with GFP_NOFS.
462d6fac
JB
1210 */
1211int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1212 u32 bits, u32 clear_bits,
210aa277 1213 struct extent_state **cached_state)
462d6fac
JB
1214{
1215 struct extent_state *state;
1216 struct extent_state *prealloc = NULL;
1217 struct rb_node *node;
12cfbad9
FDBM
1218 struct rb_node **p;
1219 struct rb_node *parent;
462d6fac
JB
1220 int err = 0;
1221 u64 last_start;
1222 u64 last_end;
c8fd3de7 1223 bool first_iteration = true;
462d6fac 1224
a5dee37d 1225 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1226 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1227 clear_bits);
8d599ae1 1228
462d6fac 1229again:
210aa277 1230 if (!prealloc) {
c8fd3de7
FM
1231 /*
1232 * Best effort, don't worry if extent state allocation fails
1233 * here for the first iteration. We might have a cached state
1234 * that matches exactly the target range, in which case no
1235 * extent state allocations are needed. We'll only know this
1236 * after locking the tree.
1237 */
210aa277 1238 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1239 if (!prealloc && !first_iteration)
462d6fac
JB
1240 return -ENOMEM;
1241 }
1242
1243 spin_lock(&tree->lock);
e6138876
JB
1244 if (cached_state && *cached_state) {
1245 state = *cached_state;
1246 if (state->start <= start && state->end > start &&
27a3507d 1247 extent_state_in_tree(state)) {
e6138876
JB
1248 node = &state->rb_node;
1249 goto hit_next;
1250 }
1251 }
1252
462d6fac
JB
1253 /*
1254 * this search will find all the extents that end after
1255 * our range starts.
1256 */
12cfbad9 1257 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1258 if (!node) {
1259 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1260 if (!prealloc) {
1261 err = -ENOMEM;
1262 goto out;
1263 }
12cfbad9 1264 err = insert_state(tree, prealloc, start, end,
d38ed27f 1265 &p, &parent, &bits, NULL);
c2d904e0
JM
1266 if (err)
1267 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1268 cache_state(prealloc, cached_state);
1269 prealloc = NULL;
462d6fac
JB
1270 goto out;
1271 }
1272 state = rb_entry(node, struct extent_state, rb_node);
1273hit_next:
1274 last_start = state->start;
1275 last_end = state->end;
1276
1277 /*
1278 * | ---- desired range ---- |
1279 * | state |
1280 *
1281 * Just lock what we found and keep going
1282 */
1283 if (state->start == start && state->end <= end) {
d38ed27f 1284 set_state_bits(tree, state, &bits, NULL);
e6138876 1285 cache_state(state, cached_state);
fefdc557 1286 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1287 if (last_end == (u64)-1)
1288 goto out;
462d6fac 1289 start = last_end + 1;
d1ac6e41
LB
1290 if (start < end && state && state->start == start &&
1291 !need_resched())
1292 goto hit_next;
462d6fac
JB
1293 goto search_again;
1294 }
1295
1296 /*
1297 * | ---- desired range ---- |
1298 * | state |
1299 * or
1300 * | ------------- state -------------- |
1301 *
1302 * We need to split the extent we found, and may flip bits on
1303 * second half.
1304 *
1305 * If the extent we found extends past our
1306 * range, we just split and search again. It'll get split
1307 * again the next time though.
1308 *
1309 * If the extent we found is inside our range, we set the
1310 * desired bit on it.
1311 */
1312 if (state->start < start) {
1313 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1314 if (!prealloc) {
1315 err = -ENOMEM;
1316 goto out;
1317 }
462d6fac 1318 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1319 if (err)
1320 extent_io_tree_panic(tree, err);
462d6fac
JB
1321 prealloc = NULL;
1322 if (err)
1323 goto out;
1324 if (state->end <= end) {
d38ed27f 1325 set_state_bits(tree, state, &bits, NULL);
e6138876 1326 cache_state(state, cached_state);
fefdc557
QW
1327 state = clear_state_bit(tree, state, &clear_bits, 0,
1328 NULL);
462d6fac
JB
1329 if (last_end == (u64)-1)
1330 goto out;
1331 start = last_end + 1;
d1ac6e41
LB
1332 if (start < end && state && state->start == start &&
1333 !need_resched())
1334 goto hit_next;
462d6fac
JB
1335 }
1336 goto search_again;
1337 }
1338 /*
1339 * | ---- desired range ---- |
1340 * | state | or | state |
1341 *
1342 * There's a hole, we need to insert something in it and
1343 * ignore the extent we found.
1344 */
1345 if (state->start > start) {
1346 u64 this_end;
1347 if (end < last_start)
1348 this_end = end;
1349 else
1350 this_end = last_start - 1;
1351
1352 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1353 if (!prealloc) {
1354 err = -ENOMEM;
1355 goto out;
1356 }
462d6fac
JB
1357
1358 /*
1359 * Avoid to free 'prealloc' if it can be merged with
1360 * the later extent.
1361 */
1362 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1363 NULL, NULL, &bits, NULL);
c2d904e0
JM
1364 if (err)
1365 extent_io_tree_panic(tree, err);
e6138876 1366 cache_state(prealloc, cached_state);
462d6fac
JB
1367 prealloc = NULL;
1368 start = this_end + 1;
1369 goto search_again;
1370 }
1371 /*
1372 * | ---- desired range ---- |
1373 * | state |
1374 * We need to split the extent, and set the bit
1375 * on the first half
1376 */
1377 if (state->start <= end && state->end > end) {
1378 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1379 if (!prealloc) {
1380 err = -ENOMEM;
1381 goto out;
1382 }
462d6fac
JB
1383
1384 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1385 if (err)
1386 extent_io_tree_panic(tree, err);
462d6fac 1387
d38ed27f 1388 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1389 cache_state(prealloc, cached_state);
fefdc557 1390 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1391 prealloc = NULL;
1392 goto out;
1393 }
1394
462d6fac
JB
1395search_again:
1396 if (start > end)
1397 goto out;
1398 spin_unlock(&tree->lock);
210aa277 1399 cond_resched();
c8fd3de7 1400 first_iteration = false;
462d6fac 1401 goto again;
462d6fac
JB
1402
1403out:
1404 spin_unlock(&tree->lock);
1405 if (prealloc)
1406 free_extent_state(prealloc);
1407
1408 return err;
462d6fac
JB
1409}
1410
d1310b2e 1411/* wrappers around set/clear extent bit */
d38ed27f 1412int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1413 u32 bits, struct extent_changeset *changeset)
d38ed27f
QW
1414{
1415 /*
1416 * We don't support EXTENT_LOCKED yet, as current changeset will
1417 * record any bits changed, so for EXTENT_LOCKED case, it will
1418 * either fail with -EEXIST or changeset will record the whole
1419 * range.
1420 */
1421 BUG_ON(bits & EXTENT_LOCKED);
1422
1cab5e72
NB
1423 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1424 changeset);
d38ed27f
QW
1425}
1426
4ca73656 1427int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1428 u32 bits)
4ca73656 1429{
1cab5e72
NB
1430 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1431 GFP_NOWAIT, NULL);
4ca73656
NB
1432}
1433
fefdc557 1434int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1435 u32 bits, int wake, int delete,
ae0f1625 1436 struct extent_state **cached)
fefdc557
QW
1437{
1438 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1439 cached, GFP_NOFS, NULL);
fefdc557
QW
1440}
1441
fefdc557 1442int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1443 u32 bits, struct extent_changeset *changeset)
fefdc557
QW
1444{
1445 /*
1446 * Don't support EXTENT_LOCKED case, same reason as
1447 * set_record_extent_bits().
1448 */
1449 BUG_ON(bits & EXTENT_LOCKED);
1450
f734c44a 1451 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1452 changeset);
1453}
1454
d352ac68
CM
1455/*
1456 * either insert or lock state struct between start and end use mask to tell
1457 * us if waiting is desired.
1458 */
1edbb734 1459int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1460 struct extent_state **cached_state)
d1310b2e
CM
1461{
1462 int err;
1463 u64 failed_start;
9ee49a04 1464
d1310b2e 1465 while (1) {
1cab5e72
NB
1466 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1467 EXTENT_LOCKED, &failed_start,
1468 cached_state, GFP_NOFS, NULL);
d0082371 1469 if (err == -EEXIST) {
d1310b2e
CM
1470 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1471 start = failed_start;
d0082371 1472 } else
d1310b2e 1473 break;
d1310b2e
CM
1474 WARN_ON(start > end);
1475 }
1476 return err;
1477}
d1310b2e 1478
d0082371 1479int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1480{
1481 int err;
1482 u64 failed_start;
1483
1cab5e72
NB
1484 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1485 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1486 if (err == -EEXIST) {
1487 if (failed_start > start)
1488 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1489 EXTENT_LOCKED, 1, 0, NULL);
25179201 1490 return 0;
6643558d 1491 }
25179201
JB
1492 return 1;
1493}
25179201 1494
bd1fa4f0 1495void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1496{
09cbfeaf
KS
1497 unsigned long index = start >> PAGE_SHIFT;
1498 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1499 struct page *page;
1500
1501 while (index <= end_index) {
1502 page = find_get_page(inode->i_mapping, index);
1503 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1504 clear_page_dirty_for_io(page);
09cbfeaf 1505 put_page(page);
4adaa611
CM
1506 index++;
1507 }
4adaa611
CM
1508}
1509
f6311572 1510void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1511{
ebf55c88 1512 struct address_space *mapping = inode->i_mapping;
09cbfeaf
KS
1513 unsigned long index = start >> PAGE_SHIFT;
1514 unsigned long end_index = end >> PAGE_SHIFT;
ebf55c88 1515 struct folio *folio;
4adaa611
CM
1516
1517 while (index <= end_index) {
ebf55c88
MWO
1518 folio = filemap_get_folio(mapping, index);
1519 filemap_dirty_folio(mapping, folio);
1520 folio_account_redirty(folio);
1521 index += folio_nr_pages(folio);
1522 folio_put(folio);
4adaa611 1523 }
4adaa611
CM
1524}
1525
d352ac68
CM
1526/* find the first state struct with 'bits' set after 'start', and
1527 * return it. tree->lock must be held. NULL will returned if
1528 * nothing was found after 'start'
1529 */
48a3b636 1530static struct extent_state *
f97e27e9 1531find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
d7fc640e
CM
1532{
1533 struct rb_node *node;
1534 struct extent_state *state;
1535
1536 /*
1537 * this search will find all the extents that end after
1538 * our range starts.
1539 */
1540 node = tree_search(tree, start);
d397712b 1541 if (!node)
d7fc640e 1542 goto out;
d7fc640e 1543
d397712b 1544 while (1) {
d7fc640e 1545 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1546 if (state->end >= start && (state->state & bits))
d7fc640e 1547 return state;
d397712b 1548
d7fc640e
CM
1549 node = rb_next(node);
1550 if (!node)
1551 break;
1552 }
1553out:
1554 return NULL;
1555}
d7fc640e 1556
69261c4b 1557/*
03509b78 1558 * Find the first offset in the io tree with one or more @bits set.
69261c4b 1559 *
03509b78
QW
1560 * Note: If there are multiple bits set in @bits, any of them will match.
1561 *
1562 * Return 0 if we find something, and update @start_ret and @end_ret.
1563 * Return 1 if we found nothing.
69261c4b
XG
1564 */
1565int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1566 u64 *start_ret, u64 *end_ret, u32 bits,
e6138876 1567 struct extent_state **cached_state)
69261c4b
XG
1568{
1569 struct extent_state *state;
1570 int ret = 1;
1571
1572 spin_lock(&tree->lock);
e6138876
JB
1573 if (cached_state && *cached_state) {
1574 state = *cached_state;
27a3507d 1575 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1576 while ((state = next_state(state)) != NULL) {
e6138876
JB
1577 if (state->state & bits)
1578 goto got_it;
e6138876
JB
1579 }
1580 free_extent_state(*cached_state);
1581 *cached_state = NULL;
1582 goto out;
1583 }
1584 free_extent_state(*cached_state);
1585 *cached_state = NULL;
1586 }
1587
69261c4b 1588 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1589got_it:
69261c4b 1590 if (state) {
e38e2ed7 1591 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1592 *start_ret = state->start;
1593 *end_ret = state->end;
1594 ret = 0;
1595 }
e6138876 1596out:
69261c4b
XG
1597 spin_unlock(&tree->lock);
1598 return ret;
1599}
1600
41a2ee75 1601/**
3bed2da1
NB
1602 * Find a contiguous area of bits
1603 *
1604 * @tree: io tree to check
1605 * @start: offset to start the search from
1606 * @start_ret: the first offset we found with the bits set
1607 * @end_ret: the final contiguous range of the bits that were set
1608 * @bits: bits to look for
41a2ee75
JB
1609 *
1610 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1611 * to set bits appropriately, and then merge them again. During this time it
1612 * will drop the tree->lock, so use this helper if you want to find the actual
1613 * contiguous area for given bits. We will search to the first bit we find, and
1614 * then walk down the tree until we find a non-contiguous area. The area
1615 * returned will be the full contiguous area with the bits set.
1616 */
1617int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1618 u64 *start_ret, u64 *end_ret, u32 bits)
41a2ee75
JB
1619{
1620 struct extent_state *state;
1621 int ret = 1;
1622
1623 spin_lock(&tree->lock);
1624 state = find_first_extent_bit_state(tree, start, bits);
1625 if (state) {
1626 *start_ret = state->start;
1627 *end_ret = state->end;
1628 while ((state = next_state(state)) != NULL) {
1629 if (state->start > (*end_ret + 1))
1630 break;
1631 *end_ret = state->end;
1632 }
1633 ret = 0;
1634 }
1635 spin_unlock(&tree->lock);
1636 return ret;
1637}
1638
45bfcfc1 1639/**
3bed2da1
NB
1640 * Find the first range that has @bits not set. This range could start before
1641 * @start.
45bfcfc1 1642 *
3bed2da1
NB
1643 * @tree: the tree to search
1644 * @start: offset at/after which the found extent should start
1645 * @start_ret: records the beginning of the range
1646 * @end_ret: records the end of the range (inclusive)
1647 * @bits: the set of bits which must be unset
45bfcfc1
NB
1648 *
1649 * Since unallocated range is also considered one which doesn't have the bits
1650 * set it's possible that @end_ret contains -1, this happens in case the range
1651 * spans (last_range_end, end of device]. In this case it's up to the caller to
1652 * trim @end_ret to the appropriate size.
1653 */
1654void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1655 u64 *start_ret, u64 *end_ret, u32 bits)
45bfcfc1
NB
1656{
1657 struct extent_state *state;
1658 struct rb_node *node, *prev = NULL, *next;
1659
1660 spin_lock(&tree->lock);
1661
1662 /* Find first extent with bits cleared */
1663 while (1) {
1664 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1665 if (!node && !next && !prev) {
1666 /*
1667 * Tree is completely empty, send full range and let
1668 * caller deal with it
1669 */
1670 *start_ret = 0;
1671 *end_ret = -1;
1672 goto out;
1673 } else if (!node && !next) {
1674 /*
1675 * We are past the last allocated chunk, set start at
1676 * the end of the last extent.
1677 */
1678 state = rb_entry(prev, struct extent_state, rb_node);
1679 *start_ret = state->end + 1;
1680 *end_ret = -1;
1681 goto out;
1682 } else if (!node) {
45bfcfc1 1683 node = next;
45bfcfc1 1684 }
1eaebb34
NB
1685 /*
1686 * At this point 'node' either contains 'start' or start is
1687 * before 'node'
1688 */
45bfcfc1 1689 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1690
1691 if (in_range(start, state->start, state->end - state->start + 1)) {
1692 if (state->state & bits) {
1693 /*
1694 * |--range with bits sets--|
1695 * |
1696 * start
1697 */
1698 start = state->end + 1;
1699 } else {
1700 /*
1701 * 'start' falls within a range that doesn't
1702 * have the bits set, so take its start as
1703 * the beginning of the desired range
1704 *
1705 * |--range with bits cleared----|
1706 * |
1707 * start
1708 */
1709 *start_ret = state->start;
1710 break;
1711 }
45bfcfc1 1712 } else {
1eaebb34
NB
1713 /*
1714 * |---prev range---|---hole/unset---|---node range---|
1715 * |
1716 * start
1717 *
1718 * or
1719 *
1720 * |---hole/unset--||--first node--|
1721 * 0 |
1722 * start
1723 */
1724 if (prev) {
1725 state = rb_entry(prev, struct extent_state,
1726 rb_node);
1727 *start_ret = state->end + 1;
1728 } else {
1729 *start_ret = 0;
1730 }
45bfcfc1
NB
1731 break;
1732 }
1733 }
1734
1735 /*
1736 * Find the longest stretch from start until an entry which has the
1737 * bits set
1738 */
1739 while (1) {
1740 state = rb_entry(node, struct extent_state, rb_node);
1741 if (state->end >= start && !(state->state & bits)) {
1742 *end_ret = state->end;
1743 } else {
1744 *end_ret = state->start - 1;
1745 break;
1746 }
1747
1748 node = rb_next(node);
1749 if (!node)
1750 break;
1751 }
1752out:
1753 spin_unlock(&tree->lock);
1754}
1755
d352ac68
CM
1756/*
1757 * find a contiguous range of bytes in the file marked as delalloc, not
1758 * more than 'max_bytes'. start and end are used to return the range,
1759 *
3522e903 1760 * true is returned if we find something, false if nothing was in the tree
d352ac68 1761 */
083e75e7
JB
1762bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1763 u64 *end, u64 max_bytes,
1764 struct extent_state **cached_state)
d1310b2e
CM
1765{
1766 struct rb_node *node;
1767 struct extent_state *state;
1768 u64 cur_start = *start;
3522e903 1769 bool found = false;
d1310b2e
CM
1770 u64 total_bytes = 0;
1771
cad321ad 1772 spin_lock(&tree->lock);
c8b97818 1773
d1310b2e
CM
1774 /*
1775 * this search will find all the extents that end after
1776 * our range starts.
1777 */
80ea96b1 1778 node = tree_search(tree, cur_start);
2b114d1d 1779 if (!node) {
3522e903 1780 *end = (u64)-1;
d1310b2e
CM
1781 goto out;
1782 }
1783
d397712b 1784 while (1) {
d1310b2e 1785 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1786 if (found && (state->start != cur_start ||
1787 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1788 goto out;
1789 }
1790 if (!(state->state & EXTENT_DELALLOC)) {
1791 if (!found)
1792 *end = state->end;
1793 goto out;
1794 }
c2a128d2 1795 if (!found) {
d1310b2e 1796 *start = state->start;
c2a128d2 1797 *cached_state = state;
b7ac31b7 1798 refcount_inc(&state->refs);
c2a128d2 1799 }
3522e903 1800 found = true;
d1310b2e
CM
1801 *end = state->end;
1802 cur_start = state->end + 1;
1803 node = rb_next(node);
d1310b2e 1804 total_bytes += state->end - state->start + 1;
7bf811a5 1805 if (total_bytes >= max_bytes)
573aecaf 1806 break;
573aecaf 1807 if (!node)
d1310b2e
CM
1808 break;
1809 }
1810out:
cad321ad 1811 spin_unlock(&tree->lock);
d1310b2e
CM
1812 return found;
1813}
1814
ed8f13bf
QW
1815/*
1816 * Process one page for __process_pages_contig().
1817 *
1818 * Return >0 if we hit @page == @locked_page.
1819 * Return 0 if we updated the page status.
1820 * Return -EGAIN if the we need to try again.
1821 * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
1822 */
e38992be
QW
1823static int process_one_page(struct btrfs_fs_info *fs_info,
1824 struct address_space *mapping,
ed8f13bf 1825 struct page *page, struct page *locked_page,
e38992be 1826 unsigned long page_ops, u64 start, u64 end)
ed8f13bf 1827{
e38992be
QW
1828 u32 len;
1829
1830 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1831 len = end + 1 - start;
1832
ed8f13bf 1833 if (page_ops & PAGE_SET_ORDERED)
b945a463 1834 btrfs_page_clamp_set_ordered(fs_info, page, start, len);
ed8f13bf 1835 if (page_ops & PAGE_SET_ERROR)
e38992be 1836 btrfs_page_clamp_set_error(fs_info, page, start, len);
ed8f13bf 1837 if (page_ops & PAGE_START_WRITEBACK) {
e38992be
QW
1838 btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1839 btrfs_page_clamp_set_writeback(fs_info, page, start, len);
ed8f13bf
QW
1840 }
1841 if (page_ops & PAGE_END_WRITEBACK)
e38992be 1842 btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
a33a8e9a
QW
1843
1844 if (page == locked_page)
1845 return 1;
1846
ed8f13bf 1847 if (page_ops & PAGE_LOCK) {
1e1de387
QW
1848 int ret;
1849
1850 ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1851 if (ret)
1852 return ret;
ed8f13bf 1853 if (!PageDirty(page) || page->mapping != mapping) {
1e1de387 1854 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
1855 return -EAGAIN;
1856 }
1857 }
1858 if (page_ops & PAGE_UNLOCK)
1e1de387 1859 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
1860 return 0;
1861}
1862
da2c7009
LB
1863static int __process_pages_contig(struct address_space *mapping,
1864 struct page *locked_page,
98af9ab1 1865 u64 start, u64 end, unsigned long page_ops,
ed8f13bf
QW
1866 u64 *processed_end)
1867{
e38992be 1868 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
ed8f13bf
QW
1869 pgoff_t start_index = start >> PAGE_SHIFT;
1870 pgoff_t end_index = end >> PAGE_SHIFT;
1871 pgoff_t index = start_index;
1872 unsigned long nr_pages = end_index - start_index + 1;
1873 unsigned long pages_processed = 0;
1874 struct page *pages[16];
1875 int err = 0;
1876 int i;
1877
1878 if (page_ops & PAGE_LOCK) {
1879 ASSERT(page_ops == PAGE_LOCK);
1880 ASSERT(processed_end && *processed_end == start);
1881 }
1882
1883 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1884 mapping_set_error(mapping, -EIO);
1885
1886 while (nr_pages > 0) {
1887 int found_pages;
1888
1889 found_pages = find_get_pages_contig(mapping, index,
1890 min_t(unsigned long,
1891 nr_pages, ARRAY_SIZE(pages)), pages);
1892 if (found_pages == 0) {
1893 /*
1894 * Only if we're going to lock these pages, we can find
1895 * nothing at @index.
1896 */
1897 ASSERT(page_ops & PAGE_LOCK);
1898 err = -EAGAIN;
1899 goto out;
1900 }
1901
1902 for (i = 0; i < found_pages; i++) {
1903 int process_ret;
1904
e38992be
QW
1905 process_ret = process_one_page(fs_info, mapping,
1906 pages[i], locked_page, page_ops,
1907 start, end);
ed8f13bf
QW
1908 if (process_ret < 0) {
1909 for (; i < found_pages; i++)
1910 put_page(pages[i]);
1911 err = -EAGAIN;
1912 goto out;
1913 }
1914 put_page(pages[i]);
1915 pages_processed++;
1916 }
1917 nr_pages -= found_pages;
1918 index += found_pages;
1919 cond_resched();
1920 }
1921out:
1922 if (err && processed_end) {
1923 /*
1924 * Update @processed_end. I know this is awful since it has
1925 * two different return value patterns (inclusive vs exclusive).
1926 *
1927 * But the exclusive pattern is necessary if @start is 0, or we
1928 * underflow and check against processed_end won't work as
1929 * expected.
1930 */
1931 if (pages_processed)
1932 *processed_end = min(end,
1933 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1934 else
1935 *processed_end = start;
1936 }
1937 return err;
1938}
da2c7009 1939
143bede5
JM
1940static noinline void __unlock_for_delalloc(struct inode *inode,
1941 struct page *locked_page,
1942 u64 start, u64 end)
c8b97818 1943{
09cbfeaf
KS
1944 unsigned long index = start >> PAGE_SHIFT;
1945 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1946
76c0021d 1947 ASSERT(locked_page);
c8b97818 1948 if (index == locked_page->index && end_index == index)
143bede5 1949 return;
c8b97818 1950
98af9ab1 1951 __process_pages_contig(inode->i_mapping, locked_page, start, end,
76c0021d 1952 PAGE_UNLOCK, NULL);
c8b97818
CM
1953}
1954
1955static noinline int lock_delalloc_pages(struct inode *inode,
1956 struct page *locked_page,
1957 u64 delalloc_start,
1958 u64 delalloc_end)
1959{
09cbfeaf 1960 unsigned long index = delalloc_start >> PAGE_SHIFT;
09cbfeaf 1961 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
98af9ab1 1962 u64 processed_end = delalloc_start;
c8b97818 1963 int ret;
c8b97818 1964
76c0021d 1965 ASSERT(locked_page);
c8b97818
CM
1966 if (index == locked_page->index && index == end_index)
1967 return 0;
1968
98af9ab1
QW
1969 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
1970 delalloc_end, PAGE_LOCK, &processed_end);
1971 if (ret == -EAGAIN && processed_end > delalloc_start)
76c0021d 1972 __unlock_for_delalloc(inode, locked_page, delalloc_start,
98af9ab1 1973 processed_end);
c8b97818
CM
1974 return ret;
1975}
1976
1977/*
3522e903 1978 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
2749f7ef 1979 * more than @max_bytes.
c8b97818 1980 *
2749f7ef
QW
1981 * @start: The original start bytenr to search.
1982 * Will store the extent range start bytenr.
1983 * @end: The original end bytenr of the search range
1984 * Will store the extent range end bytenr.
1985 *
1986 * Return true if we find a delalloc range which starts inside the original
1987 * range, and @start/@end will store the delalloc range start/end.
1988 *
1989 * Return false if we can't find any delalloc range which starts inside the
1990 * original range, and @start/@end will be the non-delalloc range start/end.
c8b97818 1991 */
ce9f967f 1992EXPORT_FOR_TESTS
3522e903 1993noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1994 struct page *locked_page, u64 *start,
917aacec 1995 u64 *end)
c8b97818 1996{
9978059b 1997 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2749f7ef
QW
1998 const u64 orig_start = *start;
1999 const u64 orig_end = *end;
917aacec 2000 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
2001 u64 delalloc_start;
2002 u64 delalloc_end;
3522e903 2003 bool found;
9655d298 2004 struct extent_state *cached_state = NULL;
c8b97818
CM
2005 int ret;
2006 int loops = 0;
2007
2749f7ef
QW
2008 /* Caller should pass a valid @end to indicate the search range end */
2009 ASSERT(orig_end > orig_start);
2010
2011 /* The range should at least cover part of the page */
2012 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
2013 orig_end <= page_offset(locked_page)));
c8b97818
CM
2014again:
2015 /* step one, find a bunch of delalloc bytes starting at start */
2016 delalloc_start = *start;
2017 delalloc_end = 0;
083e75e7
JB
2018 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2019 max_bytes, &cached_state);
2749f7ef 2020 if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
c8b97818 2021 *start = delalloc_start;
2749f7ef
QW
2022
2023 /* @delalloc_end can be -1, never go beyond @orig_end */
2024 *end = min(delalloc_end, orig_end);
c2a128d2 2025 free_extent_state(cached_state);
3522e903 2026 return false;
c8b97818
CM
2027 }
2028
70b99e69
CM
2029 /*
2030 * start comes from the offset of locked_page. We have to lock
2031 * pages in order, so we can't process delalloc bytes before
2032 * locked_page
2033 */
d397712b 2034 if (delalloc_start < *start)
70b99e69 2035 delalloc_start = *start;
70b99e69 2036
c8b97818
CM
2037 /*
2038 * make sure to limit the number of pages we try to lock down
c8b97818 2039 */
7bf811a5
JB
2040 if (delalloc_end + 1 - delalloc_start > max_bytes)
2041 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 2042
c8b97818
CM
2043 /* step two, lock all the pages after the page that has start */
2044 ret = lock_delalloc_pages(inode, locked_page,
2045 delalloc_start, delalloc_end);
9bfd61d9 2046 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
2047 if (ret == -EAGAIN) {
2048 /* some of the pages are gone, lets avoid looping by
2049 * shortening the size of the delalloc range we're searching
2050 */
9655d298 2051 free_extent_state(cached_state);
7d788742 2052 cached_state = NULL;
c8b97818 2053 if (!loops) {
09cbfeaf 2054 max_bytes = PAGE_SIZE;
c8b97818
CM
2055 loops = 1;
2056 goto again;
2057 } else {
3522e903 2058 found = false;
c8b97818
CM
2059 goto out_failed;
2060 }
2061 }
c8b97818
CM
2062
2063 /* step three, lock the state bits for the whole range */
ff13db41 2064 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
2065
2066 /* then test to make sure it is all still delalloc */
2067 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 2068 EXTENT_DELALLOC, 1, cached_state);
c8b97818 2069 if (!ret) {
9655d298 2070 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 2071 &cached_state);
c8b97818
CM
2072 __unlock_for_delalloc(inode, locked_page,
2073 delalloc_start, delalloc_end);
2074 cond_resched();
2075 goto again;
2076 }
9655d298 2077 free_extent_state(cached_state);
c8b97818
CM
2078 *start = delalloc_start;
2079 *end = delalloc_end;
2080out_failed:
2081 return found;
2082}
2083
ad7ff17b 2084void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
74e9194a 2085 struct page *locked_page,
f97e27e9 2086 u32 clear_bits, unsigned long page_ops)
873695b3 2087{
ad7ff17b 2088 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
873695b3 2089
ad7ff17b 2090 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
98af9ab1 2091 start, end, page_ops, NULL);
873695b3
LB
2092}
2093
d352ac68
CM
2094/*
2095 * count the number of bytes in the tree that have a given bit(s)
2096 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2097 * cached. The total number found is returned.
2098 */
d1310b2e
CM
2099u64 count_range_bits(struct extent_io_tree *tree,
2100 u64 *start, u64 search_end, u64 max_bytes,
f97e27e9 2101 u32 bits, int contig)
d1310b2e
CM
2102{
2103 struct rb_node *node;
2104 struct extent_state *state;
2105 u64 cur_start = *start;
2106 u64 total_bytes = 0;
ec29ed5b 2107 u64 last = 0;
d1310b2e
CM
2108 int found = 0;
2109
fae7f21c 2110 if (WARN_ON(search_end <= cur_start))
d1310b2e 2111 return 0;
d1310b2e 2112
cad321ad 2113 spin_lock(&tree->lock);
d1310b2e
CM
2114 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2115 total_bytes = tree->dirty_bytes;
2116 goto out;
2117 }
2118 /*
2119 * this search will find all the extents that end after
2120 * our range starts.
2121 */
80ea96b1 2122 node = tree_search(tree, cur_start);
d397712b 2123 if (!node)
d1310b2e 2124 goto out;
d1310b2e 2125
d397712b 2126 while (1) {
d1310b2e
CM
2127 state = rb_entry(node, struct extent_state, rb_node);
2128 if (state->start > search_end)
2129 break;
ec29ed5b
CM
2130 if (contig && found && state->start > last + 1)
2131 break;
2132 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2133 total_bytes += min(search_end, state->end) + 1 -
2134 max(cur_start, state->start);
2135 if (total_bytes >= max_bytes)
2136 break;
2137 if (!found) {
af60bed2 2138 *start = max(cur_start, state->start);
d1310b2e
CM
2139 found = 1;
2140 }
ec29ed5b
CM
2141 last = state->end;
2142 } else if (contig && found) {
2143 break;
d1310b2e
CM
2144 }
2145 node = rb_next(node);
2146 if (!node)
2147 break;
2148 }
2149out:
cad321ad 2150 spin_unlock(&tree->lock);
d1310b2e
CM
2151 return total_bytes;
2152}
b2950863 2153
d352ac68
CM
2154/*
2155 * set the private field for a given byte offset in the tree. If there isn't
2156 * an extent_state there already, this does nothing.
2157 */
b3f167aa
JB
2158int set_state_failrec(struct extent_io_tree *tree, u64 start,
2159 struct io_failure_record *failrec)
d1310b2e
CM
2160{
2161 struct rb_node *node;
2162 struct extent_state *state;
2163 int ret = 0;
2164
cad321ad 2165 spin_lock(&tree->lock);
d1310b2e
CM
2166 /*
2167 * this search will find all the extents that end after
2168 * our range starts.
2169 */
80ea96b1 2170 node = tree_search(tree, start);
2b114d1d 2171 if (!node) {
d1310b2e
CM
2172 ret = -ENOENT;
2173 goto out;
2174 }
2175 state = rb_entry(node, struct extent_state, rb_node);
2176 if (state->start != start) {
2177 ret = -ENOENT;
2178 goto out;
2179 }
47dc196a 2180 state->failrec = failrec;
d1310b2e 2181out:
cad321ad 2182 spin_unlock(&tree->lock);
d1310b2e
CM
2183 return ret;
2184}
2185
2279a270 2186struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
d1310b2e
CM
2187{
2188 struct rb_node *node;
2189 struct extent_state *state;
2279a270 2190 struct io_failure_record *failrec;
d1310b2e 2191
cad321ad 2192 spin_lock(&tree->lock);
d1310b2e
CM
2193 /*
2194 * this search will find all the extents that end after
2195 * our range starts.
2196 */
80ea96b1 2197 node = tree_search(tree, start);
2b114d1d 2198 if (!node) {
2279a270 2199 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2200 goto out;
2201 }
2202 state = rb_entry(node, struct extent_state, rb_node);
2203 if (state->start != start) {
2279a270 2204 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2205 goto out;
2206 }
2279a270
NB
2207
2208 failrec = state->failrec;
d1310b2e 2209out:
cad321ad 2210 spin_unlock(&tree->lock);
2279a270 2211 return failrec;
d1310b2e
CM
2212}
2213
2214/*
2215 * searches a range in the state tree for a given mask.
70dec807 2216 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2217 * has the bits set. Otherwise, 1 is returned if any bit in the
2218 * range is found set.
2219 */
2220int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 2221 u32 bits, int filled, struct extent_state *cached)
d1310b2e
CM
2222{
2223 struct extent_state *state = NULL;
2224 struct rb_node *node;
2225 int bitset = 0;
d1310b2e 2226
cad321ad 2227 spin_lock(&tree->lock);
27a3507d 2228 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2229 cached->end > start)
9655d298
CM
2230 node = &cached->rb_node;
2231 else
2232 node = tree_search(tree, start);
d1310b2e
CM
2233 while (node && start <= end) {
2234 state = rb_entry(node, struct extent_state, rb_node);
2235
2236 if (filled && state->start > start) {
2237 bitset = 0;
2238 break;
2239 }
2240
2241 if (state->start > end)
2242 break;
2243
2244 if (state->state & bits) {
2245 bitset = 1;
2246 if (!filled)
2247 break;
2248 } else if (filled) {
2249 bitset = 0;
2250 break;
2251 }
46562cec
CM
2252
2253 if (state->end == (u64)-1)
2254 break;
2255
d1310b2e
CM
2256 start = state->end + 1;
2257 if (start > end)
2258 break;
2259 node = rb_next(node);
2260 if (!node) {
2261 if (filled)
2262 bitset = 0;
2263 break;
2264 }
2265 }
cad321ad 2266 spin_unlock(&tree->lock);
d1310b2e
CM
2267 return bitset;
2268}
d1310b2e 2269
7870d082
JB
2270int free_io_failure(struct extent_io_tree *failure_tree,
2271 struct extent_io_tree *io_tree,
2272 struct io_failure_record *rec)
4a54c8c1
JS
2273{
2274 int ret;
2275 int err = 0;
4a54c8c1 2276
47dc196a 2277 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2278 ret = clear_extent_bits(failure_tree, rec->start,
2279 rec->start + rec->len - 1,
91166212 2280 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2281 if (ret)
2282 err = ret;
2283
7870d082 2284 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2285 rec->start + rec->len - 1,
91166212 2286 EXTENT_DAMAGED);
53b381b3
DW
2287 if (ret && !err)
2288 err = ret;
4a54c8c1
JS
2289
2290 kfree(rec);
2291 return err;
2292}
2293
4a54c8c1
JS
2294/*
2295 * this bypasses the standard btrfs submit functions deliberately, as
2296 * the standard behavior is to write all copies in a raid setup. here we only
2297 * want to write the one bad copy. so we do the mapping for ourselves and issue
2298 * submit_bio directly.
3ec706c8 2299 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2300 * actually prevents the read that triggered the error from finishing.
2301 * currently, there can be no more than two copies of every data bit. thus,
2302 * exactly one rewrite is required.
2303 */
38d5e541
QW
2304static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2305 u64 length, u64 logical, struct page *page,
2306 unsigned int pg_offset, int mirror_num)
4a54c8c1 2307{
4a54c8c1 2308 struct btrfs_device *dev;
e9458bfe
CH
2309 struct bio_vec bvec;
2310 struct bio bio;
4a54c8c1
JS
2311 u64 map_length = 0;
2312 u64 sector;
4c664611 2313 struct btrfs_io_context *bioc = NULL;
e9458bfe 2314 int ret = 0;
4a54c8c1 2315
1751e8a6 2316 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2317 BUG_ON(!mirror_num);
2318
554aed7d
JT
2319 if (btrfs_repair_one_zone(fs_info, logical))
2320 return 0;
f7ef5287 2321
4a54c8c1
JS
2322 map_length = length;
2323
b5de8d0d 2324 /*
4c664611 2325 * Avoid races with device replace and make sure our bioc has devices
b5de8d0d
FM
2326 * associated to its stripes that don't go away while we are doing the
2327 * read repair operation.
2328 */
2329 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2330 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2331 /*
2332 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2333 * to update all raid stripes, but here we just want to correct
2334 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2335 * stripe's dev and sector.
2336 */
2337 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
4c664611 2338 &map_length, &bioc, 0);
e9458bfe
CH
2339 if (ret)
2340 goto out_counter_dec;
4c664611 2341 ASSERT(bioc->mirror_num == 1);
c725328c
LB
2342 } else {
2343 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
4c664611 2344 &map_length, &bioc, mirror_num);
e9458bfe
CH
2345 if (ret)
2346 goto out_counter_dec;
4c664611 2347 BUG_ON(mirror_num != bioc->mirror_num);
4a54c8c1 2348 }
c725328c 2349
4c664611 2350 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
4c664611
QW
2351 dev = bioc->stripes[bioc->mirror_num - 1].dev;
2352 btrfs_put_bioc(bioc);
e9458bfe 2353
ebbede42
AJ
2354 if (!dev || !dev->bdev ||
2355 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
e9458bfe
CH
2356 ret = -EIO;
2357 goto out_counter_dec;
4a54c8c1 2358 }
4a54c8c1 2359
e9458bfe
CH
2360 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
2361 bio.bi_iter.bi_sector = sector;
2362 __bio_add_page(&bio, page, length, pg_offset);
2363
2364 btrfsic_check_bio(&bio);
2365 ret = submit_bio_wait(&bio);
2366 if (ret) {
4a54c8c1 2367 /* try to remap that extent elsewhere? */
442a4f63 2368 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
e9458bfe 2369 goto out_bio_uninit;
4a54c8c1
JS
2370 }
2371
b14af3b4
DS
2372 btrfs_info_rl_in_rcu(fs_info,
2373 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2374 ino, start,
1203b681 2375 rcu_str_deref(dev->name), sector);
e9458bfe
CH
2376 ret = 0;
2377
2378out_bio_uninit:
2379 bio_uninit(&bio);
2380out_counter_dec:
b5de8d0d 2381 btrfs_bio_counter_dec(fs_info);
e9458bfe 2382 return ret;
4a54c8c1
JS
2383}
2384
2b48966a 2385int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
ea466794 2386{
20a1fbf9 2387 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2388 u64 start = eb->start;
cc5e31a4 2389 int i, num_pages = num_extent_pages(eb);
d95603b2 2390 int ret = 0;
ea466794 2391
bc98a42c 2392 if (sb_rdonly(fs_info->sb))
908960c6
ID
2393 return -EROFS;
2394
ea466794 2395 for (i = 0; i < num_pages; i++) {
fb85fc9a 2396 struct page *p = eb->pages[i];
1203b681 2397
6ec656bc 2398 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2399 start - page_offset(p), mirror_num);
ea466794
JB
2400 if (ret)
2401 break;
09cbfeaf 2402 start += PAGE_SIZE;
ea466794
JB
2403 }
2404
2405 return ret;
2406}
2407
4a54c8c1
JS
2408/*
2409 * each time an IO finishes, we do a fast check in the IO failure tree
2410 * to see if we need to process or clean up an io_failure_record
2411 */
7870d082
JB
2412int clean_io_failure(struct btrfs_fs_info *fs_info,
2413 struct extent_io_tree *failure_tree,
2414 struct extent_io_tree *io_tree, u64 start,
2415 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2416{
2417 u64 private;
4a54c8c1 2418 struct io_failure_record *failrec;
4a54c8c1
JS
2419 struct extent_state *state;
2420 int num_copies;
4a54c8c1 2421 int ret;
4a54c8c1
JS
2422
2423 private = 0;
7870d082
JB
2424 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2425 EXTENT_DIRTY, 0);
4a54c8c1
JS
2426 if (!ret)
2427 return 0;
2428
2279a270
NB
2429 failrec = get_state_failrec(failure_tree, start);
2430 if (IS_ERR(failrec))
4a54c8c1
JS
2431 return 0;
2432
4a54c8c1
JS
2433 BUG_ON(!failrec->this_mirror);
2434
bc98a42c 2435 if (sb_rdonly(fs_info->sb))
908960c6 2436 goto out;
4a54c8c1 2437
7870d082
JB
2438 spin_lock(&io_tree->lock);
2439 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2440 failrec->start,
2441 EXTENT_LOCKED);
7870d082 2442 spin_unlock(&io_tree->lock);
4a54c8c1 2443
883d0de4
MX
2444 if (state && state->start <= failrec->start &&
2445 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2446 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2447 failrec->len);
4a54c8c1 2448 if (num_copies > 1) {
7870d082
JB
2449 repair_io_failure(fs_info, ino, start, failrec->len,
2450 failrec->logical, page, pg_offset,
2451 failrec->failed_mirror);
4a54c8c1
JS
2452 }
2453 }
2454
2455out:
7870d082 2456 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2457
454ff3de 2458 return 0;
4a54c8c1
JS
2459}
2460
f612496b
MX
2461/*
2462 * Can be called when
2463 * - hold extent lock
2464 * - under ordered extent
2465 * - the inode is freeing
2466 */
7ab7956e 2467void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2468{
7ab7956e 2469 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2470 struct io_failure_record *failrec;
2471 struct extent_state *state, *next;
2472
2473 if (RB_EMPTY_ROOT(&failure_tree->state))
2474 return;
2475
2476 spin_lock(&failure_tree->lock);
2477 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2478 while (state) {
2479 if (state->start > end)
2480 break;
2481
2482 ASSERT(state->end <= end);
2483
2484 next = next_state(state);
2485
47dc196a 2486 failrec = state->failrec;
f612496b
MX
2487 free_extent_state(state);
2488 kfree(failrec);
2489
2490 state = next;
2491 }
2492 spin_unlock(&failure_tree->lock);
2493}
2494
3526302f 2495static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
150e4b05 2496 u64 start)
4a54c8c1 2497{
ab8d0fc4 2498 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2499 struct io_failure_record *failrec;
4a54c8c1 2500 struct extent_map *em;
4a54c8c1
JS
2501 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2502 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2503 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
150e4b05 2504 const u32 sectorsize = fs_info->sectorsize;
4a54c8c1 2505 int ret;
4a54c8c1
JS
2506 u64 logical;
2507
2279a270 2508 failrec = get_state_failrec(failure_tree, start);
3526302f 2509 if (!IS_ERR(failrec)) {
ab8d0fc4 2510 btrfs_debug(fs_info,
1245835d
QW
2511 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2512 failrec->logical, failrec->start, failrec->len);
4a54c8c1
JS
2513 /*
2514 * when data can be on disk more than twice, add to failrec here
2515 * (e.g. with a list for failed_mirror) to make
2516 * clean_io_failure() clean all those errors at once.
2517 */
3526302f
NB
2518
2519 return failrec;
4a54c8c1 2520 }
2fe6303e 2521
3526302f
NB
2522 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2523 if (!failrec)
2524 return ERR_PTR(-ENOMEM);
2fe6303e 2525
3526302f 2526 failrec->start = start;
150e4b05 2527 failrec->len = sectorsize;
3526302f
NB
2528 failrec->this_mirror = 0;
2529 failrec->bio_flags = 0;
3526302f
NB
2530
2531 read_lock(&em_tree->lock);
2532 em = lookup_extent_mapping(em_tree, start, failrec->len);
2533 if (!em) {
2534 read_unlock(&em_tree->lock);
2535 kfree(failrec);
2536 return ERR_PTR(-EIO);
2537 }
2538
2539 if (em->start > start || em->start + em->len <= start) {
2540 free_extent_map(em);
2541 em = NULL;
2542 }
2543 read_unlock(&em_tree->lock);
2544 if (!em) {
2545 kfree(failrec);
2546 return ERR_PTR(-EIO);
2547 }
2548
2549 logical = start - em->start;
2550 logical = em->block_start + logical;
2551 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2552 logical = em->block_start;
2553 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2554 extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2555 }
2556
2557 btrfs_debug(fs_info,
2558 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2559 logical, start, failrec->len);
2560
2561 failrec->logical = logical;
2562 free_extent_map(em);
2563
2564 /* Set the bits in the private failure tree */
150e4b05 2565 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
3526302f
NB
2566 EXTENT_LOCKED | EXTENT_DIRTY);
2567 if (ret >= 0) {
2568 ret = set_state_failrec(failure_tree, start, failrec);
2569 /* Set the bits in the inode's tree */
150e4b05
QW
2570 ret = set_extent_bits(tree, start, start + sectorsize - 1,
2571 EXTENT_DAMAGED);
3526302f
NB
2572 } else if (ret < 0) {
2573 kfree(failrec);
2574 return ERR_PTR(ret);
2575 }
2576
2577 return failrec;
2fe6303e
MX
2578}
2579
1245835d 2580static bool btrfs_check_repairable(struct inode *inode,
ce06d3ec
OS
2581 struct io_failure_record *failrec,
2582 int failed_mirror)
2fe6303e 2583{
ab8d0fc4 2584 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2585 int num_copies;
2586
ab8d0fc4 2587 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2588 if (num_copies == 1) {
2589 /*
2590 * we only have a single copy of the data, so don't bother with
2591 * all the retry and error correction code that follows. no
2592 * matter what the error is, it is very likely to persist.
2593 */
ab8d0fc4
JM
2594 btrfs_debug(fs_info,
2595 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2596 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2597 return false;
4a54c8c1
JS
2598 }
2599
1245835d
QW
2600 /* The failure record should only contain one sector */
2601 ASSERT(failrec->len == fs_info->sectorsize);
2602
4a54c8c1 2603 /*
1245835d
QW
2604 * There are two premises:
2605 * a) deliver good data to the caller
2606 * b) correct the bad sectors on disk
2607 *
2608 * Since we're only doing repair for one sector, we only need to get
2609 * a good copy of the failed sector and if we succeed, we have setup
2610 * everything for repair_io_failure to do the rest for us.
4a54c8c1 2611 */
510671d2 2612 ASSERT(failed_mirror);
1245835d
QW
2613 failrec->failed_mirror = failed_mirror;
2614 failrec->this_mirror++;
2615 if (failrec->this_mirror == failed_mirror)
4a54c8c1 2616 failrec->this_mirror++;
4a54c8c1 2617
facc8a22 2618 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2619 btrfs_debug(fs_info,
2620 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2621 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2622 return false;
4a54c8c1
JS
2623 }
2624
c3cfb656 2625 return true;
2fe6303e
MX
2626}
2627
150e4b05
QW
2628int btrfs_repair_one_sector(struct inode *inode,
2629 struct bio *failed_bio, u32 bio_offset,
2630 struct page *page, unsigned int pgoff,
2631 u64 start, int failed_mirror,
2632 submit_bio_hook_t *submit_bio_hook)
2fe6303e
MX
2633{
2634 struct io_failure_record *failrec;
77d5d689 2635 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2636 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2637 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
c3a3b19b 2638 struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
7ffd27e3 2639 const int icsum = bio_offset >> fs_info->sectorsize_bits;
77d5d689 2640 struct bio *repair_bio;
c3a3b19b 2641 struct btrfs_bio *repair_bbio;
2fe6303e 2642
77d5d689
OS
2643 btrfs_debug(fs_info,
2644 "repair read error: read error at %llu", start);
2fe6303e 2645
1f7ad75b 2646 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e 2647
150e4b05 2648 failrec = btrfs_get_io_failure_record(inode, start);
3526302f 2649 if (IS_ERR(failrec))
150e4b05 2650 return PTR_ERR(failrec);
2fe6303e 2651
1245835d
QW
2652
2653 if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
7870d082 2654 free_io_failure(failure_tree, tree, failrec);
150e4b05 2655 return -EIO;
2fe6303e
MX
2656 }
2657
c3a3b19b
QW
2658 repair_bio = btrfs_bio_alloc(1);
2659 repair_bbio = btrfs_bio(repair_bio);
00d82525 2660 repair_bbio->file_offset = start;
77d5d689 2661 repair_bio->bi_opf = REQ_OP_READ;
77d5d689
OS
2662 repair_bio->bi_end_io = failed_bio->bi_end_io;
2663 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2664 repair_bio->bi_private = failed_bio->bi_private;
2fe6303e 2665
c3a3b19b 2666 if (failed_bbio->csum) {
223486c2 2667 const u32 csum_size = fs_info->csum_size;
77d5d689 2668
c3a3b19b
QW
2669 repair_bbio->csum = repair_bbio->csum_inline;
2670 memcpy(repair_bbio->csum,
2671 failed_bbio->csum + csum_size * icsum, csum_size);
77d5d689 2672 }
2fe6303e 2673
77d5d689 2674 bio_add_page(repair_bio, page, failrec->len, pgoff);
c3a3b19b 2675 repair_bbio->iter = repair_bio->bi_iter;
4a54c8c1 2676
ab8d0fc4 2677 btrfs_debug(btrfs_sb(inode->i_sb),
1245835d
QW
2678 "repair read error: submitting new read to mirror %d",
2679 failrec->this_mirror);
4a54c8c1 2680
8cbc3001
JB
2681 /*
2682 * At this point we have a bio, so any errors from submit_bio_hook()
2683 * will be handled by the endio on the repair_bio, so we can't return an
2684 * error here.
2685 */
2686 submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
2687 return BLK_STS_OK;
150e4b05
QW
2688}
2689
2690static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2691{
2692 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2693
2694 ASSERT(page_offset(page) <= start &&
2695 start + len <= page_offset(page) + PAGE_SIZE);
2696
150e4b05 2697 if (uptodate) {
14605409
BB
2698 if (fsverity_active(page->mapping->host) &&
2699 !PageError(page) &&
2700 !PageUptodate(page) &&
2701 start < i_size_read(page->mapping->host) &&
2702 !fsverity_verify_page(page)) {
2703 btrfs_page_set_error(fs_info, page, start, len);
2704 } else {
2705 btrfs_page_set_uptodate(fs_info, page, start, len);
2706 }
150e4b05
QW
2707 } else {
2708 btrfs_page_clear_uptodate(fs_info, page, start, len);
2709 btrfs_page_set_error(fs_info, page, start, len);
2710 }
2711
fbca46eb 2712 if (!btrfs_is_subpage(fs_info, page))
150e4b05 2713 unlock_page(page);
3d078efa 2714 else
150e4b05
QW
2715 btrfs_subpage_end_reader(fs_info, page, start, len);
2716}
2717
2718static blk_status_t submit_read_repair(struct inode *inode,
2719 struct bio *failed_bio, u32 bio_offset,
2720 struct page *page, unsigned int pgoff,
2721 u64 start, u64 end, int failed_mirror,
2722 unsigned int error_bitmap,
2723 submit_bio_hook_t *submit_bio_hook)
2724{
2725 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2726 const u32 sectorsize = fs_info->sectorsize;
2727 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2728 int error = 0;
2729 int i;
2730
2731 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2732
2733 /* We're here because we had some read errors or csum mismatch */
2734 ASSERT(error_bitmap);
2735
2736 /*
2737 * We only get called on buffered IO, thus page must be mapped and bio
2738 * must not be cloned.
2739 */
2740 ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
2741
2742 /* Iterate through all the sectors in the range */
2743 for (i = 0; i < nr_bits; i++) {
2744 const unsigned int offset = i * sectorsize;
2745 struct extent_state *cached = NULL;
2746 bool uptodate = false;
2747 int ret;
2748
2749 if (!(error_bitmap & (1U << i))) {
2750 /*
2751 * This sector has no error, just end the page read
2752 * and unlock the range.
2753 */
2754 uptodate = true;
2755 goto next;
2756 }
2757
2758 ret = btrfs_repair_one_sector(inode, failed_bio,
2759 bio_offset + offset,
2760 page, pgoff + offset, start + offset,
2761 failed_mirror, submit_bio_hook);
2762 if (!ret) {
2763 /*
2764 * We have submitted the read repair, the page release
2765 * will be handled by the endio function of the
2766 * submitted repair bio.
2767 * Thus we don't need to do any thing here.
2768 */
2769 continue;
2770 }
2771 /*
2772 * Repair failed, just record the error but still continue.
2773 * Or the remaining sectors will not be properly unlocked.
2774 */
2775 if (!error)
2776 error = ret;
2777next:
2778 end_page_read(page, uptodate, start + offset, sectorsize);
2779 if (uptodate)
2780 set_extent_uptodate(&BTRFS_I(inode)->io_tree,
2781 start + offset,
2782 start + offset + sectorsize - 1,
2783 &cached, GFP_ATOMIC);
2784 unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
2785 start + offset,
2786 start + offset + sectorsize - 1,
2787 &cached);
2788 }
2789 return errno_to_blk_status(error);
4a54c8c1
JS
2790}
2791
d1310b2e
CM
2792/* lots and lots of room for performance fixes in the end_bio funcs */
2793
b5227c07 2794void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0 2795{
38a39ac7 2796 struct btrfs_inode *inode;
25c1252a 2797 const bool uptodate = (err == 0);
3e2426bd 2798 int ret = 0;
87826df0 2799
38a39ac7
QW
2800 ASSERT(page && page->mapping);
2801 inode = BTRFS_I(page->mapping->host);
2802 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
87826df0 2803
87826df0 2804 if (!uptodate) {
963e4db8
QW
2805 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2806 u32 len;
2807
2808 ASSERT(end + 1 - start <= U32_MAX);
2809 len = end + 1 - start;
2810
2811 btrfs_page_clear_uptodate(fs_info, page, start, len);
2812 btrfs_page_set_error(fs_info, page, start, len);
bff5baf8 2813 ret = err < 0 ? err : -EIO;
5dca6eea 2814 mapping_set_error(page->mapping, ret);
87826df0 2815 }
87826df0
JM
2816}
2817
d1310b2e
CM
2818/*
2819 * after a writepage IO is done, we need to:
2820 * clear the uptodate bits on error
2821 * clear the writeback bits in the extent tree for this IO
2822 * end_page_writeback if the page has no more pending IO
2823 *
2824 * Scheduling is not allowed, so the extent state tree is expected
2825 * to have one and only one object corresponding to this IO.
2826 */
4246a0b6 2827static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2828{
4e4cbee9 2829 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2830 struct bio_vec *bvec;
d1310b2e
CM
2831 u64 start;
2832 u64 end;
6dc4f100 2833 struct bvec_iter_all iter_all;
d8e3fb10 2834 bool first_bvec = true;
d1310b2e 2835
c09abff8 2836 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2837 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2838 struct page *page = bvec->bv_page;
0b246afa
JM
2839 struct inode *inode = page->mapping->host;
2840 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
321a02db
QW
2841 const u32 sectorsize = fs_info->sectorsize;
2842
2843 /* Our read/write should always be sector aligned. */
2844 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2845 btrfs_err(fs_info,
2846 "partial page write in btrfs with offset %u and length %u",
2847 bvec->bv_offset, bvec->bv_len);
2848 else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2849 btrfs_info(fs_info,
2850 "incomplete page write with offset %u and length %u",
2851 bvec->bv_offset, bvec->bv_len);
2852
2853 start = page_offset(page) + bvec->bv_offset;
2854 end = start + bvec->bv_len - 1;
d1310b2e 2855
d8e3fb10
NA
2856 if (first_bvec) {
2857 btrfs_record_physical_zoned(inode, start, bio);
2858 first_bvec = false;
2859 }
2860
4e4cbee9 2861 end_extent_writepage(page, error, start, end);
9047e317
QW
2862
2863 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2c30c71b 2864 }
2b1f55b0 2865
d1310b2e 2866 bio_put(bio);
d1310b2e
CM
2867}
2868
94e8c95c
QW
2869/*
2870 * Record previously processed extent range
2871 *
2872 * For endio_readpage_release_extent() to handle a full extent range, reducing
2873 * the extent io operations.
2874 */
2875struct processed_extent {
2876 struct btrfs_inode *inode;
2877 /* Start of the range in @inode */
2878 u64 start;
2e626e56 2879 /* End of the range in @inode */
94e8c95c
QW
2880 u64 end;
2881 bool uptodate;
2882};
2883
2884/*
2885 * Try to release processed extent range
2886 *
2887 * May not release the extent range right now if the current range is
2888 * contiguous to processed extent.
2889 *
2890 * Will release processed extent when any of @inode, @uptodate, the range is
2891 * no longer contiguous to the processed range.
2892 *
2893 * Passing @inode == NULL will force processed extent to be released.
2894 */
2895static void endio_readpage_release_extent(struct processed_extent *processed,
2896 struct btrfs_inode *inode, u64 start, u64 end,
2897 bool uptodate)
883d0de4
MX
2898{
2899 struct extent_state *cached = NULL;
94e8c95c
QW
2900 struct extent_io_tree *tree;
2901
2902 /* The first extent, initialize @processed */
2903 if (!processed->inode)
2904 goto update;
883d0de4 2905
94e8c95c
QW
2906 /*
2907 * Contiguous to processed extent, just uptodate the end.
2908 *
2909 * Several things to notice:
2910 *
2911 * - bio can be merged as long as on-disk bytenr is contiguous
2912 * This means we can have page belonging to other inodes, thus need to
2913 * check if the inode still matches.
2914 * - bvec can contain range beyond current page for multi-page bvec
2915 * Thus we need to do processed->end + 1 >= start check
2916 */
2917 if (processed->inode == inode && processed->uptodate == uptodate &&
2918 processed->end + 1 >= start && end >= processed->end) {
2919 processed->end = end;
2920 return;
2921 }
2922
2923 tree = &processed->inode->io_tree;
2924 /*
2925 * Now we don't have range contiguous to the processed range, release
2926 * the processed range now.
2927 */
2928 if (processed->uptodate && tree->track_uptodate)
2929 set_extent_uptodate(tree, processed->start, processed->end,
2930 &cached, GFP_ATOMIC);
2931 unlock_extent_cached_atomic(tree, processed->start, processed->end,
2932 &cached);
2933
2934update:
2935 /* Update processed to current range */
2936 processed->inode = inode;
2937 processed->start = start;
2938 processed->end = end;
2939 processed->uptodate = uptodate;
883d0de4
MX
2940}
2941
92082d40
QW
2942static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2943{
2944 ASSERT(PageLocked(page));
fbca46eb 2945 if (!btrfs_is_subpage(fs_info, page))
92082d40
QW
2946 return;
2947
2948 ASSERT(PagePrivate(page));
2949 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2950}
2951
d9bb77d5
QW
2952/*
2953 * Find extent buffer for a givne bytenr.
2954 *
2955 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2956 * in endio context.
2957 */
2958static struct extent_buffer *find_extent_buffer_readpage(
2959 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2960{
2961 struct extent_buffer *eb;
2962
2963 /*
2964 * For regular sectorsize, we can use page->private to grab extent
2965 * buffer
2966 */
fbca46eb 2967 if (fs_info->nodesize >= PAGE_SIZE) {
d9bb77d5
QW
2968 ASSERT(PagePrivate(page) && page->private);
2969 return (struct extent_buffer *)page->private;
2970 }
2971
2972 /* For subpage case, we need to lookup buffer radix tree */
2973 rcu_read_lock();
2974 eb = radix_tree_lookup(&fs_info->buffer_radix,
2975 bytenr >> fs_info->sectorsize_bits);
2976 rcu_read_unlock();
2977 ASSERT(eb);
2978 return eb;
2979}
2980
d1310b2e
CM
2981/*
2982 * after a readpage IO is done, we need to:
2983 * clear the uptodate bits on error
2984 * set the uptodate bits if things worked
2985 * set the page up to date if all extents in the tree are uptodate
2986 * clear the lock bit in the extent tree
2987 * unlock the page if there are no other extents locked for it
2988 *
2989 * Scheduling is not allowed, so the extent state tree is expected
2990 * to have one and only one object corresponding to this IO.
2991 */
4246a0b6 2992static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2993{
2c30c71b 2994 struct bio_vec *bvec;
c3a3b19b 2995 struct btrfs_bio *bbio = btrfs_bio(bio);
7870d082 2996 struct extent_io_tree *tree, *failure_tree;
94e8c95c 2997 struct processed_extent processed = { 0 };
7ffd27e3
QW
2998 /*
2999 * The offset to the beginning of a bio, since one bio can never be
3000 * larger than UINT_MAX, u32 here is enough.
3001 */
3002 u32 bio_offset = 0;
5cf1ab56 3003 int mirror;
d1310b2e 3004 int ret;
6dc4f100 3005 struct bvec_iter_all iter_all;
d1310b2e 3006
c09abff8 3007 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 3008 bio_for_each_segment_all(bvec, bio, iter_all) {
150e4b05 3009 bool uptodate = !bio->bi_status;
d1310b2e 3010 struct page *page = bvec->bv_page;
a71754fc 3011 struct inode *inode = page->mapping->host;
ab8d0fc4 3012 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7ffd27e3 3013 const u32 sectorsize = fs_info->sectorsize;
150e4b05 3014 unsigned int error_bitmap = (unsigned int)-1;
7ffd27e3
QW
3015 u64 start;
3016 u64 end;
3017 u32 len;
507903b8 3018
ab8d0fc4
JM
3019 btrfs_debug(fs_info,
3020 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
1201b58b 3021 bio->bi_iter.bi_sector, bio->bi_status,
c3a3b19b 3022 bbio->mirror_num);
a71754fc 3023 tree = &BTRFS_I(inode)->io_tree;
7870d082 3024 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 3025
8b8bbd46
QW
3026 /*
3027 * We always issue full-sector reads, but if some block in a
3028 * page fails to read, blk_update_request() will advance
3029 * bv_offset and adjust bv_len to compensate. Print a warning
3030 * for unaligned offsets, and an error if they don't add up to
3031 * a full sector.
3032 */
3033 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3034 btrfs_err(fs_info,
3035 "partial page read in btrfs with offset %u and length %u",
3036 bvec->bv_offset, bvec->bv_len);
3037 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3038 sectorsize))
3039 btrfs_info(fs_info,
3040 "incomplete page read with offset %u and length %u",
3041 bvec->bv_offset, bvec->bv_len);
3042
3043 start = page_offset(page) + bvec->bv_offset;
3044 end = start + bvec->bv_len - 1;
facc8a22 3045 len = bvec->bv_len;
d1310b2e 3046
c3a3b19b 3047 mirror = bbio->mirror_num;
78e62c02 3048 if (likely(uptodate)) {
150e4b05 3049 if (is_data_inode(inode)) {
c3a3b19b 3050 error_bitmap = btrfs_verify_data_csum(bbio,
5e295768 3051 bio_offset, page, start, end);
150e4b05
QW
3052 ret = error_bitmap;
3053 } else {
c3a3b19b 3054 ret = btrfs_validate_metadata_buffer(bbio,
8e1dc982 3055 page, start, end, mirror);
150e4b05 3056 }
5ee0844d 3057 if (ret)
150e4b05 3058 uptodate = false;
5ee0844d 3059 else
7870d082
JB
3060 clean_io_failure(BTRFS_I(inode)->root->fs_info,
3061 failure_tree, tree, start,
3062 page,
3063 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 3064 }
ea466794 3065
f2a09da9
MX
3066 if (likely(uptodate))
3067 goto readpage_ok;
3068
be17b3af 3069 if (is_data_inode(inode)) {
510671d2
JB
3070 /*
3071 * If we failed to submit the IO at all we'll have a
3072 * mirror_num == 0, in which case we need to just mark
3073 * the page with an error and unlock it and carry on.
3074 */
3075 if (mirror == 0)
3076 goto readpage_ok;
3077
f4a8e656 3078 /*
150e4b05
QW
3079 * btrfs_submit_read_repair() will handle all the good
3080 * and bad sectors, we just continue to the next bvec.
f4a8e656 3081 */
150e4b05
QW
3082 submit_read_repair(inode, bio, bio_offset, page,
3083 start - page_offset(page), start,
3084 end, mirror, error_bitmap,
3085 btrfs_submit_data_bio);
3086
3087 ASSERT(bio_offset + len > bio_offset);
3088 bio_offset += len;
3089 continue;
78e62c02
NB
3090 } else {
3091 struct extent_buffer *eb;
3092
d9bb77d5 3093 eb = find_extent_buffer_readpage(fs_info, page, start);
78e62c02
NB
3094 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3095 eb->read_mirror = mirror;
3096 atomic_dec(&eb->io_pages);
7e38326f 3097 }
f2a09da9 3098readpage_ok:
883d0de4 3099 if (likely(uptodate)) {
a71754fc 3100 loff_t i_size = i_size_read(inode);
09cbfeaf 3101 pgoff_t end_index = i_size >> PAGE_SHIFT;
a71754fc 3102
c28ea613
QW
3103 /*
3104 * Zero out the remaining part if this range straddles
3105 * i_size.
3106 *
3107 * Here we should only zero the range inside the bvec,
3108 * not touch anything else.
3109 *
3110 * NOTE: i_size is exclusive while end is inclusive.
3111 */
3112 if (page->index == end_index && i_size <= end) {
3113 u32 zero_start = max(offset_in_page(i_size),
d2dcc8ed 3114 offset_in_page(start));
c28ea613
QW
3115
3116 zero_user_segment(page, zero_start,
3117 offset_in_page(end) + 1);
3118 }
70dec807 3119 }
7ffd27e3
QW
3120 ASSERT(bio_offset + len > bio_offset);
3121 bio_offset += len;
883d0de4 3122
e09caaf9 3123 /* Update page status and unlock */
92082d40 3124 end_page_read(page, uptodate, start, len);
94e8c95c 3125 endio_readpage_release_extent(&processed, BTRFS_I(inode),
14605409 3126 start, end, PageUptodate(page));
2c30c71b 3127 }
94e8c95c
QW
3128 /* Release the last extent */
3129 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
c3a3b19b 3130 btrfs_bio_free_csum(bbio);
d1310b2e 3131 bio_put(bio);
d1310b2e
CM
3132}
3133
dd137dd1
STD
3134/**
3135 * Populate every free slot in a provided array with pages.
3136 *
3137 * @nr_pages: number of pages to allocate
3138 * @page_array: the array to fill with pages; any existing non-null entries in
3139 * the array will be skipped
3140 *
3141 * Return: 0 if all pages were able to be allocated;
3142 * -ENOMEM otherwise, and the caller is responsible for freeing all
3143 * non-null page pointers in the array.
3144 */
3145int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
3146{
91d6ac1d 3147 unsigned int allocated;
dd137dd1 3148
91d6ac1d
STD
3149 for (allocated = 0; allocated < nr_pages;) {
3150 unsigned int last = allocated;
dd137dd1 3151
91d6ac1d
STD
3152 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
3153
395cb57e
STD
3154 if (allocated == nr_pages)
3155 return 0;
3156
91d6ac1d
STD
3157 /*
3158 * During this iteration, no page could be allocated, even
3159 * though alloc_pages_bulk_array() falls back to alloc_page()
3160 * if it could not bulk-allocate. So we must be out of memory.
3161 */
3162 if (allocated == last)
dd137dd1 3163 return -ENOMEM;
395cb57e
STD
3164
3165 memalloc_retry_wait(GFP_NOFS);
dd137dd1
STD
3166 }
3167 return 0;
3168}
3169
9be3395b 3170/*
184f999e
DS
3171 * Initialize the members up to but not including 'bio'. Use after allocating a
3172 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3173 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 3174 */
c3a3b19b 3175static inline void btrfs_bio_init(struct btrfs_bio *bbio)
d1310b2e 3176{
c3a3b19b 3177 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
184f999e 3178}
d1310b2e 3179
9be3395b 3180/*
cd8e0cca
QW
3181 * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
3182 *
3183 * The bio allocation is backed by bioset and does not fail.
9be3395b 3184 */
c3a3b19b 3185struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
d1310b2e
CM
3186{
3187 struct bio *bio;
d1310b2e 3188
cd8e0cca 3189 ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
609be106 3190 bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
c3a3b19b 3191 btrfs_bio_init(btrfs_bio(bio));
d1310b2e
CM
3192 return bio;
3193}
3194
110ac0e5 3195struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
9be3395b 3196{
c3a3b19b 3197 struct btrfs_bio *bbio;
23ea8e5a 3198 struct bio *new;
9be3395b 3199
6e707bcd 3200 /* Bio allocation backed by a bioset does not fail */
110ac0e5 3201 new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
c3a3b19b
QW
3202 bbio = btrfs_bio(new);
3203 btrfs_bio_init(bbio);
3204 bbio->iter = bio->bi_iter;
23ea8e5a
MX
3205 return new;
3206}
9be3395b 3207
21dda654 3208struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
2f8e9140
LB
3209{
3210 struct bio *bio;
c3a3b19b 3211 struct btrfs_bio *bbio;
2f8e9140 3212
21dda654
CK
3213 ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3214
2f8e9140 3215 /* this will never fail when it's backed by a bioset */
abfc426d 3216 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
3217 ASSERT(bio);
3218
c3a3b19b
QW
3219 bbio = btrfs_bio(bio);
3220 btrfs_bio_init(bbio);
2f8e9140
LB
3221
3222 bio_trim(bio, offset >> 9, size >> 9);
c3a3b19b 3223 bbio->iter = bio->bi_iter;
2f8e9140
LB
3224 return bio;
3225}
9be3395b 3226
953651eb
NA
3227/**
3228 * Attempt to add a page to bio
3229 *
be8d1a2a 3230 * @bio_ctrl: record both the bio, and its bio_flags
953651eb
NA
3231 * @page: page to add to the bio
3232 * @disk_bytenr: offset of the new bio or to check whether we are adding
3233 * a contiguous page to the previous one
953651eb 3234 * @size: portion of page that we want to write
be8d1a2a 3235 * @pg_offset: starting offset in the page
953651eb 3236 * @bio_flags: flags of the current bio to see if we can merge them
953651eb
NA
3237 *
3238 * Attempt to add a page to bio considering stripe alignment etc.
3239 *
e0eefe07
QW
3240 * Return >= 0 for the number of bytes added to the bio.
3241 * Can return 0 if the current bio is already at stripe/zone boundary.
3242 * Return <0 for error.
953651eb 3243 */
e0eefe07
QW
3244static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3245 struct page *page,
3246 u64 disk_bytenr, unsigned int size,
3247 unsigned int pg_offset,
3248 unsigned long bio_flags)
953651eb 3249{
390ed29b
QW
3250 struct bio *bio = bio_ctrl->bio;
3251 u32 bio_size = bio->bi_iter.bi_size;
e0eefe07 3252 u32 real_size;
953651eb
NA
3253 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3254 bool contig;
e1326f03 3255 int ret;
953651eb 3256
390ed29b
QW
3257 ASSERT(bio);
3258 /* The limit should be calculated when bio_ctrl->bio is allocated */
3259 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3260 if (bio_ctrl->bio_flags != bio_flags)
e0eefe07 3261 return 0;
953651eb 3262
390ed29b 3263 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
953651eb
NA
3264 contig = bio->bi_iter.bi_sector == sector;
3265 else
3266 contig = bio_end_sector(bio) == sector;
3267 if (!contig)
e0eefe07 3268 return 0;
953651eb 3269
e0eefe07
QW
3270 real_size = min(bio_ctrl->len_to_oe_boundary,
3271 bio_ctrl->len_to_stripe_boundary) - bio_size;
3272 real_size = min(real_size, size);
3273
3274 /*
3275 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
3276 * bio will still execute its endio function on the page!
3277 */
3278 if (real_size == 0)
3279 return 0;
953651eb 3280
390ed29b 3281 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
e0eefe07 3282 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
390ed29b 3283 else
e0eefe07 3284 ret = bio_add_page(bio, page, real_size, pg_offset);
e1326f03 3285
e0eefe07 3286 return ret;
953651eb
NA
3287}
3288
390ed29b 3289static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
939c7feb 3290 struct btrfs_inode *inode, u64 file_offset)
390ed29b
QW
3291{
3292 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3293 struct btrfs_io_geometry geom;
3294 struct btrfs_ordered_extent *ordered;
3295 struct extent_map *em;
3296 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3297 int ret;
3298
3299 /*
3300 * Pages for compressed extent are never submitted to disk directly,
3301 * thus it has no real boundary, just set them to U32_MAX.
3302 *
3303 * The split happens for real compressed bio, which happens in
3304 * btrfs_submit_compressed_read/write().
3305 */
3306 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
3307 bio_ctrl->len_to_oe_boundary = U32_MAX;
3308 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3309 return 0;
3310 }
3311 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3312 if (IS_ERR(em))
3313 return PTR_ERR(em);
3314 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3315 logical, &geom);
3316 free_extent_map(em);
3317 if (ret < 0) {
3318 return ret;
3319 }
3320 if (geom.len > U32_MAX)
3321 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3322 else
3323 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3324
73672710 3325 if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
390ed29b
QW
3326 bio_ctrl->len_to_oe_boundary = U32_MAX;
3327 return 0;
3328 }
3329
390ed29b 3330 /* Ordered extent not yet created, so we're good */
939c7feb 3331 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
390ed29b
QW
3332 if (!ordered) {
3333 bio_ctrl->len_to_oe_boundary = U32_MAX;
3334 return 0;
3335 }
3336
3337 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3338 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3339 btrfs_put_ordered_extent(ordered);
3340 return 0;
3341}
3342
e0eefe07
QW
3343static int alloc_new_bio(struct btrfs_inode *inode,
3344 struct btrfs_bio_ctrl *bio_ctrl,
3345 struct writeback_control *wbc,
3346 unsigned int opf,
3347 bio_end_io_t end_io_func,
939c7feb 3348 u64 disk_bytenr, u32 offset, u64 file_offset,
e0eefe07
QW
3349 unsigned long bio_flags)
3350{
3351 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3352 struct bio *bio;
3353 int ret;
3354
c3a3b19b 3355 bio = btrfs_bio_alloc(BIO_MAX_VECS);
e0eefe07
QW
3356 /*
3357 * For compressed page range, its disk_bytenr is always @disk_bytenr
3358 * passed in, no matter if we have added any range into previous bio.
3359 */
3360 if (bio_flags & EXTENT_BIO_COMPRESSED)
cd8e0cca 3361 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
e0eefe07 3362 else
cd8e0cca 3363 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
e0eefe07
QW
3364 bio_ctrl->bio = bio;
3365 bio_ctrl->bio_flags = bio_flags;
e0eefe07
QW
3366 bio->bi_end_io = end_io_func;
3367 bio->bi_private = &inode->io_tree;
e0eefe07 3368 bio->bi_opf = opf;
939c7feb
NA
3369 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3370 if (ret < 0)
3371 goto error;
e0eefe07 3372
50f1cff3
CH
3373 if (wbc) {
3374 /*
3375 * For Zone append we need the correct block_device that we are
3376 * going to write to set in the bio to be able to respect the
3377 * hardware limitation. Look it up here:
3378 */
3379 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3380 struct btrfs_device *dev;
3381
3382 dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
3383 fs_info->sectorsize);
3384 if (IS_ERR(dev)) {
3385 ret = PTR_ERR(dev);
3386 goto error;
3387 }
e0eefe07 3388
50f1cff3
CH
3389 bio_set_dev(bio, dev->bdev);
3390 } else {
3391 /*
3392 * Otherwise pick the last added device to support
3393 * cgroup writeback. For multi-device file systems this
3394 * means blk-cgroup policies have to always be set on the
3395 * last added/replaced device. This is a bit odd but has
3396 * been like that for a long time.
3397 */
3398 bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
e0eefe07 3399 }
50f1cff3
CH
3400 wbc_init_bio(wbc, bio);
3401 } else {
3402 ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
e0eefe07
QW
3403 }
3404 return 0;
3405error:
3406 bio_ctrl->bio = NULL;
3407 bio->bi_status = errno_to_blk_status(ret);
3408 bio_endio(bio);
3409 return ret;
3410}
3411
4b81ba48
DS
3412/*
3413 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
3414 * @wbc: optional writeback control for io accounting
3415 * @page: page to add to the bio
0c64c33c
QW
3416 * @disk_bytenr: logical bytenr where the write will be
3417 * @size: portion of page that we want to write to
b8b3d625
DS
3418 * @pg_offset: offset of the new bio or to check whether we are adding
3419 * a contiguous page to the previous one
5c2b1fd7 3420 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
3421 * @end_io_func: end_io callback for new bio
3422 * @mirror_num: desired mirror to read/write
3423 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3424 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 3425 */
0ceb34bf 3426static int submit_extent_page(unsigned int opf,
da2f0f74 3427 struct writeback_control *wbc,
390ed29b 3428 struct btrfs_bio_ctrl *bio_ctrl,
0c64c33c 3429 struct page *page, u64 disk_bytenr,
6c5a4e2c 3430 size_t size, unsigned long pg_offset,
f188591e 3431 bio_end_io_t end_io_func,
c8b97818 3432 int mirror_num,
005efedf
FM
3433 unsigned long bio_flags,
3434 bool force_bio_submit)
d1310b2e
CM
3435{
3436 int ret = 0;
e1326f03 3437 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
e0eefe07 3438 unsigned int cur = pg_offset;
d1310b2e 3439
390ed29b 3440 ASSERT(bio_ctrl);
5c2b1fd7 3441
390ed29b
QW
3442 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3443 pg_offset + size <= PAGE_SIZE);
e0eefe07
QW
3444 if (force_bio_submit && bio_ctrl->bio) {
3445 ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
3446 bio_ctrl->bio = NULL;
3447 if (ret < 0)
3448 return ret;
3449 }
3450
3451 while (cur < pg_offset + size) {
3452 u32 offset = cur - pg_offset;
3453 int added;
3454
3455 /* Allocate new bio if needed */
3456 if (!bio_ctrl->bio) {
3457 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3458 end_io_func, disk_bytenr, offset,
939c7feb 3459 page_offset(page) + cur,
e0eefe07
QW
3460 bio_flags);
3461 if (ret < 0)
3462 return ret;
3463 }
3464 /*
3465 * We must go through btrfs_bio_add_page() to ensure each
3466 * page range won't cross various boundaries.
3467 */
3468 if (bio_flags & EXTENT_BIO_COMPRESSED)
3469 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3470 size - offset, pg_offset + offset,
3471 bio_flags);
3472 else
3473 added = btrfs_bio_add_page(bio_ctrl, page,
3474 disk_bytenr + offset, size - offset,
3475 pg_offset + offset, bio_flags);
3476
3477 /* Metadata page range should never be split */
3478 if (!is_data_inode(&inode->vfs_inode))
3479 ASSERT(added == 0 || added == size - offset);
3480
3481 /* At least we added some page, update the account */
3482 if (wbc && added)
3483 wbc_account_cgroup_owner(wbc, page, added);
3484
3485 /* We have reached boundary, submit right now */
3486 if (added < size - offset) {
3487 /* The bio should contain some page(s) */
3488 ASSERT(bio_ctrl->bio->bi_iter.bi_size);
3489 ret = submit_one_bio(bio_ctrl->bio, mirror_num,
3490 bio_ctrl->bio_flags);
390ed29b
QW
3491 bio_ctrl->bio = NULL;
3492 if (ret < 0)
79787eaa 3493 return ret;
d1310b2e 3494 }
e0eefe07 3495 cur += added;
d1310b2e 3496 }
e0eefe07 3497 return 0;
d1310b2e
CM
3498}
3499
760f991f
QW
3500static int attach_extent_buffer_page(struct extent_buffer *eb,
3501 struct page *page,
3502 struct btrfs_subpage *prealloc)
d1310b2e 3503{
760f991f
QW
3504 struct btrfs_fs_info *fs_info = eb->fs_info;
3505 int ret = 0;
3506
0d01e247
QW
3507 /*
3508 * If the page is mapped to btree inode, we should hold the private
3509 * lock to prevent race.
3510 * For cloned or dummy extent buffers, their pages are not mapped and
3511 * will not race with any other ebs.
3512 */
3513 if (page->mapping)
3514 lockdep_assert_held(&page->mapping->private_lock);
3515
fbca46eb 3516 if (fs_info->nodesize >= PAGE_SIZE) {
760f991f
QW
3517 if (!PagePrivate(page))
3518 attach_page_private(page, eb);
3519 else
3520 WARN_ON(page->private != (unsigned long)eb);
3521 return 0;
3522 }
3523
3524 /* Already mapped, just free prealloc */
3525 if (PagePrivate(page)) {
3526 btrfs_free_subpage(prealloc);
3527 return 0;
3528 }
3529
3530 if (prealloc)
3531 /* Has preallocated memory for subpage */
3532 attach_page_private(page, prealloc);
d1b89bc0 3533 else
760f991f
QW
3534 /* Do new allocation to attach subpage */
3535 ret = btrfs_attach_subpage(fs_info, page,
3536 BTRFS_SUBPAGE_METADATA);
3537 return ret;
d1310b2e
CM
3538}
3539
32443de3 3540int set_page_extent_mapped(struct page *page)
d1310b2e 3541{
32443de3
QW
3542 struct btrfs_fs_info *fs_info;
3543
3544 ASSERT(page->mapping);
3545
3546 if (PagePrivate(page))
3547 return 0;
3548
3549 fs_info = btrfs_sb(page->mapping->host->i_sb);
3550
fbca46eb 3551 if (btrfs_is_subpage(fs_info, page))
32443de3
QW
3552 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3553
3554 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3555 return 0;
3556}
3557
3558void clear_page_extent_mapped(struct page *page)
3559{
3560 struct btrfs_fs_info *fs_info;
3561
3562 ASSERT(page->mapping);
3563
d1b89bc0 3564 if (!PagePrivate(page))
32443de3
QW
3565 return;
3566
3567 fs_info = btrfs_sb(page->mapping->host->i_sb);
fbca46eb 3568 if (btrfs_is_subpage(fs_info, page))
32443de3
QW
3569 return btrfs_detach_subpage(fs_info, page);
3570
3571 detach_page_private(page);
d1310b2e
CM
3572}
3573
125bac01
MX
3574static struct extent_map *
3575__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
1a5ee1e6 3576 u64 start, u64 len, struct extent_map **em_cached)
125bac01
MX
3577{
3578 struct extent_map *em;
3579
3580 if (em_cached && *em_cached) {
3581 em = *em_cached;
cbc0e928 3582 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3583 start < extent_map_end(em)) {
490b54d6 3584 refcount_inc(&em->refs);
125bac01
MX
3585 return em;
3586 }
3587
3588 free_extent_map(em);
3589 *em_cached = NULL;
3590 }
3591
1a5ee1e6 3592 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
c0347550 3593 if (em_cached && !IS_ERR(em)) {
125bac01 3594 BUG_ON(*em_cached);
490b54d6 3595 refcount_inc(&em->refs);
125bac01
MX
3596 *em_cached = em;
3597 }
3598 return em;
3599}
d1310b2e
CM
3600/*
3601 * basic readpage implementation. Locked extent state structs are inserted
3602 * into the tree that are removed when the IO is done (by the end_io
3603 * handlers)
79787eaa 3604 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3605 * return 0 on success, otherwise return error
d1310b2e 3606 */
0f208812 3607int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
390ed29b 3608 struct btrfs_bio_ctrl *bio_ctrl,
0f208812 3609 unsigned int read_flags, u64 *prev_em_start)
d1310b2e
CM
3610{
3611 struct inode *inode = page->mapping->host;
92082d40 3612 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4eee4fa4 3613 u64 start = page_offset(page);
8eec8296 3614 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3615 u64 cur = start;
3616 u64 extent_offset;
3617 u64 last_byte = i_size_read(inode);
3618 u64 block_start;
3619 u64 cur_end;
d1310b2e 3620 struct extent_map *em;
baf863b9 3621 int ret = 0;
306e16ce 3622 size_t pg_offset = 0;
d1310b2e
CM
3623 size_t iosize;
3624 size_t blocksize = inode->i_sb->s_blocksize;
f657a31c 3625 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 3626
32443de3
QW
3627 ret = set_page_extent_mapped(page);
3628 if (ret < 0) {
3629 unlock_extent(tree, start, end);
92082d40
QW
3630 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3631 unlock_page(page);
32443de3
QW
3632 goto out;
3633 }
d1310b2e 3634
09cbfeaf 3635 if (page->index == last_byte >> PAGE_SHIFT) {
7073017a 3636 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3637
3638 if (zero_offset) {
09cbfeaf 3639 iosize = PAGE_SIZE - zero_offset;
d048b9c2 3640 memzero_page(page, zero_offset, iosize);
c8b97818 3641 flush_dcache_page(page);
c8b97818
CM
3642 }
3643 }
92082d40 3644 begin_page_read(fs_info, page);
d1310b2e 3645 while (cur <= end) {
4c37a793 3646 unsigned long this_bio_flag = 0;
005efedf 3647 bool force_bio_submit = false;
0c64c33c 3648 u64 disk_bytenr;
c8f2f24b 3649
6a404910 3650 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
d1310b2e 3651 if (cur >= last_byte) {
507903b8
AJ
3652 struct extent_state *cached = NULL;
3653
09cbfeaf 3654 iosize = PAGE_SIZE - pg_offset;
d048b9c2 3655 memzero_page(page, pg_offset, iosize);
d1310b2e 3656 flush_dcache_page(page);
d1310b2e 3657 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3658 &cached, GFP_NOFS);
7f042a83 3659 unlock_extent_cached(tree, cur,
e43bbe5e 3660 cur + iosize - 1, &cached);
92082d40 3661 end_page_read(page, true, cur, iosize);
d1310b2e
CM
3662 break;
3663 }
125bac01 3664 em = __get_extent_map(inode, page, pg_offset, cur,
1a5ee1e6 3665 end - cur + 1, em_cached);
c0347550 3666 if (IS_ERR(em)) {
7f042a83 3667 unlock_extent(tree, cur, end);
92082d40 3668 end_page_read(page, false, cur, end + 1 - cur);
bbf0ea7e 3669 ret = PTR_ERR(em);
d1310b2e
CM
3670 break;
3671 }
d1310b2e
CM
3672 extent_offset = cur - em->start;
3673 BUG_ON(extent_map_end(em) <= cur);
3674 BUG_ON(end < cur);
3675
261507a0 3676 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3677 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3678 extent_set_compress_type(&this_bio_flag,
3679 em->compress_type);
3680 }
c8b97818 3681
d1310b2e
CM
3682 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3683 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3684 iosize = ALIGN(iosize, blocksize);
949b3273 3685 if (this_bio_flag & EXTENT_BIO_COMPRESSED)
0c64c33c 3686 disk_bytenr = em->block_start;
949b3273 3687 else
0c64c33c 3688 disk_bytenr = em->block_start + extent_offset;
d1310b2e 3689 block_start = em->block_start;
d899e052
YZ
3690 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3691 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3692
3693 /*
3694 * If we have a file range that points to a compressed extent
260db43c 3695 * and it's followed by a consecutive file range that points
005efedf
FM
3696 * to the same compressed extent (possibly with a different
3697 * offset and/or length, so it either points to the whole extent
3698 * or only part of it), we must make sure we do not submit a
3699 * single bio to populate the pages for the 2 ranges because
3700 * this makes the compressed extent read zero out the pages
3701 * belonging to the 2nd range. Imagine the following scenario:
3702 *
3703 * File layout
3704 * [0 - 8K] [8K - 24K]
3705 * | |
3706 * | |
3707 * points to extent X, points to extent X,
3708 * offset 4K, length of 8K offset 0, length 16K
3709 *
3710 * [extent X, compressed length = 4K uncompressed length = 16K]
3711 *
3712 * If the bio to read the compressed extent covers both ranges,
3713 * it will decompress extent X into the pages belonging to the
3714 * first range and then it will stop, zeroing out the remaining
3715 * pages that belong to the other range that points to extent X.
3716 * So here we make sure we submit 2 bios, one for the first
3717 * range and another one for the third range. Both will target
3718 * the same physical extent from disk, but we can't currently
3719 * make the compressed bio endio callback populate the pages
3720 * for both ranges because each compressed bio is tightly
3721 * coupled with a single extent map, and each range can have
3722 * an extent map with a different offset value relative to the
3723 * uncompressed data of our extent and different lengths. This
3724 * is a corner case so we prioritize correctness over
3725 * non-optimal behavior (submitting 2 bios for the same extent).
3726 */
3727 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3728 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3729 *prev_em_start != em->start)
005efedf
FM
3730 force_bio_submit = true;
3731
3732 if (prev_em_start)
8e928218 3733 *prev_em_start = em->start;
005efedf 3734
d1310b2e
CM
3735 free_extent_map(em);
3736 em = NULL;
3737
3738 /* we've found a hole, just zero and go on */
3739 if (block_start == EXTENT_MAP_HOLE) {
507903b8
AJ
3740 struct extent_state *cached = NULL;
3741
d048b9c2 3742 memzero_page(page, pg_offset, iosize);
d1310b2e 3743 flush_dcache_page(page);
d1310b2e
CM
3744
3745 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3746 &cached, GFP_NOFS);
7f042a83 3747 unlock_extent_cached(tree, cur,
e43bbe5e 3748 cur + iosize - 1, &cached);
92082d40 3749 end_page_read(page, true, cur, iosize);
d1310b2e 3750 cur = cur + iosize;
306e16ce 3751 pg_offset += iosize;
d1310b2e
CM
3752 continue;
3753 }
3754 /* the get_extent function already copied into the page */
9655d298
CM
3755 if (test_range_bit(tree, cur, cur_end,
3756 EXTENT_UPTODATE, 1, NULL)) {
7f042a83 3757 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3758 end_page_read(page, true, cur, iosize);
d1310b2e 3759 cur = cur + iosize;
306e16ce 3760 pg_offset += iosize;
d1310b2e
CM
3761 continue;
3762 }
70dec807
CM
3763 /* we have an inline extent but it didn't get marked up
3764 * to date. Error out
3765 */
3766 if (block_start == EXTENT_MAP_INLINE) {
7f042a83 3767 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3768 end_page_read(page, false, cur, iosize);
70dec807 3769 cur = cur + iosize;
306e16ce 3770 pg_offset += iosize;
70dec807
CM
3771 continue;
3772 }
d1310b2e 3773
0ceb34bf 3774 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
390ed29b
QW
3775 bio_ctrl, page, disk_bytenr, iosize,
3776 pg_offset,
fd513000 3777 end_bio_extent_readpage, 0,
005efedf
FM
3778 this_bio_flag,
3779 force_bio_submit);
ad3fc794 3780 if (ret) {
7f042a83 3781 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3782 end_page_read(page, false, cur, iosize);
baf863b9 3783 goto out;
edd33c99 3784 }
d1310b2e 3785 cur = cur + iosize;
306e16ce 3786 pg_offset += iosize;
d1310b2e 3787 }
90a887c9 3788out:
baf863b9 3789 return ret;
d1310b2e
CM
3790}
3791
b6660e80 3792static inline void contiguous_readpages(struct page *pages[], int nr_pages,
390ed29b
QW
3793 u64 start, u64 end,
3794 struct extent_map **em_cached,
3795 struct btrfs_bio_ctrl *bio_ctrl,
3796 u64 *prev_em_start)
9974090b 3797{
23d31bd4 3798 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
3799 int index;
3800
b272ae22 3801 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3802
3803 for (index = 0; index < nr_pages; index++) {
390ed29b 3804 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
0f208812 3805 REQ_RAHEAD, prev_em_start);
09cbfeaf 3806 put_page(pages[index]);
9974090b
MX
3807 }
3808}
3809
3d4b9496 3810static void update_nr_written(struct writeback_control *wbc,
a9132667 3811 unsigned long nr_written)
11c8349b
CM
3812{
3813 wbc->nr_to_write -= nr_written;
11c8349b
CM
3814}
3815
d1310b2e 3816/*
40f76580
CM
3817 * helper for __extent_writepage, doing all of the delayed allocation setup.
3818 *
5eaad97a 3819 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3820 * to write the page (copy into inline extent). In this case the IO has
3821 * been started and the page is already unlocked.
3822 *
3823 * This returns 0 if all went well (page still locked)
3824 * This returns < 0 if there were errors (page still locked)
d1310b2e 3825 */
cd4c0bf9 3826static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
83f1b680 3827 struct page *page, struct writeback_control *wbc)
40f76580 3828{
2749f7ef 3829 const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
cf3075fb 3830 u64 delalloc_start = page_offset(page);
40f76580 3831 u64 delalloc_to_write = 0;
83f1b680
QW
3832 /* How many pages are started by btrfs_run_delalloc_range() */
3833 unsigned long nr_written = 0;
40f76580
CM
3834 int ret;
3835 int page_started = 0;
3836
2749f7ef
QW
3837 while (delalloc_start < page_end) {
3838 u64 delalloc_end = page_end;
3839 bool found;
40f76580 3840
cd4c0bf9 3841 found = find_lock_delalloc_range(&inode->vfs_inode, page,
40f76580 3842 &delalloc_start,
917aacec 3843 &delalloc_end);
3522e903 3844 if (!found) {
40f76580
CM
3845 delalloc_start = delalloc_end + 1;
3846 continue;
3847 }
cd4c0bf9 3848 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
83f1b680 3849 delalloc_end, &page_started, &nr_written, wbc);
40f76580 3850 if (ret) {
963e4db8
QW
3851 btrfs_page_set_error(inode->root->fs_info, page,
3852 page_offset(page), PAGE_SIZE);
7361b4ae 3853 return ret;
40f76580
CM
3854 }
3855 /*
ea1754a0
KS
3856 * delalloc_end is already one less than the total length, so
3857 * we don't subtract one from PAGE_SIZE
40f76580
CM
3858 */
3859 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3860 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3861 delalloc_start = delalloc_end + 1;
3862 }
3863 if (wbc->nr_to_write < delalloc_to_write) {
3864 int thresh = 8192;
3865
3866 if (delalloc_to_write < thresh * 2)
3867 thresh = delalloc_to_write;
3868 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3869 thresh);
3870 }
3871
83f1b680 3872 /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
40f76580
CM
3873 if (page_started) {
3874 /*
83f1b680
QW
3875 * We've unlocked the page, so we can't update the mapping's
3876 * writeback index, just update nr_to_write.
40f76580 3877 */
83f1b680 3878 wbc->nr_to_write -= nr_written;
40f76580
CM
3879 return 1;
3880 }
3881
b69d1ee9 3882 return 0;
40f76580
CM
3883}
3884
c5ef5c6c
QW
3885/*
3886 * Find the first byte we need to write.
3887 *
3888 * For subpage, one page can contain several sectors, and
3889 * __extent_writepage_io() will just grab all extent maps in the page
3890 * range and try to submit all non-inline/non-compressed extents.
3891 *
3892 * This is a big problem for subpage, we shouldn't re-submit already written
3893 * data at all.
3894 * This function will lookup subpage dirty bit to find which range we really
3895 * need to submit.
3896 *
3897 * Return the next dirty range in [@start, @end).
3898 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
3899 */
3900static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3901 struct page *page, u64 *start, u64 *end)
3902{
3903 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
72a69cd0 3904 struct btrfs_subpage_info *spi = fs_info->subpage_info;
c5ef5c6c
QW
3905 u64 orig_start = *start;
3906 /* Declare as unsigned long so we can use bitmap ops */
c5ef5c6c 3907 unsigned long flags;
72a69cd0 3908 int range_start_bit;
c5ef5c6c
QW
3909 int range_end_bit;
3910
3911 /*
3912 * For regular sector size == page size case, since one page only
3913 * contains one sector, we return the page offset directly.
3914 */
fbca46eb 3915 if (!btrfs_is_subpage(fs_info, page)) {
c5ef5c6c
QW
3916 *start = page_offset(page);
3917 *end = page_offset(page) + PAGE_SIZE;
3918 return;
3919 }
3920
72a69cd0
QW
3921 range_start_bit = spi->dirty_offset +
3922 (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3923
c5ef5c6c
QW
3924 /* We should have the page locked, but just in case */
3925 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
3926 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3927 spi->dirty_offset + spi->bitmap_nr_bits);
c5ef5c6c
QW
3928 spin_unlock_irqrestore(&subpage->lock, flags);
3929
72a69cd0
QW
3930 range_start_bit -= spi->dirty_offset;
3931 range_end_bit -= spi->dirty_offset;
3932
c5ef5c6c
QW
3933 *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3934 *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3935}
3936
40f76580
CM
3937/*
3938 * helper for __extent_writepage. This calls the writepage start hooks,
3939 * and does the loop to map the page into extents and bios.
3940 *
3941 * We return 1 if the IO is started and the page is unlocked,
3942 * 0 if all went well (page still locked)
3943 * < 0 if there were errors (page still locked)
3944 */
d4580fe2 3945static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
40f76580
CM
3946 struct page *page,
3947 struct writeback_control *wbc,
3948 struct extent_page_data *epd,
3949 loff_t i_size,
57e5ffeb 3950 int *nr_ret)
d1310b2e 3951{
6bc5636a 3952 struct btrfs_fs_info *fs_info = inode->root->fs_info;
a129ffb8
QW
3953 u64 cur = page_offset(page);
3954 u64 end = cur + PAGE_SIZE - 1;
d1310b2e 3955 u64 extent_offset;
d1310b2e 3956 u64 block_start;
d1310b2e 3957 struct extent_map *em;
40f76580
CM
3958 int ret = 0;
3959 int nr = 0;
d8e3fb10 3960 u32 opf = REQ_OP_WRITE;
57e5ffeb 3961 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3962 bool compressed;
c8b97818 3963
a129ffb8 3964 ret = btrfs_writepage_cow_fixup(page);
d75855b4
NB
3965 if (ret) {
3966 /* Fixup worker will requeue */
5ab58055 3967 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3968 unlock_page(page);
3969 return 1;
247e743c
CM
3970 }
3971
11c8349b
CM
3972 /*
3973 * we don't want to touch the inode after unlocking the page,
3974 * so we update the mapping writeback index now
3975 */
83f1b680 3976 update_nr_written(wbc, 1);
771ed689 3977
d1310b2e 3978 while (cur <= end) {
0c64c33c 3979 u64 disk_bytenr;
40f76580 3980 u64 em_end;
c5ef5c6c
QW
3981 u64 dirty_range_start = cur;
3982 u64 dirty_range_end;
6bc5636a 3983 u32 iosize;
58409edd 3984
40f76580 3985 if (cur >= i_size) {
38a39ac7 3986 btrfs_writepage_endio_finish_ordered(inode, page, cur,
25c1252a 3987 end, true);
cc1d0d93
QW
3988 /*
3989 * This range is beyond i_size, thus we don't need to
3990 * bother writing back.
3991 * But we still need to clear the dirty subpage bit, or
3992 * the next time the page gets dirtied, we will try to
3993 * writeback the sectors with subpage dirty bits,
3994 * causing writeback without ordered extent.
3995 */
3996 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
d1310b2e
CM
3997 break;
3998 }
c5ef5c6c
QW
3999
4000 find_next_dirty_byte(fs_info, page, &dirty_range_start,
4001 &dirty_range_end);
4002 if (cur < dirty_range_start) {
4003 cur = dirty_range_start;
4004 continue;
4005 }
4006
d4580fe2 4007 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
c0347550 4008 if (IS_ERR(em)) {
c5ef5c6c 4009 btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
61391d56 4010 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
4011 break;
4012 }
4013
4014 extent_offset = cur - em->start;
40f76580 4015 em_end = extent_map_end(em);
6bc5636a
QW
4016 ASSERT(cur <= em_end);
4017 ASSERT(cur < end);
4018 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
4019 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
d1310b2e 4020 block_start = em->block_start;
c8b97818 4021 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6bc5636a
QW
4022 disk_bytenr = em->block_start + extent_offset;
4023
c5ef5c6c
QW
4024 /*
4025 * Note that em_end from extent_map_end() and dirty_range_end from
4026 * find_next_dirty_byte() are all exclusive
4027 */
4028 iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
d8e3fb10 4029
e380adfc 4030 if (btrfs_use_zone_append(inode, em->block_start))
d8e3fb10
NA
4031 opf = REQ_OP_ZONE_APPEND;
4032
d1310b2e
CM
4033 free_extent_map(em);
4034 em = NULL;
4035
c8b97818
CM
4036 /*
4037 * compressed and inline extents are written through other
4038 * paths in the FS
4039 */
4040 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 4041 block_start == EXTENT_MAP_INLINE) {
c8b04030 4042 if (compressed)
c8b97818 4043 nr++;
c8b04030 4044 else
38a39ac7 4045 btrfs_writepage_endio_finish_ordered(inode,
25c1252a 4046 page, cur, cur + iosize - 1, true);
cc1d0d93 4047 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
c8b97818 4048 cur += iosize;
d1310b2e
CM
4049 continue;
4050 }
c8b97818 4051
d2a91064 4052 btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
58409edd 4053 if (!PageWriteback(page)) {
d4580fe2 4054 btrfs_err(inode->root->fs_info,
58409edd
DS
4055 "page %lu not writeback, cur %llu end %llu",
4056 page->index, cur, end);
d1310b2e 4057 }
7f3c74fb 4058
c5ef5c6c
QW
4059 /*
4060 * Although the PageDirty bit is cleared before entering this
4061 * function, subpage dirty bit is not cleared.
4062 * So clear subpage dirty bit here so next time we won't submit
4063 * page for range already written to disk.
4064 */
4065 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4066
390ed29b
QW
4067 ret = submit_extent_page(opf | write_flags, wbc,
4068 &epd->bio_ctrl, page,
d8e3fb10 4069 disk_bytenr, iosize,
390ed29b 4070 cur - page_offset(page),
58409edd 4071 end_bio_extent_writepage,
390ed29b 4072 0, 0, false);
fe01aa65 4073 if (ret) {
c5ef5c6c 4074 btrfs_page_set_error(fs_info, page, cur, iosize);
fe01aa65 4075 if (PageWriteback(page))
c5ef5c6c
QW
4076 btrfs_page_clear_writeback(fs_info, page, cur,
4077 iosize);
fe01aa65 4078 }
d1310b2e 4079
6bc5636a 4080 cur += iosize;
d1310b2e
CM
4081 nr++;
4082 }
cc1d0d93
QW
4083 /*
4084 * If we finish without problem, we should not only clear page dirty,
4085 * but also empty subpage dirty bits
4086 */
4087 if (!ret)
4088 btrfs_page_assert_not_dirty(fs_info, page);
40f76580 4089 *nr_ret = nr;
40f76580
CM
4090 return ret;
4091}
4092
4093/*
4094 * the writepage semantics are similar to regular writepage. extent
4095 * records are inserted to lock ranges in the tree, and as dirty areas
4096 * are found, they are marked writeback. Then the lock bits are removed
4097 * and the end_io handler clears the writeback ranges
3065976b
QW
4098 *
4099 * Return 0 if everything goes well.
4100 * Return <0 for error.
40f76580
CM
4101 */
4102static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 4103 struct extent_page_data *epd)
40f76580 4104{
8e1dec8e 4105 struct folio *folio = page_folio(page);
40f76580 4106 struct inode *inode = page->mapping->host;
e55a0de1 4107 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
cf3075fb
QW
4108 const u64 page_start = page_offset(page);
4109 const u64 page_end = page_start + PAGE_SIZE - 1;
40f76580
CM
4110 int ret;
4111 int nr = 0;
eb70d222 4112 size_t pg_offset;
40f76580 4113 loff_t i_size = i_size_read(inode);
09cbfeaf 4114 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580 4115
40f76580
CM
4116 trace___extent_writepage(page, inode, wbc);
4117
4118 WARN_ON(!PageLocked(page));
4119
963e4db8
QW
4120 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4121 page_offset(page), PAGE_SIZE);
40f76580 4122
7073017a 4123 pg_offset = offset_in_page(i_size);
40f76580
CM
4124 if (page->index > end_index ||
4125 (page->index == end_index && !pg_offset)) {
8e1dec8e
MWO
4126 folio_invalidate(folio, 0, folio_size(folio));
4127 folio_unlock(folio);
40f76580
CM
4128 return 0;
4129 }
4130
4131 if (page->index == end_index) {
d048b9c2 4132 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
40f76580
CM
4133 flush_dcache_page(page);
4134 }
4135
32443de3
QW
4136 ret = set_page_extent_mapped(page);
4137 if (ret < 0) {
4138 SetPageError(page);
4139 goto done;
4140 }
40f76580 4141
7789a55a 4142 if (!epd->extent_locked) {
83f1b680 4143 ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
7789a55a 4144 if (ret == 1)
169d2c87 4145 return 0;
7789a55a
NB
4146 if (ret)
4147 goto done;
4148 }
40f76580 4149
d4580fe2 4150 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
83f1b680 4151 &nr);
40f76580 4152 if (ret == 1)
169d2c87 4153 return 0;
40f76580 4154
d1310b2e
CM
4155done:
4156 if (nr == 0) {
4157 /* make sure the mapping tag for page dirty gets cleared */
4158 set_page_writeback(page);
4159 end_page_writeback(page);
4160 }
963e4db8
QW
4161 /*
4162 * Here we used to have a check for PageError() and then set @ret and
4163 * call end_extent_writepage().
4164 *
4165 * But in fact setting @ret here will cause different error paths
4166 * between subpage and regular sectorsize.
4167 *
4168 * For regular page size, we never submit current page, but only add
4169 * current page to current bio.
4170 * The bio submission can only happen in next page.
4171 * Thus if we hit the PageError() branch, @ret is already set to
4172 * non-zero value and will not get updated for regular sectorsize.
4173 *
4174 * But for subpage case, it's possible we submit part of current page,
4175 * thus can get PageError() set by submitted bio of the same page,
4176 * while our @ret is still 0.
4177 *
4178 * So here we unify the behavior and don't set @ret.
4179 * Error can still be properly passed to higher layer as page will
4180 * be set error, here we just don't handle the IO failure.
4181 *
4182 * NOTE: This is just a hotfix for subpage.
4183 * The root fix will be properly ending ordered extent when we hit
4184 * an error during writeback.
4185 *
4186 * But that needs a bigger refactoring, as we not only need to grab the
4187 * submitted OE, but also need to know exactly at which bytenr we hit
4188 * the error.
4189 * Currently the full page based __extent_writepage_io() is not
4190 * capable of that.
4191 */
4192 if (PageError(page))
cf3075fb 4193 end_extent_writepage(page, ret, page_start, page_end);
e55a0de1
QW
4194 if (epd->extent_locked) {
4195 /*
4196 * If epd->extent_locked, it's from extent_write_locked_range(),
4197 * the page can either be locked by lock_page() or
4198 * process_one_page().
4199 * Let btrfs_page_unlock_writer() handle both cases.
4200 */
4201 ASSERT(wbc);
4202 btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
4203 wbc->range_end + 1 - wbc->range_start);
4204 } else {
4205 unlock_page(page);
4206 }
3065976b 4207 ASSERT(ret <= 0);
40f76580 4208 return ret;
d1310b2e
CM
4209}
4210
fd8b2b61 4211void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 4212{
74316201
N
4213 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4214 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
4215}
4216
18dfa711
FM
4217static void end_extent_buffer_writeback(struct extent_buffer *eb)
4218{
be1a1d7a
NA
4219 if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
4220 btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
4221
18dfa711
FM
4222 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4223 smp_mb__after_atomic();
4224 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4225}
4226
2e3c2513 4227/*
a3efb2f0 4228 * Lock extent buffer status and pages for writeback.
2e3c2513 4229 *
a3efb2f0
QW
4230 * May try to flush write bio if we can't get the lock.
4231 *
4232 * Return 0 if the extent buffer doesn't need to be submitted.
4233 * (E.g. the extent buffer is not dirty)
4234 * Return >0 is the extent buffer is submitted to bio.
4235 * Return <0 if something went wrong, no page is locked.
2e3c2513 4236 */
9df76fb5 4237static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 4238 struct extent_page_data *epd)
0b32f4bb 4239{
9df76fb5 4240 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 4241 int i, num_pages, failed_page_nr;
0b32f4bb
JB
4242 int flush = 0;
4243 int ret = 0;
4244
4245 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 4246 ret = flush_write_bio(epd);
2e3c2513
QW
4247 if (ret < 0)
4248 return ret;
4249 flush = 1;
0b32f4bb
JB
4250 btrfs_tree_lock(eb);
4251 }
4252
4253 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4254 btrfs_tree_unlock(eb);
4255 if (!epd->sync_io)
4256 return 0;
4257 if (!flush) {
f4340622 4258 ret = flush_write_bio(epd);
2e3c2513
QW
4259 if (ret < 0)
4260 return ret;
0b32f4bb
JB
4261 flush = 1;
4262 }
a098d8e8
CM
4263 while (1) {
4264 wait_on_extent_buffer_writeback(eb);
4265 btrfs_tree_lock(eb);
4266 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4267 break;
0b32f4bb 4268 btrfs_tree_unlock(eb);
0b32f4bb
JB
4269 }
4270 }
4271
51561ffe
JB
4272 /*
4273 * We need to do this to prevent races in people who check if the eb is
4274 * under IO since we can end up having no IO bits set for a short period
4275 * of time.
4276 */
4277 spin_lock(&eb->refs_lock);
0b32f4bb
JB
4278 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4279 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 4280 spin_unlock(&eb->refs_lock);
0b32f4bb 4281 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
4282 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4283 -eb->len,
4284 fs_info->dirty_metadata_batch);
0b32f4bb 4285 ret = 1;
51561ffe
JB
4286 } else {
4287 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
4288 }
4289
4290 btrfs_tree_unlock(eb);
4291
f3156df9
QW
4292 /*
4293 * Either we don't need to submit any tree block, or we're submitting
4294 * subpage eb.
4295 * Subpage metadata doesn't use page locking at all, so we can skip
4296 * the page locking.
4297 */
fbca46eb 4298 if (!ret || fs_info->nodesize < PAGE_SIZE)
0b32f4bb
JB
4299 return ret;
4300
65ad0104 4301 num_pages = num_extent_pages(eb);
0b32f4bb 4302 for (i = 0; i < num_pages; i++) {
fb85fc9a 4303 struct page *p = eb->pages[i];
0b32f4bb
JB
4304
4305 if (!trylock_page(p)) {
4306 if (!flush) {
18dfa711
FM
4307 int err;
4308
4309 err = flush_write_bio(epd);
4310 if (err < 0) {
4311 ret = err;
2e3c2513
QW
4312 failed_page_nr = i;
4313 goto err_unlock;
4314 }
0b32f4bb
JB
4315 flush = 1;
4316 }
4317 lock_page(p);
4318 }
4319 }
4320
4321 return ret;
2e3c2513
QW
4322err_unlock:
4323 /* Unlock already locked pages */
4324 for (i = 0; i < failed_page_nr; i++)
4325 unlock_page(eb->pages[i]);
18dfa711
FM
4326 /*
4327 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
4328 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
4329 * be made and undo everything done before.
4330 */
4331 btrfs_tree_lock(eb);
4332 spin_lock(&eb->refs_lock);
4333 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4334 end_extent_buffer_writeback(eb);
4335 spin_unlock(&eb->refs_lock);
4336 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
4337 fs_info->dirty_metadata_batch);
4338 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4339 btrfs_tree_unlock(eb);
2e3c2513 4340 return ret;
0b32f4bb
JB
4341}
4342
5a2c6075 4343static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
656f30db 4344{
5a2c6075 4345 struct btrfs_fs_info *fs_info = eb->fs_info;
656f30db 4346
5a2c6075 4347 btrfs_page_set_error(fs_info, page, eb->start, eb->len);
656f30db
FM
4348 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4349 return;
4350
c2e39305
JB
4351 /*
4352 * A read may stumble upon this buffer later, make sure that it gets an
4353 * error and knows there was an error.
4354 */
4355 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4356
68b85589
JB
4357 /*
4358 * We need to set the mapping with the io error as well because a write
4359 * error will flip the file system readonly, and then syncfs() will
4360 * return a 0 because we are readonly if we don't modify the err seq for
4361 * the superblock.
4362 */
4363 mapping_set_error(page->mapping, -EIO);
4364
eb5b64f1
DZ
4365 /*
4366 * If we error out, we should add back the dirty_metadata_bytes
4367 * to make it consistent.
4368 */
eb5b64f1
DZ
4369 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4370 eb->len, fs_info->dirty_metadata_batch);
4371
656f30db
FM
4372 /*
4373 * If writeback for a btree extent that doesn't belong to a log tree
4374 * failed, increment the counter transaction->eb_write_errors.
4375 * We do this because while the transaction is running and before it's
4376 * committing (when we call filemap_fdata[write|wait]_range against
4377 * the btree inode), we might have
4378 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4379 * returns an error or an error happens during writeback, when we're
4380 * committing the transaction we wouldn't know about it, since the pages
4381 * can be no longer dirty nor marked anymore for writeback (if a
4382 * subsequent modification to the extent buffer didn't happen before the
4383 * transaction commit), which makes filemap_fdata[write|wait]_range not
4384 * able to find the pages tagged with SetPageError at transaction
4385 * commit time. So if this happens we must abort the transaction,
4386 * otherwise we commit a super block with btree roots that point to
4387 * btree nodes/leafs whose content on disk is invalid - either garbage
4388 * or the content of some node/leaf from a past generation that got
4389 * cowed or deleted and is no longer valid.
4390 *
4391 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4392 * not be enough - we need to distinguish between log tree extents vs
4393 * non-log tree extents, and the next filemap_fdatawait_range() call
4394 * will catch and clear such errors in the mapping - and that call might
4395 * be from a log sync and not from a transaction commit. Also, checking
4396 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4397 * not done and would not be reliable - the eb might have been released
4398 * from memory and reading it back again means that flag would not be
4399 * set (since it's a runtime flag, not persisted on disk).
4400 *
4401 * Using the flags below in the btree inode also makes us achieve the
4402 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4403 * writeback for all dirty pages and before filemap_fdatawait_range()
4404 * is called, the writeback for all dirty pages had already finished
4405 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4406 * filemap_fdatawait_range() would return success, as it could not know
4407 * that writeback errors happened (the pages were no longer tagged for
4408 * writeback).
4409 */
4410 switch (eb->log_index) {
4411 case -1:
5a2c6075 4412 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
656f30db
FM
4413 break;
4414 case 0:
5a2c6075 4415 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
656f30db
FM
4416 break;
4417 case 1:
5a2c6075 4418 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
656f30db
FM
4419 break;
4420 default:
4421 BUG(); /* unexpected, logic error */
4422 }
4423}
4424
2f3186d8
QW
4425/*
4426 * The endio specific version which won't touch any unsafe spinlock in endio
4427 * context.
4428 */
4429static struct extent_buffer *find_extent_buffer_nolock(
4430 struct btrfs_fs_info *fs_info, u64 start)
4431{
4432 struct extent_buffer *eb;
4433
4434 rcu_read_lock();
4435 eb = radix_tree_lookup(&fs_info->buffer_radix,
4436 start >> fs_info->sectorsize_bits);
4437 if (eb && atomic_inc_not_zero(&eb->refs)) {
4438 rcu_read_unlock();
4439 return eb;
4440 }
4441 rcu_read_unlock();
4442 return NULL;
4443}
4444
4445/*
4446 * The endio function for subpage extent buffer write.
4447 *
4448 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4449 * after all extent buffers in the page has finished their writeback.
4450 */
fa04c165 4451static void end_bio_subpage_eb_writepage(struct bio *bio)
2f3186d8 4452{
fa04c165 4453 struct btrfs_fs_info *fs_info;
2f3186d8
QW
4454 struct bio_vec *bvec;
4455 struct bvec_iter_all iter_all;
4456
fa04c165 4457 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
fbca46eb 4458 ASSERT(fs_info->nodesize < PAGE_SIZE);
fa04c165 4459
2f3186d8
QW
4460 ASSERT(!bio_flagged(bio, BIO_CLONED));
4461 bio_for_each_segment_all(bvec, bio, iter_all) {
4462 struct page *page = bvec->bv_page;
4463 u64 bvec_start = page_offset(page) + bvec->bv_offset;
4464 u64 bvec_end = bvec_start + bvec->bv_len - 1;
4465 u64 cur_bytenr = bvec_start;
4466
4467 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4468
4469 /* Iterate through all extent buffers in the range */
4470 while (cur_bytenr <= bvec_end) {
4471 struct extent_buffer *eb;
4472 int done;
4473
4474 /*
4475 * Here we can't use find_extent_buffer(), as it may
4476 * try to lock eb->refs_lock, which is not safe in endio
4477 * context.
4478 */
4479 eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4480 ASSERT(eb);
4481
4482 cur_bytenr = eb->start + eb->len;
4483
4484 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4485 done = atomic_dec_and_test(&eb->io_pages);
4486 ASSERT(done);
4487
4488 if (bio->bi_status ||
4489 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4490 ClearPageUptodate(page);
4491 set_btree_ioerr(page, eb);
4492 }
4493
4494 btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4495 eb->len);
4496 end_extent_buffer_writeback(eb);
4497 /*
4498 * free_extent_buffer() will grab spinlock which is not
4499 * safe in endio context. Thus here we manually dec
4500 * the ref.
4501 */
4502 atomic_dec(&eb->refs);
4503 }
4504 }
4505 bio_put(bio);
4506}
4507
4246a0b6 4508static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 4509{
2c30c71b 4510 struct bio_vec *bvec;
0b32f4bb 4511 struct extent_buffer *eb;
2b070cfe 4512 int done;
6dc4f100 4513 struct bvec_iter_all iter_all;
0b32f4bb 4514
c09abff8 4515 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 4516 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
4517 struct page *page = bvec->bv_page;
4518
0b32f4bb
JB
4519 eb = (struct extent_buffer *)page->private;
4520 BUG_ON(!eb);
4521 done = atomic_dec_and_test(&eb->io_pages);
4522
4e4cbee9 4523 if (bio->bi_status ||
4246a0b6 4524 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 4525 ClearPageUptodate(page);
5a2c6075 4526 set_btree_ioerr(page, eb);
0b32f4bb
JB
4527 }
4528
4529 end_page_writeback(page);
4530
4531 if (!done)
4532 continue;
4533
4534 end_extent_buffer_writeback(eb);
2c30c71b 4535 }
0b32f4bb
JB
4536
4537 bio_put(bio);
0b32f4bb
JB
4538}
4539
fa04c165
QW
4540static void prepare_eb_write(struct extent_buffer *eb)
4541{
4542 u32 nritems;
4543 unsigned long start;
4544 unsigned long end;
4545
4546 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4547 atomic_set(&eb->io_pages, num_extent_pages(eb));
4548
4549 /* Set btree blocks beyond nritems with 0 to avoid stale content */
4550 nritems = btrfs_header_nritems(eb);
4551 if (btrfs_header_level(eb) > 0) {
4552 end = btrfs_node_key_ptr_offset(nritems);
4553 memzero_extent_buffer(eb, end, eb->len - end);
4554 } else {
4555 /*
4556 * Leaf:
4557 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4558 */
4559 start = btrfs_item_nr_offset(nritems);
4560 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4561 memzero_extent_buffer(eb, start, end - start);
4562 }
4563}
4564
35b6ddfa
QW
4565/*
4566 * Unlike the work in write_one_eb(), we rely completely on extent locking.
4567 * Page locking is only utilized at minimum to keep the VMM code happy.
35b6ddfa
QW
4568 */
4569static int write_one_subpage_eb(struct extent_buffer *eb,
4570 struct writeback_control *wbc,
4571 struct extent_page_data *epd)
4572{
4573 struct btrfs_fs_info *fs_info = eb->fs_info;
4574 struct page *page = eb->pages[0];
4575 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
4576 bool no_dirty_ebs = false;
4577 int ret;
4578
fa04c165
QW
4579 prepare_eb_write(eb);
4580
35b6ddfa
QW
4581 /* clear_page_dirty_for_io() in subpage helper needs page locked */
4582 lock_page(page);
4583 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4584
4585 /* Check if this is the last dirty bit to update nr_written */
4586 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4587 eb->start, eb->len);
4588 if (no_dirty_ebs)
4589 clear_page_dirty_for_io(page);
4590
390ed29b
QW
4591 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4592 &epd->bio_ctrl, page, eb->start, eb->len,
4593 eb->start - page_offset(page),
fa04c165 4594 end_bio_subpage_eb_writepage, 0, 0, false);
35b6ddfa
QW
4595 if (ret) {
4596 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4597 set_btree_ioerr(page, eb);
4598 unlock_page(page);
4599
4600 if (atomic_dec_and_test(&eb->io_pages))
4601 end_extent_buffer_writeback(eb);
4602 return -EIO;
4603 }
4604 unlock_page(page);
4605 /*
4606 * Submission finished without problem, if no range of the page is
4607 * dirty anymore, we have submitted a page. Update nr_written in wbc.
4608 */
4609 if (no_dirty_ebs)
4610 update_nr_written(wbc, 1);
4611 return ret;
4612}
4613
0e378df1 4614static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
4615 struct writeback_control *wbc,
4616 struct extent_page_data *epd)
4617{
0c64c33c 4618 u64 disk_bytenr = eb->start;
cc5e31a4 4619 int i, num_pages;
ff40adf7 4620 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 4621 int ret = 0;
0b32f4bb 4622
fa04c165 4623 prepare_eb_write(eb);
35b6ddfa 4624
fa04c165 4625 num_pages = num_extent_pages(eb);
0b32f4bb 4626 for (i = 0; i < num_pages; i++) {
fb85fc9a 4627 struct page *p = eb->pages[i];
0b32f4bb
JB
4628
4629 clear_page_dirty_for_io(p);
4630 set_page_writeback(p);
0ceb34bf 4631 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
390ed29b
QW
4632 &epd->bio_ctrl, p, disk_bytenr,
4633 PAGE_SIZE, 0,
1f7ad75b 4634 end_bio_extent_buffer_writepage,
390ed29b 4635 0, 0, false);
0b32f4bb 4636 if (ret) {
5a2c6075 4637 set_btree_ioerr(p, eb);
fe01aa65
TK
4638 if (PageWriteback(p))
4639 end_page_writeback(p);
0b32f4bb
JB
4640 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4641 end_extent_buffer_writeback(eb);
4642 ret = -EIO;
4643 break;
4644 }
0c64c33c 4645 disk_bytenr += PAGE_SIZE;
3d4b9496 4646 update_nr_written(wbc, 1);
0b32f4bb
JB
4647 unlock_page(p);
4648 }
4649
4650 if (unlikely(ret)) {
4651 for (; i < num_pages; i++) {
bbf65cf0 4652 struct page *p = eb->pages[i];
81465028 4653 clear_page_dirty_for_io(p);
0b32f4bb
JB
4654 unlock_page(p);
4655 }
4656 }
4657
4658 return ret;
4659}
4660
c4aec299
QW
4661/*
4662 * Submit one subpage btree page.
4663 *
4664 * The main difference to submit_eb_page() is:
4665 * - Page locking
4666 * For subpage, we don't rely on page locking at all.
4667 *
4668 * - Flush write bio
4669 * We only flush bio if we may be unable to fit current extent buffers into
4670 * current bio.
4671 *
4672 * Return >=0 for the number of submitted extent buffers.
4673 * Return <0 for fatal error.
4674 */
4675static int submit_eb_subpage(struct page *page,
4676 struct writeback_control *wbc,
4677 struct extent_page_data *epd)
4678{
4679 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4680 int submitted = 0;
4681 u64 page_start = page_offset(page);
4682 int bit_start = 0;
c4aec299
QW
4683 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4684 int ret;
4685
4686 /* Lock and write each dirty extent buffers in the range */
72a69cd0 4687 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
c4aec299
QW
4688 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4689 struct extent_buffer *eb;
4690 unsigned long flags;
4691 u64 start;
4692
4693 /*
4694 * Take private lock to ensure the subpage won't be detached
4695 * in the meantime.
4696 */
4697 spin_lock(&page->mapping->private_lock);
4698 if (!PagePrivate(page)) {
4699 spin_unlock(&page->mapping->private_lock);
4700 break;
4701 }
4702 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
4703 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4704 subpage->bitmaps)) {
c4aec299
QW
4705 spin_unlock_irqrestore(&subpage->lock, flags);
4706 spin_unlock(&page->mapping->private_lock);
4707 bit_start++;
4708 continue;
4709 }
4710
4711 start = page_start + bit_start * fs_info->sectorsize;
4712 bit_start += sectors_per_node;
4713
4714 /*
4715 * Here we just want to grab the eb without touching extra
4716 * spin locks, so call find_extent_buffer_nolock().
4717 */
4718 eb = find_extent_buffer_nolock(fs_info, start);
4719 spin_unlock_irqrestore(&subpage->lock, flags);
4720 spin_unlock(&page->mapping->private_lock);
4721
4722 /*
4723 * The eb has already reached 0 refs thus find_extent_buffer()
4724 * doesn't return it. We don't need to write back such eb
4725 * anyway.
4726 */
4727 if (!eb)
4728 continue;
4729
4730 ret = lock_extent_buffer_for_io(eb, epd);
4731 if (ret == 0) {
4732 free_extent_buffer(eb);
4733 continue;
4734 }
4735 if (ret < 0) {
4736 free_extent_buffer(eb);
4737 goto cleanup;
4738 }
fa04c165 4739 ret = write_one_subpage_eb(eb, wbc, epd);
c4aec299
QW
4740 free_extent_buffer(eb);
4741 if (ret < 0)
4742 goto cleanup;
4743 submitted++;
4744 }
4745 return submitted;
4746
4747cleanup:
4748 /* We hit error, end bio for the submitted extent buffers */
4749 end_write_bio(epd, ret);
4750 return ret;
4751}
4752
f91e0d0c
QW
4753/*
4754 * Submit all page(s) of one extent buffer.
4755 *
4756 * @page: the page of one extent buffer
4757 * @eb_context: to determine if we need to submit this page, if current page
4758 * belongs to this eb, we don't need to submit
4759 *
4760 * The caller should pass each page in their bytenr order, and here we use
4761 * @eb_context to determine if we have submitted pages of one extent buffer.
4762 *
4763 * If we have, we just skip until we hit a new page that doesn't belong to
4764 * current @eb_context.
4765 *
4766 * If not, we submit all the page(s) of the extent buffer.
4767 *
4768 * Return >0 if we have submitted the extent buffer successfully.
4769 * Return 0 if we don't need to submit the page, as it's already submitted by
4770 * previous call.
4771 * Return <0 for fatal error.
4772 */
4773static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4774 struct extent_page_data *epd,
4775 struct extent_buffer **eb_context)
4776{
4777 struct address_space *mapping = page->mapping;
0bc09ca1 4778 struct btrfs_block_group *cache = NULL;
f91e0d0c
QW
4779 struct extent_buffer *eb;
4780 int ret;
4781
4782 if (!PagePrivate(page))
4783 return 0;
4784
fbca46eb 4785 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
c4aec299
QW
4786 return submit_eb_subpage(page, wbc, epd);
4787
f91e0d0c
QW
4788 spin_lock(&mapping->private_lock);
4789 if (!PagePrivate(page)) {
4790 spin_unlock(&mapping->private_lock);
4791 return 0;
4792 }
4793
4794 eb = (struct extent_buffer *)page->private;
4795
4796 /*
4797 * Shouldn't happen and normally this would be a BUG_ON but no point
4798 * crashing the machine for something we can survive anyway.
4799 */
4800 if (WARN_ON(!eb)) {
4801 spin_unlock(&mapping->private_lock);
4802 return 0;
4803 }
4804
4805 if (eb == *eb_context) {
4806 spin_unlock(&mapping->private_lock);
4807 return 0;
4808 }
4809 ret = atomic_inc_not_zero(&eb->refs);
4810 spin_unlock(&mapping->private_lock);
4811 if (!ret)
4812 return 0;
4813
0bc09ca1
NA
4814 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4815 /*
4816 * If for_sync, this hole will be filled with
4817 * trasnsaction commit.
4818 */
4819 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4820 ret = -EAGAIN;
4821 else
4822 ret = 0;
4823 free_extent_buffer(eb);
4824 return ret;
4825 }
4826
f91e0d0c
QW
4827 *eb_context = eb;
4828
4829 ret = lock_extent_buffer_for_io(eb, epd);
4830 if (ret <= 0) {
0bc09ca1
NA
4831 btrfs_revert_meta_write_pointer(cache, eb);
4832 if (cache)
4833 btrfs_put_block_group(cache);
f91e0d0c
QW
4834 free_extent_buffer(eb);
4835 return ret;
4836 }
be1a1d7a 4837 if (cache) {
d3e29967
NB
4838 /*
4839 * Implies write in zoned mode. Mark the last eb in a block group.
4840 */
be1a1d7a
NA
4841 if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
4842 set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
d3e29967 4843 btrfs_put_block_group(cache);
be1a1d7a 4844 }
f91e0d0c
QW
4845 ret = write_one_eb(eb, wbc, epd);
4846 free_extent_buffer(eb);
4847 if (ret < 0)
4848 return ret;
4849 return 1;
4850}
4851
0b32f4bb
JB
4852int btree_write_cache_pages(struct address_space *mapping,
4853 struct writeback_control *wbc)
4854{
f91e0d0c 4855 struct extent_buffer *eb_context = NULL;
0b32f4bb 4856 struct extent_page_data epd = {
390ed29b 4857 .bio_ctrl = { 0 },
0b32f4bb
JB
4858 .extent_locked = 0,
4859 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4860 };
b3ff8f1d 4861 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
4862 int ret = 0;
4863 int done = 0;
4864 int nr_to_write_done = 0;
4865 struct pagevec pvec;
4866 int nr_pages;
4867 pgoff_t index;
4868 pgoff_t end; /* Inclusive */
4869 int scanned = 0;
10bbd235 4870 xa_mark_t tag;
0b32f4bb 4871
86679820 4872 pagevec_init(&pvec);
0b32f4bb
JB
4873 if (wbc->range_cyclic) {
4874 index = mapping->writeback_index; /* Start from prev offset */
4875 end = -1;
556755a8
JB
4876 /*
4877 * Start from the beginning does not need to cycle over the
4878 * range, mark it as scanned.
4879 */
4880 scanned = (index == 0);
0b32f4bb 4881 } else {
09cbfeaf
KS
4882 index = wbc->range_start >> PAGE_SHIFT;
4883 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
4884 scanned = 1;
4885 }
4886 if (wbc->sync_mode == WB_SYNC_ALL)
4887 tag = PAGECACHE_TAG_TOWRITE;
4888 else
4889 tag = PAGECACHE_TAG_DIRTY;
0bc09ca1 4890 btrfs_zoned_meta_io_lock(fs_info);
0b32f4bb
JB
4891retry:
4892 if (wbc->sync_mode == WB_SYNC_ALL)
4893 tag_pages_for_writeback(mapping, index, end);
4894 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 4895 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 4896 tag))) {
0b32f4bb
JB
4897 unsigned i;
4898
0b32f4bb
JB
4899 for (i = 0; i < nr_pages; i++) {
4900 struct page *page = pvec.pages[i];
4901
f91e0d0c
QW
4902 ret = submit_eb_page(page, wbc, &epd, &eb_context);
4903 if (ret == 0)
0b32f4bb 4904 continue;
f91e0d0c 4905 if (ret < 0) {
0b32f4bb 4906 done = 1;
0b32f4bb
JB
4907 break;
4908 }
0b32f4bb
JB
4909
4910 /*
4911 * the filesystem may choose to bump up nr_to_write.
4912 * We have to make sure to honor the new nr_to_write
4913 * at any time
4914 */
4915 nr_to_write_done = wbc->nr_to_write <= 0;
4916 }
4917 pagevec_release(&pvec);
4918 cond_resched();
4919 }
4920 if (!scanned && !done) {
4921 /*
4922 * We hit the last page and there is more work to be done: wrap
4923 * back to the start of the file
4924 */
4925 scanned = 1;
4926 index = 0;
4927 goto retry;
4928 }
2b952eea
QW
4929 if (ret < 0) {
4930 end_write_bio(&epd, ret);
0bc09ca1 4931 goto out;
2b952eea 4932 }
b3ff8f1d
QW
4933 /*
4934 * If something went wrong, don't allow any metadata write bio to be
4935 * submitted.
4936 *
4937 * This would prevent use-after-free if we had dirty pages not
4938 * cleaned up, which can still happen by fuzzed images.
4939 *
4940 * - Bad extent tree
4941 * Allowing existing tree block to be allocated for other trees.
4942 *
4943 * - Log tree operations
4944 * Exiting tree blocks get allocated to log tree, bumps its
4945 * generation, then get cleaned in tree re-balance.
4946 * Such tree block will not be written back, since it's clean,
4947 * thus no WRITTEN flag set.
4948 * And after log writes back, this tree block is not traced by
4949 * any dirty extent_io_tree.
4950 *
4951 * - Offending tree block gets re-dirtied from its original owner
4952 * Since it has bumped generation, no WRITTEN flag, it can be
4953 * reused without COWing. This tree block will not be traced
4954 * by btrfs_transaction::dirty_pages.
4955 *
4956 * Now such dirty tree block will not be cleaned by any dirty
4957 * extent io tree. Thus we don't want to submit such wild eb
4958 * if the fs already has error.
4959 */
84961539 4960 if (!BTRFS_FS_ERROR(fs_info)) {
b3ff8f1d
QW
4961 ret = flush_write_bio(&epd);
4962 } else {
fbabd4a3 4963 ret = -EROFS;
b3ff8f1d
QW
4964 end_write_bio(&epd, ret);
4965 }
0bc09ca1
NA
4966out:
4967 btrfs_zoned_meta_io_unlock(fs_info);
0b32f4bb
JB
4968 return ret;
4969}
4970
d1310b2e 4971/**
3bed2da1
NB
4972 * Walk the list of dirty pages of the given address space and write all of them.
4973 *
d1310b2e 4974 * @mapping: address space structure to write
3bed2da1
NB
4975 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4976 * @epd: holds context for the write, namely the bio
d1310b2e
CM
4977 *
4978 * If a page is already under I/O, write_cache_pages() skips it, even
4979 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4980 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4981 * and msync() need to guarantee that all the data which was dirty at the time
4982 * the call was made get new I/O started against them. If wbc->sync_mode is
4983 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4984 * existing IO to complete.
4985 */
4242b64a 4986static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4987 struct writeback_control *wbc,
aab6e9ed 4988 struct extent_page_data *epd)
d1310b2e 4989{
7fd1a3f7 4990 struct inode *inode = mapping->host;
d1310b2e
CM
4991 int ret = 0;
4992 int done = 0;
f85d7d6c 4993 int nr_to_write_done = 0;
d1310b2e
CM
4994 struct pagevec pvec;
4995 int nr_pages;
4996 pgoff_t index;
4997 pgoff_t end; /* Inclusive */
a9132667
LB
4998 pgoff_t done_index;
4999 int range_whole = 0;
d1310b2e 5000 int scanned = 0;
10bbd235 5001 xa_mark_t tag;
d1310b2e 5002
7fd1a3f7
JB
5003 /*
5004 * We have to hold onto the inode so that ordered extents can do their
5005 * work when the IO finishes. The alternative to this is failing to add
5006 * an ordered extent if the igrab() fails there and that is a huge pain
5007 * to deal with, so instead just hold onto the inode throughout the
5008 * writepages operation. If it fails here we are freeing up the inode
5009 * anyway and we'd rather not waste our time writing out stuff that is
5010 * going to be truncated anyway.
5011 */
5012 if (!igrab(inode))
5013 return 0;
5014
86679820 5015 pagevec_init(&pvec);
d1310b2e
CM
5016 if (wbc->range_cyclic) {
5017 index = mapping->writeback_index; /* Start from prev offset */
5018 end = -1;
556755a8
JB
5019 /*
5020 * Start from the beginning does not need to cycle over the
5021 * range, mark it as scanned.
5022 */
5023 scanned = (index == 0);
d1310b2e 5024 } else {
09cbfeaf
KS
5025 index = wbc->range_start >> PAGE_SHIFT;
5026 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
5027 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
5028 range_whole = 1;
d1310b2e
CM
5029 scanned = 1;
5030 }
3cd24c69
EL
5031
5032 /*
5033 * We do the tagged writepage as long as the snapshot flush bit is set
5034 * and we are the first one who do the filemap_flush() on this inode.
5035 *
5036 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
5037 * not race in and drop the bit.
5038 */
5039 if (range_whole && wbc->nr_to_write == LONG_MAX &&
5040 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
5041 &BTRFS_I(inode)->runtime_flags))
5042 wbc->tagged_writepages = 1;
5043
5044 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
5045 tag = PAGECACHE_TAG_TOWRITE;
5046 else
5047 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 5048retry:
3cd24c69 5049 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 5050 tag_pages_for_writeback(mapping, index, end);
a9132667 5051 done_index = index;
f85d7d6c 5052 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
5053 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
5054 &index, end, tag))) {
d1310b2e
CM
5055 unsigned i;
5056
d1310b2e
CM
5057 for (i = 0; i < nr_pages; i++) {
5058 struct page *page = pvec.pages[i];
5059
f7bddf1e 5060 done_index = page->index + 1;
d1310b2e 5061 /*
b93b0163
MW
5062 * At this point we hold neither the i_pages lock nor
5063 * the page lock: the page may be truncated or
5064 * invalidated (changing page->mapping to NULL),
5065 * or even swizzled back from swapper_space to
5066 * tmpfs file mapping
d1310b2e 5067 */
c8f2f24b 5068 if (!trylock_page(page)) {
f4340622
QW
5069 ret = flush_write_bio(epd);
5070 BUG_ON(ret < 0);
c8f2f24b 5071 lock_page(page);
01d658f2 5072 }
d1310b2e
CM
5073
5074 if (unlikely(page->mapping != mapping)) {
5075 unlock_page(page);
5076 continue;
5077 }
5078
d2c3f4f6 5079 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
5080 if (PageWriteback(page)) {
5081 ret = flush_write_bio(epd);
5082 BUG_ON(ret < 0);
5083 }
d1310b2e 5084 wait_on_page_writeback(page);
d2c3f4f6 5085 }
d1310b2e
CM
5086
5087 if (PageWriteback(page) ||
5088 !clear_page_dirty_for_io(page)) {
5089 unlock_page(page);
5090 continue;
5091 }
5092
aab6e9ed 5093 ret = __extent_writepage(page, wbc, epd);
a9132667 5094 if (ret < 0) {
a9132667
LB
5095 done = 1;
5096 break;
5097 }
f85d7d6c
CM
5098
5099 /*
5100 * the filesystem may choose to bump up nr_to_write.
5101 * We have to make sure to honor the new nr_to_write
5102 * at any time
5103 */
5104 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
5105 }
5106 pagevec_release(&pvec);
5107 cond_resched();
5108 }
894b36e3 5109 if (!scanned && !done) {
d1310b2e
CM
5110 /*
5111 * We hit the last page and there is more work to be done: wrap
5112 * back to the start of the file
5113 */
5114 scanned = 1;
5115 index = 0;
42ffb0bf
JB
5116
5117 /*
5118 * If we're looping we could run into a page that is locked by a
5119 * writer and that writer could be waiting on writeback for a
5120 * page in our current bio, and thus deadlock, so flush the
5121 * write bio here.
5122 */
5123 ret = flush_write_bio(epd);
5124 if (!ret)
5125 goto retry;
d1310b2e 5126 }
a9132667
LB
5127
5128 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5129 mapping->writeback_index = done_index;
5130
7fd1a3f7 5131 btrfs_add_delayed_iput(inode);
894b36e3 5132 return ret;
d1310b2e 5133}
d1310b2e 5134
0a9b0e53 5135int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
5136{
5137 int ret;
d1310b2e 5138 struct extent_page_data epd = {
390ed29b 5139 .bio_ctrl = { 0 },
771ed689 5140 .extent_locked = 0,
ffbd517d 5141 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 5142 };
d1310b2e 5143
d1310b2e 5144 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
5145 ASSERT(ret <= 0);
5146 if (ret < 0) {
5147 end_write_bio(&epd, ret);
5148 return ret;
5149 }
d1310b2e 5150
3065976b
QW
5151 ret = flush_write_bio(&epd);
5152 ASSERT(ret <= 0);
d1310b2e
CM
5153 return ret;
5154}
d1310b2e 5155
2bd0fc93
QW
5156/*
5157 * Submit the pages in the range to bio for call sites which delalloc range has
5158 * already been ran (aka, ordered extent inserted) and all pages are still
5159 * locked.
5160 */
5161int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
771ed689 5162{
2bd0fc93
QW
5163 bool found_error = false;
5164 int first_error = 0;
771ed689
CM
5165 int ret = 0;
5166 struct address_space *mapping = inode->i_mapping;
5167 struct page *page;
2bd0fc93 5168 u64 cur = start;
66448b9d
QW
5169 unsigned long nr_pages;
5170 const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
771ed689 5171 struct extent_page_data epd = {
390ed29b 5172 .bio_ctrl = { 0 },
771ed689 5173 .extent_locked = 1,
2bd0fc93 5174 .sync_io = 1,
771ed689
CM
5175 };
5176 struct writeback_control wbc_writepages = {
2bd0fc93 5177 .sync_mode = WB_SYNC_ALL,
771ed689
CM
5178 .range_start = start,
5179 .range_end = end + 1,
ec39f769
CM
5180 /* We're called from an async helper function */
5181 .punt_to_cgroup = 1,
5182 .no_cgroup_owner = 1,
771ed689
CM
5183 };
5184
66448b9d
QW
5185 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
5186 nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
5187 PAGE_SHIFT;
5188 wbc_writepages.nr_to_write = nr_pages * 2;
5189
dbb70bec 5190 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
2bd0fc93 5191 while (cur <= end) {
66448b9d
QW
5192 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
5193
2bd0fc93
QW
5194 page = find_get_page(mapping, cur >> PAGE_SHIFT);
5195 /*
5196 * All pages in the range are locked since
5197 * btrfs_run_delalloc_range(), thus there is no way to clear
5198 * the page dirty flag.
5199 */
66448b9d 5200 ASSERT(PageLocked(page));
2bd0fc93
QW
5201 ASSERT(PageDirty(page));
5202 clear_page_dirty_for_io(page);
5203 ret = __extent_writepage(page, &wbc_writepages, &epd);
5204 ASSERT(ret <= 0);
5205 if (ret < 0) {
5206 found_error = true;
5207 first_error = ret;
771ed689 5208 }
09cbfeaf 5209 put_page(page);
66448b9d 5210 cur = cur_end + 1;
771ed689
CM
5211 }
5212
2bd0fc93 5213 if (!found_error)
dbb70bec
CM
5214 ret = flush_write_bio(&epd);
5215 else
02c6db4f 5216 end_write_bio(&epd, ret);
dbb70bec
CM
5217
5218 wbc_detach_inode(&wbc_writepages);
2bd0fc93
QW
5219 if (found_error)
5220 return first_error;
771ed689
CM
5221 return ret;
5222}
d1310b2e 5223
8ae225a8 5224int extent_writepages(struct address_space *mapping,
d1310b2e
CM
5225 struct writeback_control *wbc)
5226{
35156d85 5227 struct inode *inode = mapping->host;
d1310b2e
CM
5228 int ret = 0;
5229 struct extent_page_data epd = {
390ed29b 5230 .bio_ctrl = { 0 },
771ed689 5231 .extent_locked = 0,
ffbd517d 5232 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
5233 };
5234
35156d85
JT
5235 /*
5236 * Allow only a single thread to do the reloc work in zoned mode to
5237 * protect the write pointer updates.
5238 */
869f4cdc 5239 btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
935db853 5240 ret = extent_write_cache_pages(mapping, wbc, &epd);
869f4cdc 5241 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
a2a72fbd
QW
5242 ASSERT(ret <= 0);
5243 if (ret < 0) {
5244 end_write_bio(&epd, ret);
5245 return ret;
5246 }
5247 ret = flush_write_bio(&epd);
d1310b2e
CM
5248 return ret;
5249}
d1310b2e 5250
ba206a02 5251void extent_readahead(struct readahead_control *rac)
d1310b2e 5252{
390ed29b 5253 struct btrfs_bio_ctrl bio_ctrl = { 0 };
67c9684f 5254 struct page *pagepool[16];
125bac01 5255 struct extent_map *em_cached = NULL;
808f80b4 5256 u64 prev_em_start = (u64)-1;
ba206a02 5257 int nr;
d1310b2e 5258
ba206a02 5259 while ((nr = readahead_page_batch(rac, pagepool))) {
32c0a6bc
MWO
5260 u64 contig_start = readahead_pos(rac);
5261 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
e65ef21e 5262
ba206a02 5263 contiguous_readpages(pagepool, nr, contig_start, contig_end,
390ed29b 5264 &em_cached, &bio_ctrl, &prev_em_start);
d1310b2e 5265 }
67c9684f 5266
125bac01
MX
5267 if (em_cached)
5268 free_extent_map(em_cached);
5269
390ed29b
QW
5270 if (bio_ctrl.bio) {
5271 if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
ba206a02
MWO
5272 return;
5273 }
d1310b2e 5274}
d1310b2e
CM
5275
5276/*
895586eb
MWO
5277 * basic invalidate_folio code, this waits on any locked or writeback
5278 * ranges corresponding to the folio, and then deletes any extent state
d1310b2e
CM
5279 * records from the tree
5280 */
895586eb
MWO
5281int extent_invalidate_folio(struct extent_io_tree *tree,
5282 struct folio *folio, size_t offset)
d1310b2e 5283{
2ac55d41 5284 struct extent_state *cached_state = NULL;
895586eb
MWO
5285 u64 start = folio_pos(folio);
5286 u64 end = start + folio_size(folio) - 1;
5287 size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
d1310b2e 5288
829ddec9
QW
5289 /* This function is only called for the btree inode */
5290 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5291
fda2832f 5292 start += ALIGN(offset, blocksize);
d1310b2e
CM
5293 if (start > end)
5294 return 0;
5295
ff13db41 5296 lock_extent_bits(tree, start, end, &cached_state);
895586eb 5297 folio_wait_writeback(folio);
829ddec9
QW
5298
5299 /*
5300 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
5301 * so here we only need to unlock the extent range to free any
5302 * existing extent state.
5303 */
5304 unlock_extent_cached(tree, start, end, &cached_state);
d1310b2e
CM
5305 return 0;
5306}
d1310b2e 5307
7b13b7b1
CM
5308/*
5309 * a helper for releasepage, this tests for areas of the page that
5310 * are locked or under IO and drops the related state bits if it is safe
5311 * to drop the page.
5312 */
29c68b2d 5313static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 5314 struct page *page, gfp_t mask)
7b13b7b1 5315{
4eee4fa4 5316 u64 start = page_offset(page);
09cbfeaf 5317 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
5318 int ret = 1;
5319
8882679e 5320 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 5321 ret = 0;
8882679e 5322 } else {
11ef160f 5323 /*
2766ff61
FM
5324 * At this point we can safely clear everything except the
5325 * locked bit, the nodatasum bit and the delalloc new bit.
5326 * The delalloc new bit will be cleared by ordered extent
5327 * completion.
11ef160f 5328 */
66b0c887 5329 ret = __clear_extent_bit(tree, start, end,
2766ff61
FM
5330 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5331 0, 0, NULL, mask, NULL);
e3f24cc5
CM
5332
5333 /* if clear_extent_bit failed for enomem reasons,
5334 * we can't allow the release to continue.
5335 */
5336 if (ret < 0)
5337 ret = 0;
5338 else
5339 ret = 1;
7b13b7b1
CM
5340 }
5341 return ret;
5342}
7b13b7b1 5343
d1310b2e
CM
5344/*
5345 * a helper for releasepage. As long as there are no locked extents
5346 * in the range corresponding to the page, both state records and extent
5347 * map records are removed
5348 */
477a30ba 5349int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
5350{
5351 struct extent_map *em;
4eee4fa4 5352 u64 start = page_offset(page);
09cbfeaf 5353 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
5354 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5355 struct extent_io_tree *tree = &btrfs_inode->io_tree;
5356 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 5357
d0164adc 5358 if (gfpflags_allow_blocking(mask) &&
ee22184b 5359 page->mapping->host->i_size > SZ_16M) {
39b5637f 5360 u64 len;
70dec807 5361 while (start <= end) {
fbc2bd7e
FM
5362 struct btrfs_fs_info *fs_info;
5363 u64 cur_gen;
5364
39b5637f 5365 len = end - start + 1;
890871be 5366 write_lock(&map->lock);
39b5637f 5367 em = lookup_extent_mapping(map, start, len);
285190d9 5368 if (!em) {
890871be 5369 write_unlock(&map->lock);
70dec807
CM
5370 break;
5371 }
7f3c74fb
CM
5372 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5373 em->start != start) {
890871be 5374 write_unlock(&map->lock);
70dec807
CM
5375 free_extent_map(em);
5376 break;
5377 }
3d6448e6
FM
5378 if (test_range_bit(tree, em->start,
5379 extent_map_end(em) - 1,
5380 EXTENT_LOCKED, 0, NULL))
5381 goto next;
5382 /*
5383 * If it's not in the list of modified extents, used
5384 * by a fast fsync, we can remove it. If it's being
5385 * logged we can safely remove it since fsync took an
5386 * extra reference on the em.
5387 */
5388 if (list_empty(&em->list) ||
fbc2bd7e
FM
5389 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5390 goto remove_em;
5391 /*
5392 * If it's in the list of modified extents, remove it
5393 * only if its generation is older then the current one,
5394 * in which case we don't need it for a fast fsync.
5395 * Otherwise don't remove it, we could be racing with an
5396 * ongoing fast fsync that could miss the new extent.
5397 */
5398 fs_info = btrfs_inode->root->fs_info;
5399 spin_lock(&fs_info->trans_lock);
5400 cur_gen = fs_info->generation;
5401 spin_unlock(&fs_info->trans_lock);
5402 if (em->generation >= cur_gen)
5403 goto next;
5404remove_em:
5e548b32
FM
5405 /*
5406 * We only remove extent maps that are not in the list of
5407 * modified extents or that are in the list but with a
5408 * generation lower then the current generation, so there
5409 * is no need to set the full fsync flag on the inode (it
5410 * hurts the fsync performance for workloads with a data
5411 * size that exceeds or is close to the system's memory).
5412 */
fbc2bd7e
FM
5413 remove_extent_mapping(map, em);
5414 /* once for the rb tree */
5415 free_extent_map(em);
3d6448e6 5416next:
70dec807 5417 start = extent_map_end(em);
890871be 5418 write_unlock(&map->lock);
70dec807
CM
5419
5420 /* once for us */
d1310b2e 5421 free_extent_map(em);
9f47eb54
PM
5422
5423 cond_resched(); /* Allow large-extent preemption. */
d1310b2e 5424 }
d1310b2e 5425 }
29c68b2d 5426 return try_release_extent_state(tree, page, mask);
d1310b2e 5427}
d1310b2e 5428
ec29ed5b
CM
5429/*
5430 * helper function for fiemap, which doesn't want to see any holes.
5431 * This maps until we find something past 'last'
5432 */
f1bbde8d 5433static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
e3350e16 5434 u64 offset, u64 last)
ec29ed5b 5435{
f1bbde8d 5436 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
5437 struct extent_map *em;
5438 u64 len;
5439
5440 if (offset >= last)
5441 return NULL;
5442
67871254 5443 while (1) {
ec29ed5b
CM
5444 len = last - offset;
5445 if (len == 0)
5446 break;
fda2832f 5447 len = ALIGN(len, sectorsize);
f1bbde8d 5448 em = btrfs_get_extent_fiemap(inode, offset, len);
6b5b7a41 5449 if (IS_ERR(em))
ec29ed5b
CM
5450 return em;
5451
5452 /* if this isn't a hole return it */
4a2d25cd 5453 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 5454 return em;
ec29ed5b
CM
5455
5456 /* this is a hole, advance to the next extent */
5457 offset = extent_map_end(em);
5458 free_extent_map(em);
5459 if (offset >= last)
5460 break;
5461 }
5462 return NULL;
5463}
5464
4751832d
QW
5465/*
5466 * To cache previous fiemap extent
5467 *
5468 * Will be used for merging fiemap extent
5469 */
5470struct fiemap_cache {
5471 u64 offset;
5472 u64 phys;
5473 u64 len;
5474 u32 flags;
5475 bool cached;
5476};
5477
5478/*
5479 * Helper to submit fiemap extent.
5480 *
5481 * Will try to merge current fiemap extent specified by @offset, @phys,
5482 * @len and @flags with cached one.
5483 * And only when we fails to merge, cached one will be submitted as
5484 * fiemap extent.
5485 *
5486 * Return value is the same as fiemap_fill_next_extent().
5487 */
5488static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5489 struct fiemap_cache *cache,
5490 u64 offset, u64 phys, u64 len, u32 flags)
5491{
5492 int ret = 0;
5493
5494 if (!cache->cached)
5495 goto assign;
5496
5497 /*
5498 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 5499 * fiemap extent won't overlap with cached one.
4751832d
QW
5500 * Not recoverable.
5501 *
5502 * NOTE: Physical address can overlap, due to compression
5503 */
5504 if (cache->offset + cache->len > offset) {
5505 WARN_ON(1);
5506 return -EINVAL;
5507 }
5508
5509 /*
5510 * Only merges fiemap extents if
5511 * 1) Their logical addresses are continuous
5512 *
5513 * 2) Their physical addresses are continuous
5514 * So truly compressed (physical size smaller than logical size)
5515 * extents won't get merged with each other
5516 *
5517 * 3) Share same flags except FIEMAP_EXTENT_LAST
5518 * So regular extent won't get merged with prealloc extent
5519 */
5520 if (cache->offset + cache->len == offset &&
5521 cache->phys + cache->len == phys &&
5522 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5523 (flags & ~FIEMAP_EXTENT_LAST)) {
5524 cache->len += len;
5525 cache->flags |= flags;
5526 goto try_submit_last;
5527 }
5528
5529 /* Not mergeable, need to submit cached one */
5530 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5531 cache->len, cache->flags);
5532 cache->cached = false;
5533 if (ret)
5534 return ret;
5535assign:
5536 cache->cached = true;
5537 cache->offset = offset;
5538 cache->phys = phys;
5539 cache->len = len;
5540 cache->flags = flags;
5541try_submit_last:
5542 if (cache->flags & FIEMAP_EXTENT_LAST) {
5543 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5544 cache->phys, cache->len, cache->flags);
5545 cache->cached = false;
5546 }
5547 return ret;
5548}
5549
5550/*
848c23b7 5551 * Emit last fiemap cache
4751832d 5552 *
848c23b7
QW
5553 * The last fiemap cache may still be cached in the following case:
5554 * 0 4k 8k
5555 * |<- Fiemap range ->|
5556 * |<------------ First extent ----------->|
5557 *
5558 * In this case, the first extent range will be cached but not emitted.
5559 * So we must emit it before ending extent_fiemap().
4751832d 5560 */
5c5aff98 5561static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 5562 struct fiemap_cache *cache)
4751832d
QW
5563{
5564 int ret;
5565
5566 if (!cache->cached)
5567 return 0;
5568
4751832d
QW
5569 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5570 cache->len, cache->flags);
5571 cache->cached = false;
5572 if (ret > 0)
5573 ret = 0;
5574 return ret;
5575}
5576
facee0a0 5577int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
bab16e21 5578 u64 start, u64 len)
1506fcc8 5579{
975f84fe 5580 int ret = 0;
15c7745c 5581 u64 off;
1506fcc8
YS
5582 u64 max = start + len;
5583 u32 flags = 0;
975f84fe
JB
5584 u32 found_type;
5585 u64 last;
ec29ed5b 5586 u64 last_for_get_extent = 0;
1506fcc8 5587 u64 disko = 0;
facee0a0 5588 u64 isize = i_size_read(&inode->vfs_inode);
975f84fe 5589 struct btrfs_key found_key;
1506fcc8 5590 struct extent_map *em = NULL;
2ac55d41 5591 struct extent_state *cached_state = NULL;
975f84fe 5592 struct btrfs_path *path;
facee0a0 5593 struct btrfs_root *root = inode->root;
4751832d 5594 struct fiemap_cache cache = { 0 };
5911c8fe
DS
5595 struct ulist *roots;
5596 struct ulist *tmp_ulist;
1506fcc8 5597 int end = 0;
ec29ed5b
CM
5598 u64 em_start = 0;
5599 u64 em_len = 0;
5600 u64 em_end = 0;
1506fcc8
YS
5601
5602 if (len == 0)
5603 return -EINVAL;
5604
975f84fe
JB
5605 path = btrfs_alloc_path();
5606 if (!path)
5607 return -ENOMEM;
975f84fe 5608
5911c8fe
DS
5609 roots = ulist_alloc(GFP_KERNEL);
5610 tmp_ulist = ulist_alloc(GFP_KERNEL);
5611 if (!roots || !tmp_ulist) {
5612 ret = -ENOMEM;
5613 goto out_free_ulist;
5614 }
5615
15c7745c
BB
5616 /*
5617 * We can't initialize that to 'start' as this could miss extents due
5618 * to extent item merging
5619 */
5620 off = 0;
facee0a0
NB
5621 start = round_down(start, btrfs_inode_sectorsize(inode));
5622 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 5623
ec29ed5b
CM
5624 /*
5625 * lookup the last file extent. We're not using i_size here
5626 * because there might be preallocation past i_size
5627 */
facee0a0
NB
5628 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5629 0);
975f84fe 5630 if (ret < 0) {
5911c8fe 5631 goto out_free_ulist;
2d324f59
LB
5632 } else {
5633 WARN_ON(!ret);
5634 if (ret == 1)
5635 ret = 0;
975f84fe 5636 }
2d324f59 5637
975f84fe 5638 path->slots[0]--;
975f84fe 5639 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 5640 found_type = found_key.type;
975f84fe 5641
ec29ed5b 5642 /* No extents, but there might be delalloc bits */
facee0a0 5643 if (found_key.objectid != btrfs_ino(inode) ||
975f84fe 5644 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
5645 /* have to trust i_size as the end */
5646 last = (u64)-1;
5647 last_for_get_extent = isize;
5648 } else {
5649 /*
5650 * remember the start of the last extent. There are a
5651 * bunch of different factors that go into the length of the
5652 * extent, so its much less complex to remember where it started
5653 */
5654 last = found_key.offset;
5655 last_for_get_extent = last + 1;
975f84fe 5656 }
fe09e16c 5657 btrfs_release_path(path);
975f84fe 5658
ec29ed5b
CM
5659 /*
5660 * we might have some extents allocated but more delalloc past those
5661 * extents. so, we trust isize unless the start of the last extent is
5662 * beyond isize
5663 */
5664 if (last < isize) {
5665 last = (u64)-1;
5666 last_for_get_extent = isize;
5667 }
5668
facee0a0 5669 lock_extent_bits(&inode->io_tree, start, start + len - 1,
d0082371 5670 &cached_state);
ec29ed5b 5671
facee0a0 5672 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
5673 if (!em)
5674 goto out;
5675 if (IS_ERR(em)) {
5676 ret = PTR_ERR(em);
5677 goto out;
5678 }
975f84fe 5679
1506fcc8 5680 while (!end) {
b76bb701 5681 u64 offset_in_extent = 0;
ea8efc74
CM
5682
5683 /* break if the extent we found is outside the range */
5684 if (em->start >= max || extent_map_end(em) < off)
5685 break;
5686
5687 /*
5688 * get_extent may return an extent that starts before our
5689 * requested range. We have to make sure the ranges
5690 * we return to fiemap always move forward and don't
5691 * overlap, so adjust the offsets here
5692 */
5693 em_start = max(em->start, off);
1506fcc8 5694
ea8efc74
CM
5695 /*
5696 * record the offset from the start of the extent
b76bb701
JB
5697 * for adjusting the disk offset below. Only do this if the
5698 * extent isn't compressed since our in ram offset may be past
5699 * what we have actually allocated on disk.
ea8efc74 5700 */
b76bb701
JB
5701 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5702 offset_in_extent = em_start - em->start;
ec29ed5b 5703 em_end = extent_map_end(em);
ea8efc74 5704 em_len = em_end - em_start;
1506fcc8 5705 flags = 0;
f0986318
FM
5706 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5707 disko = em->block_start + offset_in_extent;
5708 else
5709 disko = 0;
1506fcc8 5710
ea8efc74
CM
5711 /*
5712 * bump off for our next call to get_extent
5713 */
5714 off = extent_map_end(em);
5715 if (off >= max)
5716 end = 1;
5717
93dbfad7 5718 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
5719 end = 1;
5720 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 5721 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
5722 flags |= (FIEMAP_EXTENT_DATA_INLINE |
5723 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 5724 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
5725 flags |= (FIEMAP_EXTENT_DELALLOC |
5726 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
5727 } else if (fieinfo->fi_extents_max) {
5728 u64 bytenr = em->block_start -
5729 (em->start - em->orig_start);
fe09e16c 5730
fe09e16c
LB
5731 /*
5732 * As btrfs supports shared space, this information
5733 * can be exported to userspace tools via
dc046b10
JB
5734 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
5735 * then we're just getting a count and we can skip the
5736 * lookup stuff.
fe09e16c 5737 */
facee0a0 5738 ret = btrfs_check_shared(root, btrfs_ino(inode),
5911c8fe 5739 bytenr, roots, tmp_ulist);
dc046b10 5740 if (ret < 0)
fe09e16c 5741 goto out_free;
dc046b10 5742 if (ret)
fe09e16c 5743 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 5744 ret = 0;
1506fcc8
YS
5745 }
5746 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5747 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
5748 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5749 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 5750
1506fcc8
YS
5751 free_extent_map(em);
5752 em = NULL;
ec29ed5b
CM
5753 if ((em_start >= last) || em_len == (u64)-1 ||
5754 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
5755 flags |= FIEMAP_EXTENT_LAST;
5756 end = 1;
5757 }
5758
ec29ed5b 5759 /* now scan forward to see if this is really the last extent. */
facee0a0 5760 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
5761 if (IS_ERR(em)) {
5762 ret = PTR_ERR(em);
5763 goto out;
5764 }
5765 if (!em) {
975f84fe
JB
5766 flags |= FIEMAP_EXTENT_LAST;
5767 end = 1;
5768 }
4751832d
QW
5769 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5770 em_len, flags);
26e726af
CS
5771 if (ret) {
5772 if (ret == 1)
5773 ret = 0;
ec29ed5b 5774 goto out_free;
26e726af 5775 }
1506fcc8
YS
5776 }
5777out_free:
4751832d 5778 if (!ret)
5c5aff98 5779 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
5780 free_extent_map(em);
5781out:
facee0a0 5782 unlock_extent_cached(&inode->io_tree, start, start + len - 1,
e43bbe5e 5783 &cached_state);
5911c8fe
DS
5784
5785out_free_ulist:
e02d48ea 5786 btrfs_free_path(path);
5911c8fe
DS
5787 ulist_free(roots);
5788 ulist_free(tmp_ulist);
1506fcc8
YS
5789 return ret;
5790}
5791
727011e0
CM
5792static void __free_extent_buffer(struct extent_buffer *eb)
5793{
727011e0
CM
5794 kmem_cache_free(extent_buffer_cache, eb);
5795}
5796
2b48966a 5797int extent_buffer_under_io(const struct extent_buffer *eb)
db7f3436
JB
5798{
5799 return (atomic_read(&eb->io_pages) ||
5800 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5801 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5802}
5803
8ff8466d 5804static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
db7f3436 5805{
8ff8466d 5806 struct btrfs_subpage *subpage;
db7f3436 5807
8ff8466d 5808 lockdep_assert_held(&page->mapping->private_lock);
db7f3436 5809
8ff8466d
QW
5810 if (PagePrivate(page)) {
5811 subpage = (struct btrfs_subpage *)page->private;
5812 if (atomic_read(&subpage->eb_refs))
5813 return true;
3d078efa
QW
5814 /*
5815 * Even there is no eb refs here, we may still have
5816 * end_page_read() call relying on page::private.
5817 */
5818 if (atomic_read(&subpage->readers))
5819 return true;
8ff8466d
QW
5820 }
5821 return false;
5822}
db7f3436 5823
8ff8466d
QW
5824static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5825{
5826 struct btrfs_fs_info *fs_info = eb->fs_info;
5827 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5828
5829 /*
5830 * For mapped eb, we're going to change the page private, which should
5831 * be done under the private_lock.
5832 */
5833 if (mapped)
5834 spin_lock(&page->mapping->private_lock);
5835
5836 if (!PagePrivate(page)) {
5d2361db 5837 if (mapped)
8ff8466d
QW
5838 spin_unlock(&page->mapping->private_lock);
5839 return;
5840 }
5841
fbca46eb 5842 if (fs_info->nodesize >= PAGE_SIZE) {
5d2361db
FL
5843 /*
5844 * We do this since we'll remove the pages after we've
5845 * removed the eb from the radix tree, so we could race
5846 * and have this page now attached to the new eb. So
5847 * only clear page_private if it's still connected to
5848 * this eb.
5849 */
5850 if (PagePrivate(page) &&
5851 page->private == (unsigned long)eb) {
5852 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5853 BUG_ON(PageDirty(page));
5854 BUG_ON(PageWriteback(page));
db7f3436 5855 /*
5d2361db
FL
5856 * We need to make sure we haven't be attached
5857 * to a new eb.
db7f3436 5858 */
d1b89bc0 5859 detach_page_private(page);
db7f3436 5860 }
5d2361db
FL
5861 if (mapped)
5862 spin_unlock(&page->mapping->private_lock);
8ff8466d
QW
5863 return;
5864 }
5865
5866 /*
5867 * For subpage, we can have dummy eb with page private. In this case,
5868 * we can directly detach the private as such page is only attached to
5869 * one dummy eb, no sharing.
5870 */
5871 if (!mapped) {
5872 btrfs_detach_subpage(fs_info, page);
5873 return;
5874 }
5875
5876 btrfs_page_dec_eb_refs(fs_info, page);
5877
5878 /*
5879 * We can only detach the page private if there are no other ebs in the
3d078efa 5880 * page range and no unfinished IO.
8ff8466d
QW
5881 */
5882 if (!page_range_has_eb(fs_info, page))
5883 btrfs_detach_subpage(fs_info, page);
5884
5885 spin_unlock(&page->mapping->private_lock);
5886}
5887
5888/* Release all pages attached to the extent buffer */
5889static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5890{
5891 int i;
5892 int num_pages;
5893
5894 ASSERT(!extent_buffer_under_io(eb));
5895
5896 num_pages = num_extent_pages(eb);
5897 for (i = 0; i < num_pages; i++) {
5898 struct page *page = eb->pages[i];
5899
5900 if (!page)
5901 continue;
5902
5903 detach_extent_buffer_page(eb, page);
5d2361db 5904
01327610 5905 /* One for when we allocated the page */
09cbfeaf 5906 put_page(page);
d64766fd 5907 }
db7f3436
JB
5908}
5909
5910/*
5911 * Helper for releasing the extent buffer.
5912 */
5913static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5914{
55ac0139 5915 btrfs_release_extent_buffer_pages(eb);
8c38938c 5916 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
db7f3436
JB
5917 __free_extent_buffer(eb);
5918}
5919
f28491e0
JB
5920static struct extent_buffer *
5921__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 5922 unsigned long len)
d1310b2e
CM
5923{
5924 struct extent_buffer *eb = NULL;
5925
d1b5c567 5926 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
5927 eb->start = start;
5928 eb->len = len;
f28491e0 5929 eb->fs_info = fs_info;
815a51c7 5930 eb->bflags = 0;
196d59ab 5931 init_rwsem(&eb->lock);
b4ce94de 5932
3fd63727
JB
5933 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5934 &fs_info->allocated_ebs);
d3575156 5935 INIT_LIST_HEAD(&eb->release_list);
6d49ba1b 5936
3083ee2e 5937 spin_lock_init(&eb->refs_lock);
d1310b2e 5938 atomic_set(&eb->refs, 1);
0b32f4bb 5939 atomic_set(&eb->io_pages, 0);
727011e0 5940
deb67895 5941 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
d1310b2e
CM
5942
5943 return eb;
5944}
5945
2b48966a 5946struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
815a51c7 5947{
cc5e31a4 5948 int i;
815a51c7 5949 struct extent_buffer *new;
cc5e31a4 5950 int num_pages = num_extent_pages(src);
dd137dd1 5951 int ret;
815a51c7 5952
3f556f78 5953 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
5954 if (new == NULL)
5955 return NULL;
5956
62c053fb
QW
5957 /*
5958 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5959 * btrfs_release_extent_buffer() have different behavior for
5960 * UNMAPPED subpage extent buffer.
5961 */
5962 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5963
dd137dd1
STD
5964 memset(new->pages, 0, sizeof(*new->pages) * num_pages);
5965 ret = btrfs_alloc_page_array(num_pages, new->pages);
5966 if (ret) {
5967 btrfs_release_extent_buffer(new);
5968 return NULL;
5969 }
5970
815a51c7 5971 for (i = 0; i < num_pages; i++) {
760f991f 5972 int ret;
dd137dd1 5973 struct page *p = new->pages[i];
760f991f 5974
760f991f
QW
5975 ret = attach_extent_buffer_page(new, p, NULL);
5976 if (ret < 0) {
760f991f
QW
5977 btrfs_release_extent_buffer(new);
5978 return NULL;
5979 }
815a51c7 5980 WARN_ON(PageDirty(p));
fba1acf9 5981 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7 5982 }
92d83e94 5983 set_extent_buffer_uptodate(new);
815a51c7
JS
5984
5985 return new;
5986}
5987
0f331229
OS
5988struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5989 u64 start, unsigned long len)
815a51c7
JS
5990{
5991 struct extent_buffer *eb;
cc5e31a4
DS
5992 int num_pages;
5993 int i;
dd137dd1 5994 int ret;
815a51c7 5995
3f556f78 5996 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
5997 if (!eb)
5998 return NULL;
5999
65ad0104 6000 num_pages = num_extent_pages(eb);
dd137dd1
STD
6001 ret = btrfs_alloc_page_array(num_pages, eb->pages);
6002 if (ret)
6003 goto err;
6004
815a51c7 6005 for (i = 0; i < num_pages; i++) {
dd137dd1 6006 struct page *p = eb->pages[i];
09bc1f0f 6007
dd137dd1 6008 ret = attach_extent_buffer_page(eb, p, NULL);
09bc1f0f
QW
6009 if (ret < 0)
6010 goto err;
815a51c7 6011 }
dd137dd1 6012
815a51c7
JS
6013 set_extent_buffer_uptodate(eb);
6014 btrfs_set_header_nritems(eb, 0);
b0132a3b 6015 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
6016
6017 return eb;
6018err:
dd137dd1
STD
6019 for (i = 0; i < num_pages; i++) {
6020 if (eb->pages[i]) {
6021 detach_extent_buffer_page(eb, eb->pages[i]);
6022 __free_page(eb->pages[i]);
6023 }
09bc1f0f 6024 }
815a51c7
JS
6025 __free_extent_buffer(eb);
6026 return NULL;
6027}
6028
0f331229 6029struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 6030 u64 start)
0f331229 6031{
da17066c 6032 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
6033}
6034
0b32f4bb
JB
6035static void check_buffer_tree_ref(struct extent_buffer *eb)
6036{
242e18c7 6037 int refs;
6bf9cd2e
BB
6038 /*
6039 * The TREE_REF bit is first set when the extent_buffer is added
6040 * to the radix tree. It is also reset, if unset, when a new reference
6041 * is created by find_extent_buffer.
0b32f4bb 6042 *
6bf9cd2e
BB
6043 * It is only cleared in two cases: freeing the last non-tree
6044 * reference to the extent_buffer when its STALE bit is set or
6045 * calling releasepage when the tree reference is the only reference.
0b32f4bb 6046 *
6bf9cd2e
BB
6047 * In both cases, care is taken to ensure that the extent_buffer's
6048 * pages are not under io. However, releasepage can be concurrently
6049 * called with creating new references, which is prone to race
6050 * conditions between the calls to check_buffer_tree_ref in those
6051 * codepaths and clearing TREE_REF in try_release_extent_buffer.
0b32f4bb 6052 *
6bf9cd2e
BB
6053 * The actual lifetime of the extent_buffer in the radix tree is
6054 * adequately protected by the refcount, but the TREE_REF bit and
6055 * its corresponding reference are not. To protect against this
6056 * class of races, we call check_buffer_tree_ref from the codepaths
6057 * which trigger io after they set eb->io_pages. Note that once io is
6058 * initiated, TREE_REF can no longer be cleared, so that is the
6059 * moment at which any such race is best fixed.
0b32f4bb 6060 */
242e18c7
CM
6061 refs = atomic_read(&eb->refs);
6062 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6063 return;
6064
594831c4
JB
6065 spin_lock(&eb->refs_lock);
6066 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 6067 atomic_inc(&eb->refs);
594831c4 6068 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
6069}
6070
2457aec6
MG
6071static void mark_extent_buffer_accessed(struct extent_buffer *eb,
6072 struct page *accessed)
5df4235e 6073{
cc5e31a4 6074 int num_pages, i;
5df4235e 6075
0b32f4bb
JB
6076 check_buffer_tree_ref(eb);
6077
65ad0104 6078 num_pages = num_extent_pages(eb);
5df4235e 6079 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
6080 struct page *p = eb->pages[i];
6081
2457aec6
MG
6082 if (p != accessed)
6083 mark_page_accessed(p);
5df4235e
JB
6084 }
6085}
6086
f28491e0
JB
6087struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
6088 u64 start)
452c75c3
CS
6089{
6090 struct extent_buffer *eb;
6091
2f3186d8
QW
6092 eb = find_extent_buffer_nolock(fs_info, start);
6093 if (!eb)
6094 return NULL;
6095 /*
6096 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
6097 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
6098 * another task running free_extent_buffer() might have seen that flag
6099 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
6100 * writeback flags not set) and it's still in the tree (flag
6101 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
6102 * decrementing the extent buffer's reference count twice. So here we
6103 * could race and increment the eb's reference count, clear its stale
6104 * flag, mark it as dirty and drop our reference before the other task
6105 * finishes executing free_extent_buffer, which would later result in
6106 * an attempt to free an extent buffer that is dirty.
6107 */
6108 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6109 spin_lock(&eb->refs_lock);
6110 spin_unlock(&eb->refs_lock);
452c75c3 6111 }
2f3186d8
QW
6112 mark_extent_buffer_accessed(eb, NULL);
6113 return eb;
452c75c3
CS
6114}
6115
faa2dbf0
JB
6116#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6117struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 6118 u64 start)
faa2dbf0
JB
6119{
6120 struct extent_buffer *eb, *exists = NULL;
6121 int ret;
6122
6123 eb = find_extent_buffer(fs_info, start);
6124 if (eb)
6125 return eb;
da17066c 6126 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 6127 if (!eb)
b6293c82 6128 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
6129 eb->fs_info = fs_info;
6130again:
e1860a77 6131 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
6132 if (ret) {
6133 exists = ERR_PTR(ret);
faa2dbf0 6134 goto free_eb;
b6293c82 6135 }
faa2dbf0
JB
6136 spin_lock(&fs_info->buffer_lock);
6137 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 6138 start >> fs_info->sectorsize_bits, eb);
faa2dbf0
JB
6139 spin_unlock(&fs_info->buffer_lock);
6140 radix_tree_preload_end();
6141 if (ret == -EEXIST) {
6142 exists = find_extent_buffer(fs_info, start);
6143 if (exists)
6144 goto free_eb;
6145 else
6146 goto again;
6147 }
6148 check_buffer_tree_ref(eb);
6149 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6150
faa2dbf0
JB
6151 return eb;
6152free_eb:
6153 btrfs_release_extent_buffer(eb);
6154 return exists;
6155}
6156#endif
6157
81982210
QW
6158static struct extent_buffer *grab_extent_buffer(
6159 struct btrfs_fs_info *fs_info, struct page *page)
c0f0a9e7
QW
6160{
6161 struct extent_buffer *exists;
6162
81982210
QW
6163 /*
6164 * For subpage case, we completely rely on radix tree to ensure we
6165 * don't try to insert two ebs for the same bytenr. So here we always
6166 * return NULL and just continue.
6167 */
fbca46eb 6168 if (fs_info->nodesize < PAGE_SIZE)
81982210
QW
6169 return NULL;
6170
c0f0a9e7
QW
6171 /* Page not yet attached to an extent buffer */
6172 if (!PagePrivate(page))
6173 return NULL;
6174
6175 /*
6176 * We could have already allocated an eb for this page and attached one
6177 * so lets see if we can get a ref on the existing eb, and if we can we
6178 * know it's good and we can just return that one, else we know we can
6179 * just overwrite page->private.
6180 */
6181 exists = (struct extent_buffer *)page->private;
6182 if (atomic_inc_not_zero(&exists->refs))
6183 return exists;
6184
6185 WARN_ON(PageDirty(page));
6186 detach_page_private(page);
6187 return NULL;
6188}
6189
fbca46eb
QW
6190static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
6191{
6192 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
6193 btrfs_err(fs_info, "bad tree block start %llu", start);
6194 return -EINVAL;
6195 }
6196
6197 if (fs_info->nodesize < PAGE_SIZE &&
6198 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
6199 btrfs_err(fs_info,
6200 "tree block crosses page boundary, start %llu nodesize %u",
6201 start, fs_info->nodesize);
6202 return -EINVAL;
6203 }
6204 if (fs_info->nodesize >= PAGE_SIZE &&
6205 !IS_ALIGNED(start, PAGE_SIZE)) {
6206 btrfs_err(fs_info,
6207 "tree block is not page aligned, start %llu nodesize %u",
6208 start, fs_info->nodesize);
6209 return -EINVAL;
6210 }
6211 return 0;
6212}
6213
f28491e0 6214struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3fbaf258 6215 u64 start, u64 owner_root, int level)
d1310b2e 6216{
da17066c 6217 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
6218 int num_pages;
6219 int i;
09cbfeaf 6220 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 6221 struct extent_buffer *eb;
6af118ce 6222 struct extent_buffer *exists = NULL;
d1310b2e 6223 struct page *p;
f28491e0 6224 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 6225 int uptodate = 1;
19fe0a8b 6226 int ret;
d1310b2e 6227
fbca46eb 6228 if (check_eb_alignment(fs_info, start))
c871b0f2 6229 return ERR_PTR(-EINVAL);
c871b0f2 6230
e9306ad4
QW
6231#if BITS_PER_LONG == 32
6232 if (start >= MAX_LFS_FILESIZE) {
6233 btrfs_err_rl(fs_info,
6234 "extent buffer %llu is beyond 32bit page cache limit", start);
6235 btrfs_err_32bit_limit(fs_info);
6236 return ERR_PTR(-EOVERFLOW);
6237 }
6238 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6239 btrfs_warn_32bit_limit(fs_info);
6240#endif
6241
f28491e0 6242 eb = find_extent_buffer(fs_info, start);
452c75c3 6243 if (eb)
6af118ce 6244 return eb;
6af118ce 6245
23d79d81 6246 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 6247 if (!eb)
c871b0f2 6248 return ERR_PTR(-ENOMEM);
e114c545 6249 btrfs_set_buffer_lockdep_class(owner_root, eb, level);
d1310b2e 6250
65ad0104 6251 num_pages = num_extent_pages(eb);
727011e0 6252 for (i = 0; i < num_pages; i++, index++) {
760f991f
QW
6253 struct btrfs_subpage *prealloc = NULL;
6254
d1b5c567 6255 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
6256 if (!p) {
6257 exists = ERR_PTR(-ENOMEM);
6af118ce 6258 goto free_eb;
c871b0f2 6259 }
4f2de97a 6260
760f991f
QW
6261 /*
6262 * Preallocate page->private for subpage case, so that we won't
6263 * allocate memory with private_lock hold. The memory will be
6264 * freed by attach_extent_buffer_page() or freed manually if
6265 * we exit earlier.
6266 *
6267 * Although we have ensured one subpage eb can only have one
6268 * page, but it may change in the future for 16K page size
6269 * support, so we still preallocate the memory in the loop.
6270 */
fbca46eb 6271 if (fs_info->nodesize < PAGE_SIZE) {
651fb419
QW
6272 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6273 if (IS_ERR(prealloc)) {
6274 ret = PTR_ERR(prealloc);
fdf250db
QW
6275 unlock_page(p);
6276 put_page(p);
6277 exists = ERR_PTR(ret);
6278 goto free_eb;
6279 }
760f991f
QW
6280 }
6281
4f2de97a 6282 spin_lock(&mapping->private_lock);
81982210 6283 exists = grab_extent_buffer(fs_info, p);
c0f0a9e7
QW
6284 if (exists) {
6285 spin_unlock(&mapping->private_lock);
6286 unlock_page(p);
6287 put_page(p);
6288 mark_extent_buffer_accessed(exists, p);
760f991f 6289 btrfs_free_subpage(prealloc);
c0f0a9e7 6290 goto free_eb;
d1310b2e 6291 }
760f991f
QW
6292 /* Should not fail, as we have preallocated the memory */
6293 ret = attach_extent_buffer_page(eb, p, prealloc);
6294 ASSERT(!ret);
8ff8466d
QW
6295 /*
6296 * To inform we have extra eb under allocation, so that
6297 * detach_extent_buffer_page() won't release the page private
6298 * when the eb hasn't yet been inserted into radix tree.
6299 *
6300 * The ref will be decreased when the eb released the page, in
6301 * detach_extent_buffer_page().
6302 * Thus needs no special handling in error path.
6303 */
6304 btrfs_page_inc_eb_refs(fs_info, p);
4f2de97a 6305 spin_unlock(&mapping->private_lock);
760f991f 6306
1e5eb3d6 6307 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
727011e0 6308 eb->pages[i] = p;
d1310b2e
CM
6309 if (!PageUptodate(p))
6310 uptodate = 0;
eb14ab8e
CM
6311
6312 /*
b16d011e
NB
6313 * We can't unlock the pages just yet since the extent buffer
6314 * hasn't been properly inserted in the radix tree, this
6315 * opens a race with btree_releasepage which can free a page
6316 * while we are still filling in all pages for the buffer and
6317 * we could crash.
eb14ab8e 6318 */
d1310b2e
CM
6319 }
6320 if (uptodate)
b4ce94de 6321 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 6322again:
e1860a77 6323 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
6324 if (ret) {
6325 exists = ERR_PTR(ret);
19fe0a8b 6326 goto free_eb;
c871b0f2 6327 }
19fe0a8b 6328
f28491e0
JB
6329 spin_lock(&fs_info->buffer_lock);
6330 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 6331 start >> fs_info->sectorsize_bits, eb);
f28491e0 6332 spin_unlock(&fs_info->buffer_lock);
452c75c3 6333 radix_tree_preload_end();
19fe0a8b 6334 if (ret == -EEXIST) {
f28491e0 6335 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
6336 if (exists)
6337 goto free_eb;
6338 else
115391d2 6339 goto again;
6af118ce 6340 }
6af118ce 6341 /* add one reference for the tree */
0b32f4bb 6342 check_buffer_tree_ref(eb);
34b41ace 6343 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
6344
6345 /*
b16d011e
NB
6346 * Now it's safe to unlock the pages because any calls to
6347 * btree_releasepage will correctly detect that a page belongs to a
6348 * live buffer and won't free them prematurely.
eb14ab8e 6349 */
28187ae5
NB
6350 for (i = 0; i < num_pages; i++)
6351 unlock_page(eb->pages[i]);
d1310b2e
CM
6352 return eb;
6353
6af118ce 6354free_eb:
5ca64f45 6355 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
6356 for (i = 0; i < num_pages; i++) {
6357 if (eb->pages[i])
6358 unlock_page(eb->pages[i]);
6359 }
eb14ab8e 6360
897ca6e9 6361 btrfs_release_extent_buffer(eb);
6af118ce 6362 return exists;
d1310b2e 6363}
d1310b2e 6364
3083ee2e
JB
6365static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6366{
6367 struct extent_buffer *eb =
6368 container_of(head, struct extent_buffer, rcu_head);
6369
6370 __free_extent_buffer(eb);
6371}
6372
f7a52a40 6373static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 6374 __releases(&eb->refs_lock)
3083ee2e 6375{
07e21c4d
NB
6376 lockdep_assert_held(&eb->refs_lock);
6377
3083ee2e
JB
6378 WARN_ON(atomic_read(&eb->refs) == 0);
6379 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 6380 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 6381 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 6382
815a51c7 6383 spin_unlock(&eb->refs_lock);
3083ee2e 6384
f28491e0
JB
6385 spin_lock(&fs_info->buffer_lock);
6386 radix_tree_delete(&fs_info->buffer_radix,
478ef886 6387 eb->start >> fs_info->sectorsize_bits);
f28491e0 6388 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
6389 } else {
6390 spin_unlock(&eb->refs_lock);
815a51c7 6391 }
3083ee2e 6392
8c38938c 6393 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
3083ee2e 6394 /* Should be safe to release our pages at this point */
55ac0139 6395 btrfs_release_extent_buffer_pages(eb);
bcb7e449 6396#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 6397 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
6398 __free_extent_buffer(eb);
6399 return 1;
6400 }
6401#endif
3083ee2e 6402 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 6403 return 1;
3083ee2e
JB
6404 }
6405 spin_unlock(&eb->refs_lock);
e64860aa
JB
6406
6407 return 0;
3083ee2e
JB
6408}
6409
d1310b2e
CM
6410void free_extent_buffer(struct extent_buffer *eb)
6411{
242e18c7
CM
6412 int refs;
6413 int old;
d1310b2e
CM
6414 if (!eb)
6415 return;
6416
242e18c7
CM
6417 while (1) {
6418 refs = atomic_read(&eb->refs);
46cc775e
NB
6419 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6420 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6421 refs == 1))
242e18c7
CM
6422 break;
6423 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6424 if (old == refs)
6425 return;
6426 }
6427
3083ee2e
JB
6428 spin_lock(&eb->refs_lock);
6429 if (atomic_read(&eb->refs) == 2 &&
6430 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 6431 !extent_buffer_under_io(eb) &&
3083ee2e
JB
6432 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6433 atomic_dec(&eb->refs);
6434
6435 /*
6436 * I know this is terrible, but it's temporary until we stop tracking
6437 * the uptodate bits and such for the extent buffers.
6438 */
f7a52a40 6439 release_extent_buffer(eb);
3083ee2e
JB
6440}
6441
6442void free_extent_buffer_stale(struct extent_buffer *eb)
6443{
6444 if (!eb)
d1310b2e
CM
6445 return;
6446
3083ee2e
JB
6447 spin_lock(&eb->refs_lock);
6448 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6449
0b32f4bb 6450 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
6451 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6452 atomic_dec(&eb->refs);
f7a52a40 6453 release_extent_buffer(eb);
d1310b2e 6454}
d1310b2e 6455
0d27797e
QW
6456static void btree_clear_page_dirty(struct page *page)
6457{
6458 ASSERT(PageDirty(page));
6459 ASSERT(PageLocked(page));
6460 clear_page_dirty_for_io(page);
6461 xa_lock_irq(&page->mapping->i_pages);
6462 if (!PageDirty(page))
6463 __xa_clear_mark(&page->mapping->i_pages,
6464 page_index(page), PAGECACHE_TAG_DIRTY);
6465 xa_unlock_irq(&page->mapping->i_pages);
6466}
6467
6468static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6469{
6470 struct btrfs_fs_info *fs_info = eb->fs_info;
6471 struct page *page = eb->pages[0];
6472 bool last;
6473
6474 /* btree_clear_page_dirty() needs page locked */
6475 lock_page(page);
6476 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6477 eb->len);
6478 if (last)
6479 btree_clear_page_dirty(page);
6480 unlock_page(page);
6481 WARN_ON(atomic_read(&eb->refs) == 0);
6482}
6483
2b48966a 6484void clear_extent_buffer_dirty(const struct extent_buffer *eb)
d1310b2e 6485{
cc5e31a4
DS
6486 int i;
6487 int num_pages;
d1310b2e
CM
6488 struct page *page;
6489
fbca46eb 6490 if (eb->fs_info->nodesize < PAGE_SIZE)
0d27797e
QW
6491 return clear_subpage_extent_buffer_dirty(eb);
6492
65ad0104 6493 num_pages = num_extent_pages(eb);
d1310b2e
CM
6494
6495 for (i = 0; i < num_pages; i++) {
fb85fc9a 6496 page = eb->pages[i];
b9473439 6497 if (!PageDirty(page))
d2c3f4f6 6498 continue;
a61e6f29 6499 lock_page(page);
0d27797e 6500 btree_clear_page_dirty(page);
bf0da8c1 6501 ClearPageError(page);
a61e6f29 6502 unlock_page(page);
d1310b2e 6503 }
0b32f4bb 6504 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 6505}
d1310b2e 6506
abb57ef3 6507bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 6508{
cc5e31a4
DS
6509 int i;
6510 int num_pages;
abb57ef3 6511 bool was_dirty;
d1310b2e 6512
0b32f4bb
JB
6513 check_buffer_tree_ref(eb);
6514
b9473439 6515 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 6516
65ad0104 6517 num_pages = num_extent_pages(eb);
3083ee2e 6518 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
6519 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6520
0d27797e 6521 if (!was_dirty) {
fbca46eb 6522 bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
51995c39 6523
0d27797e
QW
6524 /*
6525 * For subpage case, we can have other extent buffers in the
6526 * same page, and in clear_subpage_extent_buffer_dirty() we
6527 * have to clear page dirty without subpage lock held.
6528 * This can cause race where our page gets dirty cleared after
6529 * we just set it.
6530 *
6531 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6532 * its page for other reasons, we can use page lock to prevent
6533 * the above race.
6534 */
6535 if (subpage)
6536 lock_page(eb->pages[0]);
6537 for (i = 0; i < num_pages; i++)
6538 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6539 eb->start, eb->len);
6540 if (subpage)
6541 unlock_page(eb->pages[0]);
6542 }
51995c39
LB
6543#ifdef CONFIG_BTRFS_DEBUG
6544 for (i = 0; i < num_pages; i++)
6545 ASSERT(PageDirty(eb->pages[i]));
6546#endif
6547
b9473439 6548 return was_dirty;
d1310b2e 6549}
d1310b2e 6550
69ba3927 6551void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 6552{
251f2acc 6553 struct btrfs_fs_info *fs_info = eb->fs_info;
1259ab75 6554 struct page *page;
cc5e31a4 6555 int num_pages;
251f2acc 6556 int i;
1259ab75 6557
b4ce94de 6558 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 6559 num_pages = num_extent_pages(eb);
1259ab75 6560 for (i = 0; i < num_pages; i++) {
fb85fc9a 6561 page = eb->pages[i];
fbca46eb
QW
6562 if (!page)
6563 continue;
6564
6565 /*
6566 * This is special handling for metadata subpage, as regular
6567 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6568 */
6569 if (fs_info->nodesize >= PAGE_SIZE)
6570 ClearPageUptodate(page);
6571 else
6572 btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
6573 eb->len);
1259ab75 6574 }
1259ab75
CM
6575}
6576
09c25a8c 6577void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 6578{
251f2acc 6579 struct btrfs_fs_info *fs_info = eb->fs_info;
d1310b2e 6580 struct page *page;
cc5e31a4 6581 int num_pages;
251f2acc 6582 int i;
d1310b2e 6583
0b32f4bb 6584 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 6585 num_pages = num_extent_pages(eb);
d1310b2e 6586 for (i = 0; i < num_pages; i++) {
fb85fc9a 6587 page = eb->pages[i];
fbca46eb
QW
6588
6589 /*
6590 * This is special handling for metadata subpage, as regular
6591 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6592 */
6593 if (fs_info->nodesize >= PAGE_SIZE)
6594 SetPageUptodate(page);
6595 else
6596 btrfs_subpage_set_uptodate(fs_info, page, eb->start,
6597 eb->len);
d1310b2e 6598 }
d1310b2e 6599}
d1310b2e 6600
4012daf7
QW
6601static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6602 int mirror_num)
6603{
6604 struct btrfs_fs_info *fs_info = eb->fs_info;
6605 struct extent_io_tree *io_tree;
6606 struct page *page = eb->pages[0];
390ed29b 6607 struct btrfs_bio_ctrl bio_ctrl = { 0 };
4012daf7
QW
6608 int ret = 0;
6609
6610 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6611 ASSERT(PagePrivate(page));
6612 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6613
6614 if (wait == WAIT_NONE) {
dc56219f
GR
6615 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6616 return -EAGAIN;
4012daf7
QW
6617 } else {
6618 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6619 if (ret < 0)
6620 return ret;
6621 }
6622
6623 ret = 0;
6624 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6625 PageUptodate(page) ||
6626 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6627 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6628 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6629 return ret;
6630 }
6631
6632 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6633 eb->read_mirror = 0;
6634 atomic_set(&eb->io_pages, 1);
6635 check_buffer_tree_ref(eb);
6636 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6637
3d078efa 6638 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
390ed29b
QW
6639 ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
6640 page, eb->start, eb->len,
6641 eb->start - page_offset(page),
6642 end_bio_extent_readpage, mirror_num, 0,
4012daf7
QW
6643 true);
6644 if (ret) {
6645 /*
6646 * In the endio function, if we hit something wrong we will
6647 * increase the io_pages, so here we need to decrease it for
6648 * error path.
6649 */
6650 atomic_dec(&eb->io_pages);
6651 }
390ed29b 6652 if (bio_ctrl.bio) {
4012daf7
QW
6653 int tmp;
6654
390ed29b
QW
6655 tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
6656 bio_ctrl.bio = NULL;
4012daf7
QW
6657 if (tmp < 0)
6658 return tmp;
6659 }
6660 if (ret || wait != WAIT_COMPLETE)
6661 return ret;
6662
6663 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6664 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6665 ret = -EIO;
6666 return ret;
6667}
6668
c2ccfbc6 6669int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 6670{
cc5e31a4 6671 int i;
d1310b2e
CM
6672 struct page *page;
6673 int err;
6674 int ret = 0;
ce9adaa5
CM
6675 int locked_pages = 0;
6676 int all_uptodate = 1;
cc5e31a4 6677 int num_pages;
727011e0 6678 unsigned long num_reads = 0;
390ed29b 6679 struct btrfs_bio_ctrl bio_ctrl = { 0 };
a86c12c7 6680
b4ce94de 6681 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
6682 return 0;
6683
651740a5
JB
6684 /*
6685 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
6686 * operation, which could potentially still be in flight. In this case
6687 * we simply want to return an error.
6688 */
6689 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
6690 return -EIO;
6691
fbca46eb 6692 if (eb->fs_info->nodesize < PAGE_SIZE)
4012daf7
QW
6693 return read_extent_buffer_subpage(eb, wait, mirror_num);
6694
65ad0104 6695 num_pages = num_extent_pages(eb);
8436ea91 6696 for (i = 0; i < num_pages; i++) {
fb85fc9a 6697 page = eb->pages[i];
bb82ab88 6698 if (wait == WAIT_NONE) {
2c4d8cb7
QW
6699 /*
6700 * WAIT_NONE is only utilized by readahead. If we can't
6701 * acquire the lock atomically it means either the eb
6702 * is being read out or under modification.
6703 * Either way the eb will be or has been cached,
6704 * readahead can exit safely.
6705 */
2db04966 6706 if (!trylock_page(page))
ce9adaa5 6707 goto unlock_exit;
d1310b2e
CM
6708 } else {
6709 lock_page(page);
6710 }
ce9adaa5 6711 locked_pages++;
2571e739
LB
6712 }
6713 /*
6714 * We need to firstly lock all pages to make sure that
6715 * the uptodate bit of our pages won't be affected by
6716 * clear_extent_buffer_uptodate().
6717 */
8436ea91 6718 for (i = 0; i < num_pages; i++) {
2571e739 6719 page = eb->pages[i];
727011e0
CM
6720 if (!PageUptodate(page)) {
6721 num_reads++;
ce9adaa5 6722 all_uptodate = 0;
727011e0 6723 }
ce9adaa5 6724 }
2571e739 6725
ce9adaa5 6726 if (all_uptodate) {
8436ea91 6727 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
6728 goto unlock_exit;
6729 }
6730
656f30db 6731 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 6732 eb->read_mirror = 0;
0b32f4bb 6733 atomic_set(&eb->io_pages, num_reads);
6bf9cd2e
BB
6734 /*
6735 * It is possible for releasepage to clear the TREE_REF bit before we
6736 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6737 */
6738 check_buffer_tree_ref(eb);
8436ea91 6739 for (i = 0; i < num_pages; i++) {
fb85fc9a 6740 page = eb->pages[i];
baf863b9 6741
ce9adaa5 6742 if (!PageUptodate(page)) {
baf863b9
LB
6743 if (ret) {
6744 atomic_dec(&eb->io_pages);
6745 unlock_page(page);
6746 continue;
6747 }
6748
f188591e 6749 ClearPageError(page);
0420177c 6750 err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
390ed29b
QW
6751 &bio_ctrl, page, page_offset(page),
6752 PAGE_SIZE, 0, end_bio_extent_readpage,
6753 mirror_num, 0, false);
baf863b9 6754 if (err) {
baf863b9 6755 /*
0420177c
NB
6756 * We failed to submit the bio so it's the
6757 * caller's responsibility to perform cleanup
6758 * i.e unlock page/set error bit.
baf863b9 6759 */
0420177c
NB
6760 ret = err;
6761 SetPageError(page);
6762 unlock_page(page);
baf863b9
LB
6763 atomic_dec(&eb->io_pages);
6764 }
d1310b2e
CM
6765 } else {
6766 unlock_page(page);
6767 }
6768 }
6769
390ed29b
QW
6770 if (bio_ctrl.bio) {
6771 err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
6772 bio_ctrl.bio = NULL;
79787eaa
JM
6773 if (err)
6774 return err;
355808c2 6775 }
a86c12c7 6776
bb82ab88 6777 if (ret || wait != WAIT_COMPLETE)
d1310b2e 6778 return ret;
d397712b 6779
8436ea91 6780 for (i = 0; i < num_pages; i++) {
fb85fc9a 6781 page = eb->pages[i];
d1310b2e 6782 wait_on_page_locked(page);
d397712b 6783 if (!PageUptodate(page))
d1310b2e 6784 ret = -EIO;
d1310b2e 6785 }
d397712b 6786
d1310b2e 6787 return ret;
ce9adaa5
CM
6788
6789unlock_exit:
d397712b 6790 while (locked_pages > 0) {
ce9adaa5 6791 locked_pages--;
8436ea91
JB
6792 page = eb->pages[locked_pages];
6793 unlock_page(page);
ce9adaa5
CM
6794 }
6795 return ret;
d1310b2e 6796}
d1310b2e 6797
f98b6215
QW
6798static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6799 unsigned long len)
6800{
6801 btrfs_warn(eb->fs_info,
6802 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6803 eb->start, eb->len, start, len);
6804 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6805
6806 return true;
6807}
6808
6809/*
6810 * Check if the [start, start + len) range is valid before reading/writing
6811 * the eb.
6812 * NOTE: @start and @len are offset inside the eb, not logical address.
6813 *
6814 * Caller should not touch the dst/src memory if this function returns error.
6815 */
6816static inline int check_eb_range(const struct extent_buffer *eb,
6817 unsigned long start, unsigned long len)
6818{
6819 unsigned long offset;
6820
6821 /* start, start + len should not go beyond eb->len nor overflow */
6822 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6823 return report_eb_range(eb, start, len);
6824
6825 return false;
6826}
6827
1cbb1f45
JM
6828void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6829 unsigned long start, unsigned long len)
d1310b2e
CM
6830{
6831 size_t cur;
6832 size_t offset;
6833 struct page *page;
6834 char *kaddr;
6835 char *dst = (char *)dstv;
884b07d0 6836 unsigned long i = get_eb_page_index(start);
d1310b2e 6837
f98b6215 6838 if (check_eb_range(eb, start, len))
f716abd5 6839 return;
d1310b2e 6840
884b07d0 6841 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6842
d397712b 6843 while (len > 0) {
fb85fc9a 6844 page = eb->pages[i];
d1310b2e 6845
09cbfeaf 6846 cur = min(len, (PAGE_SIZE - offset));
a6591715 6847 kaddr = page_address(page);
d1310b2e 6848 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
6849
6850 dst += cur;
6851 len -= cur;
6852 offset = 0;
6853 i++;
6854 }
6855}
d1310b2e 6856
a48b73ec
JB
6857int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6858 void __user *dstv,
6859 unsigned long start, unsigned long len)
550ac1d8
GH
6860{
6861 size_t cur;
6862 size_t offset;
6863 struct page *page;
6864 char *kaddr;
6865 char __user *dst = (char __user *)dstv;
884b07d0 6866 unsigned long i = get_eb_page_index(start);
550ac1d8
GH
6867 int ret = 0;
6868
6869 WARN_ON(start > eb->len);
6870 WARN_ON(start + len > eb->start + eb->len);
6871
884b07d0 6872 offset = get_eb_offset_in_page(eb, start);
550ac1d8
GH
6873
6874 while (len > 0) {
fb85fc9a 6875 page = eb->pages[i];
550ac1d8 6876
09cbfeaf 6877 cur = min(len, (PAGE_SIZE - offset));
550ac1d8 6878 kaddr = page_address(page);
a48b73ec 6879 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
550ac1d8
GH
6880 ret = -EFAULT;
6881 break;
6882 }
6883
6884 dst += cur;
6885 len -= cur;
6886 offset = 0;
6887 i++;
6888 }
6889
6890 return ret;
6891}
6892
1cbb1f45
JM
6893int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6894 unsigned long start, unsigned long len)
d1310b2e
CM
6895{
6896 size_t cur;
6897 size_t offset;
6898 struct page *page;
6899 char *kaddr;
6900 char *ptr = (char *)ptrv;
884b07d0 6901 unsigned long i = get_eb_page_index(start);
d1310b2e
CM
6902 int ret = 0;
6903
f98b6215
QW
6904 if (check_eb_range(eb, start, len))
6905 return -EINVAL;
d1310b2e 6906
884b07d0 6907 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6908
d397712b 6909 while (len > 0) {
fb85fc9a 6910 page = eb->pages[i];
d1310b2e 6911
09cbfeaf 6912 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 6913
a6591715 6914 kaddr = page_address(page);
d1310b2e 6915 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
6916 if (ret)
6917 break;
6918
6919 ptr += cur;
6920 len -= cur;
6921 offset = 0;
6922 i++;
6923 }
6924 return ret;
6925}
d1310b2e 6926
b8f95771
QW
6927/*
6928 * Check that the extent buffer is uptodate.
6929 *
6930 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6931 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6932 */
6933static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6934 struct page *page)
6935{
6936 struct btrfs_fs_info *fs_info = eb->fs_info;
6937
a50e1fcb
JB
6938 /*
6939 * If we are using the commit root we could potentially clear a page
6940 * Uptodate while we're using the extent buffer that we've previously
6941 * looked up. We don't want to complain in this case, as the page was
6942 * valid before, we just didn't write it out. Instead we want to catch
6943 * the case where we didn't actually read the block properly, which
6944 * would have !PageUptodate && !PageError, as we clear PageError before
6945 * reading.
6946 */
fbca46eb 6947 if (fs_info->nodesize < PAGE_SIZE) {
a50e1fcb 6948 bool uptodate, error;
b8f95771
QW
6949
6950 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6951 eb->start, eb->len);
a50e1fcb
JB
6952 error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6953 WARN_ON(!uptodate && !error);
b8f95771 6954 } else {
a50e1fcb 6955 WARN_ON(!PageUptodate(page) && !PageError(page));
b8f95771
QW
6956 }
6957}
6958
2b48966a 6959void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
f157bf76
DS
6960 const void *srcv)
6961{
6962 char *kaddr;
6963
b8f95771 6964 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
6965 kaddr = page_address(eb->pages[0]) +
6966 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6967 chunk_tree_uuid));
6968 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
6969}
6970
2b48966a 6971void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
f157bf76
DS
6972{
6973 char *kaddr;
6974
b8f95771 6975 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
6976 kaddr = page_address(eb->pages[0]) +
6977 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6978 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
6979}
6980
2b48966a 6981void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
d1310b2e
CM
6982 unsigned long start, unsigned long len)
6983{
6984 size_t cur;
6985 size_t offset;
6986 struct page *page;
6987 char *kaddr;
6988 char *src = (char *)srcv;
884b07d0 6989 unsigned long i = get_eb_page_index(start);
d1310b2e 6990
d3575156
NA
6991 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6992
f98b6215
QW
6993 if (check_eb_range(eb, start, len))
6994 return;
d1310b2e 6995
884b07d0 6996 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6997
d397712b 6998 while (len > 0) {
fb85fc9a 6999 page = eb->pages[i];
b8f95771 7000 assert_eb_page_uptodate(eb, page);
d1310b2e 7001
09cbfeaf 7002 cur = min(len, PAGE_SIZE - offset);
a6591715 7003 kaddr = page_address(page);
d1310b2e 7004 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
7005
7006 src += cur;
7007 len -= cur;
7008 offset = 0;
7009 i++;
7010 }
7011}
d1310b2e 7012
2b48966a 7013void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
b159fa28 7014 unsigned long len)
d1310b2e
CM
7015{
7016 size_t cur;
7017 size_t offset;
7018 struct page *page;
7019 char *kaddr;
884b07d0 7020 unsigned long i = get_eb_page_index(start);
d1310b2e 7021
f98b6215
QW
7022 if (check_eb_range(eb, start, len))
7023 return;
d1310b2e 7024
884b07d0 7025 offset = get_eb_offset_in_page(eb, start);
d1310b2e 7026
d397712b 7027 while (len > 0) {
fb85fc9a 7028 page = eb->pages[i];
b8f95771 7029 assert_eb_page_uptodate(eb, page);
d1310b2e 7030
09cbfeaf 7031 cur = min(len, PAGE_SIZE - offset);
a6591715 7032 kaddr = page_address(page);
b159fa28 7033 memset(kaddr + offset, 0, cur);
d1310b2e
CM
7034
7035 len -= cur;
7036 offset = 0;
7037 i++;
7038 }
7039}
d1310b2e 7040
2b48966a
DS
7041void copy_extent_buffer_full(const struct extent_buffer *dst,
7042 const struct extent_buffer *src)
58e8012c
DS
7043{
7044 int i;
cc5e31a4 7045 int num_pages;
58e8012c
DS
7046
7047 ASSERT(dst->len == src->len);
7048
fbca46eb 7049 if (dst->fs_info->nodesize >= PAGE_SIZE) {
884b07d0
QW
7050 num_pages = num_extent_pages(dst);
7051 for (i = 0; i < num_pages; i++)
7052 copy_page(page_address(dst->pages[i]),
7053 page_address(src->pages[i]));
7054 } else {
7055 size_t src_offset = get_eb_offset_in_page(src, 0);
7056 size_t dst_offset = get_eb_offset_in_page(dst, 0);
7057
fbca46eb 7058 ASSERT(src->fs_info->nodesize < PAGE_SIZE);
884b07d0
QW
7059 memcpy(page_address(dst->pages[0]) + dst_offset,
7060 page_address(src->pages[0]) + src_offset,
7061 src->len);
7062 }
58e8012c
DS
7063}
7064
2b48966a
DS
7065void copy_extent_buffer(const struct extent_buffer *dst,
7066 const struct extent_buffer *src,
d1310b2e
CM
7067 unsigned long dst_offset, unsigned long src_offset,
7068 unsigned long len)
7069{
7070 u64 dst_len = dst->len;
7071 size_t cur;
7072 size_t offset;
7073 struct page *page;
7074 char *kaddr;
884b07d0 7075 unsigned long i = get_eb_page_index(dst_offset);
d1310b2e 7076
f98b6215
QW
7077 if (check_eb_range(dst, dst_offset, len) ||
7078 check_eb_range(src, src_offset, len))
7079 return;
7080
d1310b2e
CM
7081 WARN_ON(src->len != dst_len);
7082
884b07d0 7083 offset = get_eb_offset_in_page(dst, dst_offset);
d1310b2e 7084
d397712b 7085 while (len > 0) {
fb85fc9a 7086 page = dst->pages[i];
b8f95771 7087 assert_eb_page_uptodate(dst, page);
d1310b2e 7088
09cbfeaf 7089 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 7090
a6591715 7091 kaddr = page_address(page);
d1310b2e 7092 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
7093
7094 src_offset += cur;
7095 len -= cur;
7096 offset = 0;
7097 i++;
7098 }
7099}
d1310b2e 7100
3e1e8bb7
OS
7101/*
7102 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
7103 * given bit number
7104 * @eb: the extent buffer
7105 * @start: offset of the bitmap item in the extent buffer
7106 * @nr: bit number
7107 * @page_index: return index of the page in the extent buffer that contains the
7108 * given bit number
7109 * @page_offset: return offset into the page given by page_index
7110 *
7111 * This helper hides the ugliness of finding the byte in an extent buffer which
7112 * contains a given bit.
7113 */
2b48966a 7114static inline void eb_bitmap_offset(const struct extent_buffer *eb,
3e1e8bb7
OS
7115 unsigned long start, unsigned long nr,
7116 unsigned long *page_index,
7117 size_t *page_offset)
7118{
3e1e8bb7
OS
7119 size_t byte_offset = BIT_BYTE(nr);
7120 size_t offset;
7121
7122 /*
7123 * The byte we want is the offset of the extent buffer + the offset of
7124 * the bitmap item in the extent buffer + the offset of the byte in the
7125 * bitmap item.
7126 */
884b07d0 7127 offset = start + offset_in_page(eb->start) + byte_offset;
3e1e8bb7 7128
09cbfeaf 7129 *page_index = offset >> PAGE_SHIFT;
7073017a 7130 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
7131}
7132
7133/**
7134 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
7135 * @eb: the extent buffer
7136 * @start: offset of the bitmap item in the extent buffer
7137 * @nr: bit number to test
7138 */
2b48966a 7139int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
7140 unsigned long nr)
7141{
2fe1d551 7142 u8 *kaddr;
3e1e8bb7
OS
7143 struct page *page;
7144 unsigned long i;
7145 size_t offset;
7146
7147 eb_bitmap_offset(eb, start, nr, &i, &offset);
7148 page = eb->pages[i];
b8f95771 7149 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7150 kaddr = page_address(page);
7151 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7152}
7153
7154/**
7155 * extent_buffer_bitmap_set - set an area of a bitmap
7156 * @eb: the extent buffer
7157 * @start: offset of the bitmap item in the extent buffer
7158 * @pos: bit number of the first bit
7159 * @len: number of bits to set
7160 */
2b48966a 7161void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
7162 unsigned long pos, unsigned long len)
7163{
2fe1d551 7164 u8 *kaddr;
3e1e8bb7
OS
7165 struct page *page;
7166 unsigned long i;
7167 size_t offset;
7168 const unsigned int size = pos + len;
7169 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 7170 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
7171
7172 eb_bitmap_offset(eb, start, pos, &i, &offset);
7173 page = eb->pages[i];
b8f95771 7174 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7175 kaddr = page_address(page);
7176
7177 while (len >= bits_to_set) {
7178 kaddr[offset] |= mask_to_set;
7179 len -= bits_to_set;
7180 bits_to_set = BITS_PER_BYTE;
9c894696 7181 mask_to_set = ~0;
09cbfeaf 7182 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
7183 offset = 0;
7184 page = eb->pages[++i];
b8f95771 7185 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7186 kaddr = page_address(page);
7187 }
7188 }
7189 if (len) {
7190 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7191 kaddr[offset] |= mask_to_set;
7192 }
7193}
7194
7195
7196/**
7197 * extent_buffer_bitmap_clear - clear an area of a bitmap
7198 * @eb: the extent buffer
7199 * @start: offset of the bitmap item in the extent buffer
7200 * @pos: bit number of the first bit
7201 * @len: number of bits to clear
7202 */
2b48966a
DS
7203void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7204 unsigned long start, unsigned long pos,
7205 unsigned long len)
3e1e8bb7 7206{
2fe1d551 7207 u8 *kaddr;
3e1e8bb7
OS
7208 struct page *page;
7209 unsigned long i;
7210 size_t offset;
7211 const unsigned int size = pos + len;
7212 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 7213 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
7214
7215 eb_bitmap_offset(eb, start, pos, &i, &offset);
7216 page = eb->pages[i];
b8f95771 7217 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7218 kaddr = page_address(page);
7219
7220 while (len >= bits_to_clear) {
7221 kaddr[offset] &= ~mask_to_clear;
7222 len -= bits_to_clear;
7223 bits_to_clear = BITS_PER_BYTE;
9c894696 7224 mask_to_clear = ~0;
09cbfeaf 7225 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
7226 offset = 0;
7227 page = eb->pages[++i];
b8f95771 7228 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7229 kaddr = page_address(page);
7230 }
7231 }
7232 if (len) {
7233 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7234 kaddr[offset] &= ~mask_to_clear;
7235 }
7236}
7237
3387206f
ST
7238static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7239{
7240 unsigned long distance = (src > dst) ? src - dst : dst - src;
7241 return distance < len;
7242}
7243
d1310b2e
CM
7244static void copy_pages(struct page *dst_page, struct page *src_page,
7245 unsigned long dst_off, unsigned long src_off,
7246 unsigned long len)
7247{
a6591715 7248 char *dst_kaddr = page_address(dst_page);
d1310b2e 7249 char *src_kaddr;
727011e0 7250 int must_memmove = 0;
d1310b2e 7251
3387206f 7252 if (dst_page != src_page) {
a6591715 7253 src_kaddr = page_address(src_page);
3387206f 7254 } else {
d1310b2e 7255 src_kaddr = dst_kaddr;
727011e0
CM
7256 if (areas_overlap(src_off, dst_off, len))
7257 must_memmove = 1;
3387206f 7258 }
d1310b2e 7259
727011e0
CM
7260 if (must_memmove)
7261 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7262 else
7263 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
7264}
7265
2b48966a
DS
7266void memcpy_extent_buffer(const struct extent_buffer *dst,
7267 unsigned long dst_offset, unsigned long src_offset,
7268 unsigned long len)
d1310b2e
CM
7269{
7270 size_t cur;
7271 size_t dst_off_in_page;
7272 size_t src_off_in_page;
d1310b2e
CM
7273 unsigned long dst_i;
7274 unsigned long src_i;
7275
f98b6215
QW
7276 if (check_eb_range(dst, dst_offset, len) ||
7277 check_eb_range(dst, src_offset, len))
7278 return;
d1310b2e 7279
d397712b 7280 while (len > 0) {
884b07d0
QW
7281 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7282 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
d1310b2e 7283
884b07d0
QW
7284 dst_i = get_eb_page_index(dst_offset);
7285 src_i = get_eb_page_index(src_offset);
d1310b2e 7286
09cbfeaf 7287 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
7288 src_off_in_page));
7289 cur = min_t(unsigned long, cur,
09cbfeaf 7290 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 7291
fb85fc9a 7292 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
7293 dst_off_in_page, src_off_in_page, cur);
7294
7295 src_offset += cur;
7296 dst_offset += cur;
7297 len -= cur;
7298 }
7299}
d1310b2e 7300
2b48966a
DS
7301void memmove_extent_buffer(const struct extent_buffer *dst,
7302 unsigned long dst_offset, unsigned long src_offset,
7303 unsigned long len)
d1310b2e
CM
7304{
7305 size_t cur;
7306 size_t dst_off_in_page;
7307 size_t src_off_in_page;
7308 unsigned long dst_end = dst_offset + len - 1;
7309 unsigned long src_end = src_offset + len - 1;
d1310b2e
CM
7310 unsigned long dst_i;
7311 unsigned long src_i;
7312
f98b6215
QW
7313 if (check_eb_range(dst, dst_offset, len) ||
7314 check_eb_range(dst, src_offset, len))
7315 return;
727011e0 7316 if (dst_offset < src_offset) {
d1310b2e
CM
7317 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7318 return;
7319 }
d397712b 7320 while (len > 0) {
884b07d0
QW
7321 dst_i = get_eb_page_index(dst_end);
7322 src_i = get_eb_page_index(src_end);
d1310b2e 7323
884b07d0
QW
7324 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7325 src_off_in_page = get_eb_offset_in_page(dst, src_end);
d1310b2e
CM
7326
7327 cur = min_t(unsigned long, len, src_off_in_page + 1);
7328 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 7329 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
7330 dst_off_in_page - cur + 1,
7331 src_off_in_page - cur + 1, cur);
7332
7333 dst_end -= cur;
7334 src_end -= cur;
7335 len -= cur;
7336 }
7337}
6af118ce 7338
72a69cd0 7339#define GANG_LOOKUP_SIZE 16
d1e86e3f
QW
7340static struct extent_buffer *get_next_extent_buffer(
7341 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7342{
72a69cd0 7343 struct extent_buffer *gang[GANG_LOOKUP_SIZE];
d1e86e3f
QW
7344 struct extent_buffer *found = NULL;
7345 u64 page_start = page_offset(page);
72a69cd0 7346 u64 cur = page_start;
d1e86e3f
QW
7347
7348 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
d1e86e3f
QW
7349 lockdep_assert_held(&fs_info->buffer_lock);
7350
72a69cd0
QW
7351 while (cur < page_start + PAGE_SIZE) {
7352 int ret;
7353 int i;
7354
7355 ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7356 (void **)gang, cur >> fs_info->sectorsize_bits,
7357 min_t(unsigned int, GANG_LOOKUP_SIZE,
7358 PAGE_SIZE / fs_info->nodesize));
7359 if (ret == 0)
7360 goto out;
7361 for (i = 0; i < ret; i++) {
7362 /* Already beyond page end */
7363 if (gang[i]->start >= page_start + PAGE_SIZE)
7364 goto out;
7365 /* Found one */
7366 if (gang[i]->start >= bytenr) {
7367 found = gang[i];
7368 goto out;
7369 }
d1e86e3f 7370 }
72a69cd0 7371 cur = gang[ret - 1]->start + gang[ret - 1]->len;
d1e86e3f 7372 }
72a69cd0 7373out:
d1e86e3f
QW
7374 return found;
7375}
7376
7377static int try_release_subpage_extent_buffer(struct page *page)
7378{
7379 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7380 u64 cur = page_offset(page);
7381 const u64 end = page_offset(page) + PAGE_SIZE;
7382 int ret;
7383
7384 while (cur < end) {
7385 struct extent_buffer *eb = NULL;
7386
7387 /*
7388 * Unlike try_release_extent_buffer() which uses page->private
7389 * to grab buffer, for subpage case we rely on radix tree, thus
7390 * we need to ensure radix tree consistency.
7391 *
7392 * We also want an atomic snapshot of the radix tree, thus go
7393 * with spinlock rather than RCU.
7394 */
7395 spin_lock(&fs_info->buffer_lock);
7396 eb = get_next_extent_buffer(fs_info, page, cur);
7397 if (!eb) {
7398 /* No more eb in the page range after or at cur */
7399 spin_unlock(&fs_info->buffer_lock);
7400 break;
7401 }
7402 cur = eb->start + eb->len;
7403
7404 /*
7405 * The same as try_release_extent_buffer(), to ensure the eb
7406 * won't disappear out from under us.
7407 */
7408 spin_lock(&eb->refs_lock);
7409 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7410 spin_unlock(&eb->refs_lock);
7411 spin_unlock(&fs_info->buffer_lock);
7412 break;
7413 }
7414 spin_unlock(&fs_info->buffer_lock);
7415
7416 /*
7417 * If tree ref isn't set then we know the ref on this eb is a
7418 * real ref, so just return, this eb will likely be freed soon
7419 * anyway.
7420 */
7421 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7422 spin_unlock(&eb->refs_lock);
7423 break;
7424 }
7425
7426 /*
7427 * Here we don't care about the return value, we will always
7428 * check the page private at the end. And
7429 * release_extent_buffer() will release the refs_lock.
7430 */
7431 release_extent_buffer(eb);
7432 }
7433 /*
7434 * Finally to check if we have cleared page private, as if we have
7435 * released all ebs in the page, the page private should be cleared now.
7436 */
7437 spin_lock(&page->mapping->private_lock);
7438 if (!PagePrivate(page))
7439 ret = 1;
7440 else
7441 ret = 0;
7442 spin_unlock(&page->mapping->private_lock);
7443 return ret;
7444
7445}
7446
f7a52a40 7447int try_release_extent_buffer(struct page *page)
19fe0a8b 7448{
6af118ce 7449 struct extent_buffer *eb;
6af118ce 7450
fbca46eb 7451 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
d1e86e3f
QW
7452 return try_release_subpage_extent_buffer(page);
7453
3083ee2e 7454 /*
d1e86e3f
QW
7455 * We need to make sure nobody is changing page->private, as we rely on
7456 * page->private as the pointer to extent buffer.
3083ee2e
JB
7457 */
7458 spin_lock(&page->mapping->private_lock);
7459 if (!PagePrivate(page)) {
7460 spin_unlock(&page->mapping->private_lock);
4f2de97a 7461 return 1;
45f49bce 7462 }
6af118ce 7463
3083ee2e
JB
7464 eb = (struct extent_buffer *)page->private;
7465 BUG_ON(!eb);
19fe0a8b
MX
7466
7467 /*
3083ee2e
JB
7468 * This is a little awful but should be ok, we need to make sure that
7469 * the eb doesn't disappear out from under us while we're looking at
7470 * this page.
19fe0a8b 7471 */
3083ee2e 7472 spin_lock(&eb->refs_lock);
0b32f4bb 7473 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
7474 spin_unlock(&eb->refs_lock);
7475 spin_unlock(&page->mapping->private_lock);
7476 return 0;
b9473439 7477 }
3083ee2e 7478 spin_unlock(&page->mapping->private_lock);
897ca6e9 7479
19fe0a8b 7480 /*
3083ee2e
JB
7481 * If tree ref isn't set then we know the ref on this eb is a real ref,
7482 * so just return, this page will likely be freed soon anyway.
19fe0a8b 7483 */
3083ee2e
JB
7484 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7485 spin_unlock(&eb->refs_lock);
7486 return 0;
b9473439 7487 }
19fe0a8b 7488
f7a52a40 7489 return release_extent_buffer(eb);
6af118ce 7490}
bfb484d9
JB
7491
7492/*
7493 * btrfs_readahead_tree_block - attempt to readahead a child block
7494 * @fs_info: the fs_info
7495 * @bytenr: bytenr to read
3fbaf258 7496 * @owner_root: objectid of the root that owns this eb
bfb484d9 7497 * @gen: generation for the uptodate check, can be 0
3fbaf258 7498 * @level: level for the eb
bfb484d9
JB
7499 *
7500 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
7501 * normal uptodate check of the eb, without checking the generation. If we have
7502 * to read the block we will not block on anything.
7503 */
7504void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
3fbaf258 7505 u64 bytenr, u64 owner_root, u64 gen, int level)
bfb484d9
JB
7506{
7507 struct extent_buffer *eb;
7508 int ret;
7509
3fbaf258 7510 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
bfb484d9
JB
7511 if (IS_ERR(eb))
7512 return;
7513
7514 if (btrfs_buffer_uptodate(eb, gen, 1)) {
7515 free_extent_buffer(eb);
7516 return;
7517 }
7518
7519 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7520 if (ret < 0)
7521 free_extent_buffer_stale(eb);
7522 else
7523 free_extent_buffer(eb);
7524}
7525
7526/*
7527 * btrfs_readahead_node_child - readahead a node's child block
7528 * @node: parent node we're reading from
7529 * @slot: slot in the parent node for the child we want to read
7530 *
7531 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7532 * the slot in the node provided.
7533 */
7534void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7535{
7536 btrfs_readahead_tree_block(node->fs_info,
7537 btrfs_node_blockptr(node, slot),
3fbaf258
JB
7538 btrfs_header_owner(node),
7539 btrfs_node_ptr_generation(node, slot),
7540 btrfs_header_level(node) - 1);
bfb484d9 7541}