btrfs: cleanup for extent_write_locked_range()
[linux-block.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
d1310b2e
CM
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
d1310b2e
CM
12#include <linux/writeback.h>
13#include <linux/pagevec.h>
268bb0ce 14#include <linux/prefetch.h>
90a887c9 15#include <linux/cleancache.h>
14605409 16#include <linux/fsverity.h>
cea62800 17#include "misc.h"
d1310b2e 18#include "extent_io.h"
9c7d3a54 19#include "extent-io-tree.h"
d1310b2e 20#include "extent_map.h"
902b22f3
DW
21#include "ctree.h"
22#include "btrfs_inode.h"
4a54c8c1 23#include "volumes.h"
21adbd5c 24#include "check-integrity.h"
0b32f4bb 25#include "locking.h"
606686ee 26#include "rcu-string.h"
fe09e16c 27#include "backref.h"
6af49dbd 28#include "disk-io.h"
760f991f 29#include "subpage.h"
d3575156 30#include "zoned.h"
0bc09ca1 31#include "block-group.h"
d1310b2e 32
d1310b2e
CM
33static struct kmem_cache *extent_state_cache;
34static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 35static struct bio_set btrfs_bioset;
d1310b2e 36
27a3507d
FM
37static inline bool extent_state_in_tree(const struct extent_state *state)
38{
39 return !RB_EMPTY_NODE(&state->rb_node);
40}
41
6d49ba1b 42#ifdef CONFIG_BTRFS_DEBUG
d1310b2e 43static LIST_HEAD(states);
d397712b 44static DEFINE_SPINLOCK(leak_lock);
6d49ba1b 45
3fd63727
JB
46static inline void btrfs_leak_debug_add(spinlock_t *lock,
47 struct list_head *new,
48 struct list_head *head)
6d49ba1b
ES
49{
50 unsigned long flags;
51
3fd63727 52 spin_lock_irqsave(lock, flags);
6d49ba1b 53 list_add(new, head);
3fd63727 54 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
55}
56
3fd63727
JB
57static inline void btrfs_leak_debug_del(spinlock_t *lock,
58 struct list_head *entry)
6d49ba1b
ES
59{
60 unsigned long flags;
61
3fd63727 62 spin_lock_irqsave(lock, flags);
6d49ba1b 63 list_del(entry);
3fd63727 64 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
65}
66
3fd63727 67void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 68{
6d49ba1b 69 struct extent_buffer *eb;
3fd63727 70 unsigned long flags;
6d49ba1b 71
8c38938c
JB
72 /*
73 * If we didn't get into open_ctree our allocated_ebs will not be
74 * initialized, so just skip this.
75 */
76 if (!fs_info->allocated_ebs.next)
77 return;
78
3fd63727
JB
79 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
80 while (!list_empty(&fs_info->allocated_ebs)) {
81 eb = list_first_entry(&fs_info->allocated_ebs,
82 struct extent_buffer, leak_list);
8c38938c
JB
83 pr_err(
84 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
85 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
86 btrfs_header_owner(eb));
33ca832f
JB
87 list_del(&eb->leak_list);
88 kmem_cache_free(extent_buffer_cache, eb);
89 }
3fd63727 90 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f
JB
91}
92
93static inline void btrfs_extent_state_leak_debug_check(void)
94{
95 struct extent_state *state;
96
6d49ba1b
ES
97 while (!list_empty(&states)) {
98 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 99 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
100 state->start, state->end, state->state,
101 extent_state_in_tree(state),
b7ac31b7 102 refcount_read(&state->refs));
6d49ba1b
ES
103 list_del(&state->leak_list);
104 kmem_cache_free(extent_state_cache, state);
105 }
6d49ba1b 106}
8d599ae1 107
a5dee37d
JB
108#define btrfs_debug_check_extent_io_range(tree, start, end) \
109 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 110static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 111 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 112{
65a680f6
NB
113 struct inode *inode = tree->private_data;
114 u64 isize;
115
116 if (!inode || !is_data_inode(inode))
117 return;
118
119 isize = i_size_read(inode);
120 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
121 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
122 "%s: ino %llu isize %llu odd range [%llu,%llu]",
123 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
124 }
8d599ae1 125}
6d49ba1b 126#else
3fd63727
JB
127#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
128#define btrfs_leak_debug_del(lock, entry) do {} while (0)
33ca832f 129#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 130#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 131#endif
d1310b2e 132
d1310b2e
CM
133struct tree_entry {
134 u64 start;
135 u64 end;
d1310b2e
CM
136 struct rb_node rb_node;
137};
138
139struct extent_page_data {
390ed29b 140 struct btrfs_bio_ctrl bio_ctrl;
771ed689
CM
141 /* tells writepage not to lock the state bits for this range
142 * it still does the unlocking
143 */
ffbd517d
CM
144 unsigned int extent_locked:1;
145
70fd7614 146 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 147 unsigned int sync_io:1;
d1310b2e
CM
148};
149
f97e27e9 150static int add_extent_changeset(struct extent_state *state, u32 bits,
d38ed27f
QW
151 struct extent_changeset *changeset,
152 int set)
153{
154 int ret;
155
156 if (!changeset)
57599c7e 157 return 0;
d38ed27f 158 if (set && (state->state & bits) == bits)
57599c7e 159 return 0;
fefdc557 160 if (!set && (state->state & bits) == 0)
57599c7e 161 return 0;
d38ed27f 162 changeset->bytes_changed += state->end - state->start + 1;
53d32359 163 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 164 GFP_ATOMIC);
57599c7e 165 return ret;
d38ed27f
QW
166}
167
c1be9c1a
NB
168int __must_check submit_one_bio(struct bio *bio, int mirror_num,
169 unsigned long bio_flags)
bb58eb9e
QW
170{
171 blk_status_t ret = 0;
bb58eb9e 172 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
173
174 bio->bi_private = NULL;
175
e0eefe07
QW
176 /* Caller should ensure the bio has at least some range added */
177 ASSERT(bio->bi_iter.bi_size);
908930f3
NB
178 if (is_data_inode(tree->private_data))
179 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
180 bio_flags);
181 else
1b36294a
NB
182 ret = btrfs_submit_metadata_bio(tree->private_data, bio,
183 mirror_num, bio_flags);
bb58eb9e
QW
184
185 return blk_status_to_errno(ret);
186}
187
3065976b
QW
188/* Cleanup unsubmitted bios */
189static void end_write_bio(struct extent_page_data *epd, int ret)
190{
390ed29b
QW
191 struct bio *bio = epd->bio_ctrl.bio;
192
193 if (bio) {
194 bio->bi_status = errno_to_blk_status(ret);
195 bio_endio(bio);
196 epd->bio_ctrl.bio = NULL;
3065976b
QW
197 }
198}
199
f4340622
QW
200/*
201 * Submit bio from extent page data via submit_one_bio
202 *
203 * Return 0 if everything is OK.
204 * Return <0 for error.
205 */
206static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 207{
f4340622 208 int ret = 0;
390ed29b 209 struct bio *bio = epd->bio_ctrl.bio;
bb58eb9e 210
390ed29b
QW
211 if (bio) {
212 ret = submit_one_bio(bio, 0, 0);
f4340622
QW
213 /*
214 * Clean up of epd->bio is handled by its endio function.
215 * And endio is either triggered by successful bio execution
216 * or the error handler of submit bio hook.
217 * So at this point, no matter what happened, we don't need
218 * to clean up epd->bio.
219 */
390ed29b 220 epd->bio_ctrl.bio = NULL;
bb58eb9e 221 }
f4340622 222 return ret;
bb58eb9e 223}
e2932ee0 224
6f0d04f8 225int __init extent_state_cache_init(void)
d1310b2e 226{
837e1972 227 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 228 sizeof(struct extent_state), 0,
fba4b697 229 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
230 if (!extent_state_cache)
231 return -ENOMEM;
6f0d04f8
JB
232 return 0;
233}
d1310b2e 234
6f0d04f8
JB
235int __init extent_io_init(void)
236{
837e1972 237 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 238 sizeof(struct extent_buffer), 0,
fba4b697 239 SLAB_MEM_SPREAD, NULL);
d1310b2e 240 if (!extent_buffer_cache)
6f0d04f8 241 return -ENOMEM;
9be3395b 242
8ac9f7c1 243 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
c3a3b19b 244 offsetof(struct btrfs_bio, bio),
8ac9f7c1 245 BIOSET_NEED_BVECS))
9be3395b 246 goto free_buffer_cache;
b208c2f7 247
8ac9f7c1 248 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
249 goto free_bioset;
250
d1310b2e
CM
251 return 0;
252
b208c2f7 253free_bioset:
8ac9f7c1 254 bioset_exit(&btrfs_bioset);
b208c2f7 255
9be3395b
CM
256free_buffer_cache:
257 kmem_cache_destroy(extent_buffer_cache);
258 extent_buffer_cache = NULL;
6f0d04f8
JB
259 return -ENOMEM;
260}
9be3395b 261
6f0d04f8
JB
262void __cold extent_state_cache_exit(void)
263{
264 btrfs_extent_state_leak_debug_check();
d1310b2e 265 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
266}
267
e67c718b 268void __cold extent_io_exit(void)
d1310b2e 269{
8c0a8537
KS
270 /*
271 * Make sure all delayed rcu free are flushed before we
272 * destroy caches.
273 */
274 rcu_barrier();
5598e900 275 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 276 bioset_exit(&btrfs_bioset);
d1310b2e
CM
277}
278
41a2ee75
JB
279/*
280 * For the file_extent_tree, we want to hold the inode lock when we lookup and
281 * update the disk_i_size, but lockdep will complain because our io_tree we hold
282 * the tree lock and get the inode lock when setting delalloc. These two things
283 * are unrelated, so make a class for the file_extent_tree so we don't get the
284 * two locking patterns mixed up.
285 */
286static struct lock_class_key file_extent_tree_class;
287
c258d6e3 288void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
289 struct extent_io_tree *tree, unsigned int owner,
290 void *private_data)
d1310b2e 291{
c258d6e3 292 tree->fs_info = fs_info;
6bef4d31 293 tree->state = RB_ROOT;
d1310b2e 294 tree->dirty_bytes = 0;
70dec807 295 spin_lock_init(&tree->lock);
c6100a4b 296 tree->private_data = private_data;
43eb5f29 297 tree->owner = owner;
41a2ee75
JB
298 if (owner == IO_TREE_INODE_FILE_EXTENT)
299 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 300}
d1310b2e 301
41e7acd3
NB
302void extent_io_tree_release(struct extent_io_tree *tree)
303{
304 spin_lock(&tree->lock);
305 /*
306 * Do a single barrier for the waitqueue_active check here, the state
307 * of the waitqueue should not change once extent_io_tree_release is
308 * called.
309 */
310 smp_mb();
311 while (!RB_EMPTY_ROOT(&tree->state)) {
312 struct rb_node *node;
313 struct extent_state *state;
314
315 node = rb_first(&tree->state);
316 state = rb_entry(node, struct extent_state, rb_node);
317 rb_erase(&state->rb_node, &tree->state);
318 RB_CLEAR_NODE(&state->rb_node);
319 /*
320 * btree io trees aren't supposed to have tasks waiting for
321 * changes in the flags of extent states ever.
322 */
323 ASSERT(!waitqueue_active(&state->wq));
324 free_extent_state(state);
325
326 cond_resched_lock(&tree->lock);
327 }
328 spin_unlock(&tree->lock);
329}
330
b2950863 331static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
332{
333 struct extent_state *state;
d1310b2e 334
3ba7ab22
MH
335 /*
336 * The given mask might be not appropriate for the slab allocator,
337 * drop the unsupported bits
338 */
339 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 340 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 341 if (!state)
d1310b2e
CM
342 return state;
343 state->state = 0;
47dc196a 344 state->failrec = NULL;
27a3507d 345 RB_CLEAR_NODE(&state->rb_node);
3fd63727 346 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
b7ac31b7 347 refcount_set(&state->refs, 1);
d1310b2e 348 init_waitqueue_head(&state->wq);
143bede5 349 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
350 return state;
351}
d1310b2e 352
4845e44f 353void free_extent_state(struct extent_state *state)
d1310b2e 354{
d1310b2e
CM
355 if (!state)
356 return;
b7ac31b7 357 if (refcount_dec_and_test(&state->refs)) {
27a3507d 358 WARN_ON(extent_state_in_tree(state));
3fd63727 359 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
143bede5 360 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
361 kmem_cache_free(extent_state_cache, state);
362 }
363}
d1310b2e 364
f2071b21
FM
365static struct rb_node *tree_insert(struct rb_root *root,
366 struct rb_node *search_start,
367 u64 offset,
12cfbad9
FDBM
368 struct rb_node *node,
369 struct rb_node ***p_in,
370 struct rb_node **parent_in)
d1310b2e 371{
f2071b21 372 struct rb_node **p;
d397712b 373 struct rb_node *parent = NULL;
d1310b2e
CM
374 struct tree_entry *entry;
375
12cfbad9
FDBM
376 if (p_in && parent_in) {
377 p = *p_in;
378 parent = *parent_in;
379 goto do_insert;
380 }
381
f2071b21 382 p = search_start ? &search_start : &root->rb_node;
d397712b 383 while (*p) {
d1310b2e
CM
384 parent = *p;
385 entry = rb_entry(parent, struct tree_entry, rb_node);
386
387 if (offset < entry->start)
388 p = &(*p)->rb_left;
389 else if (offset > entry->end)
390 p = &(*p)->rb_right;
391 else
392 return parent;
393 }
394
12cfbad9 395do_insert:
d1310b2e
CM
396 rb_link_node(node, parent, p);
397 rb_insert_color(node, root);
398 return NULL;
399}
400
8666e638 401/**
3bed2da1
NB
402 * Search @tree for an entry that contains @offset. Such entry would have
403 * entry->start <= offset && entry->end >= offset.
8666e638 404 *
3bed2da1
NB
405 * @tree: the tree to search
406 * @offset: offset that should fall within an entry in @tree
407 * @next_ret: pointer to the first entry whose range ends after @offset
408 * @prev_ret: pointer to the first entry whose range begins before @offset
409 * @p_ret: pointer where new node should be anchored (used when inserting an
410 * entry in the tree)
411 * @parent_ret: points to entry which would have been the parent of the entry,
8666e638
NB
412 * containing @offset
413 *
414 * This function returns a pointer to the entry that contains @offset byte
415 * address. If no such entry exists, then NULL is returned and the other
416 * pointer arguments to the function are filled, otherwise the found entry is
417 * returned and other pointers are left untouched.
418 */
80ea96b1 419static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 420 struct rb_node **next_ret,
352646c7 421 struct rb_node **prev_ret,
12cfbad9
FDBM
422 struct rb_node ***p_ret,
423 struct rb_node **parent_ret)
d1310b2e 424{
80ea96b1 425 struct rb_root *root = &tree->state;
12cfbad9 426 struct rb_node **n = &root->rb_node;
d1310b2e
CM
427 struct rb_node *prev = NULL;
428 struct rb_node *orig_prev = NULL;
429 struct tree_entry *entry;
430 struct tree_entry *prev_entry = NULL;
431
12cfbad9
FDBM
432 while (*n) {
433 prev = *n;
434 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
435 prev_entry = entry;
436
437 if (offset < entry->start)
12cfbad9 438 n = &(*n)->rb_left;
d1310b2e 439 else if (offset > entry->end)
12cfbad9 440 n = &(*n)->rb_right;
d397712b 441 else
12cfbad9 442 return *n;
d1310b2e
CM
443 }
444
12cfbad9
FDBM
445 if (p_ret)
446 *p_ret = n;
447 if (parent_ret)
448 *parent_ret = prev;
449
352646c7 450 if (next_ret) {
d1310b2e 451 orig_prev = prev;
d397712b 452 while (prev && offset > prev_entry->end) {
d1310b2e
CM
453 prev = rb_next(prev);
454 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
455 }
352646c7 456 *next_ret = prev;
d1310b2e
CM
457 prev = orig_prev;
458 }
459
352646c7 460 if (prev_ret) {
d1310b2e 461 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 462 while (prev && offset < prev_entry->start) {
d1310b2e
CM
463 prev = rb_prev(prev);
464 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
465 }
352646c7 466 *prev_ret = prev;
d1310b2e
CM
467 }
468 return NULL;
469}
470
12cfbad9
FDBM
471static inline struct rb_node *
472tree_search_for_insert(struct extent_io_tree *tree,
473 u64 offset,
474 struct rb_node ***p_ret,
475 struct rb_node **parent_ret)
d1310b2e 476{
352646c7 477 struct rb_node *next= NULL;
d1310b2e 478 struct rb_node *ret;
70dec807 479
352646c7 480 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 481 if (!ret)
352646c7 482 return next;
d1310b2e
CM
483 return ret;
484}
485
12cfbad9
FDBM
486static inline struct rb_node *tree_search(struct extent_io_tree *tree,
487 u64 offset)
488{
489 return tree_search_for_insert(tree, offset, NULL, NULL);
490}
491
d1310b2e
CM
492/*
493 * utility function to look for merge candidates inside a given range.
494 * Any extents with matching state are merged together into a single
495 * extent in the tree. Extents with EXTENT_IO in their state field
496 * are not merged because the end_io handlers need to be able to do
497 * operations on them without sleeping (or doing allocations/splits).
498 *
499 * This should be called with the tree lock held.
500 */
1bf85046
JM
501static void merge_state(struct extent_io_tree *tree,
502 struct extent_state *state)
d1310b2e
CM
503{
504 struct extent_state *other;
505 struct rb_node *other_node;
506
8882679e 507 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 508 return;
d1310b2e
CM
509
510 other_node = rb_prev(&state->rb_node);
511 if (other_node) {
512 other = rb_entry(other_node, struct extent_state, rb_node);
513 if (other->end == state->start - 1 &&
514 other->state == state->state) {
5c848198
NB
515 if (tree->private_data &&
516 is_data_inode(tree->private_data))
517 btrfs_merge_delalloc_extent(tree->private_data,
518 state, other);
d1310b2e 519 state->start = other->start;
d1310b2e 520 rb_erase(&other->rb_node, &tree->state);
27a3507d 521 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
522 free_extent_state(other);
523 }
524 }
525 other_node = rb_next(&state->rb_node);
526 if (other_node) {
527 other = rb_entry(other_node, struct extent_state, rb_node);
528 if (other->start == state->end + 1 &&
529 other->state == state->state) {
5c848198
NB
530 if (tree->private_data &&
531 is_data_inode(tree->private_data))
532 btrfs_merge_delalloc_extent(tree->private_data,
533 state, other);
df98b6e2 534 state->end = other->end;
df98b6e2 535 rb_erase(&other->rb_node, &tree->state);
27a3507d 536 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 537 free_extent_state(other);
d1310b2e
CM
538 }
539 }
d1310b2e
CM
540}
541
3150b699 542static void set_state_bits(struct extent_io_tree *tree,
f97e27e9 543 struct extent_state *state, u32 *bits,
d38ed27f 544 struct extent_changeset *changeset);
3150b699 545
d1310b2e
CM
546/*
547 * insert an extent_state struct into the tree. 'bits' are set on the
548 * struct before it is inserted.
549 *
550 * This may return -EEXIST if the extent is already there, in which case the
551 * state struct is freed.
552 *
553 * The tree lock is not taken internally. This is a utility function and
554 * probably isn't what you want to call (see set/clear_extent_bit).
555 */
556static int insert_state(struct extent_io_tree *tree,
557 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
558 struct rb_node ***p,
559 struct rb_node **parent,
f97e27e9 560 u32 *bits, struct extent_changeset *changeset)
d1310b2e
CM
561{
562 struct rb_node *node;
563
2792237d
DS
564 if (end < start) {
565 btrfs_err(tree->fs_info,
566 "insert state: end < start %llu %llu", end, start);
567 WARN_ON(1);
568 }
d1310b2e
CM
569 state->start = start;
570 state->end = end;
9ed74f2d 571
d38ed27f 572 set_state_bits(tree, state, bits, changeset);
3150b699 573
f2071b21 574 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
575 if (node) {
576 struct extent_state *found;
577 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
578 btrfs_err(tree->fs_info,
579 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 580 found->start, found->end, start, end);
d1310b2e
CM
581 return -EEXIST;
582 }
583 merge_state(tree, state);
584 return 0;
585}
586
587/*
588 * split a given extent state struct in two, inserting the preallocated
589 * struct 'prealloc' as the newly created second half. 'split' indicates an
590 * offset inside 'orig' where it should be split.
591 *
592 * Before calling,
593 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
594 * are two extent state structs in the tree:
595 * prealloc: [orig->start, split - 1]
596 * orig: [ split, orig->end ]
597 *
598 * The tree locks are not taken by this function. They need to be held
599 * by the caller.
600 */
601static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
602 struct extent_state *prealloc, u64 split)
603{
604 struct rb_node *node;
9ed74f2d 605
abbb55f4
NB
606 if (tree->private_data && is_data_inode(tree->private_data))
607 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 608
d1310b2e
CM
609 prealloc->start = orig->start;
610 prealloc->end = split - 1;
611 prealloc->state = orig->state;
612 orig->start = split;
613
f2071b21
FM
614 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
615 &prealloc->rb_node, NULL, NULL);
d1310b2e 616 if (node) {
d1310b2e
CM
617 free_extent_state(prealloc);
618 return -EEXIST;
619 }
620 return 0;
621}
622
cdc6a395
LZ
623static struct extent_state *next_state(struct extent_state *state)
624{
625 struct rb_node *next = rb_next(&state->rb_node);
626 if (next)
627 return rb_entry(next, struct extent_state, rb_node);
628 else
629 return NULL;
630}
631
d1310b2e
CM
632/*
633 * utility function to clear some bits in an extent state struct.
52042d8e 634 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
635 *
636 * If no bits are set on the state struct after clearing things, the
637 * struct is freed and removed from the tree
638 */
cdc6a395
LZ
639static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
640 struct extent_state *state,
f97e27e9 641 u32 *bits, int wake,
fefdc557 642 struct extent_changeset *changeset)
d1310b2e 643{
cdc6a395 644 struct extent_state *next;
f97e27e9 645 u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 646 int ret;
d1310b2e 647
0ca1f7ce 648 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
649 u64 range = state->end - state->start + 1;
650 WARN_ON(range > tree->dirty_bytes);
651 tree->dirty_bytes -= range;
652 }
a36bb5f9
NB
653
654 if (tree->private_data && is_data_inode(tree->private_data))
655 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
656
57599c7e
DS
657 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
658 BUG_ON(ret < 0);
32c00aff 659 state->state &= ~bits_to_clear;
d1310b2e
CM
660 if (wake)
661 wake_up(&state->wq);
0ca1f7ce 662 if (state->state == 0) {
cdc6a395 663 next = next_state(state);
27a3507d 664 if (extent_state_in_tree(state)) {
d1310b2e 665 rb_erase(&state->rb_node, &tree->state);
27a3507d 666 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
667 free_extent_state(state);
668 } else {
669 WARN_ON(1);
670 }
671 } else {
672 merge_state(tree, state);
cdc6a395 673 next = next_state(state);
d1310b2e 674 }
cdc6a395 675 return next;
d1310b2e
CM
676}
677
8233767a
XG
678static struct extent_state *
679alloc_extent_state_atomic(struct extent_state *prealloc)
680{
681 if (!prealloc)
682 prealloc = alloc_extent_state(GFP_ATOMIC);
683
684 return prealloc;
685}
686
48a3b636 687static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 688{
29b665cc 689 btrfs_panic(tree->fs_info, err,
05912a3c 690 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
691}
692
d1310b2e
CM
693/*
694 * clear some bits on a range in the tree. This may require splitting
695 * or inserting elements in the tree, so the gfp mask is used to
696 * indicate which allocations or sleeping are allowed.
697 *
698 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
699 * the given range from the tree regardless of state (ie for truncate).
700 *
701 * the range [start, end] is inclusive.
702 *
6763af84 703 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 704 */
66b0c887 705int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9
QW
706 u32 bits, int wake, int delete,
707 struct extent_state **cached_state,
708 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
709{
710 struct extent_state *state;
2c64c53d 711 struct extent_state *cached;
d1310b2e
CM
712 struct extent_state *prealloc = NULL;
713 struct rb_node *node;
5c939df5 714 u64 last_end;
d1310b2e 715 int err;
2ac55d41 716 int clear = 0;
d1310b2e 717
a5dee37d 718 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 719 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 720
7ee9e440
JB
721 if (bits & EXTENT_DELALLOC)
722 bits |= EXTENT_NORESERVE;
723
0ca1f7ce
YZ
724 if (delete)
725 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 726
8882679e 727 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 728 clear = 1;
d1310b2e 729again:
d0164adc 730 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
731 /*
732 * Don't care for allocation failure here because we might end
733 * up not needing the pre-allocated extent state at all, which
734 * is the case if we only have in the tree extent states that
735 * cover our input range and don't cover too any other range.
736 * If we end up needing a new extent state we allocate it later.
737 */
d1310b2e 738 prealloc = alloc_extent_state(mask);
d1310b2e
CM
739 }
740
cad321ad 741 spin_lock(&tree->lock);
2c64c53d
CM
742 if (cached_state) {
743 cached = *cached_state;
2ac55d41
JB
744
745 if (clear) {
746 *cached_state = NULL;
747 cached_state = NULL;
748 }
749
27a3507d
FM
750 if (cached && extent_state_in_tree(cached) &&
751 cached->start <= start && cached->end > start) {
2ac55d41 752 if (clear)
b7ac31b7 753 refcount_dec(&cached->refs);
2c64c53d 754 state = cached;
42daec29 755 goto hit_next;
2c64c53d 756 }
2ac55d41
JB
757 if (clear)
758 free_extent_state(cached);
2c64c53d 759 }
d1310b2e
CM
760 /*
761 * this search will find the extents that end after
762 * our range starts
763 */
80ea96b1 764 node = tree_search(tree, start);
d1310b2e
CM
765 if (!node)
766 goto out;
767 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 768hit_next:
d1310b2e
CM
769 if (state->start > end)
770 goto out;
771 WARN_ON(state->end < start);
5c939df5 772 last_end = state->end;
d1310b2e 773
0449314a 774 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
775 if (!(state->state & bits)) {
776 state = next_state(state);
0449314a 777 goto next;
cdc6a395 778 }
0449314a 779
d1310b2e
CM
780 /*
781 * | ---- desired range ---- |
782 * | state | or
783 * | ------------- state -------------- |
784 *
785 * We need to split the extent we found, and may flip
786 * bits on second half.
787 *
788 * If the extent we found extends past our range, we
789 * just split and search again. It'll get split again
790 * the next time though.
791 *
792 * If the extent we found is inside our range, we clear
793 * the desired bit on it.
794 */
795
796 if (state->start < start) {
8233767a
XG
797 prealloc = alloc_extent_state_atomic(prealloc);
798 BUG_ON(!prealloc);
d1310b2e 799 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
800 if (err)
801 extent_io_tree_panic(tree, err);
802
d1310b2e
CM
803 prealloc = NULL;
804 if (err)
805 goto out;
806 if (state->end <= end) {
fefdc557
QW
807 state = clear_state_bit(tree, state, &bits, wake,
808 changeset);
d1ac6e41 809 goto next;
d1310b2e
CM
810 }
811 goto search_again;
812 }
813 /*
814 * | ---- desired range ---- |
815 * | state |
816 * We need to split the extent, and clear the bit
817 * on the first half
818 */
819 if (state->start <= end && state->end > end) {
8233767a
XG
820 prealloc = alloc_extent_state_atomic(prealloc);
821 BUG_ON(!prealloc);
d1310b2e 822 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
823 if (err)
824 extent_io_tree_panic(tree, err);
825
d1310b2e
CM
826 if (wake)
827 wake_up(&state->wq);
42daec29 828
fefdc557 829 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 830
d1310b2e
CM
831 prealloc = NULL;
832 goto out;
833 }
42daec29 834
fefdc557 835 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 836next:
5c939df5
YZ
837 if (last_end == (u64)-1)
838 goto out;
839 start = last_end + 1;
cdc6a395 840 if (start <= end && state && !need_resched())
692e5759 841 goto hit_next;
d1310b2e
CM
842
843search_again:
844 if (start > end)
845 goto out;
cad321ad 846 spin_unlock(&tree->lock);
d0164adc 847 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
848 cond_resched();
849 goto again;
7ab5cb2a
DS
850
851out:
852 spin_unlock(&tree->lock);
853 if (prealloc)
854 free_extent_state(prealloc);
855
856 return 0;
857
d1310b2e 858}
d1310b2e 859
143bede5
JM
860static void wait_on_state(struct extent_io_tree *tree,
861 struct extent_state *state)
641f5219
CH
862 __releases(tree->lock)
863 __acquires(tree->lock)
d1310b2e
CM
864{
865 DEFINE_WAIT(wait);
866 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 867 spin_unlock(&tree->lock);
d1310b2e 868 schedule();
cad321ad 869 spin_lock(&tree->lock);
d1310b2e 870 finish_wait(&state->wq, &wait);
d1310b2e
CM
871}
872
873/*
874 * waits for one or more bits to clear on a range in the state tree.
875 * The range [start, end] is inclusive.
876 * The tree lock is taken by this function
877 */
41074888 878static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 879 u32 bits)
d1310b2e
CM
880{
881 struct extent_state *state;
882 struct rb_node *node;
883
a5dee37d 884 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 885
cad321ad 886 spin_lock(&tree->lock);
d1310b2e
CM
887again:
888 while (1) {
889 /*
890 * this search will find all the extents that end after
891 * our range starts
892 */
80ea96b1 893 node = tree_search(tree, start);
c50d3e71 894process_node:
d1310b2e
CM
895 if (!node)
896 break;
897
898 state = rb_entry(node, struct extent_state, rb_node);
899
900 if (state->start > end)
901 goto out;
902
903 if (state->state & bits) {
904 start = state->start;
b7ac31b7 905 refcount_inc(&state->refs);
d1310b2e
CM
906 wait_on_state(tree, state);
907 free_extent_state(state);
908 goto again;
909 }
910 start = state->end + 1;
911
912 if (start > end)
913 break;
914
c50d3e71
FM
915 if (!cond_resched_lock(&tree->lock)) {
916 node = rb_next(node);
917 goto process_node;
918 }
d1310b2e
CM
919 }
920out:
cad321ad 921 spin_unlock(&tree->lock);
d1310b2e 922}
d1310b2e 923
1bf85046 924static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 925 struct extent_state *state,
f97e27e9 926 u32 *bits, struct extent_changeset *changeset)
d1310b2e 927{
f97e27e9 928 u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 929 int ret;
9ed74f2d 930
e06a1fc9
NB
931 if (tree->private_data && is_data_inode(tree->private_data))
932 btrfs_set_delalloc_extent(tree->private_data, state, bits);
933
0ca1f7ce 934 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
935 u64 range = state->end - state->start + 1;
936 tree->dirty_bytes += range;
937 }
57599c7e
DS
938 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
939 BUG_ON(ret < 0);
0ca1f7ce 940 state->state |= bits_to_set;
d1310b2e
CM
941}
942
e38e2ed7
FM
943static void cache_state_if_flags(struct extent_state *state,
944 struct extent_state **cached_ptr,
9ee49a04 945 unsigned flags)
2c64c53d
CM
946{
947 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 948 if (!flags || (state->state & flags)) {
2c64c53d 949 *cached_ptr = state;
b7ac31b7 950 refcount_inc(&state->refs);
2c64c53d
CM
951 }
952 }
953}
954
e38e2ed7
FM
955static void cache_state(struct extent_state *state,
956 struct extent_state **cached_ptr)
957{
958 return cache_state_if_flags(state, cached_ptr,
8882679e 959 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
960}
961
d1310b2e 962/*
1edbb734
CM
963 * set some bits on a range in the tree. This may require allocations or
964 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 965 *
1edbb734
CM
966 * If any of the exclusive bits are set, this will fail with -EEXIST if some
967 * part of the range already has the desired bits set. The start of the
968 * existing range is returned in failed_start in this case.
d1310b2e 969 *
1edbb734 970 * [start, end] is inclusive This takes the tree lock.
d1310b2e 971 */
f97e27e9
QW
972int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
973 u32 exclusive_bits, u64 *failed_start,
1cab5e72
NB
974 struct extent_state **cached_state, gfp_t mask,
975 struct extent_changeset *changeset)
d1310b2e
CM
976{
977 struct extent_state *state;
978 struct extent_state *prealloc = NULL;
979 struct rb_node *node;
12cfbad9
FDBM
980 struct rb_node **p;
981 struct rb_node *parent;
d1310b2e 982 int err = 0;
d1310b2e
CM
983 u64 last_start;
984 u64 last_end;
42daec29 985
a5dee37d 986 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 987 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 988
3f6bb4ae
QW
989 if (exclusive_bits)
990 ASSERT(failed_start);
991 else
992 ASSERT(failed_start == NULL);
d1310b2e 993again:
d0164adc 994 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
995 /*
996 * Don't care for allocation failure here because we might end
997 * up not needing the pre-allocated extent state at all, which
998 * is the case if we only have in the tree extent states that
999 * cover our input range and don't cover too any other range.
1000 * If we end up needing a new extent state we allocate it later.
1001 */
d1310b2e 1002 prealloc = alloc_extent_state(mask);
d1310b2e
CM
1003 }
1004
cad321ad 1005 spin_lock(&tree->lock);
9655d298
CM
1006 if (cached_state && *cached_state) {
1007 state = *cached_state;
df98b6e2 1008 if (state->start <= start && state->end > start &&
27a3507d 1009 extent_state_in_tree(state)) {
9655d298
CM
1010 node = &state->rb_node;
1011 goto hit_next;
1012 }
1013 }
d1310b2e
CM
1014 /*
1015 * this search will find all the extents that end after
1016 * our range starts.
1017 */
12cfbad9 1018 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1019 if (!node) {
8233767a
XG
1020 prealloc = alloc_extent_state_atomic(prealloc);
1021 BUG_ON(!prealloc);
12cfbad9 1022 err = insert_state(tree, prealloc, start, end,
d38ed27f 1023 &p, &parent, &bits, changeset);
c2d904e0
JM
1024 if (err)
1025 extent_io_tree_panic(tree, err);
1026
c42ac0bc 1027 cache_state(prealloc, cached_state);
d1310b2e 1028 prealloc = NULL;
d1310b2e
CM
1029 goto out;
1030 }
d1310b2e 1031 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1032hit_next:
d1310b2e
CM
1033 last_start = state->start;
1034 last_end = state->end;
1035
1036 /*
1037 * | ---- desired range ---- |
1038 * | state |
1039 *
1040 * Just lock what we found and keep going
1041 */
1042 if (state->start == start && state->end <= end) {
1edbb734 1043 if (state->state & exclusive_bits) {
d1310b2e
CM
1044 *failed_start = state->start;
1045 err = -EEXIST;
1046 goto out;
1047 }
42daec29 1048
d38ed27f 1049 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1050 cache_state(state, cached_state);
d1310b2e 1051 merge_state(tree, state);
5c939df5
YZ
1052 if (last_end == (u64)-1)
1053 goto out;
1054 start = last_end + 1;
d1ac6e41
LB
1055 state = next_state(state);
1056 if (start < end && state && state->start == start &&
1057 !need_resched())
1058 goto hit_next;
d1310b2e
CM
1059 goto search_again;
1060 }
1061
1062 /*
1063 * | ---- desired range ---- |
1064 * | state |
1065 * or
1066 * | ------------- state -------------- |
1067 *
1068 * We need to split the extent we found, and may flip bits on
1069 * second half.
1070 *
1071 * If the extent we found extends past our
1072 * range, we just split and search again. It'll get split
1073 * again the next time though.
1074 *
1075 * If the extent we found is inside our range, we set the
1076 * desired bit on it.
1077 */
1078 if (state->start < start) {
1edbb734 1079 if (state->state & exclusive_bits) {
d1310b2e
CM
1080 *failed_start = start;
1081 err = -EEXIST;
1082 goto out;
1083 }
8233767a 1084
55ffaabe
FM
1085 /*
1086 * If this extent already has all the bits we want set, then
1087 * skip it, not necessary to split it or do anything with it.
1088 */
1089 if ((state->state & bits) == bits) {
1090 start = state->end + 1;
1091 cache_state(state, cached_state);
1092 goto search_again;
1093 }
1094
8233767a
XG
1095 prealloc = alloc_extent_state_atomic(prealloc);
1096 BUG_ON(!prealloc);
d1310b2e 1097 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1098 if (err)
1099 extent_io_tree_panic(tree, err);
1100
d1310b2e
CM
1101 prealloc = NULL;
1102 if (err)
1103 goto out;
1104 if (state->end <= end) {
d38ed27f 1105 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1106 cache_state(state, cached_state);
d1310b2e 1107 merge_state(tree, state);
5c939df5
YZ
1108 if (last_end == (u64)-1)
1109 goto out;
1110 start = last_end + 1;
d1ac6e41
LB
1111 state = next_state(state);
1112 if (start < end && state && state->start == start &&
1113 !need_resched())
1114 goto hit_next;
d1310b2e
CM
1115 }
1116 goto search_again;
1117 }
1118 /*
1119 * | ---- desired range ---- |
1120 * | state | or | state |
1121 *
1122 * There's a hole, we need to insert something in it and
1123 * ignore the extent we found.
1124 */
1125 if (state->start > start) {
1126 u64 this_end;
1127 if (end < last_start)
1128 this_end = end;
1129 else
d397712b 1130 this_end = last_start - 1;
8233767a
XG
1131
1132 prealloc = alloc_extent_state_atomic(prealloc);
1133 BUG_ON(!prealloc);
c7f895a2
XG
1134
1135 /*
1136 * Avoid to free 'prealloc' if it can be merged with
1137 * the later extent.
1138 */
d1310b2e 1139 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1140 NULL, NULL, &bits, changeset);
c2d904e0
JM
1141 if (err)
1142 extent_io_tree_panic(tree, err);
1143
9ed74f2d
JB
1144 cache_state(prealloc, cached_state);
1145 prealloc = NULL;
d1310b2e
CM
1146 start = this_end + 1;
1147 goto search_again;
1148 }
1149 /*
1150 * | ---- desired range ---- |
1151 * | state |
1152 * We need to split the extent, and set the bit
1153 * on the first half
1154 */
1155 if (state->start <= end && state->end > end) {
1edbb734 1156 if (state->state & exclusive_bits) {
d1310b2e
CM
1157 *failed_start = start;
1158 err = -EEXIST;
1159 goto out;
1160 }
8233767a
XG
1161
1162 prealloc = alloc_extent_state_atomic(prealloc);
1163 BUG_ON(!prealloc);
d1310b2e 1164 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1165 if (err)
1166 extent_io_tree_panic(tree, err);
d1310b2e 1167
d38ed27f 1168 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1169 cache_state(prealloc, cached_state);
d1310b2e
CM
1170 merge_state(tree, prealloc);
1171 prealloc = NULL;
1172 goto out;
1173 }
1174
b5a4ba14
DS
1175search_again:
1176 if (start > end)
1177 goto out;
1178 spin_unlock(&tree->lock);
1179 if (gfpflags_allow_blocking(mask))
1180 cond_resched();
1181 goto again;
d1310b2e
CM
1182
1183out:
cad321ad 1184 spin_unlock(&tree->lock);
d1310b2e
CM
1185 if (prealloc)
1186 free_extent_state(prealloc);
1187
1188 return err;
1189
d1310b2e 1190}
d1310b2e 1191
462d6fac 1192/**
10983f2e
LB
1193 * convert_extent_bit - convert all bits in a given range from one bit to
1194 * another
462d6fac
JB
1195 * @tree: the io tree to search
1196 * @start: the start offset in bytes
1197 * @end: the end offset in bytes (inclusive)
1198 * @bits: the bits to set in this range
1199 * @clear_bits: the bits to clear in this range
e6138876 1200 * @cached_state: state that we're going to cache
462d6fac
JB
1201 *
1202 * This will go through and set bits for the given range. If any states exist
1203 * already in this range they are set with the given bit and cleared of the
1204 * clear_bits. This is only meant to be used by things that are mergeable, ie
1205 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1206 * boundary bits like LOCK.
210aa277
DS
1207 *
1208 * All allocations are done with GFP_NOFS.
462d6fac
JB
1209 */
1210int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1211 u32 bits, u32 clear_bits,
210aa277 1212 struct extent_state **cached_state)
462d6fac
JB
1213{
1214 struct extent_state *state;
1215 struct extent_state *prealloc = NULL;
1216 struct rb_node *node;
12cfbad9
FDBM
1217 struct rb_node **p;
1218 struct rb_node *parent;
462d6fac
JB
1219 int err = 0;
1220 u64 last_start;
1221 u64 last_end;
c8fd3de7 1222 bool first_iteration = true;
462d6fac 1223
a5dee37d 1224 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1225 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1226 clear_bits);
8d599ae1 1227
462d6fac 1228again:
210aa277 1229 if (!prealloc) {
c8fd3de7
FM
1230 /*
1231 * Best effort, don't worry if extent state allocation fails
1232 * here for the first iteration. We might have a cached state
1233 * that matches exactly the target range, in which case no
1234 * extent state allocations are needed. We'll only know this
1235 * after locking the tree.
1236 */
210aa277 1237 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1238 if (!prealloc && !first_iteration)
462d6fac
JB
1239 return -ENOMEM;
1240 }
1241
1242 spin_lock(&tree->lock);
e6138876
JB
1243 if (cached_state && *cached_state) {
1244 state = *cached_state;
1245 if (state->start <= start && state->end > start &&
27a3507d 1246 extent_state_in_tree(state)) {
e6138876
JB
1247 node = &state->rb_node;
1248 goto hit_next;
1249 }
1250 }
1251
462d6fac
JB
1252 /*
1253 * this search will find all the extents that end after
1254 * our range starts.
1255 */
12cfbad9 1256 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1257 if (!node) {
1258 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1259 if (!prealloc) {
1260 err = -ENOMEM;
1261 goto out;
1262 }
12cfbad9 1263 err = insert_state(tree, prealloc, start, end,
d38ed27f 1264 &p, &parent, &bits, NULL);
c2d904e0
JM
1265 if (err)
1266 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1267 cache_state(prealloc, cached_state);
1268 prealloc = NULL;
462d6fac
JB
1269 goto out;
1270 }
1271 state = rb_entry(node, struct extent_state, rb_node);
1272hit_next:
1273 last_start = state->start;
1274 last_end = state->end;
1275
1276 /*
1277 * | ---- desired range ---- |
1278 * | state |
1279 *
1280 * Just lock what we found and keep going
1281 */
1282 if (state->start == start && state->end <= end) {
d38ed27f 1283 set_state_bits(tree, state, &bits, NULL);
e6138876 1284 cache_state(state, cached_state);
fefdc557 1285 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1286 if (last_end == (u64)-1)
1287 goto out;
462d6fac 1288 start = last_end + 1;
d1ac6e41
LB
1289 if (start < end && state && state->start == start &&
1290 !need_resched())
1291 goto hit_next;
462d6fac
JB
1292 goto search_again;
1293 }
1294
1295 /*
1296 * | ---- desired range ---- |
1297 * | state |
1298 * or
1299 * | ------------- state -------------- |
1300 *
1301 * We need to split the extent we found, and may flip bits on
1302 * second half.
1303 *
1304 * If the extent we found extends past our
1305 * range, we just split and search again. It'll get split
1306 * again the next time though.
1307 *
1308 * If the extent we found is inside our range, we set the
1309 * desired bit on it.
1310 */
1311 if (state->start < start) {
1312 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1313 if (!prealloc) {
1314 err = -ENOMEM;
1315 goto out;
1316 }
462d6fac 1317 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1318 if (err)
1319 extent_io_tree_panic(tree, err);
462d6fac
JB
1320 prealloc = NULL;
1321 if (err)
1322 goto out;
1323 if (state->end <= end) {
d38ed27f 1324 set_state_bits(tree, state, &bits, NULL);
e6138876 1325 cache_state(state, cached_state);
fefdc557
QW
1326 state = clear_state_bit(tree, state, &clear_bits, 0,
1327 NULL);
462d6fac
JB
1328 if (last_end == (u64)-1)
1329 goto out;
1330 start = last_end + 1;
d1ac6e41
LB
1331 if (start < end && state && state->start == start &&
1332 !need_resched())
1333 goto hit_next;
462d6fac
JB
1334 }
1335 goto search_again;
1336 }
1337 /*
1338 * | ---- desired range ---- |
1339 * | state | or | state |
1340 *
1341 * There's a hole, we need to insert something in it and
1342 * ignore the extent we found.
1343 */
1344 if (state->start > start) {
1345 u64 this_end;
1346 if (end < last_start)
1347 this_end = end;
1348 else
1349 this_end = last_start - 1;
1350
1351 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1352 if (!prealloc) {
1353 err = -ENOMEM;
1354 goto out;
1355 }
462d6fac
JB
1356
1357 /*
1358 * Avoid to free 'prealloc' if it can be merged with
1359 * the later extent.
1360 */
1361 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1362 NULL, NULL, &bits, NULL);
c2d904e0
JM
1363 if (err)
1364 extent_io_tree_panic(tree, err);
e6138876 1365 cache_state(prealloc, cached_state);
462d6fac
JB
1366 prealloc = NULL;
1367 start = this_end + 1;
1368 goto search_again;
1369 }
1370 /*
1371 * | ---- desired range ---- |
1372 * | state |
1373 * We need to split the extent, and set the bit
1374 * on the first half
1375 */
1376 if (state->start <= end && state->end > end) {
1377 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1378 if (!prealloc) {
1379 err = -ENOMEM;
1380 goto out;
1381 }
462d6fac
JB
1382
1383 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1384 if (err)
1385 extent_io_tree_panic(tree, err);
462d6fac 1386
d38ed27f 1387 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1388 cache_state(prealloc, cached_state);
fefdc557 1389 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1390 prealloc = NULL;
1391 goto out;
1392 }
1393
462d6fac
JB
1394search_again:
1395 if (start > end)
1396 goto out;
1397 spin_unlock(&tree->lock);
210aa277 1398 cond_resched();
c8fd3de7 1399 first_iteration = false;
462d6fac 1400 goto again;
462d6fac
JB
1401
1402out:
1403 spin_unlock(&tree->lock);
1404 if (prealloc)
1405 free_extent_state(prealloc);
1406
1407 return err;
462d6fac
JB
1408}
1409
d1310b2e 1410/* wrappers around set/clear extent bit */
d38ed27f 1411int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1412 u32 bits, struct extent_changeset *changeset)
d38ed27f
QW
1413{
1414 /*
1415 * We don't support EXTENT_LOCKED yet, as current changeset will
1416 * record any bits changed, so for EXTENT_LOCKED case, it will
1417 * either fail with -EEXIST or changeset will record the whole
1418 * range.
1419 */
1420 BUG_ON(bits & EXTENT_LOCKED);
1421
1cab5e72
NB
1422 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1423 changeset);
d38ed27f
QW
1424}
1425
4ca73656 1426int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1427 u32 bits)
4ca73656 1428{
1cab5e72
NB
1429 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1430 GFP_NOWAIT, NULL);
4ca73656
NB
1431}
1432
fefdc557 1433int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1434 u32 bits, int wake, int delete,
ae0f1625 1435 struct extent_state **cached)
fefdc557
QW
1436{
1437 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1438 cached, GFP_NOFS, NULL);
fefdc557
QW
1439}
1440
fefdc557 1441int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1442 u32 bits, struct extent_changeset *changeset)
fefdc557
QW
1443{
1444 /*
1445 * Don't support EXTENT_LOCKED case, same reason as
1446 * set_record_extent_bits().
1447 */
1448 BUG_ON(bits & EXTENT_LOCKED);
1449
f734c44a 1450 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1451 changeset);
1452}
1453
d352ac68
CM
1454/*
1455 * either insert or lock state struct between start and end use mask to tell
1456 * us if waiting is desired.
1457 */
1edbb734 1458int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1459 struct extent_state **cached_state)
d1310b2e
CM
1460{
1461 int err;
1462 u64 failed_start;
9ee49a04 1463
d1310b2e 1464 while (1) {
1cab5e72
NB
1465 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1466 EXTENT_LOCKED, &failed_start,
1467 cached_state, GFP_NOFS, NULL);
d0082371 1468 if (err == -EEXIST) {
d1310b2e
CM
1469 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1470 start = failed_start;
d0082371 1471 } else
d1310b2e 1472 break;
d1310b2e
CM
1473 WARN_ON(start > end);
1474 }
1475 return err;
1476}
d1310b2e 1477
d0082371 1478int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1479{
1480 int err;
1481 u64 failed_start;
1482
1cab5e72
NB
1483 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1484 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1485 if (err == -EEXIST) {
1486 if (failed_start > start)
1487 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1488 EXTENT_LOCKED, 1, 0, NULL);
25179201 1489 return 0;
6643558d 1490 }
25179201
JB
1491 return 1;
1492}
25179201 1493
bd1fa4f0 1494void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1495{
09cbfeaf
KS
1496 unsigned long index = start >> PAGE_SHIFT;
1497 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1498 struct page *page;
1499
1500 while (index <= end_index) {
1501 page = find_get_page(inode->i_mapping, index);
1502 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1503 clear_page_dirty_for_io(page);
09cbfeaf 1504 put_page(page);
4adaa611
CM
1505 index++;
1506 }
4adaa611
CM
1507}
1508
f6311572 1509void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1510{
09cbfeaf
KS
1511 unsigned long index = start >> PAGE_SHIFT;
1512 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1513 struct page *page;
1514
1515 while (index <= end_index) {
1516 page = find_get_page(inode->i_mapping, index);
1517 BUG_ON(!page); /* Pages should be in the extent_io_tree */
4adaa611 1518 __set_page_dirty_nobuffers(page);
8d38633c 1519 account_page_redirty(page);
09cbfeaf 1520 put_page(page);
4adaa611
CM
1521 index++;
1522 }
4adaa611
CM
1523}
1524
d352ac68
CM
1525/* find the first state struct with 'bits' set after 'start', and
1526 * return it. tree->lock must be held. NULL will returned if
1527 * nothing was found after 'start'
1528 */
48a3b636 1529static struct extent_state *
f97e27e9 1530find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
d7fc640e
CM
1531{
1532 struct rb_node *node;
1533 struct extent_state *state;
1534
1535 /*
1536 * this search will find all the extents that end after
1537 * our range starts.
1538 */
1539 node = tree_search(tree, start);
d397712b 1540 if (!node)
d7fc640e 1541 goto out;
d7fc640e 1542
d397712b 1543 while (1) {
d7fc640e 1544 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1545 if (state->end >= start && (state->state & bits))
d7fc640e 1546 return state;
d397712b 1547
d7fc640e
CM
1548 node = rb_next(node);
1549 if (!node)
1550 break;
1551 }
1552out:
1553 return NULL;
1554}
d7fc640e 1555
69261c4b 1556/*
03509b78 1557 * Find the first offset in the io tree with one or more @bits set.
69261c4b 1558 *
03509b78
QW
1559 * Note: If there are multiple bits set in @bits, any of them will match.
1560 *
1561 * Return 0 if we find something, and update @start_ret and @end_ret.
1562 * Return 1 if we found nothing.
69261c4b
XG
1563 */
1564int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1565 u64 *start_ret, u64 *end_ret, u32 bits,
e6138876 1566 struct extent_state **cached_state)
69261c4b
XG
1567{
1568 struct extent_state *state;
1569 int ret = 1;
1570
1571 spin_lock(&tree->lock);
e6138876
JB
1572 if (cached_state && *cached_state) {
1573 state = *cached_state;
27a3507d 1574 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1575 while ((state = next_state(state)) != NULL) {
e6138876
JB
1576 if (state->state & bits)
1577 goto got_it;
e6138876
JB
1578 }
1579 free_extent_state(*cached_state);
1580 *cached_state = NULL;
1581 goto out;
1582 }
1583 free_extent_state(*cached_state);
1584 *cached_state = NULL;
1585 }
1586
69261c4b 1587 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1588got_it:
69261c4b 1589 if (state) {
e38e2ed7 1590 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1591 *start_ret = state->start;
1592 *end_ret = state->end;
1593 ret = 0;
1594 }
e6138876 1595out:
69261c4b
XG
1596 spin_unlock(&tree->lock);
1597 return ret;
1598}
1599
41a2ee75 1600/**
3bed2da1
NB
1601 * Find a contiguous area of bits
1602 *
1603 * @tree: io tree to check
1604 * @start: offset to start the search from
1605 * @start_ret: the first offset we found with the bits set
1606 * @end_ret: the final contiguous range of the bits that were set
1607 * @bits: bits to look for
41a2ee75
JB
1608 *
1609 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1610 * to set bits appropriately, and then merge them again. During this time it
1611 * will drop the tree->lock, so use this helper if you want to find the actual
1612 * contiguous area for given bits. We will search to the first bit we find, and
1613 * then walk down the tree until we find a non-contiguous area. The area
1614 * returned will be the full contiguous area with the bits set.
1615 */
1616int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1617 u64 *start_ret, u64 *end_ret, u32 bits)
41a2ee75
JB
1618{
1619 struct extent_state *state;
1620 int ret = 1;
1621
1622 spin_lock(&tree->lock);
1623 state = find_first_extent_bit_state(tree, start, bits);
1624 if (state) {
1625 *start_ret = state->start;
1626 *end_ret = state->end;
1627 while ((state = next_state(state)) != NULL) {
1628 if (state->start > (*end_ret + 1))
1629 break;
1630 *end_ret = state->end;
1631 }
1632 ret = 0;
1633 }
1634 spin_unlock(&tree->lock);
1635 return ret;
1636}
1637
45bfcfc1 1638/**
3bed2da1
NB
1639 * Find the first range that has @bits not set. This range could start before
1640 * @start.
45bfcfc1 1641 *
3bed2da1
NB
1642 * @tree: the tree to search
1643 * @start: offset at/after which the found extent should start
1644 * @start_ret: records the beginning of the range
1645 * @end_ret: records the end of the range (inclusive)
1646 * @bits: the set of bits which must be unset
45bfcfc1
NB
1647 *
1648 * Since unallocated range is also considered one which doesn't have the bits
1649 * set it's possible that @end_ret contains -1, this happens in case the range
1650 * spans (last_range_end, end of device]. In this case it's up to the caller to
1651 * trim @end_ret to the appropriate size.
1652 */
1653void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1654 u64 *start_ret, u64 *end_ret, u32 bits)
45bfcfc1
NB
1655{
1656 struct extent_state *state;
1657 struct rb_node *node, *prev = NULL, *next;
1658
1659 spin_lock(&tree->lock);
1660
1661 /* Find first extent with bits cleared */
1662 while (1) {
1663 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1664 if (!node && !next && !prev) {
1665 /*
1666 * Tree is completely empty, send full range and let
1667 * caller deal with it
1668 */
1669 *start_ret = 0;
1670 *end_ret = -1;
1671 goto out;
1672 } else if (!node && !next) {
1673 /*
1674 * We are past the last allocated chunk, set start at
1675 * the end of the last extent.
1676 */
1677 state = rb_entry(prev, struct extent_state, rb_node);
1678 *start_ret = state->end + 1;
1679 *end_ret = -1;
1680 goto out;
1681 } else if (!node) {
45bfcfc1 1682 node = next;
45bfcfc1 1683 }
1eaebb34
NB
1684 /*
1685 * At this point 'node' either contains 'start' or start is
1686 * before 'node'
1687 */
45bfcfc1 1688 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1689
1690 if (in_range(start, state->start, state->end - state->start + 1)) {
1691 if (state->state & bits) {
1692 /*
1693 * |--range with bits sets--|
1694 * |
1695 * start
1696 */
1697 start = state->end + 1;
1698 } else {
1699 /*
1700 * 'start' falls within a range that doesn't
1701 * have the bits set, so take its start as
1702 * the beginning of the desired range
1703 *
1704 * |--range with bits cleared----|
1705 * |
1706 * start
1707 */
1708 *start_ret = state->start;
1709 break;
1710 }
45bfcfc1 1711 } else {
1eaebb34
NB
1712 /*
1713 * |---prev range---|---hole/unset---|---node range---|
1714 * |
1715 * start
1716 *
1717 * or
1718 *
1719 * |---hole/unset--||--first node--|
1720 * 0 |
1721 * start
1722 */
1723 if (prev) {
1724 state = rb_entry(prev, struct extent_state,
1725 rb_node);
1726 *start_ret = state->end + 1;
1727 } else {
1728 *start_ret = 0;
1729 }
45bfcfc1
NB
1730 break;
1731 }
1732 }
1733
1734 /*
1735 * Find the longest stretch from start until an entry which has the
1736 * bits set
1737 */
1738 while (1) {
1739 state = rb_entry(node, struct extent_state, rb_node);
1740 if (state->end >= start && !(state->state & bits)) {
1741 *end_ret = state->end;
1742 } else {
1743 *end_ret = state->start - 1;
1744 break;
1745 }
1746
1747 node = rb_next(node);
1748 if (!node)
1749 break;
1750 }
1751out:
1752 spin_unlock(&tree->lock);
1753}
1754
d352ac68
CM
1755/*
1756 * find a contiguous range of bytes in the file marked as delalloc, not
1757 * more than 'max_bytes'. start and end are used to return the range,
1758 *
3522e903 1759 * true is returned if we find something, false if nothing was in the tree
d352ac68 1760 */
083e75e7
JB
1761bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1762 u64 *end, u64 max_bytes,
1763 struct extent_state **cached_state)
d1310b2e
CM
1764{
1765 struct rb_node *node;
1766 struct extent_state *state;
1767 u64 cur_start = *start;
3522e903 1768 bool found = false;
d1310b2e
CM
1769 u64 total_bytes = 0;
1770
cad321ad 1771 spin_lock(&tree->lock);
c8b97818 1772
d1310b2e
CM
1773 /*
1774 * this search will find all the extents that end after
1775 * our range starts.
1776 */
80ea96b1 1777 node = tree_search(tree, cur_start);
2b114d1d 1778 if (!node) {
3522e903 1779 *end = (u64)-1;
d1310b2e
CM
1780 goto out;
1781 }
1782
d397712b 1783 while (1) {
d1310b2e 1784 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1785 if (found && (state->start != cur_start ||
1786 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1787 goto out;
1788 }
1789 if (!(state->state & EXTENT_DELALLOC)) {
1790 if (!found)
1791 *end = state->end;
1792 goto out;
1793 }
c2a128d2 1794 if (!found) {
d1310b2e 1795 *start = state->start;
c2a128d2 1796 *cached_state = state;
b7ac31b7 1797 refcount_inc(&state->refs);
c2a128d2 1798 }
3522e903 1799 found = true;
d1310b2e
CM
1800 *end = state->end;
1801 cur_start = state->end + 1;
1802 node = rb_next(node);
d1310b2e 1803 total_bytes += state->end - state->start + 1;
7bf811a5 1804 if (total_bytes >= max_bytes)
573aecaf 1805 break;
573aecaf 1806 if (!node)
d1310b2e
CM
1807 break;
1808 }
1809out:
cad321ad 1810 spin_unlock(&tree->lock);
d1310b2e
CM
1811 return found;
1812}
1813
ed8f13bf
QW
1814/*
1815 * Process one page for __process_pages_contig().
1816 *
1817 * Return >0 if we hit @page == @locked_page.
1818 * Return 0 if we updated the page status.
1819 * Return -EGAIN if the we need to try again.
1820 * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
1821 */
e38992be
QW
1822static int process_one_page(struct btrfs_fs_info *fs_info,
1823 struct address_space *mapping,
ed8f13bf 1824 struct page *page, struct page *locked_page,
e38992be 1825 unsigned long page_ops, u64 start, u64 end)
ed8f13bf 1826{
e38992be
QW
1827 u32 len;
1828
1829 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1830 len = end + 1 - start;
1831
ed8f13bf 1832 if (page_ops & PAGE_SET_ORDERED)
b945a463 1833 btrfs_page_clamp_set_ordered(fs_info, page, start, len);
ed8f13bf 1834 if (page_ops & PAGE_SET_ERROR)
e38992be 1835 btrfs_page_clamp_set_error(fs_info, page, start, len);
ed8f13bf 1836 if (page_ops & PAGE_START_WRITEBACK) {
e38992be
QW
1837 btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1838 btrfs_page_clamp_set_writeback(fs_info, page, start, len);
ed8f13bf
QW
1839 }
1840 if (page_ops & PAGE_END_WRITEBACK)
e38992be 1841 btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
a33a8e9a
QW
1842
1843 if (page == locked_page)
1844 return 1;
1845
ed8f13bf 1846 if (page_ops & PAGE_LOCK) {
1e1de387
QW
1847 int ret;
1848
1849 ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1850 if (ret)
1851 return ret;
ed8f13bf 1852 if (!PageDirty(page) || page->mapping != mapping) {
1e1de387 1853 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
1854 return -EAGAIN;
1855 }
1856 }
1857 if (page_ops & PAGE_UNLOCK)
1e1de387 1858 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
1859 return 0;
1860}
1861
da2c7009
LB
1862static int __process_pages_contig(struct address_space *mapping,
1863 struct page *locked_page,
98af9ab1 1864 u64 start, u64 end, unsigned long page_ops,
ed8f13bf
QW
1865 u64 *processed_end)
1866{
e38992be 1867 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
ed8f13bf
QW
1868 pgoff_t start_index = start >> PAGE_SHIFT;
1869 pgoff_t end_index = end >> PAGE_SHIFT;
1870 pgoff_t index = start_index;
1871 unsigned long nr_pages = end_index - start_index + 1;
1872 unsigned long pages_processed = 0;
1873 struct page *pages[16];
1874 int err = 0;
1875 int i;
1876
1877 if (page_ops & PAGE_LOCK) {
1878 ASSERT(page_ops == PAGE_LOCK);
1879 ASSERT(processed_end && *processed_end == start);
1880 }
1881
1882 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1883 mapping_set_error(mapping, -EIO);
1884
1885 while (nr_pages > 0) {
1886 int found_pages;
1887
1888 found_pages = find_get_pages_contig(mapping, index,
1889 min_t(unsigned long,
1890 nr_pages, ARRAY_SIZE(pages)), pages);
1891 if (found_pages == 0) {
1892 /*
1893 * Only if we're going to lock these pages, we can find
1894 * nothing at @index.
1895 */
1896 ASSERT(page_ops & PAGE_LOCK);
1897 err = -EAGAIN;
1898 goto out;
1899 }
1900
1901 for (i = 0; i < found_pages; i++) {
1902 int process_ret;
1903
e38992be
QW
1904 process_ret = process_one_page(fs_info, mapping,
1905 pages[i], locked_page, page_ops,
1906 start, end);
ed8f13bf
QW
1907 if (process_ret < 0) {
1908 for (; i < found_pages; i++)
1909 put_page(pages[i]);
1910 err = -EAGAIN;
1911 goto out;
1912 }
1913 put_page(pages[i]);
1914 pages_processed++;
1915 }
1916 nr_pages -= found_pages;
1917 index += found_pages;
1918 cond_resched();
1919 }
1920out:
1921 if (err && processed_end) {
1922 /*
1923 * Update @processed_end. I know this is awful since it has
1924 * two different return value patterns (inclusive vs exclusive).
1925 *
1926 * But the exclusive pattern is necessary if @start is 0, or we
1927 * underflow and check against processed_end won't work as
1928 * expected.
1929 */
1930 if (pages_processed)
1931 *processed_end = min(end,
1932 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1933 else
1934 *processed_end = start;
1935 }
1936 return err;
1937}
da2c7009 1938
143bede5
JM
1939static noinline void __unlock_for_delalloc(struct inode *inode,
1940 struct page *locked_page,
1941 u64 start, u64 end)
c8b97818 1942{
09cbfeaf
KS
1943 unsigned long index = start >> PAGE_SHIFT;
1944 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1945
76c0021d 1946 ASSERT(locked_page);
c8b97818 1947 if (index == locked_page->index && end_index == index)
143bede5 1948 return;
c8b97818 1949
98af9ab1 1950 __process_pages_contig(inode->i_mapping, locked_page, start, end,
76c0021d 1951 PAGE_UNLOCK, NULL);
c8b97818
CM
1952}
1953
1954static noinline int lock_delalloc_pages(struct inode *inode,
1955 struct page *locked_page,
1956 u64 delalloc_start,
1957 u64 delalloc_end)
1958{
09cbfeaf 1959 unsigned long index = delalloc_start >> PAGE_SHIFT;
09cbfeaf 1960 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
98af9ab1 1961 u64 processed_end = delalloc_start;
c8b97818 1962 int ret;
c8b97818 1963
76c0021d 1964 ASSERT(locked_page);
c8b97818
CM
1965 if (index == locked_page->index && index == end_index)
1966 return 0;
1967
98af9ab1
QW
1968 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
1969 delalloc_end, PAGE_LOCK, &processed_end);
1970 if (ret == -EAGAIN && processed_end > delalloc_start)
76c0021d 1971 __unlock_for_delalloc(inode, locked_page, delalloc_start,
98af9ab1 1972 processed_end);
c8b97818
CM
1973 return ret;
1974}
1975
1976/*
3522e903
LF
1977 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1978 * more than @max_bytes. @Start and @end are used to return the range,
c8b97818 1979 *
3522e903
LF
1980 * Return: true if we find something
1981 * false if nothing was in the tree
c8b97818 1982 */
ce9f967f 1983EXPORT_FOR_TESTS
3522e903 1984noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1985 struct page *locked_page, u64 *start,
917aacec 1986 u64 *end)
c8b97818 1987{
9978059b 1988 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
917aacec 1989 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
1990 u64 delalloc_start;
1991 u64 delalloc_end;
3522e903 1992 bool found;
9655d298 1993 struct extent_state *cached_state = NULL;
c8b97818
CM
1994 int ret;
1995 int loops = 0;
1996
1997again:
1998 /* step one, find a bunch of delalloc bytes starting at start */
1999 delalloc_start = *start;
2000 delalloc_end = 0;
083e75e7
JB
2001 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2002 max_bytes, &cached_state);
70b99e69 2003 if (!found || delalloc_end <= *start) {
c8b97818
CM
2004 *start = delalloc_start;
2005 *end = delalloc_end;
c2a128d2 2006 free_extent_state(cached_state);
3522e903 2007 return false;
c8b97818
CM
2008 }
2009
70b99e69
CM
2010 /*
2011 * start comes from the offset of locked_page. We have to lock
2012 * pages in order, so we can't process delalloc bytes before
2013 * locked_page
2014 */
d397712b 2015 if (delalloc_start < *start)
70b99e69 2016 delalloc_start = *start;
70b99e69 2017
c8b97818
CM
2018 /*
2019 * make sure to limit the number of pages we try to lock down
c8b97818 2020 */
7bf811a5
JB
2021 if (delalloc_end + 1 - delalloc_start > max_bytes)
2022 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 2023
c8b97818
CM
2024 /* step two, lock all the pages after the page that has start */
2025 ret = lock_delalloc_pages(inode, locked_page,
2026 delalloc_start, delalloc_end);
9bfd61d9 2027 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
2028 if (ret == -EAGAIN) {
2029 /* some of the pages are gone, lets avoid looping by
2030 * shortening the size of the delalloc range we're searching
2031 */
9655d298 2032 free_extent_state(cached_state);
7d788742 2033 cached_state = NULL;
c8b97818 2034 if (!loops) {
09cbfeaf 2035 max_bytes = PAGE_SIZE;
c8b97818
CM
2036 loops = 1;
2037 goto again;
2038 } else {
3522e903 2039 found = false;
c8b97818
CM
2040 goto out_failed;
2041 }
2042 }
c8b97818
CM
2043
2044 /* step three, lock the state bits for the whole range */
ff13db41 2045 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
2046
2047 /* then test to make sure it is all still delalloc */
2048 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 2049 EXTENT_DELALLOC, 1, cached_state);
c8b97818 2050 if (!ret) {
9655d298 2051 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 2052 &cached_state);
c8b97818
CM
2053 __unlock_for_delalloc(inode, locked_page,
2054 delalloc_start, delalloc_end);
2055 cond_resched();
2056 goto again;
2057 }
9655d298 2058 free_extent_state(cached_state);
c8b97818
CM
2059 *start = delalloc_start;
2060 *end = delalloc_end;
2061out_failed:
2062 return found;
2063}
2064
ad7ff17b 2065void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
74e9194a 2066 struct page *locked_page,
f97e27e9 2067 u32 clear_bits, unsigned long page_ops)
873695b3 2068{
ad7ff17b 2069 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
873695b3 2070
ad7ff17b 2071 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
98af9ab1 2072 start, end, page_ops, NULL);
873695b3
LB
2073}
2074
d352ac68
CM
2075/*
2076 * count the number of bytes in the tree that have a given bit(s)
2077 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2078 * cached. The total number found is returned.
2079 */
d1310b2e
CM
2080u64 count_range_bits(struct extent_io_tree *tree,
2081 u64 *start, u64 search_end, u64 max_bytes,
f97e27e9 2082 u32 bits, int contig)
d1310b2e
CM
2083{
2084 struct rb_node *node;
2085 struct extent_state *state;
2086 u64 cur_start = *start;
2087 u64 total_bytes = 0;
ec29ed5b 2088 u64 last = 0;
d1310b2e
CM
2089 int found = 0;
2090
fae7f21c 2091 if (WARN_ON(search_end <= cur_start))
d1310b2e 2092 return 0;
d1310b2e 2093
cad321ad 2094 spin_lock(&tree->lock);
d1310b2e
CM
2095 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2096 total_bytes = tree->dirty_bytes;
2097 goto out;
2098 }
2099 /*
2100 * this search will find all the extents that end after
2101 * our range starts.
2102 */
80ea96b1 2103 node = tree_search(tree, cur_start);
d397712b 2104 if (!node)
d1310b2e 2105 goto out;
d1310b2e 2106
d397712b 2107 while (1) {
d1310b2e
CM
2108 state = rb_entry(node, struct extent_state, rb_node);
2109 if (state->start > search_end)
2110 break;
ec29ed5b
CM
2111 if (contig && found && state->start > last + 1)
2112 break;
2113 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2114 total_bytes += min(search_end, state->end) + 1 -
2115 max(cur_start, state->start);
2116 if (total_bytes >= max_bytes)
2117 break;
2118 if (!found) {
af60bed2 2119 *start = max(cur_start, state->start);
d1310b2e
CM
2120 found = 1;
2121 }
ec29ed5b
CM
2122 last = state->end;
2123 } else if (contig && found) {
2124 break;
d1310b2e
CM
2125 }
2126 node = rb_next(node);
2127 if (!node)
2128 break;
2129 }
2130out:
cad321ad 2131 spin_unlock(&tree->lock);
d1310b2e
CM
2132 return total_bytes;
2133}
b2950863 2134
d352ac68
CM
2135/*
2136 * set the private field for a given byte offset in the tree. If there isn't
2137 * an extent_state there already, this does nothing.
2138 */
b3f167aa
JB
2139int set_state_failrec(struct extent_io_tree *tree, u64 start,
2140 struct io_failure_record *failrec)
d1310b2e
CM
2141{
2142 struct rb_node *node;
2143 struct extent_state *state;
2144 int ret = 0;
2145
cad321ad 2146 spin_lock(&tree->lock);
d1310b2e
CM
2147 /*
2148 * this search will find all the extents that end after
2149 * our range starts.
2150 */
80ea96b1 2151 node = tree_search(tree, start);
2b114d1d 2152 if (!node) {
d1310b2e
CM
2153 ret = -ENOENT;
2154 goto out;
2155 }
2156 state = rb_entry(node, struct extent_state, rb_node);
2157 if (state->start != start) {
2158 ret = -ENOENT;
2159 goto out;
2160 }
47dc196a 2161 state->failrec = failrec;
d1310b2e 2162out:
cad321ad 2163 spin_unlock(&tree->lock);
d1310b2e
CM
2164 return ret;
2165}
2166
2279a270 2167struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
d1310b2e
CM
2168{
2169 struct rb_node *node;
2170 struct extent_state *state;
2279a270 2171 struct io_failure_record *failrec;
d1310b2e 2172
cad321ad 2173 spin_lock(&tree->lock);
d1310b2e
CM
2174 /*
2175 * this search will find all the extents that end after
2176 * our range starts.
2177 */
80ea96b1 2178 node = tree_search(tree, start);
2b114d1d 2179 if (!node) {
2279a270 2180 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2181 goto out;
2182 }
2183 state = rb_entry(node, struct extent_state, rb_node);
2184 if (state->start != start) {
2279a270 2185 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2186 goto out;
2187 }
2279a270
NB
2188
2189 failrec = state->failrec;
d1310b2e 2190out:
cad321ad 2191 spin_unlock(&tree->lock);
2279a270 2192 return failrec;
d1310b2e
CM
2193}
2194
2195/*
2196 * searches a range in the state tree for a given mask.
70dec807 2197 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2198 * has the bits set. Otherwise, 1 is returned if any bit in the
2199 * range is found set.
2200 */
2201int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 2202 u32 bits, int filled, struct extent_state *cached)
d1310b2e
CM
2203{
2204 struct extent_state *state = NULL;
2205 struct rb_node *node;
2206 int bitset = 0;
d1310b2e 2207
cad321ad 2208 spin_lock(&tree->lock);
27a3507d 2209 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2210 cached->end > start)
9655d298
CM
2211 node = &cached->rb_node;
2212 else
2213 node = tree_search(tree, start);
d1310b2e
CM
2214 while (node && start <= end) {
2215 state = rb_entry(node, struct extent_state, rb_node);
2216
2217 if (filled && state->start > start) {
2218 bitset = 0;
2219 break;
2220 }
2221
2222 if (state->start > end)
2223 break;
2224
2225 if (state->state & bits) {
2226 bitset = 1;
2227 if (!filled)
2228 break;
2229 } else if (filled) {
2230 bitset = 0;
2231 break;
2232 }
46562cec
CM
2233
2234 if (state->end == (u64)-1)
2235 break;
2236
d1310b2e
CM
2237 start = state->end + 1;
2238 if (start > end)
2239 break;
2240 node = rb_next(node);
2241 if (!node) {
2242 if (filled)
2243 bitset = 0;
2244 break;
2245 }
2246 }
cad321ad 2247 spin_unlock(&tree->lock);
d1310b2e
CM
2248 return bitset;
2249}
d1310b2e 2250
7870d082
JB
2251int free_io_failure(struct extent_io_tree *failure_tree,
2252 struct extent_io_tree *io_tree,
2253 struct io_failure_record *rec)
4a54c8c1
JS
2254{
2255 int ret;
2256 int err = 0;
4a54c8c1 2257
47dc196a 2258 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2259 ret = clear_extent_bits(failure_tree, rec->start,
2260 rec->start + rec->len - 1,
91166212 2261 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2262 if (ret)
2263 err = ret;
2264
7870d082 2265 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2266 rec->start + rec->len - 1,
91166212 2267 EXTENT_DAMAGED);
53b381b3
DW
2268 if (ret && !err)
2269 err = ret;
4a54c8c1
JS
2270
2271 kfree(rec);
2272 return err;
2273}
2274
4a54c8c1
JS
2275/*
2276 * this bypasses the standard btrfs submit functions deliberately, as
2277 * the standard behavior is to write all copies in a raid setup. here we only
2278 * want to write the one bad copy. so we do the mapping for ourselves and issue
2279 * submit_bio directly.
3ec706c8 2280 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2281 * actually prevents the read that triggered the error from finishing.
2282 * currently, there can be no more than two copies of every data bit. thus,
2283 * exactly one rewrite is required.
2284 */
38d5e541
QW
2285static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2286 u64 length, u64 logical, struct page *page,
2287 unsigned int pg_offset, int mirror_num)
4a54c8c1
JS
2288{
2289 struct bio *bio;
2290 struct btrfs_device *dev;
4a54c8c1
JS
2291 u64 map_length = 0;
2292 u64 sector;
4c664611 2293 struct btrfs_io_context *bioc = NULL;
4a54c8c1
JS
2294 int ret;
2295
1751e8a6 2296 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2297 BUG_ON(!mirror_num);
2298
f7ef5287
NA
2299 if (btrfs_is_zoned(fs_info))
2300 return btrfs_repair_one_zone(fs_info, logical);
2301
c3a3b19b 2302 bio = btrfs_bio_alloc(1);
4f024f37 2303 bio->bi_iter.bi_size = 0;
4a54c8c1
JS
2304 map_length = length;
2305
b5de8d0d 2306 /*
4c664611 2307 * Avoid races with device replace and make sure our bioc has devices
b5de8d0d
FM
2308 * associated to its stripes that don't go away while we are doing the
2309 * read repair operation.
2310 */
2311 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2312 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2313 /*
2314 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2315 * to update all raid stripes, but here we just want to correct
2316 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2317 * stripe's dev and sector.
2318 */
2319 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
4c664611 2320 &map_length, &bioc, 0);
c725328c
LB
2321 if (ret) {
2322 btrfs_bio_counter_dec(fs_info);
2323 bio_put(bio);
2324 return -EIO;
2325 }
4c664611 2326 ASSERT(bioc->mirror_num == 1);
c725328c
LB
2327 } else {
2328 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
4c664611 2329 &map_length, &bioc, mirror_num);
c725328c
LB
2330 if (ret) {
2331 btrfs_bio_counter_dec(fs_info);
2332 bio_put(bio);
2333 return -EIO;
2334 }
4c664611 2335 BUG_ON(mirror_num != bioc->mirror_num);
4a54c8c1 2336 }
c725328c 2337
4c664611 2338 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
4f024f37 2339 bio->bi_iter.bi_sector = sector;
4c664611
QW
2340 dev = bioc->stripes[bioc->mirror_num - 1].dev;
2341 btrfs_put_bioc(bioc);
ebbede42
AJ
2342 if (!dev || !dev->bdev ||
2343 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
b5de8d0d 2344 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2345 bio_put(bio);
2346 return -EIO;
2347 }
74d46992 2348 bio_set_dev(bio, dev->bdev);
70fd7614 2349 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ffdd2018 2350 bio_add_page(bio, page, length, pg_offset);
4a54c8c1 2351
4e49ea4a 2352 if (btrfsic_submit_bio_wait(bio)) {
4a54c8c1 2353 /* try to remap that extent elsewhere? */
b5de8d0d 2354 btrfs_bio_counter_dec(fs_info);
4a54c8c1 2355 bio_put(bio);
442a4f63 2356 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4a54c8c1
JS
2357 return -EIO;
2358 }
2359
b14af3b4
DS
2360 btrfs_info_rl_in_rcu(fs_info,
2361 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2362 ino, start,
1203b681 2363 rcu_str_deref(dev->name), sector);
b5de8d0d 2364 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2365 bio_put(bio);
2366 return 0;
2367}
2368
2b48966a 2369int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
ea466794 2370{
20a1fbf9 2371 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2372 u64 start = eb->start;
cc5e31a4 2373 int i, num_pages = num_extent_pages(eb);
d95603b2 2374 int ret = 0;
ea466794 2375
bc98a42c 2376 if (sb_rdonly(fs_info->sb))
908960c6
ID
2377 return -EROFS;
2378
ea466794 2379 for (i = 0; i < num_pages; i++) {
fb85fc9a 2380 struct page *p = eb->pages[i];
1203b681 2381
6ec656bc 2382 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2383 start - page_offset(p), mirror_num);
ea466794
JB
2384 if (ret)
2385 break;
09cbfeaf 2386 start += PAGE_SIZE;
ea466794
JB
2387 }
2388
2389 return ret;
2390}
2391
4a54c8c1
JS
2392/*
2393 * each time an IO finishes, we do a fast check in the IO failure tree
2394 * to see if we need to process or clean up an io_failure_record
2395 */
7870d082
JB
2396int clean_io_failure(struct btrfs_fs_info *fs_info,
2397 struct extent_io_tree *failure_tree,
2398 struct extent_io_tree *io_tree, u64 start,
2399 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2400{
2401 u64 private;
4a54c8c1 2402 struct io_failure_record *failrec;
4a54c8c1
JS
2403 struct extent_state *state;
2404 int num_copies;
4a54c8c1 2405 int ret;
4a54c8c1
JS
2406
2407 private = 0;
7870d082
JB
2408 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2409 EXTENT_DIRTY, 0);
4a54c8c1
JS
2410 if (!ret)
2411 return 0;
2412
2279a270
NB
2413 failrec = get_state_failrec(failure_tree, start);
2414 if (IS_ERR(failrec))
4a54c8c1
JS
2415 return 0;
2416
4a54c8c1
JS
2417 BUG_ON(!failrec->this_mirror);
2418
bc98a42c 2419 if (sb_rdonly(fs_info->sb))
908960c6 2420 goto out;
4a54c8c1 2421
7870d082
JB
2422 spin_lock(&io_tree->lock);
2423 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2424 failrec->start,
2425 EXTENT_LOCKED);
7870d082 2426 spin_unlock(&io_tree->lock);
4a54c8c1 2427
883d0de4
MX
2428 if (state && state->start <= failrec->start &&
2429 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2430 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2431 failrec->len);
4a54c8c1 2432 if (num_copies > 1) {
7870d082
JB
2433 repair_io_failure(fs_info, ino, start, failrec->len,
2434 failrec->logical, page, pg_offset,
2435 failrec->failed_mirror);
4a54c8c1
JS
2436 }
2437 }
2438
2439out:
7870d082 2440 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2441
454ff3de 2442 return 0;
4a54c8c1
JS
2443}
2444
f612496b
MX
2445/*
2446 * Can be called when
2447 * - hold extent lock
2448 * - under ordered extent
2449 * - the inode is freeing
2450 */
7ab7956e 2451void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2452{
7ab7956e 2453 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2454 struct io_failure_record *failrec;
2455 struct extent_state *state, *next;
2456
2457 if (RB_EMPTY_ROOT(&failure_tree->state))
2458 return;
2459
2460 spin_lock(&failure_tree->lock);
2461 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2462 while (state) {
2463 if (state->start > end)
2464 break;
2465
2466 ASSERT(state->end <= end);
2467
2468 next = next_state(state);
2469
47dc196a 2470 failrec = state->failrec;
f612496b
MX
2471 free_extent_state(state);
2472 kfree(failrec);
2473
2474 state = next;
2475 }
2476 spin_unlock(&failure_tree->lock);
2477}
2478
3526302f 2479static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
150e4b05 2480 u64 start)
4a54c8c1 2481{
ab8d0fc4 2482 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2483 struct io_failure_record *failrec;
4a54c8c1 2484 struct extent_map *em;
4a54c8c1
JS
2485 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2486 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2487 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
150e4b05 2488 const u32 sectorsize = fs_info->sectorsize;
4a54c8c1 2489 int ret;
4a54c8c1
JS
2490 u64 logical;
2491
2279a270 2492 failrec = get_state_failrec(failure_tree, start);
3526302f 2493 if (!IS_ERR(failrec)) {
ab8d0fc4 2494 btrfs_debug(fs_info,
1245835d
QW
2495 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2496 failrec->logical, failrec->start, failrec->len);
4a54c8c1
JS
2497 /*
2498 * when data can be on disk more than twice, add to failrec here
2499 * (e.g. with a list for failed_mirror) to make
2500 * clean_io_failure() clean all those errors at once.
2501 */
3526302f
NB
2502
2503 return failrec;
4a54c8c1 2504 }
2fe6303e 2505
3526302f
NB
2506 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2507 if (!failrec)
2508 return ERR_PTR(-ENOMEM);
2fe6303e 2509
3526302f 2510 failrec->start = start;
150e4b05 2511 failrec->len = sectorsize;
3526302f
NB
2512 failrec->this_mirror = 0;
2513 failrec->bio_flags = 0;
3526302f
NB
2514
2515 read_lock(&em_tree->lock);
2516 em = lookup_extent_mapping(em_tree, start, failrec->len);
2517 if (!em) {
2518 read_unlock(&em_tree->lock);
2519 kfree(failrec);
2520 return ERR_PTR(-EIO);
2521 }
2522
2523 if (em->start > start || em->start + em->len <= start) {
2524 free_extent_map(em);
2525 em = NULL;
2526 }
2527 read_unlock(&em_tree->lock);
2528 if (!em) {
2529 kfree(failrec);
2530 return ERR_PTR(-EIO);
2531 }
2532
2533 logical = start - em->start;
2534 logical = em->block_start + logical;
2535 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2536 logical = em->block_start;
2537 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2538 extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2539 }
2540
2541 btrfs_debug(fs_info,
2542 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2543 logical, start, failrec->len);
2544
2545 failrec->logical = logical;
2546 free_extent_map(em);
2547
2548 /* Set the bits in the private failure tree */
150e4b05 2549 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
3526302f
NB
2550 EXTENT_LOCKED | EXTENT_DIRTY);
2551 if (ret >= 0) {
2552 ret = set_state_failrec(failure_tree, start, failrec);
2553 /* Set the bits in the inode's tree */
150e4b05
QW
2554 ret = set_extent_bits(tree, start, start + sectorsize - 1,
2555 EXTENT_DAMAGED);
3526302f
NB
2556 } else if (ret < 0) {
2557 kfree(failrec);
2558 return ERR_PTR(ret);
2559 }
2560
2561 return failrec;
2fe6303e
MX
2562}
2563
1245835d 2564static bool btrfs_check_repairable(struct inode *inode,
ce06d3ec
OS
2565 struct io_failure_record *failrec,
2566 int failed_mirror)
2fe6303e 2567{
ab8d0fc4 2568 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2569 int num_copies;
2570
ab8d0fc4 2571 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2572 if (num_copies == 1) {
2573 /*
2574 * we only have a single copy of the data, so don't bother with
2575 * all the retry and error correction code that follows. no
2576 * matter what the error is, it is very likely to persist.
2577 */
ab8d0fc4
JM
2578 btrfs_debug(fs_info,
2579 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2580 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2581 return false;
4a54c8c1
JS
2582 }
2583
1245835d
QW
2584 /* The failure record should only contain one sector */
2585 ASSERT(failrec->len == fs_info->sectorsize);
2586
4a54c8c1 2587 /*
1245835d
QW
2588 * There are two premises:
2589 * a) deliver good data to the caller
2590 * b) correct the bad sectors on disk
2591 *
2592 * Since we're only doing repair for one sector, we only need to get
2593 * a good copy of the failed sector and if we succeed, we have setup
2594 * everything for repair_io_failure to do the rest for us.
4a54c8c1 2595 */
1245835d
QW
2596 failrec->failed_mirror = failed_mirror;
2597 failrec->this_mirror++;
2598 if (failrec->this_mirror == failed_mirror)
4a54c8c1 2599 failrec->this_mirror++;
4a54c8c1 2600
facc8a22 2601 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2602 btrfs_debug(fs_info,
2603 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2604 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2605 return false;
4a54c8c1
JS
2606 }
2607
c3cfb656 2608 return true;
2fe6303e
MX
2609}
2610
150e4b05
QW
2611int btrfs_repair_one_sector(struct inode *inode,
2612 struct bio *failed_bio, u32 bio_offset,
2613 struct page *page, unsigned int pgoff,
2614 u64 start, int failed_mirror,
2615 submit_bio_hook_t *submit_bio_hook)
2fe6303e
MX
2616{
2617 struct io_failure_record *failrec;
77d5d689 2618 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2619 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2620 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
c3a3b19b 2621 struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
7ffd27e3 2622 const int icsum = bio_offset >> fs_info->sectorsize_bits;
77d5d689 2623 struct bio *repair_bio;
c3a3b19b 2624 struct btrfs_bio *repair_bbio;
4e4cbee9 2625 blk_status_t status;
2fe6303e 2626
77d5d689
OS
2627 btrfs_debug(fs_info,
2628 "repair read error: read error at %llu", start);
2fe6303e 2629
1f7ad75b 2630 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e 2631
150e4b05 2632 failrec = btrfs_get_io_failure_record(inode, start);
3526302f 2633 if (IS_ERR(failrec))
150e4b05 2634 return PTR_ERR(failrec);
2fe6303e 2635
1245835d
QW
2636
2637 if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
7870d082 2638 free_io_failure(failure_tree, tree, failrec);
150e4b05 2639 return -EIO;
2fe6303e
MX
2640 }
2641
c3a3b19b
QW
2642 repair_bio = btrfs_bio_alloc(1);
2643 repair_bbio = btrfs_bio(repair_bio);
77d5d689 2644 repair_bio->bi_opf = REQ_OP_READ;
77d5d689
OS
2645 repair_bio->bi_end_io = failed_bio->bi_end_io;
2646 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2647 repair_bio->bi_private = failed_bio->bi_private;
2fe6303e 2648
c3a3b19b 2649 if (failed_bbio->csum) {
223486c2 2650 const u32 csum_size = fs_info->csum_size;
77d5d689 2651
c3a3b19b
QW
2652 repair_bbio->csum = repair_bbio->csum_inline;
2653 memcpy(repair_bbio->csum,
2654 failed_bbio->csum + csum_size * icsum, csum_size);
77d5d689 2655 }
2fe6303e 2656
77d5d689 2657 bio_add_page(repair_bio, page, failrec->len, pgoff);
c3a3b19b
QW
2658 repair_bbio->logical = failrec->start;
2659 repair_bbio->iter = repair_bio->bi_iter;
4a54c8c1 2660
ab8d0fc4 2661 btrfs_debug(btrfs_sb(inode->i_sb),
1245835d
QW
2662 "repair read error: submitting new read to mirror %d",
2663 failrec->this_mirror);
4a54c8c1 2664
77d5d689
OS
2665 status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
2666 failrec->bio_flags);
4e4cbee9 2667 if (status) {
7870d082 2668 free_io_failure(failure_tree, tree, failrec);
77d5d689 2669 bio_put(repair_bio);
6c387ab2 2670 }
150e4b05
QW
2671 return blk_status_to_errno(status);
2672}
2673
2674static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2675{
2676 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2677
2678 ASSERT(page_offset(page) <= start &&
2679 start + len <= page_offset(page) + PAGE_SIZE);
2680
150e4b05 2681 if (uptodate) {
14605409
BB
2682 if (fsverity_active(page->mapping->host) &&
2683 !PageError(page) &&
2684 !PageUptodate(page) &&
2685 start < i_size_read(page->mapping->host) &&
2686 !fsverity_verify_page(page)) {
2687 btrfs_page_set_error(fs_info, page, start, len);
2688 } else {
2689 btrfs_page_set_uptodate(fs_info, page, start, len);
2690 }
150e4b05
QW
2691 } else {
2692 btrfs_page_clear_uptodate(fs_info, page, start, len);
2693 btrfs_page_set_error(fs_info, page, start, len);
2694 }
2695
2696 if (fs_info->sectorsize == PAGE_SIZE)
2697 unlock_page(page);
3d078efa 2698 else
150e4b05
QW
2699 btrfs_subpage_end_reader(fs_info, page, start, len);
2700}
2701
2702static blk_status_t submit_read_repair(struct inode *inode,
2703 struct bio *failed_bio, u32 bio_offset,
2704 struct page *page, unsigned int pgoff,
2705 u64 start, u64 end, int failed_mirror,
2706 unsigned int error_bitmap,
2707 submit_bio_hook_t *submit_bio_hook)
2708{
2709 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2710 const u32 sectorsize = fs_info->sectorsize;
2711 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2712 int error = 0;
2713 int i;
2714
2715 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2716
2717 /* We're here because we had some read errors or csum mismatch */
2718 ASSERT(error_bitmap);
2719
2720 /*
2721 * We only get called on buffered IO, thus page must be mapped and bio
2722 * must not be cloned.
2723 */
2724 ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
2725
2726 /* Iterate through all the sectors in the range */
2727 for (i = 0; i < nr_bits; i++) {
2728 const unsigned int offset = i * sectorsize;
2729 struct extent_state *cached = NULL;
2730 bool uptodate = false;
2731 int ret;
2732
2733 if (!(error_bitmap & (1U << i))) {
2734 /*
2735 * This sector has no error, just end the page read
2736 * and unlock the range.
2737 */
2738 uptodate = true;
2739 goto next;
2740 }
2741
2742 ret = btrfs_repair_one_sector(inode, failed_bio,
2743 bio_offset + offset,
2744 page, pgoff + offset, start + offset,
2745 failed_mirror, submit_bio_hook);
2746 if (!ret) {
2747 /*
2748 * We have submitted the read repair, the page release
2749 * will be handled by the endio function of the
2750 * submitted repair bio.
2751 * Thus we don't need to do any thing here.
2752 */
2753 continue;
2754 }
2755 /*
2756 * Repair failed, just record the error but still continue.
2757 * Or the remaining sectors will not be properly unlocked.
2758 */
2759 if (!error)
2760 error = ret;
2761next:
2762 end_page_read(page, uptodate, start + offset, sectorsize);
2763 if (uptodate)
2764 set_extent_uptodate(&BTRFS_I(inode)->io_tree,
2765 start + offset,
2766 start + offset + sectorsize - 1,
2767 &cached, GFP_ATOMIC);
2768 unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
2769 start + offset,
2770 start + offset + sectorsize - 1,
2771 &cached);
2772 }
2773 return errno_to_blk_status(error);
4a54c8c1
JS
2774}
2775
d1310b2e
CM
2776/* lots and lots of room for performance fixes in the end_bio funcs */
2777
b5227c07 2778void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0 2779{
38a39ac7 2780 struct btrfs_inode *inode;
25c1252a 2781 const bool uptodate = (err == 0);
3e2426bd 2782 int ret = 0;
87826df0 2783
38a39ac7
QW
2784 ASSERT(page && page->mapping);
2785 inode = BTRFS_I(page->mapping->host);
2786 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
87826df0 2787
87826df0 2788 if (!uptodate) {
963e4db8
QW
2789 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2790 u32 len;
2791
2792 ASSERT(end + 1 - start <= U32_MAX);
2793 len = end + 1 - start;
2794
2795 btrfs_page_clear_uptodate(fs_info, page, start, len);
2796 btrfs_page_set_error(fs_info, page, start, len);
bff5baf8 2797 ret = err < 0 ? err : -EIO;
5dca6eea 2798 mapping_set_error(page->mapping, ret);
87826df0 2799 }
87826df0
JM
2800}
2801
d1310b2e
CM
2802/*
2803 * after a writepage IO is done, we need to:
2804 * clear the uptodate bits on error
2805 * clear the writeback bits in the extent tree for this IO
2806 * end_page_writeback if the page has no more pending IO
2807 *
2808 * Scheduling is not allowed, so the extent state tree is expected
2809 * to have one and only one object corresponding to this IO.
2810 */
4246a0b6 2811static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2812{
4e4cbee9 2813 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2814 struct bio_vec *bvec;
d1310b2e
CM
2815 u64 start;
2816 u64 end;
6dc4f100 2817 struct bvec_iter_all iter_all;
d8e3fb10 2818 bool first_bvec = true;
d1310b2e 2819
c09abff8 2820 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2821 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2822 struct page *page = bvec->bv_page;
0b246afa
JM
2823 struct inode *inode = page->mapping->host;
2824 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
321a02db
QW
2825 const u32 sectorsize = fs_info->sectorsize;
2826
2827 /* Our read/write should always be sector aligned. */
2828 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2829 btrfs_err(fs_info,
2830 "partial page write in btrfs with offset %u and length %u",
2831 bvec->bv_offset, bvec->bv_len);
2832 else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2833 btrfs_info(fs_info,
2834 "incomplete page write with offset %u and length %u",
2835 bvec->bv_offset, bvec->bv_len);
2836
2837 start = page_offset(page) + bvec->bv_offset;
2838 end = start + bvec->bv_len - 1;
d1310b2e 2839
d8e3fb10
NA
2840 if (first_bvec) {
2841 btrfs_record_physical_zoned(inode, start, bio);
2842 first_bvec = false;
2843 }
2844
4e4cbee9 2845 end_extent_writepage(page, error, start, end);
9047e317
QW
2846
2847 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2c30c71b 2848 }
2b1f55b0 2849
d1310b2e 2850 bio_put(bio);
d1310b2e
CM
2851}
2852
94e8c95c
QW
2853/*
2854 * Record previously processed extent range
2855 *
2856 * For endio_readpage_release_extent() to handle a full extent range, reducing
2857 * the extent io operations.
2858 */
2859struct processed_extent {
2860 struct btrfs_inode *inode;
2861 /* Start of the range in @inode */
2862 u64 start;
2e626e56 2863 /* End of the range in @inode */
94e8c95c
QW
2864 u64 end;
2865 bool uptodate;
2866};
2867
2868/*
2869 * Try to release processed extent range
2870 *
2871 * May not release the extent range right now if the current range is
2872 * contiguous to processed extent.
2873 *
2874 * Will release processed extent when any of @inode, @uptodate, the range is
2875 * no longer contiguous to the processed range.
2876 *
2877 * Passing @inode == NULL will force processed extent to be released.
2878 */
2879static void endio_readpage_release_extent(struct processed_extent *processed,
2880 struct btrfs_inode *inode, u64 start, u64 end,
2881 bool uptodate)
883d0de4
MX
2882{
2883 struct extent_state *cached = NULL;
94e8c95c
QW
2884 struct extent_io_tree *tree;
2885
2886 /* The first extent, initialize @processed */
2887 if (!processed->inode)
2888 goto update;
883d0de4 2889
94e8c95c
QW
2890 /*
2891 * Contiguous to processed extent, just uptodate the end.
2892 *
2893 * Several things to notice:
2894 *
2895 * - bio can be merged as long as on-disk bytenr is contiguous
2896 * This means we can have page belonging to other inodes, thus need to
2897 * check if the inode still matches.
2898 * - bvec can contain range beyond current page for multi-page bvec
2899 * Thus we need to do processed->end + 1 >= start check
2900 */
2901 if (processed->inode == inode && processed->uptodate == uptodate &&
2902 processed->end + 1 >= start && end >= processed->end) {
2903 processed->end = end;
2904 return;
2905 }
2906
2907 tree = &processed->inode->io_tree;
2908 /*
2909 * Now we don't have range contiguous to the processed range, release
2910 * the processed range now.
2911 */
2912 if (processed->uptodate && tree->track_uptodate)
2913 set_extent_uptodate(tree, processed->start, processed->end,
2914 &cached, GFP_ATOMIC);
2915 unlock_extent_cached_atomic(tree, processed->start, processed->end,
2916 &cached);
2917
2918update:
2919 /* Update processed to current range */
2920 processed->inode = inode;
2921 processed->start = start;
2922 processed->end = end;
2923 processed->uptodate = uptodate;
883d0de4
MX
2924}
2925
92082d40
QW
2926static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2927{
2928 ASSERT(PageLocked(page));
2929 if (fs_info->sectorsize == PAGE_SIZE)
2930 return;
2931
2932 ASSERT(PagePrivate(page));
2933 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2934}
2935
d9bb77d5
QW
2936/*
2937 * Find extent buffer for a givne bytenr.
2938 *
2939 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2940 * in endio context.
2941 */
2942static struct extent_buffer *find_extent_buffer_readpage(
2943 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2944{
2945 struct extent_buffer *eb;
2946
2947 /*
2948 * For regular sectorsize, we can use page->private to grab extent
2949 * buffer
2950 */
2951 if (fs_info->sectorsize == PAGE_SIZE) {
2952 ASSERT(PagePrivate(page) && page->private);
2953 return (struct extent_buffer *)page->private;
2954 }
2955
2956 /* For subpage case, we need to lookup buffer radix tree */
2957 rcu_read_lock();
2958 eb = radix_tree_lookup(&fs_info->buffer_radix,
2959 bytenr >> fs_info->sectorsize_bits);
2960 rcu_read_unlock();
2961 ASSERT(eb);
2962 return eb;
2963}
2964
d1310b2e
CM
2965/*
2966 * after a readpage IO is done, we need to:
2967 * clear the uptodate bits on error
2968 * set the uptodate bits if things worked
2969 * set the page up to date if all extents in the tree are uptodate
2970 * clear the lock bit in the extent tree
2971 * unlock the page if there are no other extents locked for it
2972 *
2973 * Scheduling is not allowed, so the extent state tree is expected
2974 * to have one and only one object corresponding to this IO.
2975 */
4246a0b6 2976static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2977{
2c30c71b 2978 struct bio_vec *bvec;
c3a3b19b 2979 struct btrfs_bio *bbio = btrfs_bio(bio);
7870d082 2980 struct extent_io_tree *tree, *failure_tree;
94e8c95c 2981 struct processed_extent processed = { 0 };
7ffd27e3
QW
2982 /*
2983 * The offset to the beginning of a bio, since one bio can never be
2984 * larger than UINT_MAX, u32 here is enough.
2985 */
2986 u32 bio_offset = 0;
5cf1ab56 2987 int mirror;
d1310b2e 2988 int ret;
6dc4f100 2989 struct bvec_iter_all iter_all;
d1310b2e 2990
c09abff8 2991 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2992 bio_for_each_segment_all(bvec, bio, iter_all) {
150e4b05 2993 bool uptodate = !bio->bi_status;
d1310b2e 2994 struct page *page = bvec->bv_page;
a71754fc 2995 struct inode *inode = page->mapping->host;
ab8d0fc4 2996 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7ffd27e3 2997 const u32 sectorsize = fs_info->sectorsize;
150e4b05 2998 unsigned int error_bitmap = (unsigned int)-1;
7ffd27e3
QW
2999 u64 start;
3000 u64 end;
3001 u32 len;
507903b8 3002
ab8d0fc4
JM
3003 btrfs_debug(fs_info,
3004 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
1201b58b 3005 bio->bi_iter.bi_sector, bio->bi_status,
c3a3b19b 3006 bbio->mirror_num);
a71754fc 3007 tree = &BTRFS_I(inode)->io_tree;
7870d082 3008 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 3009
8b8bbd46
QW
3010 /*
3011 * We always issue full-sector reads, but if some block in a
3012 * page fails to read, blk_update_request() will advance
3013 * bv_offset and adjust bv_len to compensate. Print a warning
3014 * for unaligned offsets, and an error if they don't add up to
3015 * a full sector.
3016 */
3017 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3018 btrfs_err(fs_info,
3019 "partial page read in btrfs with offset %u and length %u",
3020 bvec->bv_offset, bvec->bv_len);
3021 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3022 sectorsize))
3023 btrfs_info(fs_info,
3024 "incomplete page read with offset %u and length %u",
3025 bvec->bv_offset, bvec->bv_len);
3026
3027 start = page_offset(page) + bvec->bv_offset;
3028 end = start + bvec->bv_len - 1;
facc8a22 3029 len = bvec->bv_len;
d1310b2e 3030
c3a3b19b 3031 mirror = bbio->mirror_num;
78e62c02 3032 if (likely(uptodate)) {
150e4b05 3033 if (is_data_inode(inode)) {
c3a3b19b 3034 error_bitmap = btrfs_verify_data_csum(bbio,
5e295768 3035 bio_offset, page, start, end);
150e4b05
QW
3036 ret = error_bitmap;
3037 } else {
c3a3b19b 3038 ret = btrfs_validate_metadata_buffer(bbio,
8e1dc982 3039 page, start, end, mirror);
150e4b05 3040 }
5ee0844d 3041 if (ret)
150e4b05 3042 uptodate = false;
5ee0844d 3043 else
7870d082
JB
3044 clean_io_failure(BTRFS_I(inode)->root->fs_info,
3045 failure_tree, tree, start,
3046 page,
3047 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 3048 }
ea466794 3049
f2a09da9
MX
3050 if (likely(uptodate))
3051 goto readpage_ok;
3052
be17b3af 3053 if (is_data_inode(inode)) {
f4a8e656 3054 /*
150e4b05
QW
3055 * btrfs_submit_read_repair() will handle all the good
3056 * and bad sectors, we just continue to the next bvec.
f4a8e656 3057 */
150e4b05
QW
3058 submit_read_repair(inode, bio, bio_offset, page,
3059 start - page_offset(page), start,
3060 end, mirror, error_bitmap,
3061 btrfs_submit_data_bio);
3062
3063 ASSERT(bio_offset + len > bio_offset);
3064 bio_offset += len;
3065 continue;
78e62c02
NB
3066 } else {
3067 struct extent_buffer *eb;
3068
d9bb77d5 3069 eb = find_extent_buffer_readpage(fs_info, page, start);
78e62c02
NB
3070 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3071 eb->read_mirror = mirror;
3072 atomic_dec(&eb->io_pages);
3073 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
3074 &eb->bflags))
3075 btree_readahead_hook(eb, -EIO);
7e38326f 3076 }
f2a09da9 3077readpage_ok:
883d0de4 3078 if (likely(uptodate)) {
a71754fc 3079 loff_t i_size = i_size_read(inode);
09cbfeaf 3080 pgoff_t end_index = i_size >> PAGE_SHIFT;
a71754fc 3081
c28ea613
QW
3082 /*
3083 * Zero out the remaining part if this range straddles
3084 * i_size.
3085 *
3086 * Here we should only zero the range inside the bvec,
3087 * not touch anything else.
3088 *
3089 * NOTE: i_size is exclusive while end is inclusive.
3090 */
3091 if (page->index == end_index && i_size <= end) {
3092 u32 zero_start = max(offset_in_page(i_size),
d2dcc8ed 3093 offset_in_page(start));
c28ea613
QW
3094
3095 zero_user_segment(page, zero_start,
3096 offset_in_page(end) + 1);
3097 }
70dec807 3098 }
7ffd27e3
QW
3099 ASSERT(bio_offset + len > bio_offset);
3100 bio_offset += len;
883d0de4 3101
e09caaf9 3102 /* Update page status and unlock */
92082d40 3103 end_page_read(page, uptodate, start, len);
94e8c95c 3104 endio_readpage_release_extent(&processed, BTRFS_I(inode),
14605409 3105 start, end, PageUptodate(page));
2c30c71b 3106 }
94e8c95c
QW
3107 /* Release the last extent */
3108 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
c3a3b19b 3109 btrfs_bio_free_csum(bbio);
d1310b2e 3110 bio_put(bio);
d1310b2e
CM
3111}
3112
9be3395b 3113/*
184f999e
DS
3114 * Initialize the members up to but not including 'bio'. Use after allocating a
3115 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3116 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 3117 */
c3a3b19b 3118static inline void btrfs_bio_init(struct btrfs_bio *bbio)
d1310b2e 3119{
c3a3b19b 3120 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
184f999e 3121}
d1310b2e 3122
9be3395b 3123/*
cd8e0cca
QW
3124 * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
3125 *
3126 * The bio allocation is backed by bioset and does not fail.
9be3395b 3127 */
c3a3b19b 3128struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
d1310b2e
CM
3129{
3130 struct bio *bio;
d1310b2e 3131
cd8e0cca
QW
3132 ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
3133 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
c3a3b19b 3134 btrfs_bio_init(btrfs_bio(bio));
d1310b2e
CM
3135 return bio;
3136}
3137
8b6c1d56 3138struct bio *btrfs_bio_clone(struct bio *bio)
9be3395b 3139{
c3a3b19b 3140 struct btrfs_bio *bbio;
23ea8e5a 3141 struct bio *new;
9be3395b 3142
6e707bcd 3143 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 3144 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
c3a3b19b
QW
3145 bbio = btrfs_bio(new);
3146 btrfs_bio_init(bbio);
3147 bbio->iter = bio->bi_iter;
23ea8e5a
MX
3148 return new;
3149}
9be3395b 3150
21dda654 3151struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
2f8e9140
LB
3152{
3153 struct bio *bio;
c3a3b19b 3154 struct btrfs_bio *bbio;
2f8e9140 3155
21dda654
CK
3156 ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3157
2f8e9140 3158 /* this will never fail when it's backed by a bioset */
8ac9f7c1 3159 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
3160 ASSERT(bio);
3161
c3a3b19b
QW
3162 bbio = btrfs_bio(bio);
3163 btrfs_bio_init(bbio);
2f8e9140
LB
3164
3165 bio_trim(bio, offset >> 9, size >> 9);
c3a3b19b 3166 bbio->iter = bio->bi_iter;
2f8e9140
LB
3167 return bio;
3168}
9be3395b 3169
953651eb
NA
3170/**
3171 * Attempt to add a page to bio
3172 *
3173 * @bio: destination bio
3174 * @page: page to add to the bio
3175 * @disk_bytenr: offset of the new bio or to check whether we are adding
3176 * a contiguous page to the previous one
3177 * @pg_offset: starting offset in the page
3178 * @size: portion of page that we want to write
3179 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3180 * @bio_flags: flags of the current bio to see if we can merge them
953651eb
NA
3181 *
3182 * Attempt to add a page to bio considering stripe alignment etc.
3183 *
e0eefe07
QW
3184 * Return >= 0 for the number of bytes added to the bio.
3185 * Can return 0 if the current bio is already at stripe/zone boundary.
3186 * Return <0 for error.
953651eb 3187 */
e0eefe07
QW
3188static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3189 struct page *page,
3190 u64 disk_bytenr, unsigned int size,
3191 unsigned int pg_offset,
3192 unsigned long bio_flags)
953651eb 3193{
390ed29b
QW
3194 struct bio *bio = bio_ctrl->bio;
3195 u32 bio_size = bio->bi_iter.bi_size;
e0eefe07 3196 u32 real_size;
953651eb
NA
3197 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3198 bool contig;
e1326f03 3199 int ret;
953651eb 3200
390ed29b
QW
3201 ASSERT(bio);
3202 /* The limit should be calculated when bio_ctrl->bio is allocated */
3203 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3204 if (bio_ctrl->bio_flags != bio_flags)
e0eefe07 3205 return 0;
953651eb 3206
390ed29b 3207 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
953651eb
NA
3208 contig = bio->bi_iter.bi_sector == sector;
3209 else
3210 contig = bio_end_sector(bio) == sector;
3211 if (!contig)
e0eefe07 3212 return 0;
953651eb 3213
e0eefe07
QW
3214 real_size = min(bio_ctrl->len_to_oe_boundary,
3215 bio_ctrl->len_to_stripe_boundary) - bio_size;
3216 real_size = min(real_size, size);
3217
3218 /*
3219 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
3220 * bio will still execute its endio function on the page!
3221 */
3222 if (real_size == 0)
3223 return 0;
953651eb 3224
390ed29b 3225 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
e0eefe07 3226 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
390ed29b 3227 else
e0eefe07 3228 ret = bio_add_page(bio, page, real_size, pg_offset);
e1326f03 3229
e0eefe07 3230 return ret;
953651eb
NA
3231}
3232
390ed29b 3233static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
939c7feb 3234 struct btrfs_inode *inode, u64 file_offset)
390ed29b
QW
3235{
3236 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3237 struct btrfs_io_geometry geom;
3238 struct btrfs_ordered_extent *ordered;
3239 struct extent_map *em;
3240 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3241 int ret;
3242
3243 /*
3244 * Pages for compressed extent are never submitted to disk directly,
3245 * thus it has no real boundary, just set them to U32_MAX.
3246 *
3247 * The split happens for real compressed bio, which happens in
3248 * btrfs_submit_compressed_read/write().
3249 */
3250 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
3251 bio_ctrl->len_to_oe_boundary = U32_MAX;
3252 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3253 return 0;
3254 }
3255 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3256 if (IS_ERR(em))
3257 return PTR_ERR(em);
3258 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3259 logical, &geom);
3260 free_extent_map(em);
3261 if (ret < 0) {
3262 return ret;
3263 }
3264 if (geom.len > U32_MAX)
3265 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3266 else
3267 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3268
3269 if (!btrfs_is_zoned(fs_info) ||
3270 bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3271 bio_ctrl->len_to_oe_boundary = U32_MAX;
3272 return 0;
3273 }
3274
390ed29b 3275 /* Ordered extent not yet created, so we're good */
939c7feb 3276 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
390ed29b
QW
3277 if (!ordered) {
3278 bio_ctrl->len_to_oe_boundary = U32_MAX;
3279 return 0;
3280 }
3281
3282 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3283 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3284 btrfs_put_ordered_extent(ordered);
3285 return 0;
3286}
3287
e0eefe07
QW
3288static int alloc_new_bio(struct btrfs_inode *inode,
3289 struct btrfs_bio_ctrl *bio_ctrl,
3290 struct writeback_control *wbc,
3291 unsigned int opf,
3292 bio_end_io_t end_io_func,
939c7feb 3293 u64 disk_bytenr, u32 offset, u64 file_offset,
e0eefe07
QW
3294 unsigned long bio_flags)
3295{
3296 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3297 struct bio *bio;
3298 int ret;
3299
c3a3b19b 3300 bio = btrfs_bio_alloc(BIO_MAX_VECS);
e0eefe07
QW
3301 /*
3302 * For compressed page range, its disk_bytenr is always @disk_bytenr
3303 * passed in, no matter if we have added any range into previous bio.
3304 */
3305 if (bio_flags & EXTENT_BIO_COMPRESSED)
cd8e0cca 3306 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
e0eefe07 3307 else
cd8e0cca 3308 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
e0eefe07
QW
3309 bio_ctrl->bio = bio;
3310 bio_ctrl->bio_flags = bio_flags;
e0eefe07
QW
3311 bio->bi_end_io = end_io_func;
3312 bio->bi_private = &inode->io_tree;
3313 bio->bi_write_hint = inode->vfs_inode.i_write_hint;
3314 bio->bi_opf = opf;
939c7feb
NA
3315 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3316 if (ret < 0)
3317 goto error;
e0eefe07
QW
3318 if (wbc) {
3319 struct block_device *bdev;
3320
d24fa5c1 3321 bdev = fs_info->fs_devices->latest_dev->bdev;
e0eefe07
QW
3322 bio_set_dev(bio, bdev);
3323 wbc_init_bio(wbc, bio);
3324 }
3325 if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
3326 struct btrfs_device *device;
3327
3328 device = btrfs_zoned_get_device(fs_info, disk_bytenr,
3329 fs_info->sectorsize);
3330 if (IS_ERR(device)) {
3331 ret = PTR_ERR(device);
3332 goto error;
3333 }
3334
c3a3b19b 3335 btrfs_bio(bio)->device = device;
e0eefe07
QW
3336 }
3337 return 0;
3338error:
3339 bio_ctrl->bio = NULL;
3340 bio->bi_status = errno_to_blk_status(ret);
3341 bio_endio(bio);
3342 return ret;
3343}
3344
4b81ba48
DS
3345/*
3346 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
3347 * @wbc: optional writeback control for io accounting
3348 * @page: page to add to the bio
0c64c33c
QW
3349 * @disk_bytenr: logical bytenr where the write will be
3350 * @size: portion of page that we want to write to
b8b3d625
DS
3351 * @pg_offset: offset of the new bio or to check whether we are adding
3352 * a contiguous page to the previous one
5c2b1fd7 3353 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
3354 * @end_io_func: end_io callback for new bio
3355 * @mirror_num: desired mirror to read/write
3356 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3357 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 3358 */
0ceb34bf 3359static int submit_extent_page(unsigned int opf,
da2f0f74 3360 struct writeback_control *wbc,
390ed29b 3361 struct btrfs_bio_ctrl *bio_ctrl,
0c64c33c 3362 struct page *page, u64 disk_bytenr,
6c5a4e2c 3363 size_t size, unsigned long pg_offset,
f188591e 3364 bio_end_io_t end_io_func,
c8b97818 3365 int mirror_num,
005efedf
FM
3366 unsigned long bio_flags,
3367 bool force_bio_submit)
d1310b2e
CM
3368{
3369 int ret = 0;
e1326f03 3370 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
e0eefe07 3371 unsigned int cur = pg_offset;
d1310b2e 3372
390ed29b 3373 ASSERT(bio_ctrl);
5c2b1fd7 3374
390ed29b
QW
3375 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3376 pg_offset + size <= PAGE_SIZE);
e0eefe07
QW
3377 if (force_bio_submit && bio_ctrl->bio) {
3378 ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
3379 bio_ctrl->bio = NULL;
3380 if (ret < 0)
3381 return ret;
3382 }
3383
3384 while (cur < pg_offset + size) {
3385 u32 offset = cur - pg_offset;
3386 int added;
3387
3388 /* Allocate new bio if needed */
3389 if (!bio_ctrl->bio) {
3390 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3391 end_io_func, disk_bytenr, offset,
939c7feb 3392 page_offset(page) + cur,
e0eefe07
QW
3393 bio_flags);
3394 if (ret < 0)
3395 return ret;
3396 }
3397 /*
3398 * We must go through btrfs_bio_add_page() to ensure each
3399 * page range won't cross various boundaries.
3400 */
3401 if (bio_flags & EXTENT_BIO_COMPRESSED)
3402 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3403 size - offset, pg_offset + offset,
3404 bio_flags);
3405 else
3406 added = btrfs_bio_add_page(bio_ctrl, page,
3407 disk_bytenr + offset, size - offset,
3408 pg_offset + offset, bio_flags);
3409
3410 /* Metadata page range should never be split */
3411 if (!is_data_inode(&inode->vfs_inode))
3412 ASSERT(added == 0 || added == size - offset);
3413
3414 /* At least we added some page, update the account */
3415 if (wbc && added)
3416 wbc_account_cgroup_owner(wbc, page, added);
3417
3418 /* We have reached boundary, submit right now */
3419 if (added < size - offset) {
3420 /* The bio should contain some page(s) */
3421 ASSERT(bio_ctrl->bio->bi_iter.bi_size);
3422 ret = submit_one_bio(bio_ctrl->bio, mirror_num,
3423 bio_ctrl->bio_flags);
390ed29b
QW
3424 bio_ctrl->bio = NULL;
3425 if (ret < 0)
79787eaa 3426 return ret;
d1310b2e 3427 }
e0eefe07 3428 cur += added;
d1310b2e 3429 }
e0eefe07 3430 return 0;
d1310b2e
CM
3431}
3432
760f991f
QW
3433static int attach_extent_buffer_page(struct extent_buffer *eb,
3434 struct page *page,
3435 struct btrfs_subpage *prealloc)
d1310b2e 3436{
760f991f
QW
3437 struct btrfs_fs_info *fs_info = eb->fs_info;
3438 int ret = 0;
3439
0d01e247
QW
3440 /*
3441 * If the page is mapped to btree inode, we should hold the private
3442 * lock to prevent race.
3443 * For cloned or dummy extent buffers, their pages are not mapped and
3444 * will not race with any other ebs.
3445 */
3446 if (page->mapping)
3447 lockdep_assert_held(&page->mapping->private_lock);
3448
760f991f
QW
3449 if (fs_info->sectorsize == PAGE_SIZE) {
3450 if (!PagePrivate(page))
3451 attach_page_private(page, eb);
3452 else
3453 WARN_ON(page->private != (unsigned long)eb);
3454 return 0;
3455 }
3456
3457 /* Already mapped, just free prealloc */
3458 if (PagePrivate(page)) {
3459 btrfs_free_subpage(prealloc);
3460 return 0;
3461 }
3462
3463 if (prealloc)
3464 /* Has preallocated memory for subpage */
3465 attach_page_private(page, prealloc);
d1b89bc0 3466 else
760f991f
QW
3467 /* Do new allocation to attach subpage */
3468 ret = btrfs_attach_subpage(fs_info, page,
3469 BTRFS_SUBPAGE_METADATA);
3470 return ret;
d1310b2e
CM
3471}
3472
32443de3 3473int set_page_extent_mapped(struct page *page)
d1310b2e 3474{
32443de3
QW
3475 struct btrfs_fs_info *fs_info;
3476
3477 ASSERT(page->mapping);
3478
3479 if (PagePrivate(page))
3480 return 0;
3481
3482 fs_info = btrfs_sb(page->mapping->host->i_sb);
3483
3484 if (fs_info->sectorsize < PAGE_SIZE)
3485 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3486
3487 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3488 return 0;
3489}
3490
3491void clear_page_extent_mapped(struct page *page)
3492{
3493 struct btrfs_fs_info *fs_info;
3494
3495 ASSERT(page->mapping);
3496
d1b89bc0 3497 if (!PagePrivate(page))
32443de3
QW
3498 return;
3499
3500 fs_info = btrfs_sb(page->mapping->host->i_sb);
3501 if (fs_info->sectorsize < PAGE_SIZE)
3502 return btrfs_detach_subpage(fs_info, page);
3503
3504 detach_page_private(page);
d1310b2e
CM
3505}
3506
125bac01
MX
3507static struct extent_map *
3508__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
1a5ee1e6 3509 u64 start, u64 len, struct extent_map **em_cached)
125bac01
MX
3510{
3511 struct extent_map *em;
3512
3513 if (em_cached && *em_cached) {
3514 em = *em_cached;
cbc0e928 3515 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3516 start < extent_map_end(em)) {
490b54d6 3517 refcount_inc(&em->refs);
125bac01
MX
3518 return em;
3519 }
3520
3521 free_extent_map(em);
3522 *em_cached = NULL;
3523 }
3524
1a5ee1e6 3525 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
125bac01
MX
3526 if (em_cached && !IS_ERR_OR_NULL(em)) {
3527 BUG_ON(*em_cached);
490b54d6 3528 refcount_inc(&em->refs);
125bac01
MX
3529 *em_cached = em;
3530 }
3531 return em;
3532}
d1310b2e
CM
3533/*
3534 * basic readpage implementation. Locked extent state structs are inserted
3535 * into the tree that are removed when the IO is done (by the end_io
3536 * handlers)
79787eaa 3537 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3538 * return 0 on success, otherwise return error
d1310b2e 3539 */
0f208812 3540int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
390ed29b 3541 struct btrfs_bio_ctrl *bio_ctrl,
0f208812 3542 unsigned int read_flags, u64 *prev_em_start)
d1310b2e
CM
3543{
3544 struct inode *inode = page->mapping->host;
92082d40 3545 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4eee4fa4 3546 u64 start = page_offset(page);
8eec8296 3547 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3548 u64 cur = start;
3549 u64 extent_offset;
3550 u64 last_byte = i_size_read(inode);
3551 u64 block_start;
3552 u64 cur_end;
d1310b2e 3553 struct extent_map *em;
baf863b9 3554 int ret = 0;
d1310b2e 3555 int nr = 0;
306e16ce 3556 size_t pg_offset = 0;
d1310b2e
CM
3557 size_t iosize;
3558 size_t blocksize = inode->i_sb->s_blocksize;
f657a31c 3559 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 3560
32443de3
QW
3561 ret = set_page_extent_mapped(page);
3562 if (ret < 0) {
3563 unlock_extent(tree, start, end);
92082d40
QW
3564 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3565 unlock_page(page);
32443de3
QW
3566 goto out;
3567 }
d1310b2e 3568
90a887c9
DM
3569 if (!PageUptodate(page)) {
3570 if (cleancache_get_page(page) == 0) {
3571 BUG_ON(blocksize != PAGE_SIZE);
9974090b 3572 unlock_extent(tree, start, end);
92082d40 3573 unlock_page(page);
90a887c9
DM
3574 goto out;
3575 }
3576 }
3577
09cbfeaf 3578 if (page->index == last_byte >> PAGE_SHIFT) {
7073017a 3579 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3580
3581 if (zero_offset) {
09cbfeaf 3582 iosize = PAGE_SIZE - zero_offset;
d048b9c2 3583 memzero_page(page, zero_offset, iosize);
c8b97818 3584 flush_dcache_page(page);
c8b97818
CM
3585 }
3586 }
92082d40 3587 begin_page_read(fs_info, page);
d1310b2e 3588 while (cur <= end) {
4c37a793 3589 unsigned long this_bio_flag = 0;
005efedf 3590 bool force_bio_submit = false;
0c64c33c 3591 u64 disk_bytenr;
c8f2f24b 3592
6a404910 3593 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
d1310b2e 3594 if (cur >= last_byte) {
507903b8
AJ
3595 struct extent_state *cached = NULL;
3596
09cbfeaf 3597 iosize = PAGE_SIZE - pg_offset;
d048b9c2 3598 memzero_page(page, pg_offset, iosize);
d1310b2e 3599 flush_dcache_page(page);
d1310b2e 3600 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3601 &cached, GFP_NOFS);
7f042a83 3602 unlock_extent_cached(tree, cur,
e43bbe5e 3603 cur + iosize - 1, &cached);
92082d40 3604 end_page_read(page, true, cur, iosize);
d1310b2e
CM
3605 break;
3606 }
125bac01 3607 em = __get_extent_map(inode, page, pg_offset, cur,
1a5ee1e6 3608 end - cur + 1, em_cached);
c704005d 3609 if (IS_ERR_OR_NULL(em)) {
7f042a83 3610 unlock_extent(tree, cur, end);
92082d40 3611 end_page_read(page, false, cur, end + 1 - cur);
d1310b2e
CM
3612 break;
3613 }
d1310b2e
CM
3614 extent_offset = cur - em->start;
3615 BUG_ON(extent_map_end(em) <= cur);
3616 BUG_ON(end < cur);
3617
261507a0 3618 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3619 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3620 extent_set_compress_type(&this_bio_flag,
3621 em->compress_type);
3622 }
c8b97818 3623
d1310b2e
CM
3624 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3625 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3626 iosize = ALIGN(iosize, blocksize);
949b3273 3627 if (this_bio_flag & EXTENT_BIO_COMPRESSED)
0c64c33c 3628 disk_bytenr = em->block_start;
949b3273 3629 else
0c64c33c 3630 disk_bytenr = em->block_start + extent_offset;
d1310b2e 3631 block_start = em->block_start;
d899e052
YZ
3632 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3633 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3634
3635 /*
3636 * If we have a file range that points to a compressed extent
260db43c 3637 * and it's followed by a consecutive file range that points
005efedf
FM
3638 * to the same compressed extent (possibly with a different
3639 * offset and/or length, so it either points to the whole extent
3640 * or only part of it), we must make sure we do not submit a
3641 * single bio to populate the pages for the 2 ranges because
3642 * this makes the compressed extent read zero out the pages
3643 * belonging to the 2nd range. Imagine the following scenario:
3644 *
3645 * File layout
3646 * [0 - 8K] [8K - 24K]
3647 * | |
3648 * | |
3649 * points to extent X, points to extent X,
3650 * offset 4K, length of 8K offset 0, length 16K
3651 *
3652 * [extent X, compressed length = 4K uncompressed length = 16K]
3653 *
3654 * If the bio to read the compressed extent covers both ranges,
3655 * it will decompress extent X into the pages belonging to the
3656 * first range and then it will stop, zeroing out the remaining
3657 * pages that belong to the other range that points to extent X.
3658 * So here we make sure we submit 2 bios, one for the first
3659 * range and another one for the third range. Both will target
3660 * the same physical extent from disk, but we can't currently
3661 * make the compressed bio endio callback populate the pages
3662 * for both ranges because each compressed bio is tightly
3663 * coupled with a single extent map, and each range can have
3664 * an extent map with a different offset value relative to the
3665 * uncompressed data of our extent and different lengths. This
3666 * is a corner case so we prioritize correctness over
3667 * non-optimal behavior (submitting 2 bios for the same extent).
3668 */
3669 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3670 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3671 *prev_em_start != em->start)
005efedf
FM
3672 force_bio_submit = true;
3673
3674 if (prev_em_start)
8e928218 3675 *prev_em_start = em->start;
005efedf 3676
d1310b2e
CM
3677 free_extent_map(em);
3678 em = NULL;
3679
3680 /* we've found a hole, just zero and go on */
3681 if (block_start == EXTENT_MAP_HOLE) {
507903b8
AJ
3682 struct extent_state *cached = NULL;
3683
d048b9c2 3684 memzero_page(page, pg_offset, iosize);
d1310b2e 3685 flush_dcache_page(page);
d1310b2e
CM
3686
3687 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3688 &cached, GFP_NOFS);
7f042a83 3689 unlock_extent_cached(tree, cur,
e43bbe5e 3690 cur + iosize - 1, &cached);
92082d40 3691 end_page_read(page, true, cur, iosize);
d1310b2e 3692 cur = cur + iosize;
306e16ce 3693 pg_offset += iosize;
d1310b2e
CM
3694 continue;
3695 }
3696 /* the get_extent function already copied into the page */
9655d298
CM
3697 if (test_range_bit(tree, cur, cur_end,
3698 EXTENT_UPTODATE, 1, NULL)) {
7f042a83 3699 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3700 end_page_read(page, true, cur, iosize);
d1310b2e 3701 cur = cur + iosize;
306e16ce 3702 pg_offset += iosize;
d1310b2e
CM
3703 continue;
3704 }
70dec807
CM
3705 /* we have an inline extent but it didn't get marked up
3706 * to date. Error out
3707 */
3708 if (block_start == EXTENT_MAP_INLINE) {
7f042a83 3709 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3710 end_page_read(page, false, cur, iosize);
70dec807 3711 cur = cur + iosize;
306e16ce 3712 pg_offset += iosize;
70dec807
CM
3713 continue;
3714 }
d1310b2e 3715
0ceb34bf 3716 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
390ed29b
QW
3717 bio_ctrl, page, disk_bytenr, iosize,
3718 pg_offset,
fd513000 3719 end_bio_extent_readpage, 0,
005efedf
FM
3720 this_bio_flag,
3721 force_bio_submit);
c8f2f24b
JB
3722 if (!ret) {
3723 nr++;
c8f2f24b 3724 } else {
7f042a83 3725 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3726 end_page_read(page, false, cur, iosize);
baf863b9 3727 goto out;
edd33c99 3728 }
d1310b2e 3729 cur = cur + iosize;
306e16ce 3730 pg_offset += iosize;
d1310b2e 3731 }
90a887c9 3732out:
baf863b9 3733 return ret;
d1310b2e
CM
3734}
3735
b6660e80 3736static inline void contiguous_readpages(struct page *pages[], int nr_pages,
390ed29b
QW
3737 u64 start, u64 end,
3738 struct extent_map **em_cached,
3739 struct btrfs_bio_ctrl *bio_ctrl,
3740 u64 *prev_em_start)
9974090b 3741{
23d31bd4 3742 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
3743 int index;
3744
b272ae22 3745 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3746
3747 for (index = 0; index < nr_pages; index++) {
390ed29b 3748 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
0f208812 3749 REQ_RAHEAD, prev_em_start);
09cbfeaf 3750 put_page(pages[index]);
9974090b
MX
3751 }
3752}
3753
3d4b9496 3754static void update_nr_written(struct writeback_control *wbc,
a9132667 3755 unsigned long nr_written)
11c8349b
CM
3756{
3757 wbc->nr_to_write -= nr_written;
11c8349b
CM
3758}
3759
d1310b2e 3760/*
40f76580
CM
3761 * helper for __extent_writepage, doing all of the delayed allocation setup.
3762 *
5eaad97a 3763 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3764 * to write the page (copy into inline extent). In this case the IO has
3765 * been started and the page is already unlocked.
3766 *
3767 * This returns 0 if all went well (page still locked)
3768 * This returns < 0 if there were errors (page still locked)
d1310b2e 3769 */
cd4c0bf9 3770static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
8cc0237a 3771 struct page *page, struct writeback_control *wbc,
cf3075fb 3772 unsigned long *nr_written)
40f76580 3773{
cf3075fb 3774 u64 page_end = page_offset(page) + PAGE_SIZE - 1;
3522e903 3775 bool found;
cf3075fb 3776 u64 delalloc_start = page_offset(page);
40f76580
CM
3777 u64 delalloc_to_write = 0;
3778 u64 delalloc_end = 0;
3779 int ret;
3780 int page_started = 0;
3781
40f76580
CM
3782
3783 while (delalloc_end < page_end) {
cd4c0bf9 3784 found = find_lock_delalloc_range(&inode->vfs_inode, page,
40f76580 3785 &delalloc_start,
917aacec 3786 &delalloc_end);
3522e903 3787 if (!found) {
40f76580
CM
3788 delalloc_start = delalloc_end + 1;
3789 continue;
3790 }
cd4c0bf9 3791 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
5eaad97a 3792 delalloc_end, &page_started, nr_written, wbc);
40f76580 3793 if (ret) {
963e4db8
QW
3794 btrfs_page_set_error(inode->root->fs_info, page,
3795 page_offset(page), PAGE_SIZE);
7361b4ae 3796 return ret;
40f76580
CM
3797 }
3798 /*
ea1754a0
KS
3799 * delalloc_end is already one less than the total length, so
3800 * we don't subtract one from PAGE_SIZE
40f76580
CM
3801 */
3802 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3803 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3804 delalloc_start = delalloc_end + 1;
3805 }
3806 if (wbc->nr_to_write < delalloc_to_write) {
3807 int thresh = 8192;
3808
3809 if (delalloc_to_write < thresh * 2)
3810 thresh = delalloc_to_write;
3811 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3812 thresh);
3813 }
3814
3815 /* did the fill delalloc function already unlock and start
3816 * the IO?
3817 */
3818 if (page_started) {
3819 /*
3820 * we've unlocked the page, so we can't update
3821 * the mapping's writeback index, just update
3822 * nr_to_write.
3823 */
3824 wbc->nr_to_write -= *nr_written;
3825 return 1;
3826 }
3827
b69d1ee9 3828 return 0;
40f76580
CM
3829}
3830
c5ef5c6c
QW
3831/*
3832 * Find the first byte we need to write.
3833 *
3834 * For subpage, one page can contain several sectors, and
3835 * __extent_writepage_io() will just grab all extent maps in the page
3836 * range and try to submit all non-inline/non-compressed extents.
3837 *
3838 * This is a big problem for subpage, we shouldn't re-submit already written
3839 * data at all.
3840 * This function will lookup subpage dirty bit to find which range we really
3841 * need to submit.
3842 *
3843 * Return the next dirty range in [@start, @end).
3844 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
3845 */
3846static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3847 struct page *page, u64 *start, u64 *end)
3848{
3849 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
72a69cd0 3850 struct btrfs_subpage_info *spi = fs_info->subpage_info;
c5ef5c6c
QW
3851 u64 orig_start = *start;
3852 /* Declare as unsigned long so we can use bitmap ops */
c5ef5c6c 3853 unsigned long flags;
72a69cd0 3854 int range_start_bit;
c5ef5c6c
QW
3855 int range_end_bit;
3856
3857 /*
3858 * For regular sector size == page size case, since one page only
3859 * contains one sector, we return the page offset directly.
3860 */
3861 if (fs_info->sectorsize == PAGE_SIZE) {
3862 *start = page_offset(page);
3863 *end = page_offset(page) + PAGE_SIZE;
3864 return;
3865 }
3866
72a69cd0
QW
3867 range_start_bit = spi->dirty_offset +
3868 (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3869
c5ef5c6c
QW
3870 /* We should have the page locked, but just in case */
3871 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
3872 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3873 spi->dirty_offset + spi->bitmap_nr_bits);
c5ef5c6c
QW
3874 spin_unlock_irqrestore(&subpage->lock, flags);
3875
72a69cd0
QW
3876 range_start_bit -= spi->dirty_offset;
3877 range_end_bit -= spi->dirty_offset;
3878
c5ef5c6c
QW
3879 *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3880 *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3881}
3882
40f76580
CM
3883/*
3884 * helper for __extent_writepage. This calls the writepage start hooks,
3885 * and does the loop to map the page into extents and bios.
3886 *
3887 * We return 1 if the IO is started and the page is unlocked,
3888 * 0 if all went well (page still locked)
3889 * < 0 if there were errors (page still locked)
3890 */
d4580fe2 3891static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
40f76580
CM
3892 struct page *page,
3893 struct writeback_control *wbc,
3894 struct extent_page_data *epd,
3895 loff_t i_size,
3896 unsigned long nr_written,
57e5ffeb 3897 int *nr_ret)
d1310b2e 3898{
6bc5636a 3899 struct btrfs_fs_info *fs_info = inode->root->fs_info;
a129ffb8
QW
3900 u64 cur = page_offset(page);
3901 u64 end = cur + PAGE_SIZE - 1;
d1310b2e 3902 u64 extent_offset;
d1310b2e 3903 u64 block_start;
d1310b2e 3904 struct extent_map *em;
40f76580
CM
3905 int ret = 0;
3906 int nr = 0;
d8e3fb10 3907 u32 opf = REQ_OP_WRITE;
57e5ffeb 3908 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3909 bool compressed;
c8b97818 3910
a129ffb8 3911 ret = btrfs_writepage_cow_fixup(page);
d75855b4
NB
3912 if (ret) {
3913 /* Fixup worker will requeue */
5ab58055 3914 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3915 update_nr_written(wbc, nr_written);
3916 unlock_page(page);
3917 return 1;
247e743c
CM
3918 }
3919
11c8349b
CM
3920 /*
3921 * we don't want to touch the inode after unlocking the page,
3922 * so we update the mapping writeback index now
3923 */
3d4b9496 3924 update_nr_written(wbc, nr_written + 1);
771ed689 3925
d1310b2e 3926 while (cur <= end) {
0c64c33c 3927 u64 disk_bytenr;
40f76580 3928 u64 em_end;
c5ef5c6c
QW
3929 u64 dirty_range_start = cur;
3930 u64 dirty_range_end;
6bc5636a 3931 u32 iosize;
58409edd 3932
40f76580 3933 if (cur >= i_size) {
38a39ac7 3934 btrfs_writepage_endio_finish_ordered(inode, page, cur,
25c1252a 3935 end, true);
cc1d0d93
QW
3936 /*
3937 * This range is beyond i_size, thus we don't need to
3938 * bother writing back.
3939 * But we still need to clear the dirty subpage bit, or
3940 * the next time the page gets dirtied, we will try to
3941 * writeback the sectors with subpage dirty bits,
3942 * causing writeback without ordered extent.
3943 */
3944 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
d1310b2e
CM
3945 break;
3946 }
c5ef5c6c
QW
3947
3948 find_next_dirty_byte(fs_info, page, &dirty_range_start,
3949 &dirty_range_end);
3950 if (cur < dirty_range_start) {
3951 cur = dirty_range_start;
3952 continue;
3953 }
3954
d4580fe2 3955 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
c704005d 3956 if (IS_ERR_OR_NULL(em)) {
c5ef5c6c 3957 btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
61391d56 3958 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
3959 break;
3960 }
3961
3962 extent_offset = cur - em->start;
40f76580 3963 em_end = extent_map_end(em);
6bc5636a
QW
3964 ASSERT(cur <= em_end);
3965 ASSERT(cur < end);
3966 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
3967 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
d1310b2e 3968 block_start = em->block_start;
c8b97818 3969 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6bc5636a
QW
3970 disk_bytenr = em->block_start + extent_offset;
3971
c5ef5c6c
QW
3972 /*
3973 * Note that em_end from extent_map_end() and dirty_range_end from
3974 * find_next_dirty_byte() are all exclusive
3975 */
3976 iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
d8e3fb10 3977
e380adfc 3978 if (btrfs_use_zone_append(inode, em->block_start))
d8e3fb10
NA
3979 opf = REQ_OP_ZONE_APPEND;
3980
d1310b2e
CM
3981 free_extent_map(em);
3982 em = NULL;
3983
c8b97818
CM
3984 /*
3985 * compressed and inline extents are written through other
3986 * paths in the FS
3987 */
3988 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 3989 block_start == EXTENT_MAP_INLINE) {
c8b04030 3990 if (compressed)
c8b97818 3991 nr++;
c8b04030 3992 else
38a39ac7 3993 btrfs_writepage_endio_finish_ordered(inode,
25c1252a 3994 page, cur, cur + iosize - 1, true);
cc1d0d93 3995 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
c8b97818 3996 cur += iosize;
d1310b2e
CM
3997 continue;
3998 }
c8b97818 3999
d2a91064 4000 btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
58409edd 4001 if (!PageWriteback(page)) {
d4580fe2 4002 btrfs_err(inode->root->fs_info,
58409edd
DS
4003 "page %lu not writeback, cur %llu end %llu",
4004 page->index, cur, end);
d1310b2e 4005 }
7f3c74fb 4006
c5ef5c6c
QW
4007 /*
4008 * Although the PageDirty bit is cleared before entering this
4009 * function, subpage dirty bit is not cleared.
4010 * So clear subpage dirty bit here so next time we won't submit
4011 * page for range already written to disk.
4012 */
4013 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4014
390ed29b
QW
4015 ret = submit_extent_page(opf | write_flags, wbc,
4016 &epd->bio_ctrl, page,
d8e3fb10 4017 disk_bytenr, iosize,
390ed29b 4018 cur - page_offset(page),
58409edd 4019 end_bio_extent_writepage,
390ed29b 4020 0, 0, false);
fe01aa65 4021 if (ret) {
c5ef5c6c 4022 btrfs_page_set_error(fs_info, page, cur, iosize);
fe01aa65 4023 if (PageWriteback(page))
c5ef5c6c
QW
4024 btrfs_page_clear_writeback(fs_info, page, cur,
4025 iosize);
fe01aa65 4026 }
d1310b2e 4027
6bc5636a 4028 cur += iosize;
d1310b2e
CM
4029 nr++;
4030 }
cc1d0d93
QW
4031 /*
4032 * If we finish without problem, we should not only clear page dirty,
4033 * but also empty subpage dirty bits
4034 */
4035 if (!ret)
4036 btrfs_page_assert_not_dirty(fs_info, page);
40f76580 4037 *nr_ret = nr;
40f76580
CM
4038 return ret;
4039}
4040
4041/*
4042 * the writepage semantics are similar to regular writepage. extent
4043 * records are inserted to lock ranges in the tree, and as dirty areas
4044 * are found, they are marked writeback. Then the lock bits are removed
4045 * and the end_io handler clears the writeback ranges
3065976b
QW
4046 *
4047 * Return 0 if everything goes well.
4048 * Return <0 for error.
40f76580
CM
4049 */
4050static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 4051 struct extent_page_data *epd)
40f76580
CM
4052{
4053 struct inode *inode = page->mapping->host;
cf3075fb
QW
4054 const u64 page_start = page_offset(page);
4055 const u64 page_end = page_start + PAGE_SIZE - 1;
40f76580
CM
4056 int ret;
4057 int nr = 0;
eb70d222 4058 size_t pg_offset;
40f76580 4059 loff_t i_size = i_size_read(inode);
09cbfeaf 4060 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580
CM
4061 unsigned long nr_written = 0;
4062
40f76580
CM
4063 trace___extent_writepage(page, inode, wbc);
4064
4065 WARN_ON(!PageLocked(page));
4066
963e4db8
QW
4067 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4068 page_offset(page), PAGE_SIZE);
40f76580 4069
7073017a 4070 pg_offset = offset_in_page(i_size);
40f76580
CM
4071 if (page->index > end_index ||
4072 (page->index == end_index && !pg_offset)) {
09cbfeaf 4073 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
40f76580
CM
4074 unlock_page(page);
4075 return 0;
4076 }
4077
4078 if (page->index == end_index) {
d048b9c2 4079 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
40f76580
CM
4080 flush_dcache_page(page);
4081 }
4082
32443de3
QW
4083 ret = set_page_extent_mapped(page);
4084 if (ret < 0) {
4085 SetPageError(page);
4086 goto done;
4087 }
40f76580 4088
7789a55a 4089 if (!epd->extent_locked) {
cf3075fb 4090 ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
7789a55a 4091 if (ret == 1)
169d2c87 4092 return 0;
7789a55a
NB
4093 if (ret)
4094 goto done;
4095 }
40f76580 4096
d4580fe2
NB
4097 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
4098 nr_written, &nr);
40f76580 4099 if (ret == 1)
169d2c87 4100 return 0;
40f76580 4101
d1310b2e
CM
4102done:
4103 if (nr == 0) {
4104 /* make sure the mapping tag for page dirty gets cleared */
4105 set_page_writeback(page);
4106 end_page_writeback(page);
4107 }
963e4db8
QW
4108 /*
4109 * Here we used to have a check for PageError() and then set @ret and
4110 * call end_extent_writepage().
4111 *
4112 * But in fact setting @ret here will cause different error paths
4113 * between subpage and regular sectorsize.
4114 *
4115 * For regular page size, we never submit current page, but only add
4116 * current page to current bio.
4117 * The bio submission can only happen in next page.
4118 * Thus if we hit the PageError() branch, @ret is already set to
4119 * non-zero value and will not get updated for regular sectorsize.
4120 *
4121 * But for subpage case, it's possible we submit part of current page,
4122 * thus can get PageError() set by submitted bio of the same page,
4123 * while our @ret is still 0.
4124 *
4125 * So here we unify the behavior and don't set @ret.
4126 * Error can still be properly passed to higher layer as page will
4127 * be set error, here we just don't handle the IO failure.
4128 *
4129 * NOTE: This is just a hotfix for subpage.
4130 * The root fix will be properly ending ordered extent when we hit
4131 * an error during writeback.
4132 *
4133 * But that needs a bigger refactoring, as we not only need to grab the
4134 * submitted OE, but also need to know exactly at which bytenr we hit
4135 * the error.
4136 * Currently the full page based __extent_writepage_io() is not
4137 * capable of that.
4138 */
4139 if (PageError(page))
cf3075fb 4140 end_extent_writepage(page, ret, page_start, page_end);
d1310b2e 4141 unlock_page(page);
3065976b 4142 ASSERT(ret <= 0);
40f76580 4143 return ret;
d1310b2e
CM
4144}
4145
fd8b2b61 4146void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 4147{
74316201
N
4148 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4149 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
4150}
4151
18dfa711
FM
4152static void end_extent_buffer_writeback(struct extent_buffer *eb)
4153{
be1a1d7a
NA
4154 if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
4155 btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
4156
18dfa711
FM
4157 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4158 smp_mb__after_atomic();
4159 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4160}
4161
2e3c2513 4162/*
a3efb2f0 4163 * Lock extent buffer status and pages for writeback.
2e3c2513 4164 *
a3efb2f0
QW
4165 * May try to flush write bio if we can't get the lock.
4166 *
4167 * Return 0 if the extent buffer doesn't need to be submitted.
4168 * (E.g. the extent buffer is not dirty)
4169 * Return >0 is the extent buffer is submitted to bio.
4170 * Return <0 if something went wrong, no page is locked.
2e3c2513 4171 */
9df76fb5 4172static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 4173 struct extent_page_data *epd)
0b32f4bb 4174{
9df76fb5 4175 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 4176 int i, num_pages, failed_page_nr;
0b32f4bb
JB
4177 int flush = 0;
4178 int ret = 0;
4179
4180 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 4181 ret = flush_write_bio(epd);
2e3c2513
QW
4182 if (ret < 0)
4183 return ret;
4184 flush = 1;
0b32f4bb
JB
4185 btrfs_tree_lock(eb);
4186 }
4187
4188 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4189 btrfs_tree_unlock(eb);
4190 if (!epd->sync_io)
4191 return 0;
4192 if (!flush) {
f4340622 4193 ret = flush_write_bio(epd);
2e3c2513
QW
4194 if (ret < 0)
4195 return ret;
0b32f4bb
JB
4196 flush = 1;
4197 }
a098d8e8
CM
4198 while (1) {
4199 wait_on_extent_buffer_writeback(eb);
4200 btrfs_tree_lock(eb);
4201 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4202 break;
0b32f4bb 4203 btrfs_tree_unlock(eb);
0b32f4bb
JB
4204 }
4205 }
4206
51561ffe
JB
4207 /*
4208 * We need to do this to prevent races in people who check if the eb is
4209 * under IO since we can end up having no IO bits set for a short period
4210 * of time.
4211 */
4212 spin_lock(&eb->refs_lock);
0b32f4bb
JB
4213 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4214 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 4215 spin_unlock(&eb->refs_lock);
0b32f4bb 4216 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
4217 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4218 -eb->len,
4219 fs_info->dirty_metadata_batch);
0b32f4bb 4220 ret = 1;
51561ffe
JB
4221 } else {
4222 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
4223 }
4224
4225 btrfs_tree_unlock(eb);
4226
f3156df9
QW
4227 /*
4228 * Either we don't need to submit any tree block, or we're submitting
4229 * subpage eb.
4230 * Subpage metadata doesn't use page locking at all, so we can skip
4231 * the page locking.
4232 */
4233 if (!ret || fs_info->sectorsize < PAGE_SIZE)
0b32f4bb
JB
4234 return ret;
4235
65ad0104 4236 num_pages = num_extent_pages(eb);
0b32f4bb 4237 for (i = 0; i < num_pages; i++) {
fb85fc9a 4238 struct page *p = eb->pages[i];
0b32f4bb
JB
4239
4240 if (!trylock_page(p)) {
4241 if (!flush) {
18dfa711
FM
4242 int err;
4243
4244 err = flush_write_bio(epd);
4245 if (err < 0) {
4246 ret = err;
2e3c2513
QW
4247 failed_page_nr = i;
4248 goto err_unlock;
4249 }
0b32f4bb
JB
4250 flush = 1;
4251 }
4252 lock_page(p);
4253 }
4254 }
4255
4256 return ret;
2e3c2513
QW
4257err_unlock:
4258 /* Unlock already locked pages */
4259 for (i = 0; i < failed_page_nr; i++)
4260 unlock_page(eb->pages[i]);
18dfa711
FM
4261 /*
4262 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
4263 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
4264 * be made and undo everything done before.
4265 */
4266 btrfs_tree_lock(eb);
4267 spin_lock(&eb->refs_lock);
4268 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4269 end_extent_buffer_writeback(eb);
4270 spin_unlock(&eb->refs_lock);
4271 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
4272 fs_info->dirty_metadata_batch);
4273 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4274 btrfs_tree_unlock(eb);
2e3c2513 4275 return ret;
0b32f4bb
JB
4276}
4277
5a2c6075 4278static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
656f30db 4279{
5a2c6075 4280 struct btrfs_fs_info *fs_info = eb->fs_info;
656f30db 4281
5a2c6075 4282 btrfs_page_set_error(fs_info, page, eb->start, eb->len);
656f30db
FM
4283 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4284 return;
4285
eb5b64f1
DZ
4286 /*
4287 * If we error out, we should add back the dirty_metadata_bytes
4288 * to make it consistent.
4289 */
eb5b64f1
DZ
4290 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4291 eb->len, fs_info->dirty_metadata_batch);
4292
656f30db
FM
4293 /*
4294 * If writeback for a btree extent that doesn't belong to a log tree
4295 * failed, increment the counter transaction->eb_write_errors.
4296 * We do this because while the transaction is running and before it's
4297 * committing (when we call filemap_fdata[write|wait]_range against
4298 * the btree inode), we might have
4299 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4300 * returns an error or an error happens during writeback, when we're
4301 * committing the transaction we wouldn't know about it, since the pages
4302 * can be no longer dirty nor marked anymore for writeback (if a
4303 * subsequent modification to the extent buffer didn't happen before the
4304 * transaction commit), which makes filemap_fdata[write|wait]_range not
4305 * able to find the pages tagged with SetPageError at transaction
4306 * commit time. So if this happens we must abort the transaction,
4307 * otherwise we commit a super block with btree roots that point to
4308 * btree nodes/leafs whose content on disk is invalid - either garbage
4309 * or the content of some node/leaf from a past generation that got
4310 * cowed or deleted and is no longer valid.
4311 *
4312 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4313 * not be enough - we need to distinguish between log tree extents vs
4314 * non-log tree extents, and the next filemap_fdatawait_range() call
4315 * will catch and clear such errors in the mapping - and that call might
4316 * be from a log sync and not from a transaction commit. Also, checking
4317 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4318 * not done and would not be reliable - the eb might have been released
4319 * from memory and reading it back again means that flag would not be
4320 * set (since it's a runtime flag, not persisted on disk).
4321 *
4322 * Using the flags below in the btree inode also makes us achieve the
4323 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4324 * writeback for all dirty pages and before filemap_fdatawait_range()
4325 * is called, the writeback for all dirty pages had already finished
4326 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4327 * filemap_fdatawait_range() would return success, as it could not know
4328 * that writeback errors happened (the pages were no longer tagged for
4329 * writeback).
4330 */
4331 switch (eb->log_index) {
4332 case -1:
5a2c6075 4333 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
656f30db
FM
4334 break;
4335 case 0:
5a2c6075 4336 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
656f30db
FM
4337 break;
4338 case 1:
5a2c6075 4339 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
656f30db
FM
4340 break;
4341 default:
4342 BUG(); /* unexpected, logic error */
4343 }
4344}
4345
2f3186d8
QW
4346/*
4347 * The endio specific version which won't touch any unsafe spinlock in endio
4348 * context.
4349 */
4350static struct extent_buffer *find_extent_buffer_nolock(
4351 struct btrfs_fs_info *fs_info, u64 start)
4352{
4353 struct extent_buffer *eb;
4354
4355 rcu_read_lock();
4356 eb = radix_tree_lookup(&fs_info->buffer_radix,
4357 start >> fs_info->sectorsize_bits);
4358 if (eb && atomic_inc_not_zero(&eb->refs)) {
4359 rcu_read_unlock();
4360 return eb;
4361 }
4362 rcu_read_unlock();
4363 return NULL;
4364}
4365
4366/*
4367 * The endio function for subpage extent buffer write.
4368 *
4369 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4370 * after all extent buffers in the page has finished their writeback.
4371 */
fa04c165 4372static void end_bio_subpage_eb_writepage(struct bio *bio)
2f3186d8 4373{
fa04c165 4374 struct btrfs_fs_info *fs_info;
2f3186d8
QW
4375 struct bio_vec *bvec;
4376 struct bvec_iter_all iter_all;
4377
fa04c165
QW
4378 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4379 ASSERT(fs_info->sectorsize < PAGE_SIZE);
4380
2f3186d8
QW
4381 ASSERT(!bio_flagged(bio, BIO_CLONED));
4382 bio_for_each_segment_all(bvec, bio, iter_all) {
4383 struct page *page = bvec->bv_page;
4384 u64 bvec_start = page_offset(page) + bvec->bv_offset;
4385 u64 bvec_end = bvec_start + bvec->bv_len - 1;
4386 u64 cur_bytenr = bvec_start;
4387
4388 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4389
4390 /* Iterate through all extent buffers in the range */
4391 while (cur_bytenr <= bvec_end) {
4392 struct extent_buffer *eb;
4393 int done;
4394
4395 /*
4396 * Here we can't use find_extent_buffer(), as it may
4397 * try to lock eb->refs_lock, which is not safe in endio
4398 * context.
4399 */
4400 eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4401 ASSERT(eb);
4402
4403 cur_bytenr = eb->start + eb->len;
4404
4405 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4406 done = atomic_dec_and_test(&eb->io_pages);
4407 ASSERT(done);
4408
4409 if (bio->bi_status ||
4410 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4411 ClearPageUptodate(page);
4412 set_btree_ioerr(page, eb);
4413 }
4414
4415 btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4416 eb->len);
4417 end_extent_buffer_writeback(eb);
4418 /*
4419 * free_extent_buffer() will grab spinlock which is not
4420 * safe in endio context. Thus here we manually dec
4421 * the ref.
4422 */
4423 atomic_dec(&eb->refs);
4424 }
4425 }
4426 bio_put(bio);
4427}
4428
4246a0b6 4429static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 4430{
2c30c71b 4431 struct bio_vec *bvec;
0b32f4bb 4432 struct extent_buffer *eb;
2b070cfe 4433 int done;
6dc4f100 4434 struct bvec_iter_all iter_all;
0b32f4bb 4435
c09abff8 4436 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 4437 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
4438 struct page *page = bvec->bv_page;
4439
0b32f4bb
JB
4440 eb = (struct extent_buffer *)page->private;
4441 BUG_ON(!eb);
4442 done = atomic_dec_and_test(&eb->io_pages);
4443
4e4cbee9 4444 if (bio->bi_status ||
4246a0b6 4445 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 4446 ClearPageUptodate(page);
5a2c6075 4447 set_btree_ioerr(page, eb);
0b32f4bb
JB
4448 }
4449
4450 end_page_writeback(page);
4451
4452 if (!done)
4453 continue;
4454
4455 end_extent_buffer_writeback(eb);
2c30c71b 4456 }
0b32f4bb
JB
4457
4458 bio_put(bio);
0b32f4bb
JB
4459}
4460
fa04c165
QW
4461static void prepare_eb_write(struct extent_buffer *eb)
4462{
4463 u32 nritems;
4464 unsigned long start;
4465 unsigned long end;
4466
4467 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4468 atomic_set(&eb->io_pages, num_extent_pages(eb));
4469
4470 /* Set btree blocks beyond nritems with 0 to avoid stale content */
4471 nritems = btrfs_header_nritems(eb);
4472 if (btrfs_header_level(eb) > 0) {
4473 end = btrfs_node_key_ptr_offset(nritems);
4474 memzero_extent_buffer(eb, end, eb->len - end);
4475 } else {
4476 /*
4477 * Leaf:
4478 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4479 */
4480 start = btrfs_item_nr_offset(nritems);
4481 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4482 memzero_extent_buffer(eb, start, end - start);
4483 }
4484}
4485
35b6ddfa
QW
4486/*
4487 * Unlike the work in write_one_eb(), we rely completely on extent locking.
4488 * Page locking is only utilized at minimum to keep the VMM code happy.
35b6ddfa
QW
4489 */
4490static int write_one_subpage_eb(struct extent_buffer *eb,
4491 struct writeback_control *wbc,
4492 struct extent_page_data *epd)
4493{
4494 struct btrfs_fs_info *fs_info = eb->fs_info;
4495 struct page *page = eb->pages[0];
4496 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
4497 bool no_dirty_ebs = false;
4498 int ret;
4499
fa04c165
QW
4500 prepare_eb_write(eb);
4501
35b6ddfa
QW
4502 /* clear_page_dirty_for_io() in subpage helper needs page locked */
4503 lock_page(page);
4504 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4505
4506 /* Check if this is the last dirty bit to update nr_written */
4507 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4508 eb->start, eb->len);
4509 if (no_dirty_ebs)
4510 clear_page_dirty_for_io(page);
4511
390ed29b
QW
4512 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4513 &epd->bio_ctrl, page, eb->start, eb->len,
4514 eb->start - page_offset(page),
fa04c165 4515 end_bio_subpage_eb_writepage, 0, 0, false);
35b6ddfa
QW
4516 if (ret) {
4517 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4518 set_btree_ioerr(page, eb);
4519 unlock_page(page);
4520
4521 if (atomic_dec_and_test(&eb->io_pages))
4522 end_extent_buffer_writeback(eb);
4523 return -EIO;
4524 }
4525 unlock_page(page);
4526 /*
4527 * Submission finished without problem, if no range of the page is
4528 * dirty anymore, we have submitted a page. Update nr_written in wbc.
4529 */
4530 if (no_dirty_ebs)
4531 update_nr_written(wbc, 1);
4532 return ret;
4533}
4534
0e378df1 4535static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
4536 struct writeback_control *wbc,
4537 struct extent_page_data *epd)
4538{
0c64c33c 4539 u64 disk_bytenr = eb->start;
cc5e31a4 4540 int i, num_pages;
ff40adf7 4541 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 4542 int ret = 0;
0b32f4bb 4543
fa04c165 4544 prepare_eb_write(eb);
35b6ddfa 4545
fa04c165 4546 num_pages = num_extent_pages(eb);
0b32f4bb 4547 for (i = 0; i < num_pages; i++) {
fb85fc9a 4548 struct page *p = eb->pages[i];
0b32f4bb
JB
4549
4550 clear_page_dirty_for_io(p);
4551 set_page_writeback(p);
0ceb34bf 4552 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
390ed29b
QW
4553 &epd->bio_ctrl, p, disk_bytenr,
4554 PAGE_SIZE, 0,
1f7ad75b 4555 end_bio_extent_buffer_writepage,
390ed29b 4556 0, 0, false);
0b32f4bb 4557 if (ret) {
5a2c6075 4558 set_btree_ioerr(p, eb);
fe01aa65
TK
4559 if (PageWriteback(p))
4560 end_page_writeback(p);
0b32f4bb
JB
4561 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4562 end_extent_buffer_writeback(eb);
4563 ret = -EIO;
4564 break;
4565 }
0c64c33c 4566 disk_bytenr += PAGE_SIZE;
3d4b9496 4567 update_nr_written(wbc, 1);
0b32f4bb
JB
4568 unlock_page(p);
4569 }
4570
4571 if (unlikely(ret)) {
4572 for (; i < num_pages; i++) {
bbf65cf0 4573 struct page *p = eb->pages[i];
81465028 4574 clear_page_dirty_for_io(p);
0b32f4bb
JB
4575 unlock_page(p);
4576 }
4577 }
4578
4579 return ret;
4580}
4581
c4aec299
QW
4582/*
4583 * Submit one subpage btree page.
4584 *
4585 * The main difference to submit_eb_page() is:
4586 * - Page locking
4587 * For subpage, we don't rely on page locking at all.
4588 *
4589 * - Flush write bio
4590 * We only flush bio if we may be unable to fit current extent buffers into
4591 * current bio.
4592 *
4593 * Return >=0 for the number of submitted extent buffers.
4594 * Return <0 for fatal error.
4595 */
4596static int submit_eb_subpage(struct page *page,
4597 struct writeback_control *wbc,
4598 struct extent_page_data *epd)
4599{
4600 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4601 int submitted = 0;
4602 u64 page_start = page_offset(page);
4603 int bit_start = 0;
c4aec299
QW
4604 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4605 int ret;
4606
4607 /* Lock and write each dirty extent buffers in the range */
72a69cd0 4608 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
c4aec299
QW
4609 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4610 struct extent_buffer *eb;
4611 unsigned long flags;
4612 u64 start;
4613
4614 /*
4615 * Take private lock to ensure the subpage won't be detached
4616 * in the meantime.
4617 */
4618 spin_lock(&page->mapping->private_lock);
4619 if (!PagePrivate(page)) {
4620 spin_unlock(&page->mapping->private_lock);
4621 break;
4622 }
4623 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
4624 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4625 subpage->bitmaps)) {
c4aec299
QW
4626 spin_unlock_irqrestore(&subpage->lock, flags);
4627 spin_unlock(&page->mapping->private_lock);
4628 bit_start++;
4629 continue;
4630 }
4631
4632 start = page_start + bit_start * fs_info->sectorsize;
4633 bit_start += sectors_per_node;
4634
4635 /*
4636 * Here we just want to grab the eb without touching extra
4637 * spin locks, so call find_extent_buffer_nolock().
4638 */
4639 eb = find_extent_buffer_nolock(fs_info, start);
4640 spin_unlock_irqrestore(&subpage->lock, flags);
4641 spin_unlock(&page->mapping->private_lock);
4642
4643 /*
4644 * The eb has already reached 0 refs thus find_extent_buffer()
4645 * doesn't return it. We don't need to write back such eb
4646 * anyway.
4647 */
4648 if (!eb)
4649 continue;
4650
4651 ret = lock_extent_buffer_for_io(eb, epd);
4652 if (ret == 0) {
4653 free_extent_buffer(eb);
4654 continue;
4655 }
4656 if (ret < 0) {
4657 free_extent_buffer(eb);
4658 goto cleanup;
4659 }
fa04c165 4660 ret = write_one_subpage_eb(eb, wbc, epd);
c4aec299
QW
4661 free_extent_buffer(eb);
4662 if (ret < 0)
4663 goto cleanup;
4664 submitted++;
4665 }
4666 return submitted;
4667
4668cleanup:
4669 /* We hit error, end bio for the submitted extent buffers */
4670 end_write_bio(epd, ret);
4671 return ret;
4672}
4673
f91e0d0c
QW
4674/*
4675 * Submit all page(s) of one extent buffer.
4676 *
4677 * @page: the page of one extent buffer
4678 * @eb_context: to determine if we need to submit this page, if current page
4679 * belongs to this eb, we don't need to submit
4680 *
4681 * The caller should pass each page in their bytenr order, and here we use
4682 * @eb_context to determine if we have submitted pages of one extent buffer.
4683 *
4684 * If we have, we just skip until we hit a new page that doesn't belong to
4685 * current @eb_context.
4686 *
4687 * If not, we submit all the page(s) of the extent buffer.
4688 *
4689 * Return >0 if we have submitted the extent buffer successfully.
4690 * Return 0 if we don't need to submit the page, as it's already submitted by
4691 * previous call.
4692 * Return <0 for fatal error.
4693 */
4694static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4695 struct extent_page_data *epd,
4696 struct extent_buffer **eb_context)
4697{
4698 struct address_space *mapping = page->mapping;
0bc09ca1 4699 struct btrfs_block_group *cache = NULL;
f91e0d0c
QW
4700 struct extent_buffer *eb;
4701 int ret;
4702
4703 if (!PagePrivate(page))
4704 return 0;
4705
c4aec299
QW
4706 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
4707 return submit_eb_subpage(page, wbc, epd);
4708
f91e0d0c
QW
4709 spin_lock(&mapping->private_lock);
4710 if (!PagePrivate(page)) {
4711 spin_unlock(&mapping->private_lock);
4712 return 0;
4713 }
4714
4715 eb = (struct extent_buffer *)page->private;
4716
4717 /*
4718 * Shouldn't happen and normally this would be a BUG_ON but no point
4719 * crashing the machine for something we can survive anyway.
4720 */
4721 if (WARN_ON(!eb)) {
4722 spin_unlock(&mapping->private_lock);
4723 return 0;
4724 }
4725
4726 if (eb == *eb_context) {
4727 spin_unlock(&mapping->private_lock);
4728 return 0;
4729 }
4730 ret = atomic_inc_not_zero(&eb->refs);
4731 spin_unlock(&mapping->private_lock);
4732 if (!ret)
4733 return 0;
4734
0bc09ca1
NA
4735 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4736 /*
4737 * If for_sync, this hole will be filled with
4738 * trasnsaction commit.
4739 */
4740 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4741 ret = -EAGAIN;
4742 else
4743 ret = 0;
4744 free_extent_buffer(eb);
4745 return ret;
4746 }
4747
f91e0d0c
QW
4748 *eb_context = eb;
4749
4750 ret = lock_extent_buffer_for_io(eb, epd);
4751 if (ret <= 0) {
0bc09ca1
NA
4752 btrfs_revert_meta_write_pointer(cache, eb);
4753 if (cache)
4754 btrfs_put_block_group(cache);
f91e0d0c
QW
4755 free_extent_buffer(eb);
4756 return ret;
4757 }
be1a1d7a
NA
4758 if (cache) {
4759 /* Impiles write in zoned mode */
0bc09ca1 4760 btrfs_put_block_group(cache);
be1a1d7a
NA
4761 /* Mark the last eb in a block group */
4762 if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
4763 set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
4764 }
f91e0d0c
QW
4765 ret = write_one_eb(eb, wbc, epd);
4766 free_extent_buffer(eb);
4767 if (ret < 0)
4768 return ret;
4769 return 1;
4770}
4771
0b32f4bb
JB
4772int btree_write_cache_pages(struct address_space *mapping,
4773 struct writeback_control *wbc)
4774{
f91e0d0c 4775 struct extent_buffer *eb_context = NULL;
0b32f4bb 4776 struct extent_page_data epd = {
390ed29b 4777 .bio_ctrl = { 0 },
0b32f4bb
JB
4778 .extent_locked = 0,
4779 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4780 };
b3ff8f1d 4781 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
4782 int ret = 0;
4783 int done = 0;
4784 int nr_to_write_done = 0;
4785 struct pagevec pvec;
4786 int nr_pages;
4787 pgoff_t index;
4788 pgoff_t end; /* Inclusive */
4789 int scanned = 0;
10bbd235 4790 xa_mark_t tag;
0b32f4bb 4791
86679820 4792 pagevec_init(&pvec);
0b32f4bb
JB
4793 if (wbc->range_cyclic) {
4794 index = mapping->writeback_index; /* Start from prev offset */
4795 end = -1;
556755a8
JB
4796 /*
4797 * Start from the beginning does not need to cycle over the
4798 * range, mark it as scanned.
4799 */
4800 scanned = (index == 0);
0b32f4bb 4801 } else {
09cbfeaf
KS
4802 index = wbc->range_start >> PAGE_SHIFT;
4803 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
4804 scanned = 1;
4805 }
4806 if (wbc->sync_mode == WB_SYNC_ALL)
4807 tag = PAGECACHE_TAG_TOWRITE;
4808 else
4809 tag = PAGECACHE_TAG_DIRTY;
0bc09ca1 4810 btrfs_zoned_meta_io_lock(fs_info);
0b32f4bb
JB
4811retry:
4812 if (wbc->sync_mode == WB_SYNC_ALL)
4813 tag_pages_for_writeback(mapping, index, end);
4814 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 4815 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 4816 tag))) {
0b32f4bb
JB
4817 unsigned i;
4818
0b32f4bb
JB
4819 for (i = 0; i < nr_pages; i++) {
4820 struct page *page = pvec.pages[i];
4821
f91e0d0c
QW
4822 ret = submit_eb_page(page, wbc, &epd, &eb_context);
4823 if (ret == 0)
0b32f4bb 4824 continue;
f91e0d0c 4825 if (ret < 0) {
0b32f4bb 4826 done = 1;
0b32f4bb
JB
4827 break;
4828 }
0b32f4bb
JB
4829
4830 /*
4831 * the filesystem may choose to bump up nr_to_write.
4832 * We have to make sure to honor the new nr_to_write
4833 * at any time
4834 */
4835 nr_to_write_done = wbc->nr_to_write <= 0;
4836 }
4837 pagevec_release(&pvec);
4838 cond_resched();
4839 }
4840 if (!scanned && !done) {
4841 /*
4842 * We hit the last page and there is more work to be done: wrap
4843 * back to the start of the file
4844 */
4845 scanned = 1;
4846 index = 0;
4847 goto retry;
4848 }
2b952eea
QW
4849 if (ret < 0) {
4850 end_write_bio(&epd, ret);
0bc09ca1 4851 goto out;
2b952eea 4852 }
b3ff8f1d
QW
4853 /*
4854 * If something went wrong, don't allow any metadata write bio to be
4855 * submitted.
4856 *
4857 * This would prevent use-after-free if we had dirty pages not
4858 * cleaned up, which can still happen by fuzzed images.
4859 *
4860 * - Bad extent tree
4861 * Allowing existing tree block to be allocated for other trees.
4862 *
4863 * - Log tree operations
4864 * Exiting tree blocks get allocated to log tree, bumps its
4865 * generation, then get cleaned in tree re-balance.
4866 * Such tree block will not be written back, since it's clean,
4867 * thus no WRITTEN flag set.
4868 * And after log writes back, this tree block is not traced by
4869 * any dirty extent_io_tree.
4870 *
4871 * - Offending tree block gets re-dirtied from its original owner
4872 * Since it has bumped generation, no WRITTEN flag, it can be
4873 * reused without COWing. This tree block will not be traced
4874 * by btrfs_transaction::dirty_pages.
4875 *
4876 * Now such dirty tree block will not be cleaned by any dirty
4877 * extent io tree. Thus we don't want to submit such wild eb
4878 * if the fs already has error.
4879 */
4880 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4881 ret = flush_write_bio(&epd);
4882 } else {
fbabd4a3 4883 ret = -EROFS;
b3ff8f1d
QW
4884 end_write_bio(&epd, ret);
4885 }
0bc09ca1
NA
4886out:
4887 btrfs_zoned_meta_io_unlock(fs_info);
0b32f4bb
JB
4888 return ret;
4889}
4890
d1310b2e 4891/**
3bed2da1
NB
4892 * Walk the list of dirty pages of the given address space and write all of them.
4893 *
d1310b2e 4894 * @mapping: address space structure to write
3bed2da1
NB
4895 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4896 * @epd: holds context for the write, namely the bio
d1310b2e
CM
4897 *
4898 * If a page is already under I/O, write_cache_pages() skips it, even
4899 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4900 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4901 * and msync() need to guarantee that all the data which was dirty at the time
4902 * the call was made get new I/O started against them. If wbc->sync_mode is
4903 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4904 * existing IO to complete.
4905 */
4242b64a 4906static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4907 struct writeback_control *wbc,
aab6e9ed 4908 struct extent_page_data *epd)
d1310b2e 4909{
7fd1a3f7 4910 struct inode *inode = mapping->host;
d1310b2e
CM
4911 int ret = 0;
4912 int done = 0;
f85d7d6c 4913 int nr_to_write_done = 0;
d1310b2e
CM
4914 struct pagevec pvec;
4915 int nr_pages;
4916 pgoff_t index;
4917 pgoff_t end; /* Inclusive */
a9132667
LB
4918 pgoff_t done_index;
4919 int range_whole = 0;
d1310b2e 4920 int scanned = 0;
10bbd235 4921 xa_mark_t tag;
d1310b2e 4922
7fd1a3f7
JB
4923 /*
4924 * We have to hold onto the inode so that ordered extents can do their
4925 * work when the IO finishes. The alternative to this is failing to add
4926 * an ordered extent if the igrab() fails there and that is a huge pain
4927 * to deal with, so instead just hold onto the inode throughout the
4928 * writepages operation. If it fails here we are freeing up the inode
4929 * anyway and we'd rather not waste our time writing out stuff that is
4930 * going to be truncated anyway.
4931 */
4932 if (!igrab(inode))
4933 return 0;
4934
86679820 4935 pagevec_init(&pvec);
d1310b2e
CM
4936 if (wbc->range_cyclic) {
4937 index = mapping->writeback_index; /* Start from prev offset */
4938 end = -1;
556755a8
JB
4939 /*
4940 * Start from the beginning does not need to cycle over the
4941 * range, mark it as scanned.
4942 */
4943 scanned = (index == 0);
d1310b2e 4944 } else {
09cbfeaf
KS
4945 index = wbc->range_start >> PAGE_SHIFT;
4946 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
4947 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4948 range_whole = 1;
d1310b2e
CM
4949 scanned = 1;
4950 }
3cd24c69
EL
4951
4952 /*
4953 * We do the tagged writepage as long as the snapshot flush bit is set
4954 * and we are the first one who do the filemap_flush() on this inode.
4955 *
4956 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4957 * not race in and drop the bit.
4958 */
4959 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4960 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4961 &BTRFS_I(inode)->runtime_flags))
4962 wbc->tagged_writepages = 1;
4963
4964 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
4965 tag = PAGECACHE_TAG_TOWRITE;
4966 else
4967 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 4968retry:
3cd24c69 4969 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 4970 tag_pages_for_writeback(mapping, index, end);
a9132667 4971 done_index = index;
f85d7d6c 4972 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
4973 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4974 &index, end, tag))) {
d1310b2e
CM
4975 unsigned i;
4976
d1310b2e
CM
4977 for (i = 0; i < nr_pages; i++) {
4978 struct page *page = pvec.pages[i];
4979
f7bddf1e 4980 done_index = page->index + 1;
d1310b2e 4981 /*
b93b0163
MW
4982 * At this point we hold neither the i_pages lock nor
4983 * the page lock: the page may be truncated or
4984 * invalidated (changing page->mapping to NULL),
4985 * or even swizzled back from swapper_space to
4986 * tmpfs file mapping
d1310b2e 4987 */
c8f2f24b 4988 if (!trylock_page(page)) {
f4340622
QW
4989 ret = flush_write_bio(epd);
4990 BUG_ON(ret < 0);
c8f2f24b 4991 lock_page(page);
01d658f2 4992 }
d1310b2e
CM
4993
4994 if (unlikely(page->mapping != mapping)) {
4995 unlock_page(page);
4996 continue;
4997 }
4998
d2c3f4f6 4999 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
5000 if (PageWriteback(page)) {
5001 ret = flush_write_bio(epd);
5002 BUG_ON(ret < 0);
5003 }
d1310b2e 5004 wait_on_page_writeback(page);
d2c3f4f6 5005 }
d1310b2e
CM
5006
5007 if (PageWriteback(page) ||
5008 !clear_page_dirty_for_io(page)) {
5009 unlock_page(page);
5010 continue;
5011 }
5012
aab6e9ed 5013 ret = __extent_writepage(page, wbc, epd);
a9132667 5014 if (ret < 0) {
a9132667
LB
5015 done = 1;
5016 break;
5017 }
f85d7d6c
CM
5018
5019 /*
5020 * the filesystem may choose to bump up nr_to_write.
5021 * We have to make sure to honor the new nr_to_write
5022 * at any time
5023 */
5024 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
5025 }
5026 pagevec_release(&pvec);
5027 cond_resched();
5028 }
894b36e3 5029 if (!scanned && !done) {
d1310b2e
CM
5030 /*
5031 * We hit the last page and there is more work to be done: wrap
5032 * back to the start of the file
5033 */
5034 scanned = 1;
5035 index = 0;
42ffb0bf
JB
5036
5037 /*
5038 * If we're looping we could run into a page that is locked by a
5039 * writer and that writer could be waiting on writeback for a
5040 * page in our current bio, and thus deadlock, so flush the
5041 * write bio here.
5042 */
5043 ret = flush_write_bio(epd);
5044 if (!ret)
5045 goto retry;
d1310b2e 5046 }
a9132667
LB
5047
5048 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5049 mapping->writeback_index = done_index;
5050
7fd1a3f7 5051 btrfs_add_delayed_iput(inode);
894b36e3 5052 return ret;
d1310b2e 5053}
d1310b2e 5054
0a9b0e53 5055int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
5056{
5057 int ret;
d1310b2e 5058 struct extent_page_data epd = {
390ed29b 5059 .bio_ctrl = { 0 },
771ed689 5060 .extent_locked = 0,
ffbd517d 5061 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 5062 };
d1310b2e 5063
d1310b2e 5064 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
5065 ASSERT(ret <= 0);
5066 if (ret < 0) {
5067 end_write_bio(&epd, ret);
5068 return ret;
5069 }
d1310b2e 5070
3065976b
QW
5071 ret = flush_write_bio(&epd);
5072 ASSERT(ret <= 0);
d1310b2e
CM
5073 return ret;
5074}
d1310b2e 5075
2bd0fc93
QW
5076/*
5077 * Submit the pages in the range to bio for call sites which delalloc range has
5078 * already been ran (aka, ordered extent inserted) and all pages are still
5079 * locked.
5080 */
5081int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
771ed689 5082{
2bd0fc93
QW
5083 bool found_error = false;
5084 int first_error = 0;
771ed689
CM
5085 int ret = 0;
5086 struct address_space *mapping = inode->i_mapping;
5087 struct page *page;
2bd0fc93 5088 u64 cur = start;
09cbfeaf
KS
5089 unsigned long nr_pages = (end - start + PAGE_SIZE) >>
5090 PAGE_SHIFT;
771ed689 5091 struct extent_page_data epd = {
390ed29b 5092 .bio_ctrl = { 0 },
771ed689 5093 .extent_locked = 1,
2bd0fc93 5094 .sync_io = 1,
771ed689
CM
5095 };
5096 struct writeback_control wbc_writepages = {
771ed689 5097 .nr_to_write = nr_pages * 2,
2bd0fc93 5098 .sync_mode = WB_SYNC_ALL,
771ed689
CM
5099 .range_start = start,
5100 .range_end = end + 1,
ec39f769
CM
5101 /* We're called from an async helper function */
5102 .punt_to_cgroup = 1,
5103 .no_cgroup_owner = 1,
771ed689
CM
5104 };
5105
dbb70bec 5106 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
2bd0fc93
QW
5107 while (cur <= end) {
5108 page = find_get_page(mapping, cur >> PAGE_SHIFT);
5109 /*
5110 * All pages in the range are locked since
5111 * btrfs_run_delalloc_range(), thus there is no way to clear
5112 * the page dirty flag.
5113 */
5114 ASSERT(PageDirty(page));
5115 clear_page_dirty_for_io(page);
5116 ret = __extent_writepage(page, &wbc_writepages, &epd);
5117 ASSERT(ret <= 0);
5118 if (ret < 0) {
5119 found_error = true;
5120 first_error = ret;
771ed689 5121 }
09cbfeaf 5122 put_page(page);
2bd0fc93 5123 cur += PAGE_SIZE;
771ed689
CM
5124 }
5125
2bd0fc93 5126 if (!found_error)
dbb70bec
CM
5127 ret = flush_write_bio(&epd);
5128 else
02c6db4f 5129 end_write_bio(&epd, ret);
dbb70bec
CM
5130
5131 wbc_detach_inode(&wbc_writepages);
2bd0fc93
QW
5132 if (found_error)
5133 return first_error;
771ed689
CM
5134 return ret;
5135}
d1310b2e 5136
8ae225a8 5137int extent_writepages(struct address_space *mapping,
d1310b2e
CM
5138 struct writeback_control *wbc)
5139{
35156d85
JT
5140 struct inode *inode = mapping->host;
5141 const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
5142 const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
d1310b2e
CM
5143 int ret = 0;
5144 struct extent_page_data epd = {
390ed29b 5145 .bio_ctrl = { 0 },
771ed689 5146 .extent_locked = 0,
ffbd517d 5147 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
5148 };
5149
35156d85
JT
5150 /*
5151 * Allow only a single thread to do the reloc work in zoned mode to
5152 * protect the write pointer updates.
5153 */
5154 if (data_reloc && zoned)
5155 btrfs_inode_lock(inode, 0);
935db853 5156 ret = extent_write_cache_pages(mapping, wbc, &epd);
35156d85
JT
5157 if (data_reloc && zoned)
5158 btrfs_inode_unlock(inode, 0);
a2a72fbd
QW
5159 ASSERT(ret <= 0);
5160 if (ret < 0) {
5161 end_write_bio(&epd, ret);
5162 return ret;
5163 }
5164 ret = flush_write_bio(&epd);
d1310b2e
CM
5165 return ret;
5166}
d1310b2e 5167
ba206a02 5168void extent_readahead(struct readahead_control *rac)
d1310b2e 5169{
390ed29b 5170 struct btrfs_bio_ctrl bio_ctrl = { 0 };
67c9684f 5171 struct page *pagepool[16];
125bac01 5172 struct extent_map *em_cached = NULL;
808f80b4 5173 u64 prev_em_start = (u64)-1;
ba206a02 5174 int nr;
d1310b2e 5175
ba206a02 5176 while ((nr = readahead_page_batch(rac, pagepool))) {
32c0a6bc
MWO
5177 u64 contig_start = readahead_pos(rac);
5178 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
e65ef21e 5179
ba206a02 5180 contiguous_readpages(pagepool, nr, contig_start, contig_end,
390ed29b 5181 &em_cached, &bio_ctrl, &prev_em_start);
d1310b2e 5182 }
67c9684f 5183
125bac01
MX
5184 if (em_cached)
5185 free_extent_map(em_cached);
5186
390ed29b
QW
5187 if (bio_ctrl.bio) {
5188 if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
ba206a02
MWO
5189 return;
5190 }
d1310b2e 5191}
d1310b2e
CM
5192
5193/*
5194 * basic invalidatepage code, this waits on any locked or writeback
5195 * ranges corresponding to the page, and then deletes any extent state
5196 * records from the tree
5197 */
5198int extent_invalidatepage(struct extent_io_tree *tree,
5199 struct page *page, unsigned long offset)
5200{
2ac55d41 5201 struct extent_state *cached_state = NULL;
4eee4fa4 5202 u64 start = page_offset(page);
09cbfeaf 5203 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
5204 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
5205
829ddec9
QW
5206 /* This function is only called for the btree inode */
5207 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5208
fda2832f 5209 start += ALIGN(offset, blocksize);
d1310b2e
CM
5210 if (start > end)
5211 return 0;
5212
ff13db41 5213 lock_extent_bits(tree, start, end, &cached_state);
1edbb734 5214 wait_on_page_writeback(page);
829ddec9
QW
5215
5216 /*
5217 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
5218 * so here we only need to unlock the extent range to free any
5219 * existing extent state.
5220 */
5221 unlock_extent_cached(tree, start, end, &cached_state);
d1310b2e
CM
5222 return 0;
5223}
d1310b2e 5224
7b13b7b1
CM
5225/*
5226 * a helper for releasepage, this tests for areas of the page that
5227 * are locked or under IO and drops the related state bits if it is safe
5228 * to drop the page.
5229 */
29c68b2d 5230static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 5231 struct page *page, gfp_t mask)
7b13b7b1 5232{
4eee4fa4 5233 u64 start = page_offset(page);
09cbfeaf 5234 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
5235 int ret = 1;
5236
8882679e 5237 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 5238 ret = 0;
8882679e 5239 } else {
11ef160f 5240 /*
2766ff61
FM
5241 * At this point we can safely clear everything except the
5242 * locked bit, the nodatasum bit and the delalloc new bit.
5243 * The delalloc new bit will be cleared by ordered extent
5244 * completion.
11ef160f 5245 */
66b0c887 5246 ret = __clear_extent_bit(tree, start, end,
2766ff61
FM
5247 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5248 0, 0, NULL, mask, NULL);
e3f24cc5
CM
5249
5250 /* if clear_extent_bit failed for enomem reasons,
5251 * we can't allow the release to continue.
5252 */
5253 if (ret < 0)
5254 ret = 0;
5255 else
5256 ret = 1;
7b13b7b1
CM
5257 }
5258 return ret;
5259}
7b13b7b1 5260
d1310b2e
CM
5261/*
5262 * a helper for releasepage. As long as there are no locked extents
5263 * in the range corresponding to the page, both state records and extent
5264 * map records are removed
5265 */
477a30ba 5266int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
5267{
5268 struct extent_map *em;
4eee4fa4 5269 u64 start = page_offset(page);
09cbfeaf 5270 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
5271 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5272 struct extent_io_tree *tree = &btrfs_inode->io_tree;
5273 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 5274
d0164adc 5275 if (gfpflags_allow_blocking(mask) &&
ee22184b 5276 page->mapping->host->i_size > SZ_16M) {
39b5637f 5277 u64 len;
70dec807 5278 while (start <= end) {
fbc2bd7e
FM
5279 struct btrfs_fs_info *fs_info;
5280 u64 cur_gen;
5281
39b5637f 5282 len = end - start + 1;
890871be 5283 write_lock(&map->lock);
39b5637f 5284 em = lookup_extent_mapping(map, start, len);
285190d9 5285 if (!em) {
890871be 5286 write_unlock(&map->lock);
70dec807
CM
5287 break;
5288 }
7f3c74fb
CM
5289 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5290 em->start != start) {
890871be 5291 write_unlock(&map->lock);
70dec807
CM
5292 free_extent_map(em);
5293 break;
5294 }
3d6448e6
FM
5295 if (test_range_bit(tree, em->start,
5296 extent_map_end(em) - 1,
5297 EXTENT_LOCKED, 0, NULL))
5298 goto next;
5299 /*
5300 * If it's not in the list of modified extents, used
5301 * by a fast fsync, we can remove it. If it's being
5302 * logged we can safely remove it since fsync took an
5303 * extra reference on the em.
5304 */
5305 if (list_empty(&em->list) ||
fbc2bd7e
FM
5306 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5307 goto remove_em;
5308 /*
5309 * If it's in the list of modified extents, remove it
5310 * only if its generation is older then the current one,
5311 * in which case we don't need it for a fast fsync.
5312 * Otherwise don't remove it, we could be racing with an
5313 * ongoing fast fsync that could miss the new extent.
5314 */
5315 fs_info = btrfs_inode->root->fs_info;
5316 spin_lock(&fs_info->trans_lock);
5317 cur_gen = fs_info->generation;
5318 spin_unlock(&fs_info->trans_lock);
5319 if (em->generation >= cur_gen)
5320 goto next;
5321remove_em:
5e548b32
FM
5322 /*
5323 * We only remove extent maps that are not in the list of
5324 * modified extents or that are in the list but with a
5325 * generation lower then the current generation, so there
5326 * is no need to set the full fsync flag on the inode (it
5327 * hurts the fsync performance for workloads with a data
5328 * size that exceeds or is close to the system's memory).
5329 */
fbc2bd7e
FM
5330 remove_extent_mapping(map, em);
5331 /* once for the rb tree */
5332 free_extent_map(em);
3d6448e6 5333next:
70dec807 5334 start = extent_map_end(em);
890871be 5335 write_unlock(&map->lock);
70dec807
CM
5336
5337 /* once for us */
d1310b2e 5338 free_extent_map(em);
9f47eb54
PM
5339
5340 cond_resched(); /* Allow large-extent preemption. */
d1310b2e 5341 }
d1310b2e 5342 }
29c68b2d 5343 return try_release_extent_state(tree, page, mask);
d1310b2e 5344}
d1310b2e 5345
ec29ed5b
CM
5346/*
5347 * helper function for fiemap, which doesn't want to see any holes.
5348 * This maps until we find something past 'last'
5349 */
f1bbde8d 5350static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
e3350e16 5351 u64 offset, u64 last)
ec29ed5b 5352{
f1bbde8d 5353 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
5354 struct extent_map *em;
5355 u64 len;
5356
5357 if (offset >= last)
5358 return NULL;
5359
67871254 5360 while (1) {
ec29ed5b
CM
5361 len = last - offset;
5362 if (len == 0)
5363 break;
fda2832f 5364 len = ALIGN(len, sectorsize);
f1bbde8d 5365 em = btrfs_get_extent_fiemap(inode, offset, len);
c704005d 5366 if (IS_ERR_OR_NULL(em))
ec29ed5b
CM
5367 return em;
5368
5369 /* if this isn't a hole return it */
4a2d25cd 5370 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 5371 return em;
ec29ed5b
CM
5372
5373 /* this is a hole, advance to the next extent */
5374 offset = extent_map_end(em);
5375 free_extent_map(em);
5376 if (offset >= last)
5377 break;
5378 }
5379 return NULL;
5380}
5381
4751832d
QW
5382/*
5383 * To cache previous fiemap extent
5384 *
5385 * Will be used for merging fiemap extent
5386 */
5387struct fiemap_cache {
5388 u64 offset;
5389 u64 phys;
5390 u64 len;
5391 u32 flags;
5392 bool cached;
5393};
5394
5395/*
5396 * Helper to submit fiemap extent.
5397 *
5398 * Will try to merge current fiemap extent specified by @offset, @phys,
5399 * @len and @flags with cached one.
5400 * And only when we fails to merge, cached one will be submitted as
5401 * fiemap extent.
5402 *
5403 * Return value is the same as fiemap_fill_next_extent().
5404 */
5405static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5406 struct fiemap_cache *cache,
5407 u64 offset, u64 phys, u64 len, u32 flags)
5408{
5409 int ret = 0;
5410
5411 if (!cache->cached)
5412 goto assign;
5413
5414 /*
5415 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 5416 * fiemap extent won't overlap with cached one.
4751832d
QW
5417 * Not recoverable.
5418 *
5419 * NOTE: Physical address can overlap, due to compression
5420 */
5421 if (cache->offset + cache->len > offset) {
5422 WARN_ON(1);
5423 return -EINVAL;
5424 }
5425
5426 /*
5427 * Only merges fiemap extents if
5428 * 1) Their logical addresses are continuous
5429 *
5430 * 2) Their physical addresses are continuous
5431 * So truly compressed (physical size smaller than logical size)
5432 * extents won't get merged with each other
5433 *
5434 * 3) Share same flags except FIEMAP_EXTENT_LAST
5435 * So regular extent won't get merged with prealloc extent
5436 */
5437 if (cache->offset + cache->len == offset &&
5438 cache->phys + cache->len == phys &&
5439 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5440 (flags & ~FIEMAP_EXTENT_LAST)) {
5441 cache->len += len;
5442 cache->flags |= flags;
5443 goto try_submit_last;
5444 }
5445
5446 /* Not mergeable, need to submit cached one */
5447 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5448 cache->len, cache->flags);
5449 cache->cached = false;
5450 if (ret)
5451 return ret;
5452assign:
5453 cache->cached = true;
5454 cache->offset = offset;
5455 cache->phys = phys;
5456 cache->len = len;
5457 cache->flags = flags;
5458try_submit_last:
5459 if (cache->flags & FIEMAP_EXTENT_LAST) {
5460 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5461 cache->phys, cache->len, cache->flags);
5462 cache->cached = false;
5463 }
5464 return ret;
5465}
5466
5467/*
848c23b7 5468 * Emit last fiemap cache
4751832d 5469 *
848c23b7
QW
5470 * The last fiemap cache may still be cached in the following case:
5471 * 0 4k 8k
5472 * |<- Fiemap range ->|
5473 * |<------------ First extent ----------->|
5474 *
5475 * In this case, the first extent range will be cached but not emitted.
5476 * So we must emit it before ending extent_fiemap().
4751832d 5477 */
5c5aff98 5478static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 5479 struct fiemap_cache *cache)
4751832d
QW
5480{
5481 int ret;
5482
5483 if (!cache->cached)
5484 return 0;
5485
4751832d
QW
5486 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5487 cache->len, cache->flags);
5488 cache->cached = false;
5489 if (ret > 0)
5490 ret = 0;
5491 return ret;
5492}
5493
facee0a0 5494int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
bab16e21 5495 u64 start, u64 len)
1506fcc8 5496{
975f84fe 5497 int ret = 0;
15c7745c 5498 u64 off;
1506fcc8
YS
5499 u64 max = start + len;
5500 u32 flags = 0;
975f84fe
JB
5501 u32 found_type;
5502 u64 last;
ec29ed5b 5503 u64 last_for_get_extent = 0;
1506fcc8 5504 u64 disko = 0;
facee0a0 5505 u64 isize = i_size_read(&inode->vfs_inode);
975f84fe 5506 struct btrfs_key found_key;
1506fcc8 5507 struct extent_map *em = NULL;
2ac55d41 5508 struct extent_state *cached_state = NULL;
975f84fe 5509 struct btrfs_path *path;
facee0a0 5510 struct btrfs_root *root = inode->root;
4751832d 5511 struct fiemap_cache cache = { 0 };
5911c8fe
DS
5512 struct ulist *roots;
5513 struct ulist *tmp_ulist;
1506fcc8 5514 int end = 0;
ec29ed5b
CM
5515 u64 em_start = 0;
5516 u64 em_len = 0;
5517 u64 em_end = 0;
1506fcc8
YS
5518
5519 if (len == 0)
5520 return -EINVAL;
5521
975f84fe
JB
5522 path = btrfs_alloc_path();
5523 if (!path)
5524 return -ENOMEM;
975f84fe 5525
5911c8fe
DS
5526 roots = ulist_alloc(GFP_KERNEL);
5527 tmp_ulist = ulist_alloc(GFP_KERNEL);
5528 if (!roots || !tmp_ulist) {
5529 ret = -ENOMEM;
5530 goto out_free_ulist;
5531 }
5532
15c7745c
BB
5533 /*
5534 * We can't initialize that to 'start' as this could miss extents due
5535 * to extent item merging
5536 */
5537 off = 0;
facee0a0
NB
5538 start = round_down(start, btrfs_inode_sectorsize(inode));
5539 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 5540
ec29ed5b
CM
5541 /*
5542 * lookup the last file extent. We're not using i_size here
5543 * because there might be preallocation past i_size
5544 */
facee0a0
NB
5545 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5546 0);
975f84fe 5547 if (ret < 0) {
5911c8fe 5548 goto out_free_ulist;
2d324f59
LB
5549 } else {
5550 WARN_ON(!ret);
5551 if (ret == 1)
5552 ret = 0;
975f84fe 5553 }
2d324f59 5554
975f84fe 5555 path->slots[0]--;
975f84fe 5556 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 5557 found_type = found_key.type;
975f84fe 5558
ec29ed5b 5559 /* No extents, but there might be delalloc bits */
facee0a0 5560 if (found_key.objectid != btrfs_ino(inode) ||
975f84fe 5561 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
5562 /* have to trust i_size as the end */
5563 last = (u64)-1;
5564 last_for_get_extent = isize;
5565 } else {
5566 /*
5567 * remember the start of the last extent. There are a
5568 * bunch of different factors that go into the length of the
5569 * extent, so its much less complex to remember where it started
5570 */
5571 last = found_key.offset;
5572 last_for_get_extent = last + 1;
975f84fe 5573 }
fe09e16c 5574 btrfs_release_path(path);
975f84fe 5575
ec29ed5b
CM
5576 /*
5577 * we might have some extents allocated but more delalloc past those
5578 * extents. so, we trust isize unless the start of the last extent is
5579 * beyond isize
5580 */
5581 if (last < isize) {
5582 last = (u64)-1;
5583 last_for_get_extent = isize;
5584 }
5585
facee0a0 5586 lock_extent_bits(&inode->io_tree, start, start + len - 1,
d0082371 5587 &cached_state);
ec29ed5b 5588
facee0a0 5589 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
5590 if (!em)
5591 goto out;
5592 if (IS_ERR(em)) {
5593 ret = PTR_ERR(em);
5594 goto out;
5595 }
975f84fe 5596
1506fcc8 5597 while (!end) {
b76bb701 5598 u64 offset_in_extent = 0;
ea8efc74
CM
5599
5600 /* break if the extent we found is outside the range */
5601 if (em->start >= max || extent_map_end(em) < off)
5602 break;
5603
5604 /*
5605 * get_extent may return an extent that starts before our
5606 * requested range. We have to make sure the ranges
5607 * we return to fiemap always move forward and don't
5608 * overlap, so adjust the offsets here
5609 */
5610 em_start = max(em->start, off);
1506fcc8 5611
ea8efc74
CM
5612 /*
5613 * record the offset from the start of the extent
b76bb701
JB
5614 * for adjusting the disk offset below. Only do this if the
5615 * extent isn't compressed since our in ram offset may be past
5616 * what we have actually allocated on disk.
ea8efc74 5617 */
b76bb701
JB
5618 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5619 offset_in_extent = em_start - em->start;
ec29ed5b 5620 em_end = extent_map_end(em);
ea8efc74 5621 em_len = em_end - em_start;
1506fcc8 5622 flags = 0;
f0986318
FM
5623 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5624 disko = em->block_start + offset_in_extent;
5625 else
5626 disko = 0;
1506fcc8 5627
ea8efc74
CM
5628 /*
5629 * bump off for our next call to get_extent
5630 */
5631 off = extent_map_end(em);
5632 if (off >= max)
5633 end = 1;
5634
93dbfad7 5635 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
5636 end = 1;
5637 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 5638 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
5639 flags |= (FIEMAP_EXTENT_DATA_INLINE |
5640 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 5641 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
5642 flags |= (FIEMAP_EXTENT_DELALLOC |
5643 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
5644 } else if (fieinfo->fi_extents_max) {
5645 u64 bytenr = em->block_start -
5646 (em->start - em->orig_start);
fe09e16c 5647
fe09e16c
LB
5648 /*
5649 * As btrfs supports shared space, this information
5650 * can be exported to userspace tools via
dc046b10
JB
5651 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
5652 * then we're just getting a count and we can skip the
5653 * lookup stuff.
fe09e16c 5654 */
facee0a0 5655 ret = btrfs_check_shared(root, btrfs_ino(inode),
5911c8fe 5656 bytenr, roots, tmp_ulist);
dc046b10 5657 if (ret < 0)
fe09e16c 5658 goto out_free;
dc046b10 5659 if (ret)
fe09e16c 5660 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 5661 ret = 0;
1506fcc8
YS
5662 }
5663 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5664 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
5665 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5666 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 5667
1506fcc8
YS
5668 free_extent_map(em);
5669 em = NULL;
ec29ed5b
CM
5670 if ((em_start >= last) || em_len == (u64)-1 ||
5671 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
5672 flags |= FIEMAP_EXTENT_LAST;
5673 end = 1;
5674 }
5675
ec29ed5b 5676 /* now scan forward to see if this is really the last extent. */
facee0a0 5677 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
5678 if (IS_ERR(em)) {
5679 ret = PTR_ERR(em);
5680 goto out;
5681 }
5682 if (!em) {
975f84fe
JB
5683 flags |= FIEMAP_EXTENT_LAST;
5684 end = 1;
5685 }
4751832d
QW
5686 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5687 em_len, flags);
26e726af
CS
5688 if (ret) {
5689 if (ret == 1)
5690 ret = 0;
ec29ed5b 5691 goto out_free;
26e726af 5692 }
1506fcc8
YS
5693 }
5694out_free:
4751832d 5695 if (!ret)
5c5aff98 5696 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
5697 free_extent_map(em);
5698out:
facee0a0 5699 unlock_extent_cached(&inode->io_tree, start, start + len - 1,
e43bbe5e 5700 &cached_state);
5911c8fe
DS
5701
5702out_free_ulist:
e02d48ea 5703 btrfs_free_path(path);
5911c8fe
DS
5704 ulist_free(roots);
5705 ulist_free(tmp_ulist);
1506fcc8
YS
5706 return ret;
5707}
5708
727011e0
CM
5709static void __free_extent_buffer(struct extent_buffer *eb)
5710{
727011e0
CM
5711 kmem_cache_free(extent_buffer_cache, eb);
5712}
5713
2b48966a 5714int extent_buffer_under_io(const struct extent_buffer *eb)
db7f3436
JB
5715{
5716 return (atomic_read(&eb->io_pages) ||
5717 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5718 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5719}
5720
8ff8466d 5721static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
db7f3436 5722{
8ff8466d 5723 struct btrfs_subpage *subpage;
db7f3436 5724
8ff8466d 5725 lockdep_assert_held(&page->mapping->private_lock);
db7f3436 5726
8ff8466d
QW
5727 if (PagePrivate(page)) {
5728 subpage = (struct btrfs_subpage *)page->private;
5729 if (atomic_read(&subpage->eb_refs))
5730 return true;
3d078efa
QW
5731 /*
5732 * Even there is no eb refs here, we may still have
5733 * end_page_read() call relying on page::private.
5734 */
5735 if (atomic_read(&subpage->readers))
5736 return true;
8ff8466d
QW
5737 }
5738 return false;
5739}
db7f3436 5740
8ff8466d
QW
5741static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5742{
5743 struct btrfs_fs_info *fs_info = eb->fs_info;
5744 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5745
5746 /*
5747 * For mapped eb, we're going to change the page private, which should
5748 * be done under the private_lock.
5749 */
5750 if (mapped)
5751 spin_lock(&page->mapping->private_lock);
5752
5753 if (!PagePrivate(page)) {
5d2361db 5754 if (mapped)
8ff8466d
QW
5755 spin_unlock(&page->mapping->private_lock);
5756 return;
5757 }
5758
5759 if (fs_info->sectorsize == PAGE_SIZE) {
5d2361db
FL
5760 /*
5761 * We do this since we'll remove the pages after we've
5762 * removed the eb from the radix tree, so we could race
5763 * and have this page now attached to the new eb. So
5764 * only clear page_private if it's still connected to
5765 * this eb.
5766 */
5767 if (PagePrivate(page) &&
5768 page->private == (unsigned long)eb) {
5769 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5770 BUG_ON(PageDirty(page));
5771 BUG_ON(PageWriteback(page));
db7f3436 5772 /*
5d2361db
FL
5773 * We need to make sure we haven't be attached
5774 * to a new eb.
db7f3436 5775 */
d1b89bc0 5776 detach_page_private(page);
db7f3436 5777 }
5d2361db
FL
5778 if (mapped)
5779 spin_unlock(&page->mapping->private_lock);
8ff8466d
QW
5780 return;
5781 }
5782
5783 /*
5784 * For subpage, we can have dummy eb with page private. In this case,
5785 * we can directly detach the private as such page is only attached to
5786 * one dummy eb, no sharing.
5787 */
5788 if (!mapped) {
5789 btrfs_detach_subpage(fs_info, page);
5790 return;
5791 }
5792
5793 btrfs_page_dec_eb_refs(fs_info, page);
5794
5795 /*
5796 * We can only detach the page private if there are no other ebs in the
3d078efa 5797 * page range and no unfinished IO.
8ff8466d
QW
5798 */
5799 if (!page_range_has_eb(fs_info, page))
5800 btrfs_detach_subpage(fs_info, page);
5801
5802 spin_unlock(&page->mapping->private_lock);
5803}
5804
5805/* Release all pages attached to the extent buffer */
5806static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5807{
5808 int i;
5809 int num_pages;
5810
5811 ASSERT(!extent_buffer_under_io(eb));
5812
5813 num_pages = num_extent_pages(eb);
5814 for (i = 0; i < num_pages; i++) {
5815 struct page *page = eb->pages[i];
5816
5817 if (!page)
5818 continue;
5819
5820 detach_extent_buffer_page(eb, page);
5d2361db 5821
01327610 5822 /* One for when we allocated the page */
09cbfeaf 5823 put_page(page);
d64766fd 5824 }
db7f3436
JB
5825}
5826
5827/*
5828 * Helper for releasing the extent buffer.
5829 */
5830static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5831{
55ac0139 5832 btrfs_release_extent_buffer_pages(eb);
8c38938c 5833 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
db7f3436
JB
5834 __free_extent_buffer(eb);
5835}
5836
f28491e0
JB
5837static struct extent_buffer *
5838__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 5839 unsigned long len)
d1310b2e
CM
5840{
5841 struct extent_buffer *eb = NULL;
5842
d1b5c567 5843 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
5844 eb->start = start;
5845 eb->len = len;
f28491e0 5846 eb->fs_info = fs_info;
815a51c7 5847 eb->bflags = 0;
196d59ab 5848 init_rwsem(&eb->lock);
b4ce94de 5849
3fd63727
JB
5850 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5851 &fs_info->allocated_ebs);
d3575156 5852 INIT_LIST_HEAD(&eb->release_list);
6d49ba1b 5853
3083ee2e 5854 spin_lock_init(&eb->refs_lock);
d1310b2e 5855 atomic_set(&eb->refs, 1);
0b32f4bb 5856 atomic_set(&eb->io_pages, 0);
727011e0 5857
deb67895 5858 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
d1310b2e
CM
5859
5860 return eb;
5861}
5862
2b48966a 5863struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
815a51c7 5864{
cc5e31a4 5865 int i;
815a51c7
JS
5866 struct page *p;
5867 struct extent_buffer *new;
cc5e31a4 5868 int num_pages = num_extent_pages(src);
815a51c7 5869
3f556f78 5870 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
5871 if (new == NULL)
5872 return NULL;
5873
62c053fb
QW
5874 /*
5875 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5876 * btrfs_release_extent_buffer() have different behavior for
5877 * UNMAPPED subpage extent buffer.
5878 */
5879 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5880
815a51c7 5881 for (i = 0; i < num_pages; i++) {
760f991f
QW
5882 int ret;
5883
9ec72677 5884 p = alloc_page(GFP_NOFS);
db7f3436
JB
5885 if (!p) {
5886 btrfs_release_extent_buffer(new);
5887 return NULL;
5888 }
760f991f
QW
5889 ret = attach_extent_buffer_page(new, p, NULL);
5890 if (ret < 0) {
5891 put_page(p);
5892 btrfs_release_extent_buffer(new);
5893 return NULL;
5894 }
815a51c7 5895 WARN_ON(PageDirty(p));
815a51c7 5896 new->pages[i] = p;
fba1acf9 5897 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7 5898 }
92d83e94 5899 set_extent_buffer_uptodate(new);
815a51c7
JS
5900
5901 return new;
5902}
5903
0f331229
OS
5904struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5905 u64 start, unsigned long len)
815a51c7
JS
5906{
5907 struct extent_buffer *eb;
cc5e31a4
DS
5908 int num_pages;
5909 int i;
815a51c7 5910
3f556f78 5911 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
5912 if (!eb)
5913 return NULL;
5914
65ad0104 5915 num_pages = num_extent_pages(eb);
815a51c7 5916 for (i = 0; i < num_pages; i++) {
09bc1f0f
QW
5917 int ret;
5918
9ec72677 5919 eb->pages[i] = alloc_page(GFP_NOFS);
815a51c7
JS
5920 if (!eb->pages[i])
5921 goto err;
09bc1f0f
QW
5922 ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
5923 if (ret < 0)
5924 goto err;
815a51c7
JS
5925 }
5926 set_extent_buffer_uptodate(eb);
5927 btrfs_set_header_nritems(eb, 0);
b0132a3b 5928 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
5929
5930 return eb;
5931err:
09bc1f0f
QW
5932 for (; i > 0; i--) {
5933 detach_extent_buffer_page(eb, eb->pages[i - 1]);
84167d19 5934 __free_page(eb->pages[i - 1]);
09bc1f0f 5935 }
815a51c7
JS
5936 __free_extent_buffer(eb);
5937 return NULL;
5938}
5939
0f331229 5940struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5941 u64 start)
0f331229 5942{
da17066c 5943 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
5944}
5945
0b32f4bb
JB
5946static void check_buffer_tree_ref(struct extent_buffer *eb)
5947{
242e18c7 5948 int refs;
6bf9cd2e
BB
5949 /*
5950 * The TREE_REF bit is first set when the extent_buffer is added
5951 * to the radix tree. It is also reset, if unset, when a new reference
5952 * is created by find_extent_buffer.
0b32f4bb 5953 *
6bf9cd2e
BB
5954 * It is only cleared in two cases: freeing the last non-tree
5955 * reference to the extent_buffer when its STALE bit is set or
5956 * calling releasepage when the tree reference is the only reference.
0b32f4bb 5957 *
6bf9cd2e
BB
5958 * In both cases, care is taken to ensure that the extent_buffer's
5959 * pages are not under io. However, releasepage can be concurrently
5960 * called with creating new references, which is prone to race
5961 * conditions between the calls to check_buffer_tree_ref in those
5962 * codepaths and clearing TREE_REF in try_release_extent_buffer.
0b32f4bb 5963 *
6bf9cd2e
BB
5964 * The actual lifetime of the extent_buffer in the radix tree is
5965 * adequately protected by the refcount, but the TREE_REF bit and
5966 * its corresponding reference are not. To protect against this
5967 * class of races, we call check_buffer_tree_ref from the codepaths
5968 * which trigger io after they set eb->io_pages. Note that once io is
5969 * initiated, TREE_REF can no longer be cleared, so that is the
5970 * moment at which any such race is best fixed.
0b32f4bb 5971 */
242e18c7
CM
5972 refs = atomic_read(&eb->refs);
5973 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5974 return;
5975
594831c4
JB
5976 spin_lock(&eb->refs_lock);
5977 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 5978 atomic_inc(&eb->refs);
594831c4 5979 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
5980}
5981
2457aec6
MG
5982static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5983 struct page *accessed)
5df4235e 5984{
cc5e31a4 5985 int num_pages, i;
5df4235e 5986
0b32f4bb
JB
5987 check_buffer_tree_ref(eb);
5988
65ad0104 5989 num_pages = num_extent_pages(eb);
5df4235e 5990 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
5991 struct page *p = eb->pages[i];
5992
2457aec6
MG
5993 if (p != accessed)
5994 mark_page_accessed(p);
5df4235e
JB
5995 }
5996}
5997
f28491e0
JB
5998struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5999 u64 start)
452c75c3
CS
6000{
6001 struct extent_buffer *eb;
6002
2f3186d8
QW
6003 eb = find_extent_buffer_nolock(fs_info, start);
6004 if (!eb)
6005 return NULL;
6006 /*
6007 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
6008 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
6009 * another task running free_extent_buffer() might have seen that flag
6010 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
6011 * writeback flags not set) and it's still in the tree (flag
6012 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
6013 * decrementing the extent buffer's reference count twice. So here we
6014 * could race and increment the eb's reference count, clear its stale
6015 * flag, mark it as dirty and drop our reference before the other task
6016 * finishes executing free_extent_buffer, which would later result in
6017 * an attempt to free an extent buffer that is dirty.
6018 */
6019 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6020 spin_lock(&eb->refs_lock);
6021 spin_unlock(&eb->refs_lock);
452c75c3 6022 }
2f3186d8
QW
6023 mark_extent_buffer_accessed(eb, NULL);
6024 return eb;
452c75c3
CS
6025}
6026
faa2dbf0
JB
6027#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6028struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 6029 u64 start)
faa2dbf0
JB
6030{
6031 struct extent_buffer *eb, *exists = NULL;
6032 int ret;
6033
6034 eb = find_extent_buffer(fs_info, start);
6035 if (eb)
6036 return eb;
da17066c 6037 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 6038 if (!eb)
b6293c82 6039 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
6040 eb->fs_info = fs_info;
6041again:
e1860a77 6042 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
6043 if (ret) {
6044 exists = ERR_PTR(ret);
faa2dbf0 6045 goto free_eb;
b6293c82 6046 }
faa2dbf0
JB
6047 spin_lock(&fs_info->buffer_lock);
6048 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 6049 start >> fs_info->sectorsize_bits, eb);
faa2dbf0
JB
6050 spin_unlock(&fs_info->buffer_lock);
6051 radix_tree_preload_end();
6052 if (ret == -EEXIST) {
6053 exists = find_extent_buffer(fs_info, start);
6054 if (exists)
6055 goto free_eb;
6056 else
6057 goto again;
6058 }
6059 check_buffer_tree_ref(eb);
6060 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6061
faa2dbf0
JB
6062 return eb;
6063free_eb:
6064 btrfs_release_extent_buffer(eb);
6065 return exists;
6066}
6067#endif
6068
81982210
QW
6069static struct extent_buffer *grab_extent_buffer(
6070 struct btrfs_fs_info *fs_info, struct page *page)
c0f0a9e7
QW
6071{
6072 struct extent_buffer *exists;
6073
81982210
QW
6074 /*
6075 * For subpage case, we completely rely on radix tree to ensure we
6076 * don't try to insert two ebs for the same bytenr. So here we always
6077 * return NULL and just continue.
6078 */
6079 if (fs_info->sectorsize < PAGE_SIZE)
6080 return NULL;
6081
c0f0a9e7
QW
6082 /* Page not yet attached to an extent buffer */
6083 if (!PagePrivate(page))
6084 return NULL;
6085
6086 /*
6087 * We could have already allocated an eb for this page and attached one
6088 * so lets see if we can get a ref on the existing eb, and if we can we
6089 * know it's good and we can just return that one, else we know we can
6090 * just overwrite page->private.
6091 */
6092 exists = (struct extent_buffer *)page->private;
6093 if (atomic_inc_not_zero(&exists->refs))
6094 return exists;
6095
6096 WARN_ON(PageDirty(page));
6097 detach_page_private(page);
6098 return NULL;
6099}
6100
f28491e0 6101struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3fbaf258 6102 u64 start, u64 owner_root, int level)
d1310b2e 6103{
da17066c 6104 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
6105 int num_pages;
6106 int i;
09cbfeaf 6107 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 6108 struct extent_buffer *eb;
6af118ce 6109 struct extent_buffer *exists = NULL;
d1310b2e 6110 struct page *p;
f28491e0 6111 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 6112 int uptodate = 1;
19fe0a8b 6113 int ret;
d1310b2e 6114
da17066c 6115 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
c871b0f2
LB
6116 btrfs_err(fs_info, "bad tree block start %llu", start);
6117 return ERR_PTR(-EINVAL);
6118 }
6119
e9306ad4
QW
6120#if BITS_PER_LONG == 32
6121 if (start >= MAX_LFS_FILESIZE) {
6122 btrfs_err_rl(fs_info,
6123 "extent buffer %llu is beyond 32bit page cache limit", start);
6124 btrfs_err_32bit_limit(fs_info);
6125 return ERR_PTR(-EOVERFLOW);
6126 }
6127 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6128 btrfs_warn_32bit_limit(fs_info);
6129#endif
6130
1aaac38c
QW
6131 if (fs_info->sectorsize < PAGE_SIZE &&
6132 offset_in_page(start) + len > PAGE_SIZE) {
6133 btrfs_err(fs_info,
6134 "tree block crosses page boundary, start %llu nodesize %lu",
6135 start, len);
6136 return ERR_PTR(-EINVAL);
6137 }
6138
f28491e0 6139 eb = find_extent_buffer(fs_info, start);
452c75c3 6140 if (eb)
6af118ce 6141 return eb;
6af118ce 6142
23d79d81 6143 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 6144 if (!eb)
c871b0f2 6145 return ERR_PTR(-ENOMEM);
e114c545 6146 btrfs_set_buffer_lockdep_class(owner_root, eb, level);
d1310b2e 6147
65ad0104 6148 num_pages = num_extent_pages(eb);
727011e0 6149 for (i = 0; i < num_pages; i++, index++) {
760f991f
QW
6150 struct btrfs_subpage *prealloc = NULL;
6151
d1b5c567 6152 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
6153 if (!p) {
6154 exists = ERR_PTR(-ENOMEM);
6af118ce 6155 goto free_eb;
c871b0f2 6156 }
4f2de97a 6157
760f991f
QW
6158 /*
6159 * Preallocate page->private for subpage case, so that we won't
6160 * allocate memory with private_lock hold. The memory will be
6161 * freed by attach_extent_buffer_page() or freed manually if
6162 * we exit earlier.
6163 *
6164 * Although we have ensured one subpage eb can only have one
6165 * page, but it may change in the future for 16K page size
6166 * support, so we still preallocate the memory in the loop.
6167 */
fdf250db 6168 if (fs_info->sectorsize < PAGE_SIZE) {
651fb419
QW
6169 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6170 if (IS_ERR(prealloc)) {
6171 ret = PTR_ERR(prealloc);
fdf250db
QW
6172 unlock_page(p);
6173 put_page(p);
6174 exists = ERR_PTR(ret);
6175 goto free_eb;
6176 }
760f991f
QW
6177 }
6178
4f2de97a 6179 spin_lock(&mapping->private_lock);
81982210 6180 exists = grab_extent_buffer(fs_info, p);
c0f0a9e7
QW
6181 if (exists) {
6182 spin_unlock(&mapping->private_lock);
6183 unlock_page(p);
6184 put_page(p);
6185 mark_extent_buffer_accessed(exists, p);
760f991f 6186 btrfs_free_subpage(prealloc);
c0f0a9e7 6187 goto free_eb;
d1310b2e 6188 }
760f991f
QW
6189 /* Should not fail, as we have preallocated the memory */
6190 ret = attach_extent_buffer_page(eb, p, prealloc);
6191 ASSERT(!ret);
8ff8466d
QW
6192 /*
6193 * To inform we have extra eb under allocation, so that
6194 * detach_extent_buffer_page() won't release the page private
6195 * when the eb hasn't yet been inserted into radix tree.
6196 *
6197 * The ref will be decreased when the eb released the page, in
6198 * detach_extent_buffer_page().
6199 * Thus needs no special handling in error path.
6200 */
6201 btrfs_page_inc_eb_refs(fs_info, p);
4f2de97a 6202 spin_unlock(&mapping->private_lock);
760f991f 6203
1e5eb3d6 6204 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
727011e0 6205 eb->pages[i] = p;
d1310b2e
CM
6206 if (!PageUptodate(p))
6207 uptodate = 0;
eb14ab8e
CM
6208
6209 /*
b16d011e
NB
6210 * We can't unlock the pages just yet since the extent buffer
6211 * hasn't been properly inserted in the radix tree, this
6212 * opens a race with btree_releasepage which can free a page
6213 * while we are still filling in all pages for the buffer and
6214 * we could crash.
eb14ab8e 6215 */
d1310b2e
CM
6216 }
6217 if (uptodate)
b4ce94de 6218 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 6219again:
e1860a77 6220 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
6221 if (ret) {
6222 exists = ERR_PTR(ret);
19fe0a8b 6223 goto free_eb;
c871b0f2 6224 }
19fe0a8b 6225
f28491e0
JB
6226 spin_lock(&fs_info->buffer_lock);
6227 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 6228 start >> fs_info->sectorsize_bits, eb);
f28491e0 6229 spin_unlock(&fs_info->buffer_lock);
452c75c3 6230 radix_tree_preload_end();
19fe0a8b 6231 if (ret == -EEXIST) {
f28491e0 6232 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
6233 if (exists)
6234 goto free_eb;
6235 else
115391d2 6236 goto again;
6af118ce 6237 }
6af118ce 6238 /* add one reference for the tree */
0b32f4bb 6239 check_buffer_tree_ref(eb);
34b41ace 6240 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
6241
6242 /*
b16d011e
NB
6243 * Now it's safe to unlock the pages because any calls to
6244 * btree_releasepage will correctly detect that a page belongs to a
6245 * live buffer and won't free them prematurely.
eb14ab8e 6246 */
28187ae5
NB
6247 for (i = 0; i < num_pages; i++)
6248 unlock_page(eb->pages[i]);
d1310b2e
CM
6249 return eb;
6250
6af118ce 6251free_eb:
5ca64f45 6252 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
6253 for (i = 0; i < num_pages; i++) {
6254 if (eb->pages[i])
6255 unlock_page(eb->pages[i]);
6256 }
eb14ab8e 6257
897ca6e9 6258 btrfs_release_extent_buffer(eb);
6af118ce 6259 return exists;
d1310b2e 6260}
d1310b2e 6261
3083ee2e
JB
6262static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6263{
6264 struct extent_buffer *eb =
6265 container_of(head, struct extent_buffer, rcu_head);
6266
6267 __free_extent_buffer(eb);
6268}
6269
f7a52a40 6270static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 6271 __releases(&eb->refs_lock)
3083ee2e 6272{
07e21c4d
NB
6273 lockdep_assert_held(&eb->refs_lock);
6274
3083ee2e
JB
6275 WARN_ON(atomic_read(&eb->refs) == 0);
6276 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 6277 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 6278 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 6279
815a51c7 6280 spin_unlock(&eb->refs_lock);
3083ee2e 6281
f28491e0
JB
6282 spin_lock(&fs_info->buffer_lock);
6283 radix_tree_delete(&fs_info->buffer_radix,
478ef886 6284 eb->start >> fs_info->sectorsize_bits);
f28491e0 6285 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
6286 } else {
6287 spin_unlock(&eb->refs_lock);
815a51c7 6288 }
3083ee2e 6289
8c38938c 6290 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
3083ee2e 6291 /* Should be safe to release our pages at this point */
55ac0139 6292 btrfs_release_extent_buffer_pages(eb);
bcb7e449 6293#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 6294 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
6295 __free_extent_buffer(eb);
6296 return 1;
6297 }
6298#endif
3083ee2e 6299 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 6300 return 1;
3083ee2e
JB
6301 }
6302 spin_unlock(&eb->refs_lock);
e64860aa
JB
6303
6304 return 0;
3083ee2e
JB
6305}
6306
d1310b2e
CM
6307void free_extent_buffer(struct extent_buffer *eb)
6308{
242e18c7
CM
6309 int refs;
6310 int old;
d1310b2e
CM
6311 if (!eb)
6312 return;
6313
242e18c7
CM
6314 while (1) {
6315 refs = atomic_read(&eb->refs);
46cc775e
NB
6316 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6317 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6318 refs == 1))
242e18c7
CM
6319 break;
6320 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6321 if (old == refs)
6322 return;
6323 }
6324
3083ee2e
JB
6325 spin_lock(&eb->refs_lock);
6326 if (atomic_read(&eb->refs) == 2 &&
6327 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 6328 !extent_buffer_under_io(eb) &&
3083ee2e
JB
6329 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6330 atomic_dec(&eb->refs);
6331
6332 /*
6333 * I know this is terrible, but it's temporary until we stop tracking
6334 * the uptodate bits and such for the extent buffers.
6335 */
f7a52a40 6336 release_extent_buffer(eb);
3083ee2e
JB
6337}
6338
6339void free_extent_buffer_stale(struct extent_buffer *eb)
6340{
6341 if (!eb)
d1310b2e
CM
6342 return;
6343
3083ee2e
JB
6344 spin_lock(&eb->refs_lock);
6345 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6346
0b32f4bb 6347 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
6348 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6349 atomic_dec(&eb->refs);
f7a52a40 6350 release_extent_buffer(eb);
d1310b2e 6351}
d1310b2e 6352
0d27797e
QW
6353static void btree_clear_page_dirty(struct page *page)
6354{
6355 ASSERT(PageDirty(page));
6356 ASSERT(PageLocked(page));
6357 clear_page_dirty_for_io(page);
6358 xa_lock_irq(&page->mapping->i_pages);
6359 if (!PageDirty(page))
6360 __xa_clear_mark(&page->mapping->i_pages,
6361 page_index(page), PAGECACHE_TAG_DIRTY);
6362 xa_unlock_irq(&page->mapping->i_pages);
6363}
6364
6365static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6366{
6367 struct btrfs_fs_info *fs_info = eb->fs_info;
6368 struct page *page = eb->pages[0];
6369 bool last;
6370
6371 /* btree_clear_page_dirty() needs page locked */
6372 lock_page(page);
6373 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6374 eb->len);
6375 if (last)
6376 btree_clear_page_dirty(page);
6377 unlock_page(page);
6378 WARN_ON(atomic_read(&eb->refs) == 0);
6379}
6380
2b48966a 6381void clear_extent_buffer_dirty(const struct extent_buffer *eb)
d1310b2e 6382{
cc5e31a4
DS
6383 int i;
6384 int num_pages;
d1310b2e
CM
6385 struct page *page;
6386
0d27797e
QW
6387 if (eb->fs_info->sectorsize < PAGE_SIZE)
6388 return clear_subpage_extent_buffer_dirty(eb);
6389
65ad0104 6390 num_pages = num_extent_pages(eb);
d1310b2e
CM
6391
6392 for (i = 0; i < num_pages; i++) {
fb85fc9a 6393 page = eb->pages[i];
b9473439 6394 if (!PageDirty(page))
d2c3f4f6 6395 continue;
a61e6f29 6396 lock_page(page);
0d27797e 6397 btree_clear_page_dirty(page);
bf0da8c1 6398 ClearPageError(page);
a61e6f29 6399 unlock_page(page);
d1310b2e 6400 }
0b32f4bb 6401 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 6402}
d1310b2e 6403
abb57ef3 6404bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 6405{
cc5e31a4
DS
6406 int i;
6407 int num_pages;
abb57ef3 6408 bool was_dirty;
d1310b2e 6409
0b32f4bb
JB
6410 check_buffer_tree_ref(eb);
6411
b9473439 6412 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 6413
65ad0104 6414 num_pages = num_extent_pages(eb);
3083ee2e 6415 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
6416 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6417
0d27797e
QW
6418 if (!was_dirty) {
6419 bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
51995c39 6420
0d27797e
QW
6421 /*
6422 * For subpage case, we can have other extent buffers in the
6423 * same page, and in clear_subpage_extent_buffer_dirty() we
6424 * have to clear page dirty without subpage lock held.
6425 * This can cause race where our page gets dirty cleared after
6426 * we just set it.
6427 *
6428 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6429 * its page for other reasons, we can use page lock to prevent
6430 * the above race.
6431 */
6432 if (subpage)
6433 lock_page(eb->pages[0]);
6434 for (i = 0; i < num_pages; i++)
6435 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6436 eb->start, eb->len);
6437 if (subpage)
6438 unlock_page(eb->pages[0]);
6439 }
51995c39
LB
6440#ifdef CONFIG_BTRFS_DEBUG
6441 for (i = 0; i < num_pages; i++)
6442 ASSERT(PageDirty(eb->pages[i]));
6443#endif
6444
b9473439 6445 return was_dirty;
d1310b2e 6446}
d1310b2e 6447
69ba3927 6448void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 6449{
251f2acc 6450 struct btrfs_fs_info *fs_info = eb->fs_info;
1259ab75 6451 struct page *page;
cc5e31a4 6452 int num_pages;
251f2acc 6453 int i;
1259ab75 6454
b4ce94de 6455 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 6456 num_pages = num_extent_pages(eb);
1259ab75 6457 for (i = 0; i < num_pages; i++) {
fb85fc9a 6458 page = eb->pages[i];
33958dc6 6459 if (page)
251f2acc
QW
6460 btrfs_page_clear_uptodate(fs_info, page,
6461 eb->start, eb->len);
1259ab75 6462 }
1259ab75
CM
6463}
6464
09c25a8c 6465void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 6466{
251f2acc 6467 struct btrfs_fs_info *fs_info = eb->fs_info;
d1310b2e 6468 struct page *page;
cc5e31a4 6469 int num_pages;
251f2acc 6470 int i;
d1310b2e 6471
0b32f4bb 6472 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 6473 num_pages = num_extent_pages(eb);
d1310b2e 6474 for (i = 0; i < num_pages; i++) {
fb85fc9a 6475 page = eb->pages[i];
251f2acc 6476 btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
d1310b2e 6477 }
d1310b2e 6478}
d1310b2e 6479
4012daf7
QW
6480static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6481 int mirror_num)
6482{
6483 struct btrfs_fs_info *fs_info = eb->fs_info;
6484 struct extent_io_tree *io_tree;
6485 struct page *page = eb->pages[0];
390ed29b 6486 struct btrfs_bio_ctrl bio_ctrl = { 0 };
4012daf7
QW
6487 int ret = 0;
6488
6489 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6490 ASSERT(PagePrivate(page));
6491 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6492
6493 if (wait == WAIT_NONE) {
dc56219f
GR
6494 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6495 return -EAGAIN;
4012daf7
QW
6496 } else {
6497 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6498 if (ret < 0)
6499 return ret;
6500 }
6501
6502 ret = 0;
6503 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6504 PageUptodate(page) ||
6505 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6506 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6507 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6508 return ret;
6509 }
6510
6511 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6512 eb->read_mirror = 0;
6513 atomic_set(&eb->io_pages, 1);
6514 check_buffer_tree_ref(eb);
6515 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6516
3d078efa 6517 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
390ed29b
QW
6518 ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
6519 page, eb->start, eb->len,
6520 eb->start - page_offset(page),
6521 end_bio_extent_readpage, mirror_num, 0,
4012daf7
QW
6522 true);
6523 if (ret) {
6524 /*
6525 * In the endio function, if we hit something wrong we will
6526 * increase the io_pages, so here we need to decrease it for
6527 * error path.
6528 */
6529 atomic_dec(&eb->io_pages);
6530 }
390ed29b 6531 if (bio_ctrl.bio) {
4012daf7
QW
6532 int tmp;
6533
390ed29b
QW
6534 tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
6535 bio_ctrl.bio = NULL;
4012daf7
QW
6536 if (tmp < 0)
6537 return tmp;
6538 }
6539 if (ret || wait != WAIT_COMPLETE)
6540 return ret;
6541
6542 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6543 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6544 ret = -EIO;
6545 return ret;
6546}
6547
c2ccfbc6 6548int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 6549{
cc5e31a4 6550 int i;
d1310b2e
CM
6551 struct page *page;
6552 int err;
6553 int ret = 0;
ce9adaa5
CM
6554 int locked_pages = 0;
6555 int all_uptodate = 1;
cc5e31a4 6556 int num_pages;
727011e0 6557 unsigned long num_reads = 0;
390ed29b 6558 struct btrfs_bio_ctrl bio_ctrl = { 0 };
a86c12c7 6559
b4ce94de 6560 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
6561 return 0;
6562
4012daf7
QW
6563 if (eb->fs_info->sectorsize < PAGE_SIZE)
6564 return read_extent_buffer_subpage(eb, wait, mirror_num);
6565
65ad0104 6566 num_pages = num_extent_pages(eb);
8436ea91 6567 for (i = 0; i < num_pages; i++) {
fb85fc9a 6568 page = eb->pages[i];
bb82ab88 6569 if (wait == WAIT_NONE) {
2c4d8cb7
QW
6570 /*
6571 * WAIT_NONE is only utilized by readahead. If we can't
6572 * acquire the lock atomically it means either the eb
6573 * is being read out or under modification.
6574 * Either way the eb will be or has been cached,
6575 * readahead can exit safely.
6576 */
2db04966 6577 if (!trylock_page(page))
ce9adaa5 6578 goto unlock_exit;
d1310b2e
CM
6579 } else {
6580 lock_page(page);
6581 }
ce9adaa5 6582 locked_pages++;
2571e739
LB
6583 }
6584 /*
6585 * We need to firstly lock all pages to make sure that
6586 * the uptodate bit of our pages won't be affected by
6587 * clear_extent_buffer_uptodate().
6588 */
8436ea91 6589 for (i = 0; i < num_pages; i++) {
2571e739 6590 page = eb->pages[i];
727011e0
CM
6591 if (!PageUptodate(page)) {
6592 num_reads++;
ce9adaa5 6593 all_uptodate = 0;
727011e0 6594 }
ce9adaa5 6595 }
2571e739 6596
ce9adaa5 6597 if (all_uptodate) {
8436ea91 6598 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
6599 goto unlock_exit;
6600 }
6601
656f30db 6602 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 6603 eb->read_mirror = 0;
0b32f4bb 6604 atomic_set(&eb->io_pages, num_reads);
6bf9cd2e
BB
6605 /*
6606 * It is possible for releasepage to clear the TREE_REF bit before we
6607 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6608 */
6609 check_buffer_tree_ref(eb);
8436ea91 6610 for (i = 0; i < num_pages; i++) {
fb85fc9a 6611 page = eb->pages[i];
baf863b9 6612
ce9adaa5 6613 if (!PageUptodate(page)) {
baf863b9
LB
6614 if (ret) {
6615 atomic_dec(&eb->io_pages);
6616 unlock_page(page);
6617 continue;
6618 }
6619
f188591e 6620 ClearPageError(page);
0420177c 6621 err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
390ed29b
QW
6622 &bio_ctrl, page, page_offset(page),
6623 PAGE_SIZE, 0, end_bio_extent_readpage,
6624 mirror_num, 0, false);
baf863b9 6625 if (err) {
baf863b9 6626 /*
0420177c
NB
6627 * We failed to submit the bio so it's the
6628 * caller's responsibility to perform cleanup
6629 * i.e unlock page/set error bit.
baf863b9 6630 */
0420177c
NB
6631 ret = err;
6632 SetPageError(page);
6633 unlock_page(page);
baf863b9
LB
6634 atomic_dec(&eb->io_pages);
6635 }
d1310b2e
CM
6636 } else {
6637 unlock_page(page);
6638 }
6639 }
6640
390ed29b
QW
6641 if (bio_ctrl.bio) {
6642 err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
6643 bio_ctrl.bio = NULL;
79787eaa
JM
6644 if (err)
6645 return err;
355808c2 6646 }
a86c12c7 6647
bb82ab88 6648 if (ret || wait != WAIT_COMPLETE)
d1310b2e 6649 return ret;
d397712b 6650
8436ea91 6651 for (i = 0; i < num_pages; i++) {
fb85fc9a 6652 page = eb->pages[i];
d1310b2e 6653 wait_on_page_locked(page);
d397712b 6654 if (!PageUptodate(page))
d1310b2e 6655 ret = -EIO;
d1310b2e 6656 }
d397712b 6657
d1310b2e 6658 return ret;
ce9adaa5
CM
6659
6660unlock_exit:
d397712b 6661 while (locked_pages > 0) {
ce9adaa5 6662 locked_pages--;
8436ea91
JB
6663 page = eb->pages[locked_pages];
6664 unlock_page(page);
ce9adaa5
CM
6665 }
6666 return ret;
d1310b2e 6667}
d1310b2e 6668
f98b6215
QW
6669static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6670 unsigned long len)
6671{
6672 btrfs_warn(eb->fs_info,
6673 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6674 eb->start, eb->len, start, len);
6675 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6676
6677 return true;
6678}
6679
6680/*
6681 * Check if the [start, start + len) range is valid before reading/writing
6682 * the eb.
6683 * NOTE: @start and @len are offset inside the eb, not logical address.
6684 *
6685 * Caller should not touch the dst/src memory if this function returns error.
6686 */
6687static inline int check_eb_range(const struct extent_buffer *eb,
6688 unsigned long start, unsigned long len)
6689{
6690 unsigned long offset;
6691
6692 /* start, start + len should not go beyond eb->len nor overflow */
6693 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6694 return report_eb_range(eb, start, len);
6695
6696 return false;
6697}
6698
1cbb1f45
JM
6699void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6700 unsigned long start, unsigned long len)
d1310b2e
CM
6701{
6702 size_t cur;
6703 size_t offset;
6704 struct page *page;
6705 char *kaddr;
6706 char *dst = (char *)dstv;
884b07d0 6707 unsigned long i = get_eb_page_index(start);
d1310b2e 6708
f98b6215 6709 if (check_eb_range(eb, start, len))
f716abd5 6710 return;
d1310b2e 6711
884b07d0 6712 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6713
d397712b 6714 while (len > 0) {
fb85fc9a 6715 page = eb->pages[i];
d1310b2e 6716
09cbfeaf 6717 cur = min(len, (PAGE_SIZE - offset));
a6591715 6718 kaddr = page_address(page);
d1310b2e 6719 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
6720
6721 dst += cur;
6722 len -= cur;
6723 offset = 0;
6724 i++;
6725 }
6726}
d1310b2e 6727
a48b73ec
JB
6728int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6729 void __user *dstv,
6730 unsigned long start, unsigned long len)
550ac1d8
GH
6731{
6732 size_t cur;
6733 size_t offset;
6734 struct page *page;
6735 char *kaddr;
6736 char __user *dst = (char __user *)dstv;
884b07d0 6737 unsigned long i = get_eb_page_index(start);
550ac1d8
GH
6738 int ret = 0;
6739
6740 WARN_ON(start > eb->len);
6741 WARN_ON(start + len > eb->start + eb->len);
6742
884b07d0 6743 offset = get_eb_offset_in_page(eb, start);
550ac1d8
GH
6744
6745 while (len > 0) {
fb85fc9a 6746 page = eb->pages[i];
550ac1d8 6747
09cbfeaf 6748 cur = min(len, (PAGE_SIZE - offset));
550ac1d8 6749 kaddr = page_address(page);
a48b73ec 6750 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
550ac1d8
GH
6751 ret = -EFAULT;
6752 break;
6753 }
6754
6755 dst += cur;
6756 len -= cur;
6757 offset = 0;
6758 i++;
6759 }
6760
6761 return ret;
6762}
6763
1cbb1f45
JM
6764int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6765 unsigned long start, unsigned long len)
d1310b2e
CM
6766{
6767 size_t cur;
6768 size_t offset;
6769 struct page *page;
6770 char *kaddr;
6771 char *ptr = (char *)ptrv;
884b07d0 6772 unsigned long i = get_eb_page_index(start);
d1310b2e
CM
6773 int ret = 0;
6774
f98b6215
QW
6775 if (check_eb_range(eb, start, len))
6776 return -EINVAL;
d1310b2e 6777
884b07d0 6778 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6779
d397712b 6780 while (len > 0) {
fb85fc9a 6781 page = eb->pages[i];
d1310b2e 6782
09cbfeaf 6783 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 6784
a6591715 6785 kaddr = page_address(page);
d1310b2e 6786 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
6787 if (ret)
6788 break;
6789
6790 ptr += cur;
6791 len -= cur;
6792 offset = 0;
6793 i++;
6794 }
6795 return ret;
6796}
d1310b2e 6797
b8f95771
QW
6798/*
6799 * Check that the extent buffer is uptodate.
6800 *
6801 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6802 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6803 */
6804static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6805 struct page *page)
6806{
6807 struct btrfs_fs_info *fs_info = eb->fs_info;
6808
6809 if (fs_info->sectorsize < PAGE_SIZE) {
6810 bool uptodate;
6811
6812 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6813 eb->start, eb->len);
6814 WARN_ON(!uptodate);
6815 } else {
6816 WARN_ON(!PageUptodate(page));
6817 }
6818}
6819
2b48966a 6820void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
f157bf76
DS
6821 const void *srcv)
6822{
6823 char *kaddr;
6824
b8f95771 6825 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
6826 kaddr = page_address(eb->pages[0]) +
6827 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6828 chunk_tree_uuid));
6829 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
6830}
6831
2b48966a 6832void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
f157bf76
DS
6833{
6834 char *kaddr;
6835
b8f95771 6836 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
6837 kaddr = page_address(eb->pages[0]) +
6838 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6839 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
6840}
6841
2b48966a 6842void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
d1310b2e
CM
6843 unsigned long start, unsigned long len)
6844{
6845 size_t cur;
6846 size_t offset;
6847 struct page *page;
6848 char *kaddr;
6849 char *src = (char *)srcv;
884b07d0 6850 unsigned long i = get_eb_page_index(start);
d1310b2e 6851
d3575156
NA
6852 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6853
f98b6215
QW
6854 if (check_eb_range(eb, start, len))
6855 return;
d1310b2e 6856
884b07d0 6857 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6858
d397712b 6859 while (len > 0) {
fb85fc9a 6860 page = eb->pages[i];
b8f95771 6861 assert_eb_page_uptodate(eb, page);
d1310b2e 6862
09cbfeaf 6863 cur = min(len, PAGE_SIZE - offset);
a6591715 6864 kaddr = page_address(page);
d1310b2e 6865 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
6866
6867 src += cur;
6868 len -= cur;
6869 offset = 0;
6870 i++;
6871 }
6872}
d1310b2e 6873
2b48966a 6874void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
b159fa28 6875 unsigned long len)
d1310b2e
CM
6876{
6877 size_t cur;
6878 size_t offset;
6879 struct page *page;
6880 char *kaddr;
884b07d0 6881 unsigned long i = get_eb_page_index(start);
d1310b2e 6882
f98b6215
QW
6883 if (check_eb_range(eb, start, len))
6884 return;
d1310b2e 6885
884b07d0 6886 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6887
d397712b 6888 while (len > 0) {
fb85fc9a 6889 page = eb->pages[i];
b8f95771 6890 assert_eb_page_uptodate(eb, page);
d1310b2e 6891
09cbfeaf 6892 cur = min(len, PAGE_SIZE - offset);
a6591715 6893 kaddr = page_address(page);
b159fa28 6894 memset(kaddr + offset, 0, cur);
d1310b2e
CM
6895
6896 len -= cur;
6897 offset = 0;
6898 i++;
6899 }
6900}
d1310b2e 6901
2b48966a
DS
6902void copy_extent_buffer_full(const struct extent_buffer *dst,
6903 const struct extent_buffer *src)
58e8012c
DS
6904{
6905 int i;
cc5e31a4 6906 int num_pages;
58e8012c
DS
6907
6908 ASSERT(dst->len == src->len);
6909
884b07d0
QW
6910 if (dst->fs_info->sectorsize == PAGE_SIZE) {
6911 num_pages = num_extent_pages(dst);
6912 for (i = 0; i < num_pages; i++)
6913 copy_page(page_address(dst->pages[i]),
6914 page_address(src->pages[i]));
6915 } else {
6916 size_t src_offset = get_eb_offset_in_page(src, 0);
6917 size_t dst_offset = get_eb_offset_in_page(dst, 0);
6918
6919 ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
6920 memcpy(page_address(dst->pages[0]) + dst_offset,
6921 page_address(src->pages[0]) + src_offset,
6922 src->len);
6923 }
58e8012c
DS
6924}
6925
2b48966a
DS
6926void copy_extent_buffer(const struct extent_buffer *dst,
6927 const struct extent_buffer *src,
d1310b2e
CM
6928 unsigned long dst_offset, unsigned long src_offset,
6929 unsigned long len)
6930{
6931 u64 dst_len = dst->len;
6932 size_t cur;
6933 size_t offset;
6934 struct page *page;
6935 char *kaddr;
884b07d0 6936 unsigned long i = get_eb_page_index(dst_offset);
d1310b2e 6937
f98b6215
QW
6938 if (check_eb_range(dst, dst_offset, len) ||
6939 check_eb_range(src, src_offset, len))
6940 return;
6941
d1310b2e
CM
6942 WARN_ON(src->len != dst_len);
6943
884b07d0 6944 offset = get_eb_offset_in_page(dst, dst_offset);
d1310b2e 6945
d397712b 6946 while (len > 0) {
fb85fc9a 6947 page = dst->pages[i];
b8f95771 6948 assert_eb_page_uptodate(dst, page);
d1310b2e 6949
09cbfeaf 6950 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 6951
a6591715 6952 kaddr = page_address(page);
d1310b2e 6953 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
6954
6955 src_offset += cur;
6956 len -= cur;
6957 offset = 0;
6958 i++;
6959 }
6960}
d1310b2e 6961
3e1e8bb7
OS
6962/*
6963 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
6964 * given bit number
6965 * @eb: the extent buffer
6966 * @start: offset of the bitmap item in the extent buffer
6967 * @nr: bit number
6968 * @page_index: return index of the page in the extent buffer that contains the
6969 * given bit number
6970 * @page_offset: return offset into the page given by page_index
6971 *
6972 * This helper hides the ugliness of finding the byte in an extent buffer which
6973 * contains a given bit.
6974 */
2b48966a 6975static inline void eb_bitmap_offset(const struct extent_buffer *eb,
3e1e8bb7
OS
6976 unsigned long start, unsigned long nr,
6977 unsigned long *page_index,
6978 size_t *page_offset)
6979{
3e1e8bb7
OS
6980 size_t byte_offset = BIT_BYTE(nr);
6981 size_t offset;
6982
6983 /*
6984 * The byte we want is the offset of the extent buffer + the offset of
6985 * the bitmap item in the extent buffer + the offset of the byte in the
6986 * bitmap item.
6987 */
884b07d0 6988 offset = start + offset_in_page(eb->start) + byte_offset;
3e1e8bb7 6989
09cbfeaf 6990 *page_index = offset >> PAGE_SHIFT;
7073017a 6991 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
6992}
6993
6994/**
6995 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
6996 * @eb: the extent buffer
6997 * @start: offset of the bitmap item in the extent buffer
6998 * @nr: bit number to test
6999 */
2b48966a 7000int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
7001 unsigned long nr)
7002{
2fe1d551 7003 u8 *kaddr;
3e1e8bb7
OS
7004 struct page *page;
7005 unsigned long i;
7006 size_t offset;
7007
7008 eb_bitmap_offset(eb, start, nr, &i, &offset);
7009 page = eb->pages[i];
b8f95771 7010 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7011 kaddr = page_address(page);
7012 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7013}
7014
7015/**
7016 * extent_buffer_bitmap_set - set an area of a bitmap
7017 * @eb: the extent buffer
7018 * @start: offset of the bitmap item in the extent buffer
7019 * @pos: bit number of the first bit
7020 * @len: number of bits to set
7021 */
2b48966a 7022void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
7023 unsigned long pos, unsigned long len)
7024{
2fe1d551 7025 u8 *kaddr;
3e1e8bb7
OS
7026 struct page *page;
7027 unsigned long i;
7028 size_t offset;
7029 const unsigned int size = pos + len;
7030 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 7031 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
7032
7033 eb_bitmap_offset(eb, start, pos, &i, &offset);
7034 page = eb->pages[i];
b8f95771 7035 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7036 kaddr = page_address(page);
7037
7038 while (len >= bits_to_set) {
7039 kaddr[offset] |= mask_to_set;
7040 len -= bits_to_set;
7041 bits_to_set = BITS_PER_BYTE;
9c894696 7042 mask_to_set = ~0;
09cbfeaf 7043 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
7044 offset = 0;
7045 page = eb->pages[++i];
b8f95771 7046 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7047 kaddr = page_address(page);
7048 }
7049 }
7050 if (len) {
7051 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7052 kaddr[offset] |= mask_to_set;
7053 }
7054}
7055
7056
7057/**
7058 * extent_buffer_bitmap_clear - clear an area of a bitmap
7059 * @eb: the extent buffer
7060 * @start: offset of the bitmap item in the extent buffer
7061 * @pos: bit number of the first bit
7062 * @len: number of bits to clear
7063 */
2b48966a
DS
7064void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7065 unsigned long start, unsigned long pos,
7066 unsigned long len)
3e1e8bb7 7067{
2fe1d551 7068 u8 *kaddr;
3e1e8bb7
OS
7069 struct page *page;
7070 unsigned long i;
7071 size_t offset;
7072 const unsigned int size = pos + len;
7073 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 7074 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
7075
7076 eb_bitmap_offset(eb, start, pos, &i, &offset);
7077 page = eb->pages[i];
b8f95771 7078 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7079 kaddr = page_address(page);
7080
7081 while (len >= bits_to_clear) {
7082 kaddr[offset] &= ~mask_to_clear;
7083 len -= bits_to_clear;
7084 bits_to_clear = BITS_PER_BYTE;
9c894696 7085 mask_to_clear = ~0;
09cbfeaf 7086 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
7087 offset = 0;
7088 page = eb->pages[++i];
b8f95771 7089 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
7090 kaddr = page_address(page);
7091 }
7092 }
7093 if (len) {
7094 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7095 kaddr[offset] &= ~mask_to_clear;
7096 }
7097}
7098
3387206f
ST
7099static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7100{
7101 unsigned long distance = (src > dst) ? src - dst : dst - src;
7102 return distance < len;
7103}
7104
d1310b2e
CM
7105static void copy_pages(struct page *dst_page, struct page *src_page,
7106 unsigned long dst_off, unsigned long src_off,
7107 unsigned long len)
7108{
a6591715 7109 char *dst_kaddr = page_address(dst_page);
d1310b2e 7110 char *src_kaddr;
727011e0 7111 int must_memmove = 0;
d1310b2e 7112
3387206f 7113 if (dst_page != src_page) {
a6591715 7114 src_kaddr = page_address(src_page);
3387206f 7115 } else {
d1310b2e 7116 src_kaddr = dst_kaddr;
727011e0
CM
7117 if (areas_overlap(src_off, dst_off, len))
7118 must_memmove = 1;
3387206f 7119 }
d1310b2e 7120
727011e0
CM
7121 if (must_memmove)
7122 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7123 else
7124 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
7125}
7126
2b48966a
DS
7127void memcpy_extent_buffer(const struct extent_buffer *dst,
7128 unsigned long dst_offset, unsigned long src_offset,
7129 unsigned long len)
d1310b2e
CM
7130{
7131 size_t cur;
7132 size_t dst_off_in_page;
7133 size_t src_off_in_page;
d1310b2e
CM
7134 unsigned long dst_i;
7135 unsigned long src_i;
7136
f98b6215
QW
7137 if (check_eb_range(dst, dst_offset, len) ||
7138 check_eb_range(dst, src_offset, len))
7139 return;
d1310b2e 7140
d397712b 7141 while (len > 0) {
884b07d0
QW
7142 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7143 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
d1310b2e 7144
884b07d0
QW
7145 dst_i = get_eb_page_index(dst_offset);
7146 src_i = get_eb_page_index(src_offset);
d1310b2e 7147
09cbfeaf 7148 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
7149 src_off_in_page));
7150 cur = min_t(unsigned long, cur,
09cbfeaf 7151 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 7152
fb85fc9a 7153 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
7154 dst_off_in_page, src_off_in_page, cur);
7155
7156 src_offset += cur;
7157 dst_offset += cur;
7158 len -= cur;
7159 }
7160}
d1310b2e 7161
2b48966a
DS
7162void memmove_extent_buffer(const struct extent_buffer *dst,
7163 unsigned long dst_offset, unsigned long src_offset,
7164 unsigned long len)
d1310b2e
CM
7165{
7166 size_t cur;
7167 size_t dst_off_in_page;
7168 size_t src_off_in_page;
7169 unsigned long dst_end = dst_offset + len - 1;
7170 unsigned long src_end = src_offset + len - 1;
d1310b2e
CM
7171 unsigned long dst_i;
7172 unsigned long src_i;
7173
f98b6215
QW
7174 if (check_eb_range(dst, dst_offset, len) ||
7175 check_eb_range(dst, src_offset, len))
7176 return;
727011e0 7177 if (dst_offset < src_offset) {
d1310b2e
CM
7178 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7179 return;
7180 }
d397712b 7181 while (len > 0) {
884b07d0
QW
7182 dst_i = get_eb_page_index(dst_end);
7183 src_i = get_eb_page_index(src_end);
d1310b2e 7184
884b07d0
QW
7185 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7186 src_off_in_page = get_eb_offset_in_page(dst, src_end);
d1310b2e
CM
7187
7188 cur = min_t(unsigned long, len, src_off_in_page + 1);
7189 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 7190 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
7191 dst_off_in_page - cur + 1,
7192 src_off_in_page - cur + 1, cur);
7193
7194 dst_end -= cur;
7195 src_end -= cur;
7196 len -= cur;
7197 }
7198}
6af118ce 7199
72a69cd0 7200#define GANG_LOOKUP_SIZE 16
d1e86e3f
QW
7201static struct extent_buffer *get_next_extent_buffer(
7202 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7203{
72a69cd0 7204 struct extent_buffer *gang[GANG_LOOKUP_SIZE];
d1e86e3f
QW
7205 struct extent_buffer *found = NULL;
7206 u64 page_start = page_offset(page);
72a69cd0 7207 u64 cur = page_start;
d1e86e3f
QW
7208
7209 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
d1e86e3f
QW
7210 lockdep_assert_held(&fs_info->buffer_lock);
7211
72a69cd0
QW
7212 while (cur < page_start + PAGE_SIZE) {
7213 int ret;
7214 int i;
7215
7216 ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7217 (void **)gang, cur >> fs_info->sectorsize_bits,
7218 min_t(unsigned int, GANG_LOOKUP_SIZE,
7219 PAGE_SIZE / fs_info->nodesize));
7220 if (ret == 0)
7221 goto out;
7222 for (i = 0; i < ret; i++) {
7223 /* Already beyond page end */
7224 if (gang[i]->start >= page_start + PAGE_SIZE)
7225 goto out;
7226 /* Found one */
7227 if (gang[i]->start >= bytenr) {
7228 found = gang[i];
7229 goto out;
7230 }
d1e86e3f 7231 }
72a69cd0 7232 cur = gang[ret - 1]->start + gang[ret - 1]->len;
d1e86e3f 7233 }
72a69cd0 7234out:
d1e86e3f
QW
7235 return found;
7236}
7237
7238static int try_release_subpage_extent_buffer(struct page *page)
7239{
7240 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7241 u64 cur = page_offset(page);
7242 const u64 end = page_offset(page) + PAGE_SIZE;
7243 int ret;
7244
7245 while (cur < end) {
7246 struct extent_buffer *eb = NULL;
7247
7248 /*
7249 * Unlike try_release_extent_buffer() which uses page->private
7250 * to grab buffer, for subpage case we rely on radix tree, thus
7251 * we need to ensure radix tree consistency.
7252 *
7253 * We also want an atomic snapshot of the radix tree, thus go
7254 * with spinlock rather than RCU.
7255 */
7256 spin_lock(&fs_info->buffer_lock);
7257 eb = get_next_extent_buffer(fs_info, page, cur);
7258 if (!eb) {
7259 /* No more eb in the page range after or at cur */
7260 spin_unlock(&fs_info->buffer_lock);
7261 break;
7262 }
7263 cur = eb->start + eb->len;
7264
7265 /*
7266 * The same as try_release_extent_buffer(), to ensure the eb
7267 * won't disappear out from under us.
7268 */
7269 spin_lock(&eb->refs_lock);
7270 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7271 spin_unlock(&eb->refs_lock);
7272 spin_unlock(&fs_info->buffer_lock);
7273 break;
7274 }
7275 spin_unlock(&fs_info->buffer_lock);
7276
7277 /*
7278 * If tree ref isn't set then we know the ref on this eb is a
7279 * real ref, so just return, this eb will likely be freed soon
7280 * anyway.
7281 */
7282 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7283 spin_unlock(&eb->refs_lock);
7284 break;
7285 }
7286
7287 /*
7288 * Here we don't care about the return value, we will always
7289 * check the page private at the end. And
7290 * release_extent_buffer() will release the refs_lock.
7291 */
7292 release_extent_buffer(eb);
7293 }
7294 /*
7295 * Finally to check if we have cleared page private, as if we have
7296 * released all ebs in the page, the page private should be cleared now.
7297 */
7298 spin_lock(&page->mapping->private_lock);
7299 if (!PagePrivate(page))
7300 ret = 1;
7301 else
7302 ret = 0;
7303 spin_unlock(&page->mapping->private_lock);
7304 return ret;
7305
7306}
7307
f7a52a40 7308int try_release_extent_buffer(struct page *page)
19fe0a8b 7309{
6af118ce 7310 struct extent_buffer *eb;
6af118ce 7311
d1e86e3f
QW
7312 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
7313 return try_release_subpage_extent_buffer(page);
7314
3083ee2e 7315 /*
d1e86e3f
QW
7316 * We need to make sure nobody is changing page->private, as we rely on
7317 * page->private as the pointer to extent buffer.
3083ee2e
JB
7318 */
7319 spin_lock(&page->mapping->private_lock);
7320 if (!PagePrivate(page)) {
7321 spin_unlock(&page->mapping->private_lock);
4f2de97a 7322 return 1;
45f49bce 7323 }
6af118ce 7324
3083ee2e
JB
7325 eb = (struct extent_buffer *)page->private;
7326 BUG_ON(!eb);
19fe0a8b
MX
7327
7328 /*
3083ee2e
JB
7329 * This is a little awful but should be ok, we need to make sure that
7330 * the eb doesn't disappear out from under us while we're looking at
7331 * this page.
19fe0a8b 7332 */
3083ee2e 7333 spin_lock(&eb->refs_lock);
0b32f4bb 7334 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
7335 spin_unlock(&eb->refs_lock);
7336 spin_unlock(&page->mapping->private_lock);
7337 return 0;
b9473439 7338 }
3083ee2e 7339 spin_unlock(&page->mapping->private_lock);
897ca6e9 7340
19fe0a8b 7341 /*
3083ee2e
JB
7342 * If tree ref isn't set then we know the ref on this eb is a real ref,
7343 * so just return, this page will likely be freed soon anyway.
19fe0a8b 7344 */
3083ee2e
JB
7345 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7346 spin_unlock(&eb->refs_lock);
7347 return 0;
b9473439 7348 }
19fe0a8b 7349
f7a52a40 7350 return release_extent_buffer(eb);
6af118ce 7351}
bfb484d9
JB
7352
7353/*
7354 * btrfs_readahead_tree_block - attempt to readahead a child block
7355 * @fs_info: the fs_info
7356 * @bytenr: bytenr to read
3fbaf258 7357 * @owner_root: objectid of the root that owns this eb
bfb484d9 7358 * @gen: generation for the uptodate check, can be 0
3fbaf258 7359 * @level: level for the eb
bfb484d9
JB
7360 *
7361 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
7362 * normal uptodate check of the eb, without checking the generation. If we have
7363 * to read the block we will not block on anything.
7364 */
7365void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
3fbaf258 7366 u64 bytenr, u64 owner_root, u64 gen, int level)
bfb484d9
JB
7367{
7368 struct extent_buffer *eb;
7369 int ret;
7370
3fbaf258 7371 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
bfb484d9
JB
7372 if (IS_ERR(eb))
7373 return;
7374
7375 if (btrfs_buffer_uptodate(eb, gen, 1)) {
7376 free_extent_buffer(eb);
7377 return;
7378 }
7379
7380 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7381 if (ret < 0)
7382 free_extent_buffer_stale(eb);
7383 else
7384 free_extent_buffer(eb);
7385}
7386
7387/*
7388 * btrfs_readahead_node_child - readahead a node's child block
7389 * @node: parent node we're reading from
7390 * @slot: slot in the parent node for the child we want to read
7391 *
7392 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7393 * the slot in the node provided.
7394 */
7395void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7396{
7397 btrfs_readahead_tree_block(node->fs_info,
7398 btrfs_node_blockptr(node, slot),
3fbaf258
JB
7399 btrfs_header_owner(node),
7400 btrfs_node_ptr_generation(node, slot),
7401 btrfs_header_level(node) - 1);
bfb484d9 7402}