btrfs: make subpage metadata write path call its own endio functions
[linux-block.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
d1310b2e
CM
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
d1310b2e
CM
12#include <linux/writeback.h>
13#include <linux/pagevec.h>
268bb0ce 14#include <linux/prefetch.h>
90a887c9 15#include <linux/cleancache.h>
cea62800 16#include "misc.h"
d1310b2e 17#include "extent_io.h"
9c7d3a54 18#include "extent-io-tree.h"
d1310b2e 19#include "extent_map.h"
902b22f3
DW
20#include "ctree.h"
21#include "btrfs_inode.h"
4a54c8c1 22#include "volumes.h"
21adbd5c 23#include "check-integrity.h"
0b32f4bb 24#include "locking.h"
606686ee 25#include "rcu-string.h"
fe09e16c 26#include "backref.h"
6af49dbd 27#include "disk-io.h"
760f991f 28#include "subpage.h"
d3575156 29#include "zoned.h"
0bc09ca1 30#include "block-group.h"
d1310b2e 31
d1310b2e
CM
32static struct kmem_cache *extent_state_cache;
33static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 34static struct bio_set btrfs_bioset;
d1310b2e 35
27a3507d
FM
36static inline bool extent_state_in_tree(const struct extent_state *state)
37{
38 return !RB_EMPTY_NODE(&state->rb_node);
39}
40
6d49ba1b 41#ifdef CONFIG_BTRFS_DEBUG
d1310b2e 42static LIST_HEAD(states);
d397712b 43static DEFINE_SPINLOCK(leak_lock);
6d49ba1b 44
3fd63727
JB
45static inline void btrfs_leak_debug_add(spinlock_t *lock,
46 struct list_head *new,
47 struct list_head *head)
6d49ba1b
ES
48{
49 unsigned long flags;
50
3fd63727 51 spin_lock_irqsave(lock, flags);
6d49ba1b 52 list_add(new, head);
3fd63727 53 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
54}
55
3fd63727
JB
56static inline void btrfs_leak_debug_del(spinlock_t *lock,
57 struct list_head *entry)
6d49ba1b
ES
58{
59 unsigned long flags;
60
3fd63727 61 spin_lock_irqsave(lock, flags);
6d49ba1b 62 list_del(entry);
3fd63727 63 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
64}
65
3fd63727 66void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 67{
6d49ba1b 68 struct extent_buffer *eb;
3fd63727 69 unsigned long flags;
6d49ba1b 70
8c38938c
JB
71 /*
72 * If we didn't get into open_ctree our allocated_ebs will not be
73 * initialized, so just skip this.
74 */
75 if (!fs_info->allocated_ebs.next)
76 return;
77
3fd63727
JB
78 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
79 while (!list_empty(&fs_info->allocated_ebs)) {
80 eb = list_first_entry(&fs_info->allocated_ebs,
81 struct extent_buffer, leak_list);
8c38938c
JB
82 pr_err(
83 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
84 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
85 btrfs_header_owner(eb));
33ca832f
JB
86 list_del(&eb->leak_list);
87 kmem_cache_free(extent_buffer_cache, eb);
88 }
3fd63727 89 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f
JB
90}
91
92static inline void btrfs_extent_state_leak_debug_check(void)
93{
94 struct extent_state *state;
95
6d49ba1b
ES
96 while (!list_empty(&states)) {
97 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 98 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
99 state->start, state->end, state->state,
100 extent_state_in_tree(state),
b7ac31b7 101 refcount_read(&state->refs));
6d49ba1b
ES
102 list_del(&state->leak_list);
103 kmem_cache_free(extent_state_cache, state);
104 }
6d49ba1b 105}
8d599ae1 106
a5dee37d
JB
107#define btrfs_debug_check_extent_io_range(tree, start, end) \
108 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 109static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 110 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 111{
65a680f6
NB
112 struct inode *inode = tree->private_data;
113 u64 isize;
114
115 if (!inode || !is_data_inode(inode))
116 return;
117
118 isize = i_size_read(inode);
119 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
120 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
121 "%s: ino %llu isize %llu odd range [%llu,%llu]",
122 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
123 }
8d599ae1 124}
6d49ba1b 125#else
3fd63727
JB
126#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
127#define btrfs_leak_debug_del(lock, entry) do {} while (0)
33ca832f 128#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 129#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 130#endif
d1310b2e 131
d1310b2e
CM
132struct tree_entry {
133 u64 start;
134 u64 end;
d1310b2e
CM
135 struct rb_node rb_node;
136};
137
138struct extent_page_data {
390ed29b 139 struct btrfs_bio_ctrl bio_ctrl;
771ed689
CM
140 /* tells writepage not to lock the state bits for this range
141 * it still does the unlocking
142 */
ffbd517d
CM
143 unsigned int extent_locked:1;
144
70fd7614 145 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 146 unsigned int sync_io:1;
d1310b2e
CM
147};
148
f97e27e9 149static int add_extent_changeset(struct extent_state *state, u32 bits,
d38ed27f
QW
150 struct extent_changeset *changeset,
151 int set)
152{
153 int ret;
154
155 if (!changeset)
57599c7e 156 return 0;
d38ed27f 157 if (set && (state->state & bits) == bits)
57599c7e 158 return 0;
fefdc557 159 if (!set && (state->state & bits) == 0)
57599c7e 160 return 0;
d38ed27f 161 changeset->bytes_changed += state->end - state->start + 1;
53d32359 162 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 163 GFP_ATOMIC);
57599c7e 164 return ret;
d38ed27f
QW
165}
166
c1be9c1a
NB
167int __must_check submit_one_bio(struct bio *bio, int mirror_num,
168 unsigned long bio_flags)
bb58eb9e
QW
169{
170 blk_status_t ret = 0;
bb58eb9e 171 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
172
173 bio->bi_private = NULL;
174
908930f3
NB
175 if (is_data_inode(tree->private_data))
176 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
177 bio_flags);
178 else
1b36294a
NB
179 ret = btrfs_submit_metadata_bio(tree->private_data, bio,
180 mirror_num, bio_flags);
bb58eb9e
QW
181
182 return blk_status_to_errno(ret);
183}
184
3065976b
QW
185/* Cleanup unsubmitted bios */
186static void end_write_bio(struct extent_page_data *epd, int ret)
187{
390ed29b
QW
188 struct bio *bio = epd->bio_ctrl.bio;
189
190 if (bio) {
191 bio->bi_status = errno_to_blk_status(ret);
192 bio_endio(bio);
193 epd->bio_ctrl.bio = NULL;
3065976b
QW
194 }
195}
196
f4340622
QW
197/*
198 * Submit bio from extent page data via submit_one_bio
199 *
200 * Return 0 if everything is OK.
201 * Return <0 for error.
202 */
203static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 204{
f4340622 205 int ret = 0;
390ed29b 206 struct bio *bio = epd->bio_ctrl.bio;
bb58eb9e 207
390ed29b
QW
208 if (bio) {
209 ret = submit_one_bio(bio, 0, 0);
f4340622
QW
210 /*
211 * Clean up of epd->bio is handled by its endio function.
212 * And endio is either triggered by successful bio execution
213 * or the error handler of submit bio hook.
214 * So at this point, no matter what happened, we don't need
215 * to clean up epd->bio.
216 */
390ed29b 217 epd->bio_ctrl.bio = NULL;
bb58eb9e 218 }
f4340622 219 return ret;
bb58eb9e 220}
e2932ee0 221
6f0d04f8 222int __init extent_state_cache_init(void)
d1310b2e 223{
837e1972 224 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 225 sizeof(struct extent_state), 0,
fba4b697 226 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
227 if (!extent_state_cache)
228 return -ENOMEM;
6f0d04f8
JB
229 return 0;
230}
d1310b2e 231
6f0d04f8
JB
232int __init extent_io_init(void)
233{
837e1972 234 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 235 sizeof(struct extent_buffer), 0,
fba4b697 236 SLAB_MEM_SPREAD, NULL);
d1310b2e 237 if (!extent_buffer_cache)
6f0d04f8 238 return -ENOMEM;
9be3395b 239
8ac9f7c1
KO
240 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
241 offsetof(struct btrfs_io_bio, bio),
242 BIOSET_NEED_BVECS))
9be3395b 243 goto free_buffer_cache;
b208c2f7 244
8ac9f7c1 245 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
246 goto free_bioset;
247
d1310b2e
CM
248 return 0;
249
b208c2f7 250free_bioset:
8ac9f7c1 251 bioset_exit(&btrfs_bioset);
b208c2f7 252
9be3395b
CM
253free_buffer_cache:
254 kmem_cache_destroy(extent_buffer_cache);
255 extent_buffer_cache = NULL;
6f0d04f8
JB
256 return -ENOMEM;
257}
9be3395b 258
6f0d04f8
JB
259void __cold extent_state_cache_exit(void)
260{
261 btrfs_extent_state_leak_debug_check();
d1310b2e 262 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
263}
264
e67c718b 265void __cold extent_io_exit(void)
d1310b2e 266{
8c0a8537
KS
267 /*
268 * Make sure all delayed rcu free are flushed before we
269 * destroy caches.
270 */
271 rcu_barrier();
5598e900 272 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 273 bioset_exit(&btrfs_bioset);
d1310b2e
CM
274}
275
41a2ee75
JB
276/*
277 * For the file_extent_tree, we want to hold the inode lock when we lookup and
278 * update the disk_i_size, but lockdep will complain because our io_tree we hold
279 * the tree lock and get the inode lock when setting delalloc. These two things
280 * are unrelated, so make a class for the file_extent_tree so we don't get the
281 * two locking patterns mixed up.
282 */
283static struct lock_class_key file_extent_tree_class;
284
c258d6e3 285void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
286 struct extent_io_tree *tree, unsigned int owner,
287 void *private_data)
d1310b2e 288{
c258d6e3 289 tree->fs_info = fs_info;
6bef4d31 290 tree->state = RB_ROOT;
d1310b2e 291 tree->dirty_bytes = 0;
70dec807 292 spin_lock_init(&tree->lock);
c6100a4b 293 tree->private_data = private_data;
43eb5f29 294 tree->owner = owner;
41a2ee75
JB
295 if (owner == IO_TREE_INODE_FILE_EXTENT)
296 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 297}
d1310b2e 298
41e7acd3
NB
299void extent_io_tree_release(struct extent_io_tree *tree)
300{
301 spin_lock(&tree->lock);
302 /*
303 * Do a single barrier for the waitqueue_active check here, the state
304 * of the waitqueue should not change once extent_io_tree_release is
305 * called.
306 */
307 smp_mb();
308 while (!RB_EMPTY_ROOT(&tree->state)) {
309 struct rb_node *node;
310 struct extent_state *state;
311
312 node = rb_first(&tree->state);
313 state = rb_entry(node, struct extent_state, rb_node);
314 rb_erase(&state->rb_node, &tree->state);
315 RB_CLEAR_NODE(&state->rb_node);
316 /*
317 * btree io trees aren't supposed to have tasks waiting for
318 * changes in the flags of extent states ever.
319 */
320 ASSERT(!waitqueue_active(&state->wq));
321 free_extent_state(state);
322
323 cond_resched_lock(&tree->lock);
324 }
325 spin_unlock(&tree->lock);
326}
327
b2950863 328static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
329{
330 struct extent_state *state;
d1310b2e 331
3ba7ab22
MH
332 /*
333 * The given mask might be not appropriate for the slab allocator,
334 * drop the unsupported bits
335 */
336 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 337 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 338 if (!state)
d1310b2e
CM
339 return state;
340 state->state = 0;
47dc196a 341 state->failrec = NULL;
27a3507d 342 RB_CLEAR_NODE(&state->rb_node);
3fd63727 343 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
b7ac31b7 344 refcount_set(&state->refs, 1);
d1310b2e 345 init_waitqueue_head(&state->wq);
143bede5 346 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
347 return state;
348}
d1310b2e 349
4845e44f 350void free_extent_state(struct extent_state *state)
d1310b2e 351{
d1310b2e
CM
352 if (!state)
353 return;
b7ac31b7 354 if (refcount_dec_and_test(&state->refs)) {
27a3507d 355 WARN_ON(extent_state_in_tree(state));
3fd63727 356 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
143bede5 357 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
358 kmem_cache_free(extent_state_cache, state);
359 }
360}
d1310b2e 361
f2071b21
FM
362static struct rb_node *tree_insert(struct rb_root *root,
363 struct rb_node *search_start,
364 u64 offset,
12cfbad9
FDBM
365 struct rb_node *node,
366 struct rb_node ***p_in,
367 struct rb_node **parent_in)
d1310b2e 368{
f2071b21 369 struct rb_node **p;
d397712b 370 struct rb_node *parent = NULL;
d1310b2e
CM
371 struct tree_entry *entry;
372
12cfbad9
FDBM
373 if (p_in && parent_in) {
374 p = *p_in;
375 parent = *parent_in;
376 goto do_insert;
377 }
378
f2071b21 379 p = search_start ? &search_start : &root->rb_node;
d397712b 380 while (*p) {
d1310b2e
CM
381 parent = *p;
382 entry = rb_entry(parent, struct tree_entry, rb_node);
383
384 if (offset < entry->start)
385 p = &(*p)->rb_left;
386 else if (offset > entry->end)
387 p = &(*p)->rb_right;
388 else
389 return parent;
390 }
391
12cfbad9 392do_insert:
d1310b2e
CM
393 rb_link_node(node, parent, p);
394 rb_insert_color(node, root);
395 return NULL;
396}
397
8666e638 398/**
3bed2da1
NB
399 * Search @tree for an entry that contains @offset. Such entry would have
400 * entry->start <= offset && entry->end >= offset.
8666e638 401 *
3bed2da1
NB
402 * @tree: the tree to search
403 * @offset: offset that should fall within an entry in @tree
404 * @next_ret: pointer to the first entry whose range ends after @offset
405 * @prev_ret: pointer to the first entry whose range begins before @offset
406 * @p_ret: pointer where new node should be anchored (used when inserting an
407 * entry in the tree)
408 * @parent_ret: points to entry which would have been the parent of the entry,
8666e638
NB
409 * containing @offset
410 *
411 * This function returns a pointer to the entry that contains @offset byte
412 * address. If no such entry exists, then NULL is returned and the other
413 * pointer arguments to the function are filled, otherwise the found entry is
414 * returned and other pointers are left untouched.
415 */
80ea96b1 416static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 417 struct rb_node **next_ret,
352646c7 418 struct rb_node **prev_ret,
12cfbad9
FDBM
419 struct rb_node ***p_ret,
420 struct rb_node **parent_ret)
d1310b2e 421{
80ea96b1 422 struct rb_root *root = &tree->state;
12cfbad9 423 struct rb_node **n = &root->rb_node;
d1310b2e
CM
424 struct rb_node *prev = NULL;
425 struct rb_node *orig_prev = NULL;
426 struct tree_entry *entry;
427 struct tree_entry *prev_entry = NULL;
428
12cfbad9
FDBM
429 while (*n) {
430 prev = *n;
431 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
432 prev_entry = entry;
433
434 if (offset < entry->start)
12cfbad9 435 n = &(*n)->rb_left;
d1310b2e 436 else if (offset > entry->end)
12cfbad9 437 n = &(*n)->rb_right;
d397712b 438 else
12cfbad9 439 return *n;
d1310b2e
CM
440 }
441
12cfbad9
FDBM
442 if (p_ret)
443 *p_ret = n;
444 if (parent_ret)
445 *parent_ret = prev;
446
352646c7 447 if (next_ret) {
d1310b2e 448 orig_prev = prev;
d397712b 449 while (prev && offset > prev_entry->end) {
d1310b2e
CM
450 prev = rb_next(prev);
451 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
452 }
352646c7 453 *next_ret = prev;
d1310b2e
CM
454 prev = orig_prev;
455 }
456
352646c7 457 if (prev_ret) {
d1310b2e 458 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 459 while (prev && offset < prev_entry->start) {
d1310b2e
CM
460 prev = rb_prev(prev);
461 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
462 }
352646c7 463 *prev_ret = prev;
d1310b2e
CM
464 }
465 return NULL;
466}
467
12cfbad9
FDBM
468static inline struct rb_node *
469tree_search_for_insert(struct extent_io_tree *tree,
470 u64 offset,
471 struct rb_node ***p_ret,
472 struct rb_node **parent_ret)
d1310b2e 473{
352646c7 474 struct rb_node *next= NULL;
d1310b2e 475 struct rb_node *ret;
70dec807 476
352646c7 477 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 478 if (!ret)
352646c7 479 return next;
d1310b2e
CM
480 return ret;
481}
482
12cfbad9
FDBM
483static inline struct rb_node *tree_search(struct extent_io_tree *tree,
484 u64 offset)
485{
486 return tree_search_for_insert(tree, offset, NULL, NULL);
487}
488
d1310b2e
CM
489/*
490 * utility function to look for merge candidates inside a given range.
491 * Any extents with matching state are merged together into a single
492 * extent in the tree. Extents with EXTENT_IO in their state field
493 * are not merged because the end_io handlers need to be able to do
494 * operations on them without sleeping (or doing allocations/splits).
495 *
496 * This should be called with the tree lock held.
497 */
1bf85046
JM
498static void merge_state(struct extent_io_tree *tree,
499 struct extent_state *state)
d1310b2e
CM
500{
501 struct extent_state *other;
502 struct rb_node *other_node;
503
8882679e 504 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 505 return;
d1310b2e
CM
506
507 other_node = rb_prev(&state->rb_node);
508 if (other_node) {
509 other = rb_entry(other_node, struct extent_state, rb_node);
510 if (other->end == state->start - 1 &&
511 other->state == state->state) {
5c848198
NB
512 if (tree->private_data &&
513 is_data_inode(tree->private_data))
514 btrfs_merge_delalloc_extent(tree->private_data,
515 state, other);
d1310b2e 516 state->start = other->start;
d1310b2e 517 rb_erase(&other->rb_node, &tree->state);
27a3507d 518 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
519 free_extent_state(other);
520 }
521 }
522 other_node = rb_next(&state->rb_node);
523 if (other_node) {
524 other = rb_entry(other_node, struct extent_state, rb_node);
525 if (other->start == state->end + 1 &&
526 other->state == state->state) {
5c848198
NB
527 if (tree->private_data &&
528 is_data_inode(tree->private_data))
529 btrfs_merge_delalloc_extent(tree->private_data,
530 state, other);
df98b6e2 531 state->end = other->end;
df98b6e2 532 rb_erase(&other->rb_node, &tree->state);
27a3507d 533 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 534 free_extent_state(other);
d1310b2e
CM
535 }
536 }
d1310b2e
CM
537}
538
3150b699 539static void set_state_bits(struct extent_io_tree *tree,
f97e27e9 540 struct extent_state *state, u32 *bits,
d38ed27f 541 struct extent_changeset *changeset);
3150b699 542
d1310b2e
CM
543/*
544 * insert an extent_state struct into the tree. 'bits' are set on the
545 * struct before it is inserted.
546 *
547 * This may return -EEXIST if the extent is already there, in which case the
548 * state struct is freed.
549 *
550 * The tree lock is not taken internally. This is a utility function and
551 * probably isn't what you want to call (see set/clear_extent_bit).
552 */
553static int insert_state(struct extent_io_tree *tree,
554 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
555 struct rb_node ***p,
556 struct rb_node **parent,
f97e27e9 557 u32 *bits, struct extent_changeset *changeset)
d1310b2e
CM
558{
559 struct rb_node *node;
560
2792237d
DS
561 if (end < start) {
562 btrfs_err(tree->fs_info,
563 "insert state: end < start %llu %llu", end, start);
564 WARN_ON(1);
565 }
d1310b2e
CM
566 state->start = start;
567 state->end = end;
9ed74f2d 568
d38ed27f 569 set_state_bits(tree, state, bits, changeset);
3150b699 570
f2071b21 571 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
572 if (node) {
573 struct extent_state *found;
574 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
575 btrfs_err(tree->fs_info,
576 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 577 found->start, found->end, start, end);
d1310b2e
CM
578 return -EEXIST;
579 }
580 merge_state(tree, state);
581 return 0;
582}
583
584/*
585 * split a given extent state struct in two, inserting the preallocated
586 * struct 'prealloc' as the newly created second half. 'split' indicates an
587 * offset inside 'orig' where it should be split.
588 *
589 * Before calling,
590 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
591 * are two extent state structs in the tree:
592 * prealloc: [orig->start, split - 1]
593 * orig: [ split, orig->end ]
594 *
595 * The tree locks are not taken by this function. They need to be held
596 * by the caller.
597 */
598static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
599 struct extent_state *prealloc, u64 split)
600{
601 struct rb_node *node;
9ed74f2d 602
abbb55f4
NB
603 if (tree->private_data && is_data_inode(tree->private_data))
604 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 605
d1310b2e
CM
606 prealloc->start = orig->start;
607 prealloc->end = split - 1;
608 prealloc->state = orig->state;
609 orig->start = split;
610
f2071b21
FM
611 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
612 &prealloc->rb_node, NULL, NULL);
d1310b2e 613 if (node) {
d1310b2e
CM
614 free_extent_state(prealloc);
615 return -EEXIST;
616 }
617 return 0;
618}
619
cdc6a395
LZ
620static struct extent_state *next_state(struct extent_state *state)
621{
622 struct rb_node *next = rb_next(&state->rb_node);
623 if (next)
624 return rb_entry(next, struct extent_state, rb_node);
625 else
626 return NULL;
627}
628
d1310b2e
CM
629/*
630 * utility function to clear some bits in an extent state struct.
52042d8e 631 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
632 *
633 * If no bits are set on the state struct after clearing things, the
634 * struct is freed and removed from the tree
635 */
cdc6a395
LZ
636static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
637 struct extent_state *state,
f97e27e9 638 u32 *bits, int wake,
fefdc557 639 struct extent_changeset *changeset)
d1310b2e 640{
cdc6a395 641 struct extent_state *next;
f97e27e9 642 u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 643 int ret;
d1310b2e 644
0ca1f7ce 645 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
646 u64 range = state->end - state->start + 1;
647 WARN_ON(range > tree->dirty_bytes);
648 tree->dirty_bytes -= range;
649 }
a36bb5f9
NB
650
651 if (tree->private_data && is_data_inode(tree->private_data))
652 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
653
57599c7e
DS
654 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
655 BUG_ON(ret < 0);
32c00aff 656 state->state &= ~bits_to_clear;
d1310b2e
CM
657 if (wake)
658 wake_up(&state->wq);
0ca1f7ce 659 if (state->state == 0) {
cdc6a395 660 next = next_state(state);
27a3507d 661 if (extent_state_in_tree(state)) {
d1310b2e 662 rb_erase(&state->rb_node, &tree->state);
27a3507d 663 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
664 free_extent_state(state);
665 } else {
666 WARN_ON(1);
667 }
668 } else {
669 merge_state(tree, state);
cdc6a395 670 next = next_state(state);
d1310b2e 671 }
cdc6a395 672 return next;
d1310b2e
CM
673}
674
8233767a
XG
675static struct extent_state *
676alloc_extent_state_atomic(struct extent_state *prealloc)
677{
678 if (!prealloc)
679 prealloc = alloc_extent_state(GFP_ATOMIC);
680
681 return prealloc;
682}
683
48a3b636 684static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 685{
29b665cc 686 btrfs_panic(tree->fs_info, err,
05912a3c 687 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
688}
689
d1310b2e
CM
690/*
691 * clear some bits on a range in the tree. This may require splitting
692 * or inserting elements in the tree, so the gfp mask is used to
693 * indicate which allocations or sleeping are allowed.
694 *
695 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
696 * the given range from the tree regardless of state (ie for truncate).
697 *
698 * the range [start, end] is inclusive.
699 *
6763af84 700 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 701 */
66b0c887 702int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9
QW
703 u32 bits, int wake, int delete,
704 struct extent_state **cached_state,
705 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
706{
707 struct extent_state *state;
2c64c53d 708 struct extent_state *cached;
d1310b2e
CM
709 struct extent_state *prealloc = NULL;
710 struct rb_node *node;
5c939df5 711 u64 last_end;
d1310b2e 712 int err;
2ac55d41 713 int clear = 0;
d1310b2e 714
a5dee37d 715 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 716 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 717
7ee9e440
JB
718 if (bits & EXTENT_DELALLOC)
719 bits |= EXTENT_NORESERVE;
720
0ca1f7ce
YZ
721 if (delete)
722 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 723
8882679e 724 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 725 clear = 1;
d1310b2e 726again:
d0164adc 727 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
728 /*
729 * Don't care for allocation failure here because we might end
730 * up not needing the pre-allocated extent state at all, which
731 * is the case if we only have in the tree extent states that
732 * cover our input range and don't cover too any other range.
733 * If we end up needing a new extent state we allocate it later.
734 */
d1310b2e 735 prealloc = alloc_extent_state(mask);
d1310b2e
CM
736 }
737
cad321ad 738 spin_lock(&tree->lock);
2c64c53d
CM
739 if (cached_state) {
740 cached = *cached_state;
2ac55d41
JB
741
742 if (clear) {
743 *cached_state = NULL;
744 cached_state = NULL;
745 }
746
27a3507d
FM
747 if (cached && extent_state_in_tree(cached) &&
748 cached->start <= start && cached->end > start) {
2ac55d41 749 if (clear)
b7ac31b7 750 refcount_dec(&cached->refs);
2c64c53d 751 state = cached;
42daec29 752 goto hit_next;
2c64c53d 753 }
2ac55d41
JB
754 if (clear)
755 free_extent_state(cached);
2c64c53d 756 }
d1310b2e
CM
757 /*
758 * this search will find the extents that end after
759 * our range starts
760 */
80ea96b1 761 node = tree_search(tree, start);
d1310b2e
CM
762 if (!node)
763 goto out;
764 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 765hit_next:
d1310b2e
CM
766 if (state->start > end)
767 goto out;
768 WARN_ON(state->end < start);
5c939df5 769 last_end = state->end;
d1310b2e 770
0449314a 771 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
772 if (!(state->state & bits)) {
773 state = next_state(state);
0449314a 774 goto next;
cdc6a395 775 }
0449314a 776
d1310b2e
CM
777 /*
778 * | ---- desired range ---- |
779 * | state | or
780 * | ------------- state -------------- |
781 *
782 * We need to split the extent we found, and may flip
783 * bits on second half.
784 *
785 * If the extent we found extends past our range, we
786 * just split and search again. It'll get split again
787 * the next time though.
788 *
789 * If the extent we found is inside our range, we clear
790 * the desired bit on it.
791 */
792
793 if (state->start < start) {
8233767a
XG
794 prealloc = alloc_extent_state_atomic(prealloc);
795 BUG_ON(!prealloc);
d1310b2e 796 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
797 if (err)
798 extent_io_tree_panic(tree, err);
799
d1310b2e
CM
800 prealloc = NULL;
801 if (err)
802 goto out;
803 if (state->end <= end) {
fefdc557
QW
804 state = clear_state_bit(tree, state, &bits, wake,
805 changeset);
d1ac6e41 806 goto next;
d1310b2e
CM
807 }
808 goto search_again;
809 }
810 /*
811 * | ---- desired range ---- |
812 * | state |
813 * We need to split the extent, and clear the bit
814 * on the first half
815 */
816 if (state->start <= end && state->end > end) {
8233767a
XG
817 prealloc = alloc_extent_state_atomic(prealloc);
818 BUG_ON(!prealloc);
d1310b2e 819 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
820 if (err)
821 extent_io_tree_panic(tree, err);
822
d1310b2e
CM
823 if (wake)
824 wake_up(&state->wq);
42daec29 825
fefdc557 826 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 827
d1310b2e
CM
828 prealloc = NULL;
829 goto out;
830 }
42daec29 831
fefdc557 832 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 833next:
5c939df5
YZ
834 if (last_end == (u64)-1)
835 goto out;
836 start = last_end + 1;
cdc6a395 837 if (start <= end && state && !need_resched())
692e5759 838 goto hit_next;
d1310b2e
CM
839
840search_again:
841 if (start > end)
842 goto out;
cad321ad 843 spin_unlock(&tree->lock);
d0164adc 844 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
845 cond_resched();
846 goto again;
7ab5cb2a
DS
847
848out:
849 spin_unlock(&tree->lock);
850 if (prealloc)
851 free_extent_state(prealloc);
852
853 return 0;
854
d1310b2e 855}
d1310b2e 856
143bede5
JM
857static void wait_on_state(struct extent_io_tree *tree,
858 struct extent_state *state)
641f5219
CH
859 __releases(tree->lock)
860 __acquires(tree->lock)
d1310b2e
CM
861{
862 DEFINE_WAIT(wait);
863 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 864 spin_unlock(&tree->lock);
d1310b2e 865 schedule();
cad321ad 866 spin_lock(&tree->lock);
d1310b2e 867 finish_wait(&state->wq, &wait);
d1310b2e
CM
868}
869
870/*
871 * waits for one or more bits to clear on a range in the state tree.
872 * The range [start, end] is inclusive.
873 * The tree lock is taken by this function
874 */
41074888 875static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 876 u32 bits)
d1310b2e
CM
877{
878 struct extent_state *state;
879 struct rb_node *node;
880
a5dee37d 881 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 882
cad321ad 883 spin_lock(&tree->lock);
d1310b2e
CM
884again:
885 while (1) {
886 /*
887 * this search will find all the extents that end after
888 * our range starts
889 */
80ea96b1 890 node = tree_search(tree, start);
c50d3e71 891process_node:
d1310b2e
CM
892 if (!node)
893 break;
894
895 state = rb_entry(node, struct extent_state, rb_node);
896
897 if (state->start > end)
898 goto out;
899
900 if (state->state & bits) {
901 start = state->start;
b7ac31b7 902 refcount_inc(&state->refs);
d1310b2e
CM
903 wait_on_state(tree, state);
904 free_extent_state(state);
905 goto again;
906 }
907 start = state->end + 1;
908
909 if (start > end)
910 break;
911
c50d3e71
FM
912 if (!cond_resched_lock(&tree->lock)) {
913 node = rb_next(node);
914 goto process_node;
915 }
d1310b2e
CM
916 }
917out:
cad321ad 918 spin_unlock(&tree->lock);
d1310b2e 919}
d1310b2e 920
1bf85046 921static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 922 struct extent_state *state,
f97e27e9 923 u32 *bits, struct extent_changeset *changeset)
d1310b2e 924{
f97e27e9 925 u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 926 int ret;
9ed74f2d 927
e06a1fc9
NB
928 if (tree->private_data && is_data_inode(tree->private_data))
929 btrfs_set_delalloc_extent(tree->private_data, state, bits);
930
0ca1f7ce 931 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
932 u64 range = state->end - state->start + 1;
933 tree->dirty_bytes += range;
934 }
57599c7e
DS
935 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
936 BUG_ON(ret < 0);
0ca1f7ce 937 state->state |= bits_to_set;
d1310b2e
CM
938}
939
e38e2ed7
FM
940static void cache_state_if_flags(struct extent_state *state,
941 struct extent_state **cached_ptr,
9ee49a04 942 unsigned flags)
2c64c53d
CM
943{
944 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 945 if (!flags || (state->state & flags)) {
2c64c53d 946 *cached_ptr = state;
b7ac31b7 947 refcount_inc(&state->refs);
2c64c53d
CM
948 }
949 }
950}
951
e38e2ed7
FM
952static void cache_state(struct extent_state *state,
953 struct extent_state **cached_ptr)
954{
955 return cache_state_if_flags(state, cached_ptr,
8882679e 956 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
957}
958
d1310b2e 959/*
1edbb734
CM
960 * set some bits on a range in the tree. This may require allocations or
961 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 962 *
1edbb734
CM
963 * If any of the exclusive bits are set, this will fail with -EEXIST if some
964 * part of the range already has the desired bits set. The start of the
965 * existing range is returned in failed_start in this case.
d1310b2e 966 *
1edbb734 967 * [start, end] is inclusive This takes the tree lock.
d1310b2e 968 */
f97e27e9
QW
969int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
970 u32 exclusive_bits, u64 *failed_start,
1cab5e72
NB
971 struct extent_state **cached_state, gfp_t mask,
972 struct extent_changeset *changeset)
d1310b2e
CM
973{
974 struct extent_state *state;
975 struct extent_state *prealloc = NULL;
976 struct rb_node *node;
12cfbad9
FDBM
977 struct rb_node **p;
978 struct rb_node *parent;
d1310b2e 979 int err = 0;
d1310b2e
CM
980 u64 last_start;
981 u64 last_end;
42daec29 982
a5dee37d 983 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 984 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 985
3f6bb4ae
QW
986 if (exclusive_bits)
987 ASSERT(failed_start);
988 else
989 ASSERT(failed_start == NULL);
d1310b2e 990again:
d0164adc 991 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
992 /*
993 * Don't care for allocation failure here because we might end
994 * up not needing the pre-allocated extent state at all, which
995 * is the case if we only have in the tree extent states that
996 * cover our input range and don't cover too any other range.
997 * If we end up needing a new extent state we allocate it later.
998 */
d1310b2e 999 prealloc = alloc_extent_state(mask);
d1310b2e
CM
1000 }
1001
cad321ad 1002 spin_lock(&tree->lock);
9655d298
CM
1003 if (cached_state && *cached_state) {
1004 state = *cached_state;
df98b6e2 1005 if (state->start <= start && state->end > start &&
27a3507d 1006 extent_state_in_tree(state)) {
9655d298
CM
1007 node = &state->rb_node;
1008 goto hit_next;
1009 }
1010 }
d1310b2e
CM
1011 /*
1012 * this search will find all the extents that end after
1013 * our range starts.
1014 */
12cfbad9 1015 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1016 if (!node) {
8233767a
XG
1017 prealloc = alloc_extent_state_atomic(prealloc);
1018 BUG_ON(!prealloc);
12cfbad9 1019 err = insert_state(tree, prealloc, start, end,
d38ed27f 1020 &p, &parent, &bits, changeset);
c2d904e0
JM
1021 if (err)
1022 extent_io_tree_panic(tree, err);
1023
c42ac0bc 1024 cache_state(prealloc, cached_state);
d1310b2e 1025 prealloc = NULL;
d1310b2e
CM
1026 goto out;
1027 }
d1310b2e 1028 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1029hit_next:
d1310b2e
CM
1030 last_start = state->start;
1031 last_end = state->end;
1032
1033 /*
1034 * | ---- desired range ---- |
1035 * | state |
1036 *
1037 * Just lock what we found and keep going
1038 */
1039 if (state->start == start && state->end <= end) {
1edbb734 1040 if (state->state & exclusive_bits) {
d1310b2e
CM
1041 *failed_start = state->start;
1042 err = -EEXIST;
1043 goto out;
1044 }
42daec29 1045
d38ed27f 1046 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1047 cache_state(state, cached_state);
d1310b2e 1048 merge_state(tree, state);
5c939df5
YZ
1049 if (last_end == (u64)-1)
1050 goto out;
1051 start = last_end + 1;
d1ac6e41
LB
1052 state = next_state(state);
1053 if (start < end && state && state->start == start &&
1054 !need_resched())
1055 goto hit_next;
d1310b2e
CM
1056 goto search_again;
1057 }
1058
1059 /*
1060 * | ---- desired range ---- |
1061 * | state |
1062 * or
1063 * | ------------- state -------------- |
1064 *
1065 * We need to split the extent we found, and may flip bits on
1066 * second half.
1067 *
1068 * If the extent we found extends past our
1069 * range, we just split and search again. It'll get split
1070 * again the next time though.
1071 *
1072 * If the extent we found is inside our range, we set the
1073 * desired bit on it.
1074 */
1075 if (state->start < start) {
1edbb734 1076 if (state->state & exclusive_bits) {
d1310b2e
CM
1077 *failed_start = start;
1078 err = -EEXIST;
1079 goto out;
1080 }
8233767a 1081
55ffaabe
FM
1082 /*
1083 * If this extent already has all the bits we want set, then
1084 * skip it, not necessary to split it or do anything with it.
1085 */
1086 if ((state->state & bits) == bits) {
1087 start = state->end + 1;
1088 cache_state(state, cached_state);
1089 goto search_again;
1090 }
1091
8233767a
XG
1092 prealloc = alloc_extent_state_atomic(prealloc);
1093 BUG_ON(!prealloc);
d1310b2e 1094 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1095 if (err)
1096 extent_io_tree_panic(tree, err);
1097
d1310b2e
CM
1098 prealloc = NULL;
1099 if (err)
1100 goto out;
1101 if (state->end <= end) {
d38ed27f 1102 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1103 cache_state(state, cached_state);
d1310b2e 1104 merge_state(tree, state);
5c939df5
YZ
1105 if (last_end == (u64)-1)
1106 goto out;
1107 start = last_end + 1;
d1ac6e41
LB
1108 state = next_state(state);
1109 if (start < end && state && state->start == start &&
1110 !need_resched())
1111 goto hit_next;
d1310b2e
CM
1112 }
1113 goto search_again;
1114 }
1115 /*
1116 * | ---- desired range ---- |
1117 * | state | or | state |
1118 *
1119 * There's a hole, we need to insert something in it and
1120 * ignore the extent we found.
1121 */
1122 if (state->start > start) {
1123 u64 this_end;
1124 if (end < last_start)
1125 this_end = end;
1126 else
d397712b 1127 this_end = last_start - 1;
8233767a
XG
1128
1129 prealloc = alloc_extent_state_atomic(prealloc);
1130 BUG_ON(!prealloc);
c7f895a2
XG
1131
1132 /*
1133 * Avoid to free 'prealloc' if it can be merged with
1134 * the later extent.
1135 */
d1310b2e 1136 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1137 NULL, NULL, &bits, changeset);
c2d904e0
JM
1138 if (err)
1139 extent_io_tree_panic(tree, err);
1140
9ed74f2d
JB
1141 cache_state(prealloc, cached_state);
1142 prealloc = NULL;
d1310b2e
CM
1143 start = this_end + 1;
1144 goto search_again;
1145 }
1146 /*
1147 * | ---- desired range ---- |
1148 * | state |
1149 * We need to split the extent, and set the bit
1150 * on the first half
1151 */
1152 if (state->start <= end && state->end > end) {
1edbb734 1153 if (state->state & exclusive_bits) {
d1310b2e
CM
1154 *failed_start = start;
1155 err = -EEXIST;
1156 goto out;
1157 }
8233767a
XG
1158
1159 prealloc = alloc_extent_state_atomic(prealloc);
1160 BUG_ON(!prealloc);
d1310b2e 1161 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1162 if (err)
1163 extent_io_tree_panic(tree, err);
d1310b2e 1164
d38ed27f 1165 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1166 cache_state(prealloc, cached_state);
d1310b2e
CM
1167 merge_state(tree, prealloc);
1168 prealloc = NULL;
1169 goto out;
1170 }
1171
b5a4ba14
DS
1172search_again:
1173 if (start > end)
1174 goto out;
1175 spin_unlock(&tree->lock);
1176 if (gfpflags_allow_blocking(mask))
1177 cond_resched();
1178 goto again;
d1310b2e
CM
1179
1180out:
cad321ad 1181 spin_unlock(&tree->lock);
d1310b2e
CM
1182 if (prealloc)
1183 free_extent_state(prealloc);
1184
1185 return err;
1186
d1310b2e 1187}
d1310b2e 1188
462d6fac 1189/**
10983f2e
LB
1190 * convert_extent_bit - convert all bits in a given range from one bit to
1191 * another
462d6fac
JB
1192 * @tree: the io tree to search
1193 * @start: the start offset in bytes
1194 * @end: the end offset in bytes (inclusive)
1195 * @bits: the bits to set in this range
1196 * @clear_bits: the bits to clear in this range
e6138876 1197 * @cached_state: state that we're going to cache
462d6fac
JB
1198 *
1199 * This will go through and set bits for the given range. If any states exist
1200 * already in this range they are set with the given bit and cleared of the
1201 * clear_bits. This is only meant to be used by things that are mergeable, ie
1202 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1203 * boundary bits like LOCK.
210aa277
DS
1204 *
1205 * All allocations are done with GFP_NOFS.
462d6fac
JB
1206 */
1207int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1208 u32 bits, u32 clear_bits,
210aa277 1209 struct extent_state **cached_state)
462d6fac
JB
1210{
1211 struct extent_state *state;
1212 struct extent_state *prealloc = NULL;
1213 struct rb_node *node;
12cfbad9
FDBM
1214 struct rb_node **p;
1215 struct rb_node *parent;
462d6fac
JB
1216 int err = 0;
1217 u64 last_start;
1218 u64 last_end;
c8fd3de7 1219 bool first_iteration = true;
462d6fac 1220
a5dee37d 1221 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1222 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1223 clear_bits);
8d599ae1 1224
462d6fac 1225again:
210aa277 1226 if (!prealloc) {
c8fd3de7
FM
1227 /*
1228 * Best effort, don't worry if extent state allocation fails
1229 * here for the first iteration. We might have a cached state
1230 * that matches exactly the target range, in which case no
1231 * extent state allocations are needed. We'll only know this
1232 * after locking the tree.
1233 */
210aa277 1234 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1235 if (!prealloc && !first_iteration)
462d6fac
JB
1236 return -ENOMEM;
1237 }
1238
1239 spin_lock(&tree->lock);
e6138876
JB
1240 if (cached_state && *cached_state) {
1241 state = *cached_state;
1242 if (state->start <= start && state->end > start &&
27a3507d 1243 extent_state_in_tree(state)) {
e6138876
JB
1244 node = &state->rb_node;
1245 goto hit_next;
1246 }
1247 }
1248
462d6fac
JB
1249 /*
1250 * this search will find all the extents that end after
1251 * our range starts.
1252 */
12cfbad9 1253 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1254 if (!node) {
1255 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1256 if (!prealloc) {
1257 err = -ENOMEM;
1258 goto out;
1259 }
12cfbad9 1260 err = insert_state(tree, prealloc, start, end,
d38ed27f 1261 &p, &parent, &bits, NULL);
c2d904e0
JM
1262 if (err)
1263 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1264 cache_state(prealloc, cached_state);
1265 prealloc = NULL;
462d6fac
JB
1266 goto out;
1267 }
1268 state = rb_entry(node, struct extent_state, rb_node);
1269hit_next:
1270 last_start = state->start;
1271 last_end = state->end;
1272
1273 /*
1274 * | ---- desired range ---- |
1275 * | state |
1276 *
1277 * Just lock what we found and keep going
1278 */
1279 if (state->start == start && state->end <= end) {
d38ed27f 1280 set_state_bits(tree, state, &bits, NULL);
e6138876 1281 cache_state(state, cached_state);
fefdc557 1282 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1283 if (last_end == (u64)-1)
1284 goto out;
462d6fac 1285 start = last_end + 1;
d1ac6e41
LB
1286 if (start < end && state && state->start == start &&
1287 !need_resched())
1288 goto hit_next;
462d6fac
JB
1289 goto search_again;
1290 }
1291
1292 /*
1293 * | ---- desired range ---- |
1294 * | state |
1295 * or
1296 * | ------------- state -------------- |
1297 *
1298 * We need to split the extent we found, and may flip bits on
1299 * second half.
1300 *
1301 * If the extent we found extends past our
1302 * range, we just split and search again. It'll get split
1303 * again the next time though.
1304 *
1305 * If the extent we found is inside our range, we set the
1306 * desired bit on it.
1307 */
1308 if (state->start < start) {
1309 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1310 if (!prealloc) {
1311 err = -ENOMEM;
1312 goto out;
1313 }
462d6fac 1314 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1315 if (err)
1316 extent_io_tree_panic(tree, err);
462d6fac
JB
1317 prealloc = NULL;
1318 if (err)
1319 goto out;
1320 if (state->end <= end) {
d38ed27f 1321 set_state_bits(tree, state, &bits, NULL);
e6138876 1322 cache_state(state, cached_state);
fefdc557
QW
1323 state = clear_state_bit(tree, state, &clear_bits, 0,
1324 NULL);
462d6fac
JB
1325 if (last_end == (u64)-1)
1326 goto out;
1327 start = last_end + 1;
d1ac6e41
LB
1328 if (start < end && state && state->start == start &&
1329 !need_resched())
1330 goto hit_next;
462d6fac
JB
1331 }
1332 goto search_again;
1333 }
1334 /*
1335 * | ---- desired range ---- |
1336 * | state | or | state |
1337 *
1338 * There's a hole, we need to insert something in it and
1339 * ignore the extent we found.
1340 */
1341 if (state->start > start) {
1342 u64 this_end;
1343 if (end < last_start)
1344 this_end = end;
1345 else
1346 this_end = last_start - 1;
1347
1348 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1349 if (!prealloc) {
1350 err = -ENOMEM;
1351 goto out;
1352 }
462d6fac
JB
1353
1354 /*
1355 * Avoid to free 'prealloc' if it can be merged with
1356 * the later extent.
1357 */
1358 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1359 NULL, NULL, &bits, NULL);
c2d904e0
JM
1360 if (err)
1361 extent_io_tree_panic(tree, err);
e6138876 1362 cache_state(prealloc, cached_state);
462d6fac
JB
1363 prealloc = NULL;
1364 start = this_end + 1;
1365 goto search_again;
1366 }
1367 /*
1368 * | ---- desired range ---- |
1369 * | state |
1370 * We need to split the extent, and set the bit
1371 * on the first half
1372 */
1373 if (state->start <= end && state->end > end) {
1374 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1375 if (!prealloc) {
1376 err = -ENOMEM;
1377 goto out;
1378 }
462d6fac
JB
1379
1380 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1381 if (err)
1382 extent_io_tree_panic(tree, err);
462d6fac 1383
d38ed27f 1384 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1385 cache_state(prealloc, cached_state);
fefdc557 1386 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1387 prealloc = NULL;
1388 goto out;
1389 }
1390
462d6fac
JB
1391search_again:
1392 if (start > end)
1393 goto out;
1394 spin_unlock(&tree->lock);
210aa277 1395 cond_resched();
c8fd3de7 1396 first_iteration = false;
462d6fac 1397 goto again;
462d6fac
JB
1398
1399out:
1400 spin_unlock(&tree->lock);
1401 if (prealloc)
1402 free_extent_state(prealloc);
1403
1404 return err;
462d6fac
JB
1405}
1406
d1310b2e 1407/* wrappers around set/clear extent bit */
d38ed27f 1408int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1409 u32 bits, struct extent_changeset *changeset)
d38ed27f
QW
1410{
1411 /*
1412 * We don't support EXTENT_LOCKED yet, as current changeset will
1413 * record any bits changed, so for EXTENT_LOCKED case, it will
1414 * either fail with -EEXIST or changeset will record the whole
1415 * range.
1416 */
1417 BUG_ON(bits & EXTENT_LOCKED);
1418
1cab5e72
NB
1419 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1420 changeset);
d38ed27f
QW
1421}
1422
4ca73656 1423int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1424 u32 bits)
4ca73656 1425{
1cab5e72
NB
1426 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1427 GFP_NOWAIT, NULL);
4ca73656
NB
1428}
1429
fefdc557 1430int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1431 u32 bits, int wake, int delete,
ae0f1625 1432 struct extent_state **cached)
fefdc557
QW
1433{
1434 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1435 cached, GFP_NOFS, NULL);
fefdc557
QW
1436}
1437
fefdc557 1438int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1439 u32 bits, struct extent_changeset *changeset)
fefdc557
QW
1440{
1441 /*
1442 * Don't support EXTENT_LOCKED case, same reason as
1443 * set_record_extent_bits().
1444 */
1445 BUG_ON(bits & EXTENT_LOCKED);
1446
f734c44a 1447 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1448 changeset);
1449}
1450
d352ac68
CM
1451/*
1452 * either insert or lock state struct between start and end use mask to tell
1453 * us if waiting is desired.
1454 */
1edbb734 1455int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1456 struct extent_state **cached_state)
d1310b2e
CM
1457{
1458 int err;
1459 u64 failed_start;
9ee49a04 1460
d1310b2e 1461 while (1) {
1cab5e72
NB
1462 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1463 EXTENT_LOCKED, &failed_start,
1464 cached_state, GFP_NOFS, NULL);
d0082371 1465 if (err == -EEXIST) {
d1310b2e
CM
1466 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1467 start = failed_start;
d0082371 1468 } else
d1310b2e 1469 break;
d1310b2e
CM
1470 WARN_ON(start > end);
1471 }
1472 return err;
1473}
d1310b2e 1474
d0082371 1475int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1476{
1477 int err;
1478 u64 failed_start;
1479
1cab5e72
NB
1480 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1481 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1482 if (err == -EEXIST) {
1483 if (failed_start > start)
1484 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1485 EXTENT_LOCKED, 1, 0, NULL);
25179201 1486 return 0;
6643558d 1487 }
25179201
JB
1488 return 1;
1489}
25179201 1490
bd1fa4f0 1491void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1492{
09cbfeaf
KS
1493 unsigned long index = start >> PAGE_SHIFT;
1494 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1495 struct page *page;
1496
1497 while (index <= end_index) {
1498 page = find_get_page(inode->i_mapping, index);
1499 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1500 clear_page_dirty_for_io(page);
09cbfeaf 1501 put_page(page);
4adaa611
CM
1502 index++;
1503 }
4adaa611
CM
1504}
1505
f6311572 1506void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1507{
09cbfeaf
KS
1508 unsigned long index = start >> PAGE_SHIFT;
1509 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1510 struct page *page;
1511
1512 while (index <= end_index) {
1513 page = find_get_page(inode->i_mapping, index);
1514 BUG_ON(!page); /* Pages should be in the extent_io_tree */
4adaa611 1515 __set_page_dirty_nobuffers(page);
8d38633c 1516 account_page_redirty(page);
09cbfeaf 1517 put_page(page);
4adaa611
CM
1518 index++;
1519 }
4adaa611
CM
1520}
1521
d352ac68
CM
1522/* find the first state struct with 'bits' set after 'start', and
1523 * return it. tree->lock must be held. NULL will returned if
1524 * nothing was found after 'start'
1525 */
48a3b636 1526static struct extent_state *
f97e27e9 1527find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
d7fc640e
CM
1528{
1529 struct rb_node *node;
1530 struct extent_state *state;
1531
1532 /*
1533 * this search will find all the extents that end after
1534 * our range starts.
1535 */
1536 node = tree_search(tree, start);
d397712b 1537 if (!node)
d7fc640e 1538 goto out;
d7fc640e 1539
d397712b 1540 while (1) {
d7fc640e 1541 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1542 if (state->end >= start && (state->state & bits))
d7fc640e 1543 return state;
d397712b 1544
d7fc640e
CM
1545 node = rb_next(node);
1546 if (!node)
1547 break;
1548 }
1549out:
1550 return NULL;
1551}
d7fc640e 1552
69261c4b 1553/*
03509b78 1554 * Find the first offset in the io tree with one or more @bits set.
69261c4b 1555 *
03509b78
QW
1556 * Note: If there are multiple bits set in @bits, any of them will match.
1557 *
1558 * Return 0 if we find something, and update @start_ret and @end_ret.
1559 * Return 1 if we found nothing.
69261c4b
XG
1560 */
1561int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1562 u64 *start_ret, u64 *end_ret, u32 bits,
e6138876 1563 struct extent_state **cached_state)
69261c4b
XG
1564{
1565 struct extent_state *state;
1566 int ret = 1;
1567
1568 spin_lock(&tree->lock);
e6138876
JB
1569 if (cached_state && *cached_state) {
1570 state = *cached_state;
27a3507d 1571 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1572 while ((state = next_state(state)) != NULL) {
e6138876
JB
1573 if (state->state & bits)
1574 goto got_it;
e6138876
JB
1575 }
1576 free_extent_state(*cached_state);
1577 *cached_state = NULL;
1578 goto out;
1579 }
1580 free_extent_state(*cached_state);
1581 *cached_state = NULL;
1582 }
1583
69261c4b 1584 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1585got_it:
69261c4b 1586 if (state) {
e38e2ed7 1587 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1588 *start_ret = state->start;
1589 *end_ret = state->end;
1590 ret = 0;
1591 }
e6138876 1592out:
69261c4b
XG
1593 spin_unlock(&tree->lock);
1594 return ret;
1595}
1596
41a2ee75 1597/**
3bed2da1
NB
1598 * Find a contiguous area of bits
1599 *
1600 * @tree: io tree to check
1601 * @start: offset to start the search from
1602 * @start_ret: the first offset we found with the bits set
1603 * @end_ret: the final contiguous range of the bits that were set
1604 * @bits: bits to look for
41a2ee75
JB
1605 *
1606 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1607 * to set bits appropriately, and then merge them again. During this time it
1608 * will drop the tree->lock, so use this helper if you want to find the actual
1609 * contiguous area for given bits. We will search to the first bit we find, and
1610 * then walk down the tree until we find a non-contiguous area. The area
1611 * returned will be the full contiguous area with the bits set.
1612 */
1613int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1614 u64 *start_ret, u64 *end_ret, u32 bits)
41a2ee75
JB
1615{
1616 struct extent_state *state;
1617 int ret = 1;
1618
1619 spin_lock(&tree->lock);
1620 state = find_first_extent_bit_state(tree, start, bits);
1621 if (state) {
1622 *start_ret = state->start;
1623 *end_ret = state->end;
1624 while ((state = next_state(state)) != NULL) {
1625 if (state->start > (*end_ret + 1))
1626 break;
1627 *end_ret = state->end;
1628 }
1629 ret = 0;
1630 }
1631 spin_unlock(&tree->lock);
1632 return ret;
1633}
1634
45bfcfc1 1635/**
3bed2da1
NB
1636 * Find the first range that has @bits not set. This range could start before
1637 * @start.
45bfcfc1 1638 *
3bed2da1
NB
1639 * @tree: the tree to search
1640 * @start: offset at/after which the found extent should start
1641 * @start_ret: records the beginning of the range
1642 * @end_ret: records the end of the range (inclusive)
1643 * @bits: the set of bits which must be unset
45bfcfc1
NB
1644 *
1645 * Since unallocated range is also considered one which doesn't have the bits
1646 * set it's possible that @end_ret contains -1, this happens in case the range
1647 * spans (last_range_end, end of device]. In this case it's up to the caller to
1648 * trim @end_ret to the appropriate size.
1649 */
1650void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1651 u64 *start_ret, u64 *end_ret, u32 bits)
45bfcfc1
NB
1652{
1653 struct extent_state *state;
1654 struct rb_node *node, *prev = NULL, *next;
1655
1656 spin_lock(&tree->lock);
1657
1658 /* Find first extent with bits cleared */
1659 while (1) {
1660 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1661 if (!node && !next && !prev) {
1662 /*
1663 * Tree is completely empty, send full range and let
1664 * caller deal with it
1665 */
1666 *start_ret = 0;
1667 *end_ret = -1;
1668 goto out;
1669 } else if (!node && !next) {
1670 /*
1671 * We are past the last allocated chunk, set start at
1672 * the end of the last extent.
1673 */
1674 state = rb_entry(prev, struct extent_state, rb_node);
1675 *start_ret = state->end + 1;
1676 *end_ret = -1;
1677 goto out;
1678 } else if (!node) {
45bfcfc1 1679 node = next;
45bfcfc1 1680 }
1eaebb34
NB
1681 /*
1682 * At this point 'node' either contains 'start' or start is
1683 * before 'node'
1684 */
45bfcfc1 1685 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1686
1687 if (in_range(start, state->start, state->end - state->start + 1)) {
1688 if (state->state & bits) {
1689 /*
1690 * |--range with bits sets--|
1691 * |
1692 * start
1693 */
1694 start = state->end + 1;
1695 } else {
1696 /*
1697 * 'start' falls within a range that doesn't
1698 * have the bits set, so take its start as
1699 * the beginning of the desired range
1700 *
1701 * |--range with bits cleared----|
1702 * |
1703 * start
1704 */
1705 *start_ret = state->start;
1706 break;
1707 }
45bfcfc1 1708 } else {
1eaebb34
NB
1709 /*
1710 * |---prev range---|---hole/unset---|---node range---|
1711 * |
1712 * start
1713 *
1714 * or
1715 *
1716 * |---hole/unset--||--first node--|
1717 * 0 |
1718 * start
1719 */
1720 if (prev) {
1721 state = rb_entry(prev, struct extent_state,
1722 rb_node);
1723 *start_ret = state->end + 1;
1724 } else {
1725 *start_ret = 0;
1726 }
45bfcfc1
NB
1727 break;
1728 }
1729 }
1730
1731 /*
1732 * Find the longest stretch from start until an entry which has the
1733 * bits set
1734 */
1735 while (1) {
1736 state = rb_entry(node, struct extent_state, rb_node);
1737 if (state->end >= start && !(state->state & bits)) {
1738 *end_ret = state->end;
1739 } else {
1740 *end_ret = state->start - 1;
1741 break;
1742 }
1743
1744 node = rb_next(node);
1745 if (!node)
1746 break;
1747 }
1748out:
1749 spin_unlock(&tree->lock);
1750}
1751
d352ac68
CM
1752/*
1753 * find a contiguous range of bytes in the file marked as delalloc, not
1754 * more than 'max_bytes'. start and end are used to return the range,
1755 *
3522e903 1756 * true is returned if we find something, false if nothing was in the tree
d352ac68 1757 */
083e75e7
JB
1758bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1759 u64 *end, u64 max_bytes,
1760 struct extent_state **cached_state)
d1310b2e
CM
1761{
1762 struct rb_node *node;
1763 struct extent_state *state;
1764 u64 cur_start = *start;
3522e903 1765 bool found = false;
d1310b2e
CM
1766 u64 total_bytes = 0;
1767
cad321ad 1768 spin_lock(&tree->lock);
c8b97818 1769
d1310b2e
CM
1770 /*
1771 * this search will find all the extents that end after
1772 * our range starts.
1773 */
80ea96b1 1774 node = tree_search(tree, cur_start);
2b114d1d 1775 if (!node) {
3522e903 1776 *end = (u64)-1;
d1310b2e
CM
1777 goto out;
1778 }
1779
d397712b 1780 while (1) {
d1310b2e 1781 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1782 if (found && (state->start != cur_start ||
1783 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1784 goto out;
1785 }
1786 if (!(state->state & EXTENT_DELALLOC)) {
1787 if (!found)
1788 *end = state->end;
1789 goto out;
1790 }
c2a128d2 1791 if (!found) {
d1310b2e 1792 *start = state->start;
c2a128d2 1793 *cached_state = state;
b7ac31b7 1794 refcount_inc(&state->refs);
c2a128d2 1795 }
3522e903 1796 found = true;
d1310b2e
CM
1797 *end = state->end;
1798 cur_start = state->end + 1;
1799 node = rb_next(node);
d1310b2e 1800 total_bytes += state->end - state->start + 1;
7bf811a5 1801 if (total_bytes >= max_bytes)
573aecaf 1802 break;
573aecaf 1803 if (!node)
d1310b2e
CM
1804 break;
1805 }
1806out:
cad321ad 1807 spin_unlock(&tree->lock);
d1310b2e
CM
1808 return found;
1809}
1810
da2c7009
LB
1811static int __process_pages_contig(struct address_space *mapping,
1812 struct page *locked_page,
1813 pgoff_t start_index, pgoff_t end_index,
1814 unsigned long page_ops, pgoff_t *index_ret);
1815
143bede5
JM
1816static noinline void __unlock_for_delalloc(struct inode *inode,
1817 struct page *locked_page,
1818 u64 start, u64 end)
c8b97818 1819{
09cbfeaf
KS
1820 unsigned long index = start >> PAGE_SHIFT;
1821 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1822
76c0021d 1823 ASSERT(locked_page);
c8b97818 1824 if (index == locked_page->index && end_index == index)
143bede5 1825 return;
c8b97818 1826
76c0021d
LB
1827 __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1828 PAGE_UNLOCK, NULL);
c8b97818
CM
1829}
1830
1831static noinline int lock_delalloc_pages(struct inode *inode,
1832 struct page *locked_page,
1833 u64 delalloc_start,
1834 u64 delalloc_end)
1835{
09cbfeaf 1836 unsigned long index = delalloc_start >> PAGE_SHIFT;
76c0021d 1837 unsigned long index_ret = index;
09cbfeaf 1838 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
c8b97818 1839 int ret;
c8b97818 1840
76c0021d 1841 ASSERT(locked_page);
c8b97818
CM
1842 if (index == locked_page->index && index == end_index)
1843 return 0;
1844
76c0021d
LB
1845 ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1846 end_index, PAGE_LOCK, &index_ret);
1847 if (ret == -EAGAIN)
1848 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1849 (u64)index_ret << PAGE_SHIFT);
c8b97818
CM
1850 return ret;
1851}
1852
1853/*
3522e903
LF
1854 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1855 * more than @max_bytes. @Start and @end are used to return the range,
c8b97818 1856 *
3522e903
LF
1857 * Return: true if we find something
1858 * false if nothing was in the tree
c8b97818 1859 */
ce9f967f 1860EXPORT_FOR_TESTS
3522e903 1861noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1862 struct page *locked_page, u64 *start,
917aacec 1863 u64 *end)
c8b97818 1864{
9978059b 1865 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
917aacec 1866 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
1867 u64 delalloc_start;
1868 u64 delalloc_end;
3522e903 1869 bool found;
9655d298 1870 struct extent_state *cached_state = NULL;
c8b97818
CM
1871 int ret;
1872 int loops = 0;
1873
1874again:
1875 /* step one, find a bunch of delalloc bytes starting at start */
1876 delalloc_start = *start;
1877 delalloc_end = 0;
083e75e7
JB
1878 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1879 max_bytes, &cached_state);
70b99e69 1880 if (!found || delalloc_end <= *start) {
c8b97818
CM
1881 *start = delalloc_start;
1882 *end = delalloc_end;
c2a128d2 1883 free_extent_state(cached_state);
3522e903 1884 return false;
c8b97818
CM
1885 }
1886
70b99e69
CM
1887 /*
1888 * start comes from the offset of locked_page. We have to lock
1889 * pages in order, so we can't process delalloc bytes before
1890 * locked_page
1891 */
d397712b 1892 if (delalloc_start < *start)
70b99e69 1893 delalloc_start = *start;
70b99e69 1894
c8b97818
CM
1895 /*
1896 * make sure to limit the number of pages we try to lock down
c8b97818 1897 */
7bf811a5
JB
1898 if (delalloc_end + 1 - delalloc_start > max_bytes)
1899 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 1900
c8b97818
CM
1901 /* step two, lock all the pages after the page that has start */
1902 ret = lock_delalloc_pages(inode, locked_page,
1903 delalloc_start, delalloc_end);
9bfd61d9 1904 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
1905 if (ret == -EAGAIN) {
1906 /* some of the pages are gone, lets avoid looping by
1907 * shortening the size of the delalloc range we're searching
1908 */
9655d298 1909 free_extent_state(cached_state);
7d788742 1910 cached_state = NULL;
c8b97818 1911 if (!loops) {
09cbfeaf 1912 max_bytes = PAGE_SIZE;
c8b97818
CM
1913 loops = 1;
1914 goto again;
1915 } else {
3522e903 1916 found = false;
c8b97818
CM
1917 goto out_failed;
1918 }
1919 }
c8b97818
CM
1920
1921 /* step three, lock the state bits for the whole range */
ff13db41 1922 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
1923
1924 /* then test to make sure it is all still delalloc */
1925 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 1926 EXTENT_DELALLOC, 1, cached_state);
c8b97818 1927 if (!ret) {
9655d298 1928 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 1929 &cached_state);
c8b97818
CM
1930 __unlock_for_delalloc(inode, locked_page,
1931 delalloc_start, delalloc_end);
1932 cond_resched();
1933 goto again;
1934 }
9655d298 1935 free_extent_state(cached_state);
c8b97818
CM
1936 *start = delalloc_start;
1937 *end = delalloc_end;
1938out_failed:
1939 return found;
1940}
1941
da2c7009
LB
1942static int __process_pages_contig(struct address_space *mapping,
1943 struct page *locked_page,
1944 pgoff_t start_index, pgoff_t end_index,
1945 unsigned long page_ops, pgoff_t *index_ret)
c8b97818 1946{
873695b3 1947 unsigned long nr_pages = end_index - start_index + 1;
12e3360f 1948 unsigned long pages_processed = 0;
873695b3 1949 pgoff_t index = start_index;
c8b97818 1950 struct page *pages[16];
873695b3 1951 unsigned ret;
da2c7009 1952 int err = 0;
c8b97818 1953 int i;
771ed689 1954
da2c7009
LB
1955 if (page_ops & PAGE_LOCK) {
1956 ASSERT(page_ops == PAGE_LOCK);
1957 ASSERT(index_ret && *index_ret == start_index);
1958 }
1959
704de49d 1960 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
873695b3 1961 mapping_set_error(mapping, -EIO);
704de49d 1962
d397712b 1963 while (nr_pages > 0) {
873695b3 1964 ret = find_get_pages_contig(mapping, index,
5b050f04
CM
1965 min_t(unsigned long,
1966 nr_pages, ARRAY_SIZE(pages)), pages);
da2c7009
LB
1967 if (ret == 0) {
1968 /*
1969 * Only if we're going to lock these pages,
1970 * can we find nothing at @index.
1971 */
1972 ASSERT(page_ops & PAGE_LOCK);
49d4a334
LB
1973 err = -EAGAIN;
1974 goto out;
da2c7009 1975 }
8b62b72b 1976
da2c7009 1977 for (i = 0; i < ret; i++) {
c2790a2e 1978 if (page_ops & PAGE_SET_PRIVATE2)
8b62b72b
CM
1979 SetPagePrivate2(pages[i]);
1980
1d53c9e6 1981 if (locked_page && pages[i] == locked_page) {
09cbfeaf 1982 put_page(pages[i]);
12e3360f 1983 pages_processed++;
c8b97818
CM
1984 continue;
1985 }
6869b0a8 1986 if (page_ops & PAGE_START_WRITEBACK) {
c8b97818 1987 clear_page_dirty_for_io(pages[i]);
c8b97818 1988 set_page_writeback(pages[i]);
6869b0a8 1989 }
704de49d
FM
1990 if (page_ops & PAGE_SET_ERROR)
1991 SetPageError(pages[i]);
c2790a2e 1992 if (page_ops & PAGE_END_WRITEBACK)
c8b97818 1993 end_page_writeback(pages[i]);
c2790a2e 1994 if (page_ops & PAGE_UNLOCK)
771ed689 1995 unlock_page(pages[i]);
da2c7009
LB
1996 if (page_ops & PAGE_LOCK) {
1997 lock_page(pages[i]);
1998 if (!PageDirty(pages[i]) ||
1999 pages[i]->mapping != mapping) {
2000 unlock_page(pages[i]);
5909ca11
RK
2001 for (; i < ret; i++)
2002 put_page(pages[i]);
da2c7009
LB
2003 err = -EAGAIN;
2004 goto out;
2005 }
2006 }
09cbfeaf 2007 put_page(pages[i]);
12e3360f 2008 pages_processed++;
c8b97818
CM
2009 }
2010 nr_pages -= ret;
2011 index += ret;
2012 cond_resched();
2013 }
da2c7009
LB
2014out:
2015 if (err && index_ret)
12e3360f 2016 *index_ret = start_index + pages_processed - 1;
da2c7009 2017 return err;
c8b97818 2018}
c8b97818 2019
ad7ff17b 2020void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
74e9194a 2021 struct page *locked_page,
f97e27e9 2022 u32 clear_bits, unsigned long page_ops)
873695b3 2023{
ad7ff17b 2024 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
873695b3 2025
ad7ff17b 2026 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
873695b3 2027 start >> PAGE_SHIFT, end >> PAGE_SHIFT,
da2c7009 2028 page_ops, NULL);
873695b3
LB
2029}
2030
d352ac68
CM
2031/*
2032 * count the number of bytes in the tree that have a given bit(s)
2033 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2034 * cached. The total number found is returned.
2035 */
d1310b2e
CM
2036u64 count_range_bits(struct extent_io_tree *tree,
2037 u64 *start, u64 search_end, u64 max_bytes,
f97e27e9 2038 u32 bits, int contig)
d1310b2e
CM
2039{
2040 struct rb_node *node;
2041 struct extent_state *state;
2042 u64 cur_start = *start;
2043 u64 total_bytes = 0;
ec29ed5b 2044 u64 last = 0;
d1310b2e
CM
2045 int found = 0;
2046
fae7f21c 2047 if (WARN_ON(search_end <= cur_start))
d1310b2e 2048 return 0;
d1310b2e 2049
cad321ad 2050 spin_lock(&tree->lock);
d1310b2e
CM
2051 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2052 total_bytes = tree->dirty_bytes;
2053 goto out;
2054 }
2055 /*
2056 * this search will find all the extents that end after
2057 * our range starts.
2058 */
80ea96b1 2059 node = tree_search(tree, cur_start);
d397712b 2060 if (!node)
d1310b2e 2061 goto out;
d1310b2e 2062
d397712b 2063 while (1) {
d1310b2e
CM
2064 state = rb_entry(node, struct extent_state, rb_node);
2065 if (state->start > search_end)
2066 break;
ec29ed5b
CM
2067 if (contig && found && state->start > last + 1)
2068 break;
2069 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2070 total_bytes += min(search_end, state->end) + 1 -
2071 max(cur_start, state->start);
2072 if (total_bytes >= max_bytes)
2073 break;
2074 if (!found) {
af60bed2 2075 *start = max(cur_start, state->start);
d1310b2e
CM
2076 found = 1;
2077 }
ec29ed5b
CM
2078 last = state->end;
2079 } else if (contig && found) {
2080 break;
d1310b2e
CM
2081 }
2082 node = rb_next(node);
2083 if (!node)
2084 break;
2085 }
2086out:
cad321ad 2087 spin_unlock(&tree->lock);
d1310b2e
CM
2088 return total_bytes;
2089}
b2950863 2090
d352ac68
CM
2091/*
2092 * set the private field for a given byte offset in the tree. If there isn't
2093 * an extent_state there already, this does nothing.
2094 */
b3f167aa
JB
2095int set_state_failrec(struct extent_io_tree *tree, u64 start,
2096 struct io_failure_record *failrec)
d1310b2e
CM
2097{
2098 struct rb_node *node;
2099 struct extent_state *state;
2100 int ret = 0;
2101
cad321ad 2102 spin_lock(&tree->lock);
d1310b2e
CM
2103 /*
2104 * this search will find all the extents that end after
2105 * our range starts.
2106 */
80ea96b1 2107 node = tree_search(tree, start);
2b114d1d 2108 if (!node) {
d1310b2e
CM
2109 ret = -ENOENT;
2110 goto out;
2111 }
2112 state = rb_entry(node, struct extent_state, rb_node);
2113 if (state->start != start) {
2114 ret = -ENOENT;
2115 goto out;
2116 }
47dc196a 2117 state->failrec = failrec;
d1310b2e 2118out:
cad321ad 2119 spin_unlock(&tree->lock);
d1310b2e
CM
2120 return ret;
2121}
2122
2279a270 2123struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
d1310b2e
CM
2124{
2125 struct rb_node *node;
2126 struct extent_state *state;
2279a270 2127 struct io_failure_record *failrec;
d1310b2e 2128
cad321ad 2129 spin_lock(&tree->lock);
d1310b2e
CM
2130 /*
2131 * this search will find all the extents that end after
2132 * our range starts.
2133 */
80ea96b1 2134 node = tree_search(tree, start);
2b114d1d 2135 if (!node) {
2279a270 2136 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2137 goto out;
2138 }
2139 state = rb_entry(node, struct extent_state, rb_node);
2140 if (state->start != start) {
2279a270 2141 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2142 goto out;
2143 }
2279a270
NB
2144
2145 failrec = state->failrec;
d1310b2e 2146out:
cad321ad 2147 spin_unlock(&tree->lock);
2279a270 2148 return failrec;
d1310b2e
CM
2149}
2150
2151/*
2152 * searches a range in the state tree for a given mask.
70dec807 2153 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2154 * has the bits set. Otherwise, 1 is returned if any bit in the
2155 * range is found set.
2156 */
2157int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 2158 u32 bits, int filled, struct extent_state *cached)
d1310b2e
CM
2159{
2160 struct extent_state *state = NULL;
2161 struct rb_node *node;
2162 int bitset = 0;
d1310b2e 2163
cad321ad 2164 spin_lock(&tree->lock);
27a3507d 2165 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2166 cached->end > start)
9655d298
CM
2167 node = &cached->rb_node;
2168 else
2169 node = tree_search(tree, start);
d1310b2e
CM
2170 while (node && start <= end) {
2171 state = rb_entry(node, struct extent_state, rb_node);
2172
2173 if (filled && state->start > start) {
2174 bitset = 0;
2175 break;
2176 }
2177
2178 if (state->start > end)
2179 break;
2180
2181 if (state->state & bits) {
2182 bitset = 1;
2183 if (!filled)
2184 break;
2185 } else if (filled) {
2186 bitset = 0;
2187 break;
2188 }
46562cec
CM
2189
2190 if (state->end == (u64)-1)
2191 break;
2192
d1310b2e
CM
2193 start = state->end + 1;
2194 if (start > end)
2195 break;
2196 node = rb_next(node);
2197 if (!node) {
2198 if (filled)
2199 bitset = 0;
2200 break;
2201 }
2202 }
cad321ad 2203 spin_unlock(&tree->lock);
d1310b2e
CM
2204 return bitset;
2205}
d1310b2e
CM
2206
2207/*
2208 * helper function to set a given page up to date if all the
2209 * extents in the tree for that page are up to date
2210 */
143bede5 2211static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
d1310b2e 2212{
4eee4fa4 2213 u64 start = page_offset(page);
09cbfeaf 2214 u64 end = start + PAGE_SIZE - 1;
9655d298 2215 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
d1310b2e 2216 SetPageUptodate(page);
d1310b2e
CM
2217}
2218
7870d082
JB
2219int free_io_failure(struct extent_io_tree *failure_tree,
2220 struct extent_io_tree *io_tree,
2221 struct io_failure_record *rec)
4a54c8c1
JS
2222{
2223 int ret;
2224 int err = 0;
4a54c8c1 2225
47dc196a 2226 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2227 ret = clear_extent_bits(failure_tree, rec->start,
2228 rec->start + rec->len - 1,
91166212 2229 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2230 if (ret)
2231 err = ret;
2232
7870d082 2233 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2234 rec->start + rec->len - 1,
91166212 2235 EXTENT_DAMAGED);
53b381b3
DW
2236 if (ret && !err)
2237 err = ret;
4a54c8c1
JS
2238
2239 kfree(rec);
2240 return err;
2241}
2242
4a54c8c1
JS
2243/*
2244 * this bypasses the standard btrfs submit functions deliberately, as
2245 * the standard behavior is to write all copies in a raid setup. here we only
2246 * want to write the one bad copy. so we do the mapping for ourselves and issue
2247 * submit_bio directly.
3ec706c8 2248 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2249 * actually prevents the read that triggered the error from finishing.
2250 * currently, there can be no more than two copies of every data bit. thus,
2251 * exactly one rewrite is required.
2252 */
6ec656bc
JB
2253int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2254 u64 length, u64 logical, struct page *page,
2255 unsigned int pg_offset, int mirror_num)
4a54c8c1
JS
2256{
2257 struct bio *bio;
2258 struct btrfs_device *dev;
4a54c8c1
JS
2259 u64 map_length = 0;
2260 u64 sector;
2261 struct btrfs_bio *bbio = NULL;
2262 int ret;
2263
1751e8a6 2264 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2265 BUG_ON(!mirror_num);
2266
f7ef5287
NA
2267 if (btrfs_is_zoned(fs_info))
2268 return btrfs_repair_one_zone(fs_info, logical);
2269
c5e4c3d7 2270 bio = btrfs_io_bio_alloc(1);
4f024f37 2271 bio->bi_iter.bi_size = 0;
4a54c8c1
JS
2272 map_length = length;
2273
b5de8d0d
FM
2274 /*
2275 * Avoid races with device replace and make sure our bbio has devices
2276 * associated to its stripes that don't go away while we are doing the
2277 * read repair operation.
2278 */
2279 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2280 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2281 /*
2282 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2283 * to update all raid stripes, but here we just want to correct
2284 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2285 * stripe's dev and sector.
2286 */
2287 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2288 &map_length, &bbio, 0);
2289 if (ret) {
2290 btrfs_bio_counter_dec(fs_info);
2291 bio_put(bio);
2292 return -EIO;
2293 }
2294 ASSERT(bbio->mirror_num == 1);
2295 } else {
2296 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2297 &map_length, &bbio, mirror_num);
2298 if (ret) {
2299 btrfs_bio_counter_dec(fs_info);
2300 bio_put(bio);
2301 return -EIO;
2302 }
2303 BUG_ON(mirror_num != bbio->mirror_num);
4a54c8c1 2304 }
c725328c
LB
2305
2306 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
4f024f37 2307 bio->bi_iter.bi_sector = sector;
c725328c 2308 dev = bbio->stripes[bbio->mirror_num - 1].dev;
6e9606d2 2309 btrfs_put_bbio(bbio);
ebbede42
AJ
2310 if (!dev || !dev->bdev ||
2311 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
b5de8d0d 2312 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2313 bio_put(bio);
2314 return -EIO;
2315 }
74d46992 2316 bio_set_dev(bio, dev->bdev);
70fd7614 2317 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ffdd2018 2318 bio_add_page(bio, page, length, pg_offset);
4a54c8c1 2319
4e49ea4a 2320 if (btrfsic_submit_bio_wait(bio)) {
4a54c8c1 2321 /* try to remap that extent elsewhere? */
b5de8d0d 2322 btrfs_bio_counter_dec(fs_info);
4a54c8c1 2323 bio_put(bio);
442a4f63 2324 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4a54c8c1
JS
2325 return -EIO;
2326 }
2327
b14af3b4
DS
2328 btrfs_info_rl_in_rcu(fs_info,
2329 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2330 ino, start,
1203b681 2331 rcu_str_deref(dev->name), sector);
b5de8d0d 2332 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2333 bio_put(bio);
2334 return 0;
2335}
2336
2b48966a 2337int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
ea466794 2338{
20a1fbf9 2339 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2340 u64 start = eb->start;
cc5e31a4 2341 int i, num_pages = num_extent_pages(eb);
d95603b2 2342 int ret = 0;
ea466794 2343
bc98a42c 2344 if (sb_rdonly(fs_info->sb))
908960c6
ID
2345 return -EROFS;
2346
ea466794 2347 for (i = 0; i < num_pages; i++) {
fb85fc9a 2348 struct page *p = eb->pages[i];
1203b681 2349
6ec656bc 2350 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2351 start - page_offset(p), mirror_num);
ea466794
JB
2352 if (ret)
2353 break;
09cbfeaf 2354 start += PAGE_SIZE;
ea466794
JB
2355 }
2356
2357 return ret;
2358}
2359
4a54c8c1
JS
2360/*
2361 * each time an IO finishes, we do a fast check in the IO failure tree
2362 * to see if we need to process or clean up an io_failure_record
2363 */
7870d082
JB
2364int clean_io_failure(struct btrfs_fs_info *fs_info,
2365 struct extent_io_tree *failure_tree,
2366 struct extent_io_tree *io_tree, u64 start,
2367 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2368{
2369 u64 private;
4a54c8c1 2370 struct io_failure_record *failrec;
4a54c8c1
JS
2371 struct extent_state *state;
2372 int num_copies;
4a54c8c1 2373 int ret;
4a54c8c1
JS
2374
2375 private = 0;
7870d082
JB
2376 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2377 EXTENT_DIRTY, 0);
4a54c8c1
JS
2378 if (!ret)
2379 return 0;
2380
2279a270
NB
2381 failrec = get_state_failrec(failure_tree, start);
2382 if (IS_ERR(failrec))
4a54c8c1
JS
2383 return 0;
2384
4a54c8c1
JS
2385 BUG_ON(!failrec->this_mirror);
2386
bc98a42c 2387 if (sb_rdonly(fs_info->sb))
908960c6 2388 goto out;
4a54c8c1 2389
7870d082
JB
2390 spin_lock(&io_tree->lock);
2391 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2392 failrec->start,
2393 EXTENT_LOCKED);
7870d082 2394 spin_unlock(&io_tree->lock);
4a54c8c1 2395
883d0de4
MX
2396 if (state && state->start <= failrec->start &&
2397 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2398 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2399 failrec->len);
4a54c8c1 2400 if (num_copies > 1) {
7870d082
JB
2401 repair_io_failure(fs_info, ino, start, failrec->len,
2402 failrec->logical, page, pg_offset,
2403 failrec->failed_mirror);
4a54c8c1
JS
2404 }
2405 }
2406
2407out:
7870d082 2408 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2409
454ff3de 2410 return 0;
4a54c8c1
JS
2411}
2412
f612496b
MX
2413/*
2414 * Can be called when
2415 * - hold extent lock
2416 * - under ordered extent
2417 * - the inode is freeing
2418 */
7ab7956e 2419void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2420{
7ab7956e 2421 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2422 struct io_failure_record *failrec;
2423 struct extent_state *state, *next;
2424
2425 if (RB_EMPTY_ROOT(&failure_tree->state))
2426 return;
2427
2428 spin_lock(&failure_tree->lock);
2429 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2430 while (state) {
2431 if (state->start > end)
2432 break;
2433
2434 ASSERT(state->end <= end);
2435
2436 next = next_state(state);
2437
47dc196a 2438 failrec = state->failrec;
f612496b
MX
2439 free_extent_state(state);
2440 kfree(failrec);
2441
2442 state = next;
2443 }
2444 spin_unlock(&failure_tree->lock);
2445}
2446
3526302f 2447static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
150e4b05 2448 u64 start)
4a54c8c1 2449{
ab8d0fc4 2450 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2451 struct io_failure_record *failrec;
4a54c8c1 2452 struct extent_map *em;
4a54c8c1
JS
2453 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2454 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2455 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
150e4b05 2456 const u32 sectorsize = fs_info->sectorsize;
4a54c8c1 2457 int ret;
4a54c8c1
JS
2458 u64 logical;
2459
2279a270 2460 failrec = get_state_failrec(failure_tree, start);
3526302f 2461 if (!IS_ERR(failrec)) {
ab8d0fc4 2462 btrfs_debug(fs_info,
1245835d
QW
2463 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2464 failrec->logical, failrec->start, failrec->len);
4a54c8c1
JS
2465 /*
2466 * when data can be on disk more than twice, add to failrec here
2467 * (e.g. with a list for failed_mirror) to make
2468 * clean_io_failure() clean all those errors at once.
2469 */
3526302f
NB
2470
2471 return failrec;
4a54c8c1 2472 }
2fe6303e 2473
3526302f
NB
2474 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2475 if (!failrec)
2476 return ERR_PTR(-ENOMEM);
2fe6303e 2477
3526302f 2478 failrec->start = start;
150e4b05 2479 failrec->len = sectorsize;
3526302f
NB
2480 failrec->this_mirror = 0;
2481 failrec->bio_flags = 0;
3526302f
NB
2482
2483 read_lock(&em_tree->lock);
2484 em = lookup_extent_mapping(em_tree, start, failrec->len);
2485 if (!em) {
2486 read_unlock(&em_tree->lock);
2487 kfree(failrec);
2488 return ERR_PTR(-EIO);
2489 }
2490
2491 if (em->start > start || em->start + em->len <= start) {
2492 free_extent_map(em);
2493 em = NULL;
2494 }
2495 read_unlock(&em_tree->lock);
2496 if (!em) {
2497 kfree(failrec);
2498 return ERR_PTR(-EIO);
2499 }
2500
2501 logical = start - em->start;
2502 logical = em->block_start + logical;
2503 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2504 logical = em->block_start;
2505 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2506 extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2507 }
2508
2509 btrfs_debug(fs_info,
2510 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2511 logical, start, failrec->len);
2512
2513 failrec->logical = logical;
2514 free_extent_map(em);
2515
2516 /* Set the bits in the private failure tree */
150e4b05 2517 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
3526302f
NB
2518 EXTENT_LOCKED | EXTENT_DIRTY);
2519 if (ret >= 0) {
2520 ret = set_state_failrec(failure_tree, start, failrec);
2521 /* Set the bits in the inode's tree */
150e4b05
QW
2522 ret = set_extent_bits(tree, start, start + sectorsize - 1,
2523 EXTENT_DAMAGED);
3526302f
NB
2524 } else if (ret < 0) {
2525 kfree(failrec);
2526 return ERR_PTR(ret);
2527 }
2528
2529 return failrec;
2fe6303e
MX
2530}
2531
1245835d 2532static bool btrfs_check_repairable(struct inode *inode,
ce06d3ec
OS
2533 struct io_failure_record *failrec,
2534 int failed_mirror)
2fe6303e 2535{
ab8d0fc4 2536 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2537 int num_copies;
2538
ab8d0fc4 2539 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2540 if (num_copies == 1) {
2541 /*
2542 * we only have a single copy of the data, so don't bother with
2543 * all the retry and error correction code that follows. no
2544 * matter what the error is, it is very likely to persist.
2545 */
ab8d0fc4
JM
2546 btrfs_debug(fs_info,
2547 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2548 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2549 return false;
4a54c8c1
JS
2550 }
2551
1245835d
QW
2552 /* The failure record should only contain one sector */
2553 ASSERT(failrec->len == fs_info->sectorsize);
2554
4a54c8c1 2555 /*
1245835d
QW
2556 * There are two premises:
2557 * a) deliver good data to the caller
2558 * b) correct the bad sectors on disk
2559 *
2560 * Since we're only doing repair for one sector, we only need to get
2561 * a good copy of the failed sector and if we succeed, we have setup
2562 * everything for repair_io_failure to do the rest for us.
4a54c8c1 2563 */
1245835d
QW
2564 failrec->failed_mirror = failed_mirror;
2565 failrec->this_mirror++;
2566 if (failrec->this_mirror == failed_mirror)
4a54c8c1 2567 failrec->this_mirror++;
4a54c8c1 2568
facc8a22 2569 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2570 btrfs_debug(fs_info,
2571 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2572 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2573 return false;
4a54c8c1
JS
2574 }
2575
c3cfb656 2576 return true;
2fe6303e
MX
2577}
2578
150e4b05
QW
2579int btrfs_repair_one_sector(struct inode *inode,
2580 struct bio *failed_bio, u32 bio_offset,
2581 struct page *page, unsigned int pgoff,
2582 u64 start, int failed_mirror,
2583 submit_bio_hook_t *submit_bio_hook)
2fe6303e
MX
2584{
2585 struct io_failure_record *failrec;
77d5d689 2586 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2587 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2588 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
77d5d689 2589 struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
7ffd27e3 2590 const int icsum = bio_offset >> fs_info->sectorsize_bits;
77d5d689
OS
2591 struct bio *repair_bio;
2592 struct btrfs_io_bio *repair_io_bio;
4e4cbee9 2593 blk_status_t status;
2fe6303e 2594
77d5d689
OS
2595 btrfs_debug(fs_info,
2596 "repair read error: read error at %llu", start);
2fe6303e 2597
1f7ad75b 2598 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e 2599
150e4b05 2600 failrec = btrfs_get_io_failure_record(inode, start);
3526302f 2601 if (IS_ERR(failrec))
150e4b05 2602 return PTR_ERR(failrec);
2fe6303e 2603
1245835d
QW
2604
2605 if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
7870d082 2606 free_io_failure(failure_tree, tree, failrec);
150e4b05 2607 return -EIO;
2fe6303e
MX
2608 }
2609
77d5d689
OS
2610 repair_bio = btrfs_io_bio_alloc(1);
2611 repair_io_bio = btrfs_io_bio(repair_bio);
2612 repair_bio->bi_opf = REQ_OP_READ;
77d5d689
OS
2613 repair_bio->bi_end_io = failed_bio->bi_end_io;
2614 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2615 repair_bio->bi_private = failed_bio->bi_private;
2fe6303e 2616
77d5d689 2617 if (failed_io_bio->csum) {
223486c2 2618 const u32 csum_size = fs_info->csum_size;
77d5d689
OS
2619
2620 repair_io_bio->csum = repair_io_bio->csum_inline;
2621 memcpy(repair_io_bio->csum,
2622 failed_io_bio->csum + csum_size * icsum, csum_size);
2623 }
2fe6303e 2624
77d5d689
OS
2625 bio_add_page(repair_bio, page, failrec->len, pgoff);
2626 repair_io_bio->logical = failrec->start;
2627 repair_io_bio->iter = repair_bio->bi_iter;
4a54c8c1 2628
ab8d0fc4 2629 btrfs_debug(btrfs_sb(inode->i_sb),
1245835d
QW
2630 "repair read error: submitting new read to mirror %d",
2631 failrec->this_mirror);
4a54c8c1 2632
77d5d689
OS
2633 status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
2634 failrec->bio_flags);
4e4cbee9 2635 if (status) {
7870d082 2636 free_io_failure(failure_tree, tree, failrec);
77d5d689 2637 bio_put(repair_bio);
6c387ab2 2638 }
150e4b05
QW
2639 return blk_status_to_errno(status);
2640}
2641
2642static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2643{
2644 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2645
2646 ASSERT(page_offset(page) <= start &&
2647 start + len <= page_offset(page) + PAGE_SIZE);
2648
2649 /*
2650 * For subapge metadata case, all btrfs_page_* helpers need page to
2651 * have page::private populated.
2652 * But we can have rare case where the last eb in the page is only
2653 * referred by the IO, and it gets released immedately after it's
2654 * read and verified.
2655 *
2656 * This can detach the page private completely.
2657 * In that case, we can just skip the page status update completely,
2658 * as the page has no eb anymore.
2659 */
2660 if (fs_info->sectorsize < PAGE_SIZE && unlikely(!PagePrivate(page))) {
2661 ASSERT(!is_data_inode(page->mapping->host));
2662 return;
2663 }
2664 if (uptodate) {
2665 btrfs_page_set_uptodate(fs_info, page, start, len);
2666 } else {
2667 btrfs_page_clear_uptodate(fs_info, page, start, len);
2668 btrfs_page_set_error(fs_info, page, start, len);
2669 }
2670
2671 if (fs_info->sectorsize == PAGE_SIZE)
2672 unlock_page(page);
2673 else if (is_data_inode(page->mapping->host))
2674 /*
2675 * For subpage data, unlock the page if we're the last reader.
2676 * For subpage metadata, page lock is not utilized for read.
2677 */
2678 btrfs_subpage_end_reader(fs_info, page, start, len);
2679}
2680
2681static blk_status_t submit_read_repair(struct inode *inode,
2682 struct bio *failed_bio, u32 bio_offset,
2683 struct page *page, unsigned int pgoff,
2684 u64 start, u64 end, int failed_mirror,
2685 unsigned int error_bitmap,
2686 submit_bio_hook_t *submit_bio_hook)
2687{
2688 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2689 const u32 sectorsize = fs_info->sectorsize;
2690 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2691 int error = 0;
2692 int i;
2693
2694 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2695
2696 /* We're here because we had some read errors or csum mismatch */
2697 ASSERT(error_bitmap);
2698
2699 /*
2700 * We only get called on buffered IO, thus page must be mapped and bio
2701 * must not be cloned.
2702 */
2703 ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
2704
2705 /* Iterate through all the sectors in the range */
2706 for (i = 0; i < nr_bits; i++) {
2707 const unsigned int offset = i * sectorsize;
2708 struct extent_state *cached = NULL;
2709 bool uptodate = false;
2710 int ret;
2711
2712 if (!(error_bitmap & (1U << i))) {
2713 /*
2714 * This sector has no error, just end the page read
2715 * and unlock the range.
2716 */
2717 uptodate = true;
2718 goto next;
2719 }
2720
2721 ret = btrfs_repair_one_sector(inode, failed_bio,
2722 bio_offset + offset,
2723 page, pgoff + offset, start + offset,
2724 failed_mirror, submit_bio_hook);
2725 if (!ret) {
2726 /*
2727 * We have submitted the read repair, the page release
2728 * will be handled by the endio function of the
2729 * submitted repair bio.
2730 * Thus we don't need to do any thing here.
2731 */
2732 continue;
2733 }
2734 /*
2735 * Repair failed, just record the error but still continue.
2736 * Or the remaining sectors will not be properly unlocked.
2737 */
2738 if (!error)
2739 error = ret;
2740next:
2741 end_page_read(page, uptodate, start + offset, sectorsize);
2742 if (uptodate)
2743 set_extent_uptodate(&BTRFS_I(inode)->io_tree,
2744 start + offset,
2745 start + offset + sectorsize - 1,
2746 &cached, GFP_ATOMIC);
2747 unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
2748 start + offset,
2749 start + offset + sectorsize - 1,
2750 &cached);
2751 }
2752 return errno_to_blk_status(error);
4a54c8c1
JS
2753}
2754
d1310b2e
CM
2755/* lots and lots of room for performance fixes in the end_bio funcs */
2756
b5227c07 2757void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0
JM
2758{
2759 int uptodate = (err == 0);
3e2426bd 2760 int ret = 0;
87826df0 2761
c629732d 2762 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
87826df0 2763
87826df0 2764 if (!uptodate) {
87826df0
JM
2765 ClearPageUptodate(page);
2766 SetPageError(page);
bff5baf8 2767 ret = err < 0 ? err : -EIO;
5dca6eea 2768 mapping_set_error(page->mapping, ret);
87826df0 2769 }
87826df0
JM
2770}
2771
d1310b2e
CM
2772/*
2773 * after a writepage IO is done, we need to:
2774 * clear the uptodate bits on error
2775 * clear the writeback bits in the extent tree for this IO
2776 * end_page_writeback if the page has no more pending IO
2777 *
2778 * Scheduling is not allowed, so the extent state tree is expected
2779 * to have one and only one object corresponding to this IO.
2780 */
4246a0b6 2781static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2782{
4e4cbee9 2783 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2784 struct bio_vec *bvec;
d1310b2e
CM
2785 u64 start;
2786 u64 end;
6dc4f100 2787 struct bvec_iter_all iter_all;
d8e3fb10 2788 bool first_bvec = true;
d1310b2e 2789
c09abff8 2790 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2791 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2792 struct page *page = bvec->bv_page;
0b246afa
JM
2793 struct inode *inode = page->mapping->host;
2794 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
902b22f3 2795
17a5adcc
AO
2796 /* We always issue full-page reads, but if some block
2797 * in a page fails to read, blk_update_request() will
2798 * advance bv_offset and adjust bv_len to compensate.
2799 * Print a warning for nonzero offsets, and an error
2800 * if they don't add up to a full page. */
09cbfeaf
KS
2801 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2802 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
0b246afa 2803 btrfs_err(fs_info,
efe120a0
FH
2804 "partial page write in btrfs with offset %u and length %u",
2805 bvec->bv_offset, bvec->bv_len);
2806 else
0b246afa 2807 btrfs_info(fs_info,
5d163e0e 2808 "incomplete page write in btrfs with offset %u and length %u",
efe120a0
FH
2809 bvec->bv_offset, bvec->bv_len);
2810 }
d1310b2e 2811
17a5adcc
AO
2812 start = page_offset(page);
2813 end = start + bvec->bv_offset + bvec->bv_len - 1;
d1310b2e 2814
d8e3fb10
NA
2815 if (first_bvec) {
2816 btrfs_record_physical_zoned(inode, start, bio);
2817 first_bvec = false;
2818 }
2819
4e4cbee9 2820 end_extent_writepage(page, error, start, end);
17a5adcc 2821 end_page_writeback(page);
2c30c71b 2822 }
2b1f55b0 2823
d1310b2e 2824 bio_put(bio);
d1310b2e
CM
2825}
2826
94e8c95c
QW
2827/*
2828 * Record previously processed extent range
2829 *
2830 * For endio_readpage_release_extent() to handle a full extent range, reducing
2831 * the extent io operations.
2832 */
2833struct processed_extent {
2834 struct btrfs_inode *inode;
2835 /* Start of the range in @inode */
2836 u64 start;
2e626e56 2837 /* End of the range in @inode */
94e8c95c
QW
2838 u64 end;
2839 bool uptodate;
2840};
2841
2842/*
2843 * Try to release processed extent range
2844 *
2845 * May not release the extent range right now if the current range is
2846 * contiguous to processed extent.
2847 *
2848 * Will release processed extent when any of @inode, @uptodate, the range is
2849 * no longer contiguous to the processed range.
2850 *
2851 * Passing @inode == NULL will force processed extent to be released.
2852 */
2853static void endio_readpage_release_extent(struct processed_extent *processed,
2854 struct btrfs_inode *inode, u64 start, u64 end,
2855 bool uptodate)
883d0de4
MX
2856{
2857 struct extent_state *cached = NULL;
94e8c95c
QW
2858 struct extent_io_tree *tree;
2859
2860 /* The first extent, initialize @processed */
2861 if (!processed->inode)
2862 goto update;
883d0de4 2863
94e8c95c
QW
2864 /*
2865 * Contiguous to processed extent, just uptodate the end.
2866 *
2867 * Several things to notice:
2868 *
2869 * - bio can be merged as long as on-disk bytenr is contiguous
2870 * This means we can have page belonging to other inodes, thus need to
2871 * check if the inode still matches.
2872 * - bvec can contain range beyond current page for multi-page bvec
2873 * Thus we need to do processed->end + 1 >= start check
2874 */
2875 if (processed->inode == inode && processed->uptodate == uptodate &&
2876 processed->end + 1 >= start && end >= processed->end) {
2877 processed->end = end;
2878 return;
2879 }
2880
2881 tree = &processed->inode->io_tree;
2882 /*
2883 * Now we don't have range contiguous to the processed range, release
2884 * the processed range now.
2885 */
2886 if (processed->uptodate && tree->track_uptodate)
2887 set_extent_uptodate(tree, processed->start, processed->end,
2888 &cached, GFP_ATOMIC);
2889 unlock_extent_cached_atomic(tree, processed->start, processed->end,
2890 &cached);
2891
2892update:
2893 /* Update processed to current range */
2894 processed->inode = inode;
2895 processed->start = start;
2896 processed->end = end;
2897 processed->uptodate = uptodate;
883d0de4
MX
2898}
2899
92082d40
QW
2900static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2901{
2902 ASSERT(PageLocked(page));
2903 if (fs_info->sectorsize == PAGE_SIZE)
2904 return;
2905
2906 ASSERT(PagePrivate(page));
2907 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2908}
2909
d9bb77d5
QW
2910/*
2911 * Find extent buffer for a givne bytenr.
2912 *
2913 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2914 * in endio context.
2915 */
2916static struct extent_buffer *find_extent_buffer_readpage(
2917 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2918{
2919 struct extent_buffer *eb;
2920
2921 /*
2922 * For regular sectorsize, we can use page->private to grab extent
2923 * buffer
2924 */
2925 if (fs_info->sectorsize == PAGE_SIZE) {
2926 ASSERT(PagePrivate(page) && page->private);
2927 return (struct extent_buffer *)page->private;
2928 }
2929
2930 /* For subpage case, we need to lookup buffer radix tree */
2931 rcu_read_lock();
2932 eb = radix_tree_lookup(&fs_info->buffer_radix,
2933 bytenr >> fs_info->sectorsize_bits);
2934 rcu_read_unlock();
2935 ASSERT(eb);
2936 return eb;
2937}
2938
d1310b2e
CM
2939/*
2940 * after a readpage IO is done, we need to:
2941 * clear the uptodate bits on error
2942 * set the uptodate bits if things worked
2943 * set the page up to date if all extents in the tree are uptodate
2944 * clear the lock bit in the extent tree
2945 * unlock the page if there are no other extents locked for it
2946 *
2947 * Scheduling is not allowed, so the extent state tree is expected
2948 * to have one and only one object corresponding to this IO.
2949 */
4246a0b6 2950static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2951{
2c30c71b 2952 struct bio_vec *bvec;
facc8a22 2953 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7870d082 2954 struct extent_io_tree *tree, *failure_tree;
94e8c95c 2955 struct processed_extent processed = { 0 };
7ffd27e3
QW
2956 /*
2957 * The offset to the beginning of a bio, since one bio can never be
2958 * larger than UINT_MAX, u32 here is enough.
2959 */
2960 u32 bio_offset = 0;
5cf1ab56 2961 int mirror;
d1310b2e 2962 int ret;
6dc4f100 2963 struct bvec_iter_all iter_all;
d1310b2e 2964
c09abff8 2965 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2966 bio_for_each_segment_all(bvec, bio, iter_all) {
150e4b05 2967 bool uptodate = !bio->bi_status;
d1310b2e 2968 struct page *page = bvec->bv_page;
a71754fc 2969 struct inode *inode = page->mapping->host;
ab8d0fc4 2970 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7ffd27e3 2971 const u32 sectorsize = fs_info->sectorsize;
150e4b05 2972 unsigned int error_bitmap = (unsigned int)-1;
7ffd27e3
QW
2973 u64 start;
2974 u64 end;
2975 u32 len;
507903b8 2976
ab8d0fc4
JM
2977 btrfs_debug(fs_info,
2978 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
1201b58b 2979 bio->bi_iter.bi_sector, bio->bi_status,
ab8d0fc4 2980 io_bio->mirror_num);
a71754fc 2981 tree = &BTRFS_I(inode)->io_tree;
7870d082 2982 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 2983
8b8bbd46
QW
2984 /*
2985 * We always issue full-sector reads, but if some block in a
2986 * page fails to read, blk_update_request() will advance
2987 * bv_offset and adjust bv_len to compensate. Print a warning
2988 * for unaligned offsets, and an error if they don't add up to
2989 * a full sector.
2990 */
2991 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2992 btrfs_err(fs_info,
2993 "partial page read in btrfs with offset %u and length %u",
2994 bvec->bv_offset, bvec->bv_len);
2995 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
2996 sectorsize))
2997 btrfs_info(fs_info,
2998 "incomplete page read with offset %u and length %u",
2999 bvec->bv_offset, bvec->bv_len);
3000
3001 start = page_offset(page) + bvec->bv_offset;
3002 end = start + bvec->bv_len - 1;
facc8a22 3003 len = bvec->bv_len;
d1310b2e 3004
9be3395b 3005 mirror = io_bio->mirror_num;
78e62c02 3006 if (likely(uptodate)) {
150e4b05
QW
3007 if (is_data_inode(inode)) {
3008 error_bitmap = btrfs_verify_data_csum(io_bio,
5e295768 3009 bio_offset, page, start, end);
150e4b05
QW
3010 ret = error_bitmap;
3011 } else {
9a446d6a 3012 ret = btrfs_validate_metadata_buffer(io_bio,
8e1dc982 3013 page, start, end, mirror);
150e4b05 3014 }
5ee0844d 3015 if (ret)
150e4b05 3016 uptodate = false;
5ee0844d 3017 else
7870d082
JB
3018 clean_io_failure(BTRFS_I(inode)->root->fs_info,
3019 failure_tree, tree, start,
3020 page,
3021 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 3022 }
ea466794 3023
f2a09da9
MX
3024 if (likely(uptodate))
3025 goto readpage_ok;
3026
be17b3af 3027 if (is_data_inode(inode)) {
f4a8e656 3028 /*
150e4b05
QW
3029 * btrfs_submit_read_repair() will handle all the good
3030 * and bad sectors, we just continue to the next bvec.
f4a8e656 3031 */
150e4b05
QW
3032 submit_read_repair(inode, bio, bio_offset, page,
3033 start - page_offset(page), start,
3034 end, mirror, error_bitmap,
3035 btrfs_submit_data_bio);
3036
3037 ASSERT(bio_offset + len > bio_offset);
3038 bio_offset += len;
3039 continue;
78e62c02
NB
3040 } else {
3041 struct extent_buffer *eb;
3042
d9bb77d5 3043 eb = find_extent_buffer_readpage(fs_info, page, start);
78e62c02
NB
3044 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3045 eb->read_mirror = mirror;
3046 atomic_dec(&eb->io_pages);
3047 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
3048 &eb->bflags))
3049 btree_readahead_hook(eb, -EIO);
7e38326f 3050 }
f2a09da9 3051readpage_ok:
883d0de4 3052 if (likely(uptodate)) {
a71754fc 3053 loff_t i_size = i_size_read(inode);
09cbfeaf 3054 pgoff_t end_index = i_size >> PAGE_SHIFT;
a71754fc 3055
c28ea613
QW
3056 /*
3057 * Zero out the remaining part if this range straddles
3058 * i_size.
3059 *
3060 * Here we should only zero the range inside the bvec,
3061 * not touch anything else.
3062 *
3063 * NOTE: i_size is exclusive while end is inclusive.
3064 */
3065 if (page->index == end_index && i_size <= end) {
3066 u32 zero_start = max(offset_in_page(i_size),
d2dcc8ed 3067 offset_in_page(start));
c28ea613
QW
3068
3069 zero_user_segment(page, zero_start,
3070 offset_in_page(end) + 1);
3071 }
70dec807 3072 }
7ffd27e3
QW
3073 ASSERT(bio_offset + len > bio_offset);
3074 bio_offset += len;
883d0de4 3075
e09caaf9 3076 /* Update page status and unlock */
92082d40 3077 end_page_read(page, uptodate, start, len);
94e8c95c
QW
3078 endio_readpage_release_extent(&processed, BTRFS_I(inode),
3079 start, end, uptodate);
2c30c71b 3080 }
94e8c95c
QW
3081 /* Release the last extent */
3082 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
b3a0dd50 3083 btrfs_io_bio_free_csum(io_bio);
d1310b2e 3084 bio_put(bio);
d1310b2e
CM
3085}
3086
9be3395b 3087/*
184f999e
DS
3088 * Initialize the members up to but not including 'bio'. Use after allocating a
3089 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3090 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 3091 */
184f999e 3092static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
d1310b2e 3093{
184f999e
DS
3094 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
3095}
d1310b2e 3096
9be3395b 3097/*
6e707bcd
DS
3098 * The following helpers allocate a bio. As it's backed by a bioset, it'll
3099 * never fail. We're returning a bio right now but you can call btrfs_io_bio
3100 * for the appropriate container_of magic
9be3395b 3101 */
e749af44 3102struct bio *btrfs_bio_alloc(u64 first_byte)
d1310b2e
CM
3103{
3104 struct bio *bio;
d1310b2e 3105
a8affc03 3106 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
c821e7f3 3107 bio->bi_iter.bi_sector = first_byte >> 9;
184f999e 3108 btrfs_io_bio_init(btrfs_io_bio(bio));
d1310b2e
CM
3109 return bio;
3110}
3111
8b6c1d56 3112struct bio *btrfs_bio_clone(struct bio *bio)
9be3395b 3113{
23ea8e5a
MX
3114 struct btrfs_io_bio *btrfs_bio;
3115 struct bio *new;
9be3395b 3116
6e707bcd 3117 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 3118 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
6e707bcd 3119 btrfs_bio = btrfs_io_bio(new);
184f999e 3120 btrfs_io_bio_init(btrfs_bio);
6e707bcd 3121 btrfs_bio->iter = bio->bi_iter;
23ea8e5a
MX
3122 return new;
3123}
9be3395b 3124
c5e4c3d7 3125struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
9be3395b 3126{
facc8a22
MX
3127 struct bio *bio;
3128
6e707bcd 3129 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 3130 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
184f999e 3131 btrfs_io_bio_init(btrfs_io_bio(bio));
facc8a22 3132 return bio;
9be3395b
CM
3133}
3134
e477094f 3135struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2f8e9140
LB
3136{
3137 struct bio *bio;
3138 struct btrfs_io_bio *btrfs_bio;
3139
3140 /* this will never fail when it's backed by a bioset */
8ac9f7c1 3141 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
3142 ASSERT(bio);
3143
3144 btrfs_bio = btrfs_io_bio(bio);
184f999e 3145 btrfs_io_bio_init(btrfs_bio);
2f8e9140
LB
3146
3147 bio_trim(bio, offset >> 9, size >> 9);
17347cec 3148 btrfs_bio->iter = bio->bi_iter;
2f8e9140
LB
3149 return bio;
3150}
9be3395b 3151
953651eb
NA
3152/**
3153 * Attempt to add a page to bio
3154 *
3155 * @bio: destination bio
3156 * @page: page to add to the bio
3157 * @disk_bytenr: offset of the new bio or to check whether we are adding
3158 * a contiguous page to the previous one
3159 * @pg_offset: starting offset in the page
3160 * @size: portion of page that we want to write
3161 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3162 * @bio_flags: flags of the current bio to see if we can merge them
3163 * @return: true if page was added, false otherwise
3164 *
3165 * Attempt to add a page to bio considering stripe alignment etc.
3166 *
3167 * Return true if successfully page added. Otherwise, return false.
3168 */
390ed29b
QW
3169static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3170 struct page *page,
953651eb
NA
3171 u64 disk_bytenr, unsigned int size,
3172 unsigned int pg_offset,
953651eb
NA
3173 unsigned long bio_flags)
3174{
390ed29b
QW
3175 struct bio *bio = bio_ctrl->bio;
3176 u32 bio_size = bio->bi_iter.bi_size;
953651eb
NA
3177 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3178 bool contig;
e1326f03 3179 int ret;
953651eb 3180
390ed29b
QW
3181 ASSERT(bio);
3182 /* The limit should be calculated when bio_ctrl->bio is allocated */
3183 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3184 if (bio_ctrl->bio_flags != bio_flags)
953651eb
NA
3185 return false;
3186
390ed29b 3187 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
953651eb
NA
3188 contig = bio->bi_iter.bi_sector == sector;
3189 else
3190 contig = bio_end_sector(bio) == sector;
3191 if (!contig)
3192 return false;
3193
390ed29b
QW
3194 if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
3195 bio_size + size > bio_ctrl->len_to_stripe_boundary)
953651eb
NA
3196 return false;
3197
390ed29b 3198 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
e1326f03 3199 ret = bio_add_zone_append_page(bio, page, size, pg_offset);
390ed29b 3200 else
e1326f03
NA
3201 ret = bio_add_page(bio, page, size, pg_offset);
3202
3203 return ret == size;
953651eb
NA
3204}
3205
390ed29b
QW
3206static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
3207 struct btrfs_inode *inode)
3208{
3209 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3210 struct btrfs_io_geometry geom;
3211 struct btrfs_ordered_extent *ordered;
3212 struct extent_map *em;
3213 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3214 int ret;
3215
3216 /*
3217 * Pages for compressed extent are never submitted to disk directly,
3218 * thus it has no real boundary, just set them to U32_MAX.
3219 *
3220 * The split happens for real compressed bio, which happens in
3221 * btrfs_submit_compressed_read/write().
3222 */
3223 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
3224 bio_ctrl->len_to_oe_boundary = U32_MAX;
3225 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3226 return 0;
3227 }
3228 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3229 if (IS_ERR(em))
3230 return PTR_ERR(em);
3231 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3232 logical, &geom);
3233 free_extent_map(em);
3234 if (ret < 0) {
3235 return ret;
3236 }
3237 if (geom.len > U32_MAX)
3238 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3239 else
3240 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3241
3242 if (!btrfs_is_zoned(fs_info) ||
3243 bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3244 bio_ctrl->len_to_oe_boundary = U32_MAX;
3245 return 0;
3246 }
3247
3248 ASSERT(fs_info->max_zone_append_size > 0);
3249 /* Ordered extent not yet created, so we're good */
3250 ordered = btrfs_lookup_ordered_extent(inode, logical);
3251 if (!ordered) {
3252 bio_ctrl->len_to_oe_boundary = U32_MAX;
3253 return 0;
3254 }
3255
3256 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3257 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3258 btrfs_put_ordered_extent(ordered);
3259 return 0;
3260}
3261
4b81ba48
DS
3262/*
3263 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
3264 * @wbc: optional writeback control for io accounting
3265 * @page: page to add to the bio
0c64c33c
QW
3266 * @disk_bytenr: logical bytenr where the write will be
3267 * @size: portion of page that we want to write to
b8b3d625
DS
3268 * @pg_offset: offset of the new bio or to check whether we are adding
3269 * a contiguous page to the previous one
5c2b1fd7 3270 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
3271 * @end_io_func: end_io callback for new bio
3272 * @mirror_num: desired mirror to read/write
3273 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3274 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 3275 */
0ceb34bf 3276static int submit_extent_page(unsigned int opf,
da2f0f74 3277 struct writeback_control *wbc,
390ed29b 3278 struct btrfs_bio_ctrl *bio_ctrl,
0c64c33c 3279 struct page *page, u64 disk_bytenr,
6c5a4e2c 3280 size_t size, unsigned long pg_offset,
f188591e 3281 bio_end_io_t end_io_func,
c8b97818 3282 int mirror_num,
005efedf
FM
3283 unsigned long bio_flags,
3284 bool force_bio_submit)
d1310b2e
CM
3285{
3286 int ret = 0;
3287 struct bio *bio;
e940e9a7 3288 size_t io_size = min_t(size_t, size, PAGE_SIZE);
e1326f03
NA
3289 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3290 struct extent_io_tree *tree = &inode->io_tree;
3291 struct btrfs_fs_info *fs_info = inode->root->fs_info;
d1310b2e 3292
390ed29b 3293 ASSERT(bio_ctrl);
5c2b1fd7 3294
390ed29b
QW
3295 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3296 pg_offset + size <= PAGE_SIZE);
3297 if (bio_ctrl->bio) {
3298 bio = bio_ctrl->bio;
953651eb 3299 if (force_bio_submit ||
390ed29b
QW
3300 !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
3301 pg_offset, bio_flags)) {
3302 ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
3303 bio_ctrl->bio = NULL;
3304 if (ret < 0)
79787eaa 3305 return ret;
d1310b2e 3306 } else {
da2f0f74 3307 if (wbc)
e940e9a7 3308 wbc_account_cgroup_owner(wbc, page, io_size);
d1310b2e
CM
3309 return 0;
3310 }
3311 }
c8b97818 3312
0c64c33c 3313 bio = btrfs_bio_alloc(disk_bytenr);
e940e9a7 3314 bio_add_page(bio, page, io_size, pg_offset);
d1310b2e
CM
3315 bio->bi_end_io = end_io_func;
3316 bio->bi_private = tree;
e6959b93 3317 bio->bi_write_hint = page->mapping->host->i_write_hint;
4b81ba48 3318 bio->bi_opf = opf;
da2f0f74 3319 if (wbc) {
429aebc0
DS
3320 struct block_device *bdev;
3321
e1326f03 3322 bdev = fs_info->fs_devices->latest_bdev;
429aebc0 3323 bio_set_dev(bio, bdev);
da2f0f74 3324 wbc_init_bio(wbc, bio);
e940e9a7 3325 wbc_account_cgroup_owner(wbc, page, io_size);
da2f0f74 3326 }
e1326f03 3327 if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
e7ff9e6b 3328 struct btrfs_device *device;
e1326f03 3329
e7ff9e6b
JT
3330 device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
3331 if (IS_ERR(device))
3332 return PTR_ERR(device);
e1326f03 3333
e7ff9e6b 3334 btrfs_io_bio(bio)->device = device;
e1326f03 3335 }
70dec807 3336
390ed29b
QW
3337 bio_ctrl->bio = bio;
3338 bio_ctrl->bio_flags = bio_flags;
3339 ret = calc_bio_boundaries(bio_ctrl, inode);
d1310b2e
CM
3340
3341 return ret;
3342}
3343
760f991f
QW
3344static int attach_extent_buffer_page(struct extent_buffer *eb,
3345 struct page *page,
3346 struct btrfs_subpage *prealloc)
d1310b2e 3347{
760f991f
QW
3348 struct btrfs_fs_info *fs_info = eb->fs_info;
3349 int ret = 0;
3350
0d01e247
QW
3351 /*
3352 * If the page is mapped to btree inode, we should hold the private
3353 * lock to prevent race.
3354 * For cloned or dummy extent buffers, their pages are not mapped and
3355 * will not race with any other ebs.
3356 */
3357 if (page->mapping)
3358 lockdep_assert_held(&page->mapping->private_lock);
3359
760f991f
QW
3360 if (fs_info->sectorsize == PAGE_SIZE) {
3361 if (!PagePrivate(page))
3362 attach_page_private(page, eb);
3363 else
3364 WARN_ON(page->private != (unsigned long)eb);
3365 return 0;
3366 }
3367
3368 /* Already mapped, just free prealloc */
3369 if (PagePrivate(page)) {
3370 btrfs_free_subpage(prealloc);
3371 return 0;
3372 }
3373
3374 if (prealloc)
3375 /* Has preallocated memory for subpage */
3376 attach_page_private(page, prealloc);
d1b89bc0 3377 else
760f991f
QW
3378 /* Do new allocation to attach subpage */
3379 ret = btrfs_attach_subpage(fs_info, page,
3380 BTRFS_SUBPAGE_METADATA);
3381 return ret;
d1310b2e
CM
3382}
3383
32443de3 3384int set_page_extent_mapped(struct page *page)
d1310b2e 3385{
32443de3
QW
3386 struct btrfs_fs_info *fs_info;
3387
3388 ASSERT(page->mapping);
3389
3390 if (PagePrivate(page))
3391 return 0;
3392
3393 fs_info = btrfs_sb(page->mapping->host->i_sb);
3394
3395 if (fs_info->sectorsize < PAGE_SIZE)
3396 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3397
3398 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3399 return 0;
3400}
3401
3402void clear_page_extent_mapped(struct page *page)
3403{
3404 struct btrfs_fs_info *fs_info;
3405
3406 ASSERT(page->mapping);
3407
d1b89bc0 3408 if (!PagePrivate(page))
32443de3
QW
3409 return;
3410
3411 fs_info = btrfs_sb(page->mapping->host->i_sb);
3412 if (fs_info->sectorsize < PAGE_SIZE)
3413 return btrfs_detach_subpage(fs_info, page);
3414
3415 detach_page_private(page);
d1310b2e
CM
3416}
3417
125bac01
MX
3418static struct extent_map *
3419__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
1a5ee1e6 3420 u64 start, u64 len, struct extent_map **em_cached)
125bac01
MX
3421{
3422 struct extent_map *em;
3423
3424 if (em_cached && *em_cached) {
3425 em = *em_cached;
cbc0e928 3426 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3427 start < extent_map_end(em)) {
490b54d6 3428 refcount_inc(&em->refs);
125bac01
MX
3429 return em;
3430 }
3431
3432 free_extent_map(em);
3433 *em_cached = NULL;
3434 }
3435
1a5ee1e6 3436 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
125bac01
MX
3437 if (em_cached && !IS_ERR_OR_NULL(em)) {
3438 BUG_ON(*em_cached);
490b54d6 3439 refcount_inc(&em->refs);
125bac01
MX
3440 *em_cached = em;
3441 }
3442 return em;
3443}
d1310b2e
CM
3444/*
3445 * basic readpage implementation. Locked extent state structs are inserted
3446 * into the tree that are removed when the IO is done (by the end_io
3447 * handlers)
79787eaa 3448 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3449 * return 0 on success, otherwise return error
d1310b2e 3450 */
0f208812 3451int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
390ed29b 3452 struct btrfs_bio_ctrl *bio_ctrl,
0f208812 3453 unsigned int read_flags, u64 *prev_em_start)
d1310b2e
CM
3454{
3455 struct inode *inode = page->mapping->host;
92082d40 3456 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4eee4fa4 3457 u64 start = page_offset(page);
8eec8296 3458 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3459 u64 cur = start;
3460 u64 extent_offset;
3461 u64 last_byte = i_size_read(inode);
3462 u64 block_start;
3463 u64 cur_end;
d1310b2e 3464 struct extent_map *em;
baf863b9 3465 int ret = 0;
d1310b2e 3466 int nr = 0;
306e16ce 3467 size_t pg_offset = 0;
d1310b2e
CM
3468 size_t iosize;
3469 size_t blocksize = inode->i_sb->s_blocksize;
7f042a83 3470 unsigned long this_bio_flag = 0;
f657a31c 3471 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 3472
32443de3
QW
3473 ret = set_page_extent_mapped(page);
3474 if (ret < 0) {
3475 unlock_extent(tree, start, end);
92082d40
QW
3476 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3477 unlock_page(page);
32443de3
QW
3478 goto out;
3479 }
d1310b2e 3480
90a887c9
DM
3481 if (!PageUptodate(page)) {
3482 if (cleancache_get_page(page) == 0) {
3483 BUG_ON(blocksize != PAGE_SIZE);
9974090b 3484 unlock_extent(tree, start, end);
92082d40 3485 unlock_page(page);
90a887c9
DM
3486 goto out;
3487 }
3488 }
3489
09cbfeaf 3490 if (page->index == last_byte >> PAGE_SHIFT) {
7073017a 3491 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3492
3493 if (zero_offset) {
09cbfeaf 3494 iosize = PAGE_SIZE - zero_offset;
d048b9c2 3495 memzero_page(page, zero_offset, iosize);
c8b97818 3496 flush_dcache_page(page);
c8b97818
CM
3497 }
3498 }
92082d40 3499 begin_page_read(fs_info, page);
d1310b2e 3500 while (cur <= end) {
005efedf 3501 bool force_bio_submit = false;
0c64c33c 3502 u64 disk_bytenr;
c8f2f24b 3503
d1310b2e 3504 if (cur >= last_byte) {
507903b8
AJ
3505 struct extent_state *cached = NULL;
3506
09cbfeaf 3507 iosize = PAGE_SIZE - pg_offset;
d048b9c2 3508 memzero_page(page, pg_offset, iosize);
d1310b2e 3509 flush_dcache_page(page);
d1310b2e 3510 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3511 &cached, GFP_NOFS);
7f042a83 3512 unlock_extent_cached(tree, cur,
e43bbe5e 3513 cur + iosize - 1, &cached);
92082d40 3514 end_page_read(page, true, cur, iosize);
d1310b2e
CM
3515 break;
3516 }
125bac01 3517 em = __get_extent_map(inode, page, pg_offset, cur,
1a5ee1e6 3518 end - cur + 1, em_cached);
c704005d 3519 if (IS_ERR_OR_NULL(em)) {
7f042a83 3520 unlock_extent(tree, cur, end);
92082d40 3521 end_page_read(page, false, cur, end + 1 - cur);
d1310b2e
CM
3522 break;
3523 }
d1310b2e
CM
3524 extent_offset = cur - em->start;
3525 BUG_ON(extent_map_end(em) <= cur);
3526 BUG_ON(end < cur);
3527
261507a0 3528 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3529 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3530 extent_set_compress_type(&this_bio_flag,
3531 em->compress_type);
3532 }
c8b97818 3533
d1310b2e
CM
3534 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3535 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3536 iosize = ALIGN(iosize, blocksize);
949b3273 3537 if (this_bio_flag & EXTENT_BIO_COMPRESSED)
0c64c33c 3538 disk_bytenr = em->block_start;
949b3273 3539 else
0c64c33c 3540 disk_bytenr = em->block_start + extent_offset;
d1310b2e 3541 block_start = em->block_start;
d899e052
YZ
3542 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3543 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3544
3545 /*
3546 * If we have a file range that points to a compressed extent
260db43c 3547 * and it's followed by a consecutive file range that points
005efedf
FM
3548 * to the same compressed extent (possibly with a different
3549 * offset and/or length, so it either points to the whole extent
3550 * or only part of it), we must make sure we do not submit a
3551 * single bio to populate the pages for the 2 ranges because
3552 * this makes the compressed extent read zero out the pages
3553 * belonging to the 2nd range. Imagine the following scenario:
3554 *
3555 * File layout
3556 * [0 - 8K] [8K - 24K]
3557 * | |
3558 * | |
3559 * points to extent X, points to extent X,
3560 * offset 4K, length of 8K offset 0, length 16K
3561 *
3562 * [extent X, compressed length = 4K uncompressed length = 16K]
3563 *
3564 * If the bio to read the compressed extent covers both ranges,
3565 * it will decompress extent X into the pages belonging to the
3566 * first range and then it will stop, zeroing out the remaining
3567 * pages that belong to the other range that points to extent X.
3568 * So here we make sure we submit 2 bios, one for the first
3569 * range and another one for the third range. Both will target
3570 * the same physical extent from disk, but we can't currently
3571 * make the compressed bio endio callback populate the pages
3572 * for both ranges because each compressed bio is tightly
3573 * coupled with a single extent map, and each range can have
3574 * an extent map with a different offset value relative to the
3575 * uncompressed data of our extent and different lengths. This
3576 * is a corner case so we prioritize correctness over
3577 * non-optimal behavior (submitting 2 bios for the same extent).
3578 */
3579 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3580 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3581 *prev_em_start != em->start)
005efedf
FM
3582 force_bio_submit = true;
3583
3584 if (prev_em_start)
8e928218 3585 *prev_em_start = em->start;
005efedf 3586
d1310b2e
CM
3587 free_extent_map(em);
3588 em = NULL;
3589
3590 /* we've found a hole, just zero and go on */
3591 if (block_start == EXTENT_MAP_HOLE) {
507903b8
AJ
3592 struct extent_state *cached = NULL;
3593
d048b9c2 3594 memzero_page(page, pg_offset, iosize);
d1310b2e 3595 flush_dcache_page(page);
d1310b2e
CM
3596
3597 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3598 &cached, GFP_NOFS);
7f042a83 3599 unlock_extent_cached(tree, cur,
e43bbe5e 3600 cur + iosize - 1, &cached);
92082d40 3601 end_page_read(page, true, cur, iosize);
d1310b2e 3602 cur = cur + iosize;
306e16ce 3603 pg_offset += iosize;
d1310b2e
CM
3604 continue;
3605 }
3606 /* the get_extent function already copied into the page */
9655d298
CM
3607 if (test_range_bit(tree, cur, cur_end,
3608 EXTENT_UPTODATE, 1, NULL)) {
a1b32a59 3609 check_page_uptodate(tree, page);
7f042a83 3610 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3611 end_page_read(page, true, cur, iosize);
d1310b2e 3612 cur = cur + iosize;
306e16ce 3613 pg_offset += iosize;
d1310b2e
CM
3614 continue;
3615 }
70dec807
CM
3616 /* we have an inline extent but it didn't get marked up
3617 * to date. Error out
3618 */
3619 if (block_start == EXTENT_MAP_INLINE) {
7f042a83 3620 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3621 end_page_read(page, false, cur, iosize);
70dec807 3622 cur = cur + iosize;
306e16ce 3623 pg_offset += iosize;
70dec807
CM
3624 continue;
3625 }
d1310b2e 3626
0ceb34bf 3627 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
390ed29b
QW
3628 bio_ctrl, page, disk_bytenr, iosize,
3629 pg_offset,
fd513000 3630 end_bio_extent_readpage, 0,
005efedf
FM
3631 this_bio_flag,
3632 force_bio_submit);
c8f2f24b
JB
3633 if (!ret) {
3634 nr++;
c8f2f24b 3635 } else {
7f042a83 3636 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3637 end_page_read(page, false, cur, iosize);
baf863b9 3638 goto out;
edd33c99 3639 }
d1310b2e 3640 cur = cur + iosize;
306e16ce 3641 pg_offset += iosize;
d1310b2e 3642 }
90a887c9 3643out:
baf863b9 3644 return ret;
d1310b2e
CM
3645}
3646
b6660e80 3647static inline void contiguous_readpages(struct page *pages[], int nr_pages,
390ed29b
QW
3648 u64 start, u64 end,
3649 struct extent_map **em_cached,
3650 struct btrfs_bio_ctrl *bio_ctrl,
3651 u64 *prev_em_start)
9974090b 3652{
23d31bd4 3653 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
3654 int index;
3655
b272ae22 3656 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3657
3658 for (index = 0; index < nr_pages; index++) {
390ed29b 3659 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
0f208812 3660 REQ_RAHEAD, prev_em_start);
09cbfeaf 3661 put_page(pages[index]);
9974090b
MX
3662 }
3663}
3664
3d4b9496 3665static void update_nr_written(struct writeback_control *wbc,
a9132667 3666 unsigned long nr_written)
11c8349b
CM
3667{
3668 wbc->nr_to_write -= nr_written;
11c8349b
CM
3669}
3670
d1310b2e 3671/*
40f76580
CM
3672 * helper for __extent_writepage, doing all of the delayed allocation setup.
3673 *
5eaad97a 3674 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3675 * to write the page (copy into inline extent). In this case the IO has
3676 * been started and the page is already unlocked.
3677 *
3678 * This returns 0 if all went well (page still locked)
3679 * This returns < 0 if there were errors (page still locked)
d1310b2e 3680 */
cd4c0bf9 3681static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
8cc0237a
NB
3682 struct page *page, struct writeback_control *wbc,
3683 u64 delalloc_start, unsigned long *nr_written)
40f76580 3684{
09cbfeaf 3685 u64 page_end = delalloc_start + PAGE_SIZE - 1;
3522e903 3686 bool found;
40f76580
CM
3687 u64 delalloc_to_write = 0;
3688 u64 delalloc_end = 0;
3689 int ret;
3690 int page_started = 0;
3691
40f76580
CM
3692
3693 while (delalloc_end < page_end) {
cd4c0bf9 3694 found = find_lock_delalloc_range(&inode->vfs_inode, page,
40f76580 3695 &delalloc_start,
917aacec 3696 &delalloc_end);
3522e903 3697 if (!found) {
40f76580
CM
3698 delalloc_start = delalloc_end + 1;
3699 continue;
3700 }
cd4c0bf9 3701 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
5eaad97a 3702 delalloc_end, &page_started, nr_written, wbc);
40f76580
CM
3703 if (ret) {
3704 SetPageError(page);
5eaad97a
NB
3705 /*
3706 * btrfs_run_delalloc_range should return < 0 for error
3707 * but just in case, we use > 0 here meaning the IO is
3708 * started, so we don't want to return > 0 unless
3709 * things are going well.
40f76580 3710 */
b69d1ee9 3711 return ret < 0 ? ret : -EIO;
40f76580
CM
3712 }
3713 /*
ea1754a0
KS
3714 * delalloc_end is already one less than the total length, so
3715 * we don't subtract one from PAGE_SIZE
40f76580
CM
3716 */
3717 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3718 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3719 delalloc_start = delalloc_end + 1;
3720 }
3721 if (wbc->nr_to_write < delalloc_to_write) {
3722 int thresh = 8192;
3723
3724 if (delalloc_to_write < thresh * 2)
3725 thresh = delalloc_to_write;
3726 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3727 thresh);
3728 }
3729
3730 /* did the fill delalloc function already unlock and start
3731 * the IO?
3732 */
3733 if (page_started) {
3734 /*
3735 * we've unlocked the page, so we can't update
3736 * the mapping's writeback index, just update
3737 * nr_to_write.
3738 */
3739 wbc->nr_to_write -= *nr_written;
3740 return 1;
3741 }
3742
b69d1ee9 3743 return 0;
40f76580
CM
3744}
3745
3746/*
3747 * helper for __extent_writepage. This calls the writepage start hooks,
3748 * and does the loop to map the page into extents and bios.
3749 *
3750 * We return 1 if the IO is started and the page is unlocked,
3751 * 0 if all went well (page still locked)
3752 * < 0 if there were errors (page still locked)
3753 */
d4580fe2 3754static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
40f76580
CM
3755 struct page *page,
3756 struct writeback_control *wbc,
3757 struct extent_page_data *epd,
3758 loff_t i_size,
3759 unsigned long nr_written,
57e5ffeb 3760 int *nr_ret)
d1310b2e 3761{
6bc5636a 3762 struct btrfs_fs_info *fs_info = inode->root->fs_info;
d4580fe2 3763 struct extent_io_tree *tree = &inode->io_tree;
4eee4fa4 3764 u64 start = page_offset(page);
6bc5636a 3765 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3766 u64 cur = start;
3767 u64 extent_offset;
d1310b2e 3768 u64 block_start;
d1310b2e 3769 struct extent_map *em;
40f76580
CM
3770 int ret = 0;
3771 int nr = 0;
d8e3fb10 3772 u32 opf = REQ_OP_WRITE;
57e5ffeb 3773 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3774 bool compressed;
c8b97818 3775
6bc5636a 3776 ret = btrfs_writepage_cow_fixup(page, start, end);
d75855b4
NB
3777 if (ret) {
3778 /* Fixup worker will requeue */
5ab58055 3779 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3780 update_nr_written(wbc, nr_written);
3781 unlock_page(page);
3782 return 1;
247e743c
CM
3783 }
3784
11c8349b
CM
3785 /*
3786 * we don't want to touch the inode after unlocking the page,
3787 * so we update the mapping writeback index now
3788 */
3d4b9496 3789 update_nr_written(wbc, nr_written + 1);
771ed689 3790
d1310b2e 3791 while (cur <= end) {
0c64c33c 3792 u64 disk_bytenr;
40f76580 3793 u64 em_end;
6bc5636a 3794 u32 iosize;
58409edd 3795
40f76580 3796 if (cur >= i_size) {
6bc5636a 3797 btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
d1310b2e
CM
3798 break;
3799 }
d4580fe2 3800 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
c704005d 3801 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3802 SetPageError(page);
61391d56 3803 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
3804 break;
3805 }
3806
3807 extent_offset = cur - em->start;
40f76580 3808 em_end = extent_map_end(em);
6bc5636a
QW
3809 ASSERT(cur <= em_end);
3810 ASSERT(cur < end);
3811 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
3812 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
d1310b2e 3813 block_start = em->block_start;
c8b97818 3814 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6bc5636a
QW
3815 disk_bytenr = em->block_start + extent_offset;
3816
3817 /* Note that em_end from extent_map_end() is exclusive */
3818 iosize = min(em_end, end + 1) - cur;
d8e3fb10 3819
e380adfc 3820 if (btrfs_use_zone_append(inode, em->block_start))
d8e3fb10
NA
3821 opf = REQ_OP_ZONE_APPEND;
3822
d1310b2e
CM
3823 free_extent_map(em);
3824 em = NULL;
3825
c8b97818
CM
3826 /*
3827 * compressed and inline extents are written through other
3828 * paths in the FS
3829 */
3830 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 3831 block_start == EXTENT_MAP_INLINE) {
c8b04030 3832 if (compressed)
c8b97818 3833 nr++;
c8b04030
OS
3834 else
3835 btrfs_writepage_endio_finish_ordered(page, cur,
3836 cur + iosize - 1, 1);
c8b97818 3837 cur += iosize;
d1310b2e
CM
3838 continue;
3839 }
c8b97818 3840
5cdc84bf 3841 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
58409edd 3842 if (!PageWriteback(page)) {
d4580fe2 3843 btrfs_err(inode->root->fs_info,
58409edd
DS
3844 "page %lu not writeback, cur %llu end %llu",
3845 page->index, cur, end);
d1310b2e 3846 }
7f3c74fb 3847
390ed29b
QW
3848 ret = submit_extent_page(opf | write_flags, wbc,
3849 &epd->bio_ctrl, page,
d8e3fb10 3850 disk_bytenr, iosize,
390ed29b 3851 cur - page_offset(page),
58409edd 3852 end_bio_extent_writepage,
390ed29b 3853 0, 0, false);
fe01aa65 3854 if (ret) {
58409edd 3855 SetPageError(page);
fe01aa65
TK
3856 if (PageWriteback(page))
3857 end_page_writeback(page);
3858 }
d1310b2e 3859
6bc5636a 3860 cur += iosize;
d1310b2e
CM
3861 nr++;
3862 }
40f76580 3863 *nr_ret = nr;
40f76580
CM
3864 return ret;
3865}
3866
3867/*
3868 * the writepage semantics are similar to regular writepage. extent
3869 * records are inserted to lock ranges in the tree, and as dirty areas
3870 * are found, they are marked writeback. Then the lock bits are removed
3871 * and the end_io handler clears the writeback ranges
3065976b
QW
3872 *
3873 * Return 0 if everything goes well.
3874 * Return <0 for error.
40f76580
CM
3875 */
3876static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 3877 struct extent_page_data *epd)
40f76580
CM
3878{
3879 struct inode *inode = page->mapping->host;
40f76580 3880 u64 start = page_offset(page);
09cbfeaf 3881 u64 page_end = start + PAGE_SIZE - 1;
40f76580
CM
3882 int ret;
3883 int nr = 0;
eb70d222 3884 size_t pg_offset;
40f76580 3885 loff_t i_size = i_size_read(inode);
09cbfeaf 3886 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580
CM
3887 unsigned long nr_written = 0;
3888
40f76580
CM
3889 trace___extent_writepage(page, inode, wbc);
3890
3891 WARN_ON(!PageLocked(page));
3892
3893 ClearPageError(page);
3894
7073017a 3895 pg_offset = offset_in_page(i_size);
40f76580
CM
3896 if (page->index > end_index ||
3897 (page->index == end_index && !pg_offset)) {
09cbfeaf 3898 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
40f76580
CM
3899 unlock_page(page);
3900 return 0;
3901 }
3902
3903 if (page->index == end_index) {
d048b9c2 3904 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
40f76580
CM
3905 flush_dcache_page(page);
3906 }
3907
32443de3
QW
3908 ret = set_page_extent_mapped(page);
3909 if (ret < 0) {
3910 SetPageError(page);
3911 goto done;
3912 }
40f76580 3913
7789a55a 3914 if (!epd->extent_locked) {
cd4c0bf9
NB
3915 ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
3916 &nr_written);
7789a55a 3917 if (ret == 1)
169d2c87 3918 return 0;
7789a55a
NB
3919 if (ret)
3920 goto done;
3921 }
40f76580 3922
d4580fe2
NB
3923 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
3924 nr_written, &nr);
40f76580 3925 if (ret == 1)
169d2c87 3926 return 0;
40f76580 3927
d1310b2e
CM
3928done:
3929 if (nr == 0) {
3930 /* make sure the mapping tag for page dirty gets cleared */
3931 set_page_writeback(page);
3932 end_page_writeback(page);
3933 }
61391d56
FM
3934 if (PageError(page)) {
3935 ret = ret < 0 ? ret : -EIO;
3936 end_extent_writepage(page, ret, start, page_end);
3937 }
d1310b2e 3938 unlock_page(page);
3065976b 3939 ASSERT(ret <= 0);
40f76580 3940 return ret;
d1310b2e
CM
3941}
3942
fd8b2b61 3943void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 3944{
74316201
N
3945 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3946 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
3947}
3948
18dfa711
FM
3949static void end_extent_buffer_writeback(struct extent_buffer *eb)
3950{
3951 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3952 smp_mb__after_atomic();
3953 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3954}
3955
2e3c2513 3956/*
a3efb2f0 3957 * Lock extent buffer status and pages for writeback.
2e3c2513 3958 *
a3efb2f0
QW
3959 * May try to flush write bio if we can't get the lock.
3960 *
3961 * Return 0 if the extent buffer doesn't need to be submitted.
3962 * (E.g. the extent buffer is not dirty)
3963 * Return >0 is the extent buffer is submitted to bio.
3964 * Return <0 if something went wrong, no page is locked.
2e3c2513 3965 */
9df76fb5 3966static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 3967 struct extent_page_data *epd)
0b32f4bb 3968{
9df76fb5 3969 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 3970 int i, num_pages, failed_page_nr;
0b32f4bb
JB
3971 int flush = 0;
3972 int ret = 0;
3973
3974 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 3975 ret = flush_write_bio(epd);
2e3c2513
QW
3976 if (ret < 0)
3977 return ret;
3978 flush = 1;
0b32f4bb
JB
3979 btrfs_tree_lock(eb);
3980 }
3981
3982 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3983 btrfs_tree_unlock(eb);
3984 if (!epd->sync_io)
3985 return 0;
3986 if (!flush) {
f4340622 3987 ret = flush_write_bio(epd);
2e3c2513
QW
3988 if (ret < 0)
3989 return ret;
0b32f4bb
JB
3990 flush = 1;
3991 }
a098d8e8
CM
3992 while (1) {
3993 wait_on_extent_buffer_writeback(eb);
3994 btrfs_tree_lock(eb);
3995 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3996 break;
0b32f4bb 3997 btrfs_tree_unlock(eb);
0b32f4bb
JB
3998 }
3999 }
4000
51561ffe
JB
4001 /*
4002 * We need to do this to prevent races in people who check if the eb is
4003 * under IO since we can end up having no IO bits set for a short period
4004 * of time.
4005 */
4006 spin_lock(&eb->refs_lock);
0b32f4bb
JB
4007 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4008 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 4009 spin_unlock(&eb->refs_lock);
0b32f4bb 4010 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
4011 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4012 -eb->len,
4013 fs_info->dirty_metadata_batch);
0b32f4bb 4014 ret = 1;
51561ffe
JB
4015 } else {
4016 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
4017 }
4018
4019 btrfs_tree_unlock(eb);
4020
f3156df9
QW
4021 /*
4022 * Either we don't need to submit any tree block, or we're submitting
4023 * subpage eb.
4024 * Subpage metadata doesn't use page locking at all, so we can skip
4025 * the page locking.
4026 */
4027 if (!ret || fs_info->sectorsize < PAGE_SIZE)
0b32f4bb
JB
4028 return ret;
4029
65ad0104 4030 num_pages = num_extent_pages(eb);
0b32f4bb 4031 for (i = 0; i < num_pages; i++) {
fb85fc9a 4032 struct page *p = eb->pages[i];
0b32f4bb
JB
4033
4034 if (!trylock_page(p)) {
4035 if (!flush) {
18dfa711
FM
4036 int err;
4037
4038 err = flush_write_bio(epd);
4039 if (err < 0) {
4040 ret = err;
2e3c2513
QW
4041 failed_page_nr = i;
4042 goto err_unlock;
4043 }
0b32f4bb
JB
4044 flush = 1;
4045 }
4046 lock_page(p);
4047 }
4048 }
4049
4050 return ret;
2e3c2513
QW
4051err_unlock:
4052 /* Unlock already locked pages */
4053 for (i = 0; i < failed_page_nr; i++)
4054 unlock_page(eb->pages[i]);
18dfa711
FM
4055 /*
4056 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
4057 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
4058 * be made and undo everything done before.
4059 */
4060 btrfs_tree_lock(eb);
4061 spin_lock(&eb->refs_lock);
4062 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4063 end_extent_buffer_writeback(eb);
4064 spin_unlock(&eb->refs_lock);
4065 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
4066 fs_info->dirty_metadata_batch);
4067 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4068 btrfs_tree_unlock(eb);
2e3c2513 4069 return ret;
0b32f4bb
JB
4070}
4071
5a2c6075 4072static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
656f30db 4073{
5a2c6075 4074 struct btrfs_fs_info *fs_info = eb->fs_info;
656f30db 4075
5a2c6075 4076 btrfs_page_set_error(fs_info, page, eb->start, eb->len);
656f30db
FM
4077 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4078 return;
4079
eb5b64f1
DZ
4080 /*
4081 * If we error out, we should add back the dirty_metadata_bytes
4082 * to make it consistent.
4083 */
eb5b64f1
DZ
4084 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4085 eb->len, fs_info->dirty_metadata_batch);
4086
656f30db
FM
4087 /*
4088 * If writeback for a btree extent that doesn't belong to a log tree
4089 * failed, increment the counter transaction->eb_write_errors.
4090 * We do this because while the transaction is running and before it's
4091 * committing (when we call filemap_fdata[write|wait]_range against
4092 * the btree inode), we might have
4093 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4094 * returns an error or an error happens during writeback, when we're
4095 * committing the transaction we wouldn't know about it, since the pages
4096 * can be no longer dirty nor marked anymore for writeback (if a
4097 * subsequent modification to the extent buffer didn't happen before the
4098 * transaction commit), which makes filemap_fdata[write|wait]_range not
4099 * able to find the pages tagged with SetPageError at transaction
4100 * commit time. So if this happens we must abort the transaction,
4101 * otherwise we commit a super block with btree roots that point to
4102 * btree nodes/leafs whose content on disk is invalid - either garbage
4103 * or the content of some node/leaf from a past generation that got
4104 * cowed or deleted and is no longer valid.
4105 *
4106 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4107 * not be enough - we need to distinguish between log tree extents vs
4108 * non-log tree extents, and the next filemap_fdatawait_range() call
4109 * will catch and clear such errors in the mapping - and that call might
4110 * be from a log sync and not from a transaction commit. Also, checking
4111 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4112 * not done and would not be reliable - the eb might have been released
4113 * from memory and reading it back again means that flag would not be
4114 * set (since it's a runtime flag, not persisted on disk).
4115 *
4116 * Using the flags below in the btree inode also makes us achieve the
4117 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4118 * writeback for all dirty pages and before filemap_fdatawait_range()
4119 * is called, the writeback for all dirty pages had already finished
4120 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4121 * filemap_fdatawait_range() would return success, as it could not know
4122 * that writeback errors happened (the pages were no longer tagged for
4123 * writeback).
4124 */
4125 switch (eb->log_index) {
4126 case -1:
5a2c6075 4127 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
656f30db
FM
4128 break;
4129 case 0:
5a2c6075 4130 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
656f30db
FM
4131 break;
4132 case 1:
5a2c6075 4133 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
656f30db
FM
4134 break;
4135 default:
4136 BUG(); /* unexpected, logic error */
4137 }
4138}
4139
2f3186d8
QW
4140/*
4141 * The endio specific version which won't touch any unsafe spinlock in endio
4142 * context.
4143 */
4144static struct extent_buffer *find_extent_buffer_nolock(
4145 struct btrfs_fs_info *fs_info, u64 start)
4146{
4147 struct extent_buffer *eb;
4148
4149 rcu_read_lock();
4150 eb = radix_tree_lookup(&fs_info->buffer_radix,
4151 start >> fs_info->sectorsize_bits);
4152 if (eb && atomic_inc_not_zero(&eb->refs)) {
4153 rcu_read_unlock();
4154 return eb;
4155 }
4156 rcu_read_unlock();
4157 return NULL;
4158}
4159
4160/*
4161 * The endio function for subpage extent buffer write.
4162 *
4163 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4164 * after all extent buffers in the page has finished their writeback.
4165 */
fa04c165 4166static void end_bio_subpage_eb_writepage(struct bio *bio)
2f3186d8 4167{
fa04c165 4168 struct btrfs_fs_info *fs_info;
2f3186d8
QW
4169 struct bio_vec *bvec;
4170 struct bvec_iter_all iter_all;
4171
fa04c165
QW
4172 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4173 ASSERT(fs_info->sectorsize < PAGE_SIZE);
4174
2f3186d8
QW
4175 ASSERT(!bio_flagged(bio, BIO_CLONED));
4176 bio_for_each_segment_all(bvec, bio, iter_all) {
4177 struct page *page = bvec->bv_page;
4178 u64 bvec_start = page_offset(page) + bvec->bv_offset;
4179 u64 bvec_end = bvec_start + bvec->bv_len - 1;
4180 u64 cur_bytenr = bvec_start;
4181
4182 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4183
4184 /* Iterate through all extent buffers in the range */
4185 while (cur_bytenr <= bvec_end) {
4186 struct extent_buffer *eb;
4187 int done;
4188
4189 /*
4190 * Here we can't use find_extent_buffer(), as it may
4191 * try to lock eb->refs_lock, which is not safe in endio
4192 * context.
4193 */
4194 eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4195 ASSERT(eb);
4196
4197 cur_bytenr = eb->start + eb->len;
4198
4199 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4200 done = atomic_dec_and_test(&eb->io_pages);
4201 ASSERT(done);
4202
4203 if (bio->bi_status ||
4204 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4205 ClearPageUptodate(page);
4206 set_btree_ioerr(page, eb);
4207 }
4208
4209 btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4210 eb->len);
4211 end_extent_buffer_writeback(eb);
4212 /*
4213 * free_extent_buffer() will grab spinlock which is not
4214 * safe in endio context. Thus here we manually dec
4215 * the ref.
4216 */
4217 atomic_dec(&eb->refs);
4218 }
4219 }
4220 bio_put(bio);
4221}
4222
4246a0b6 4223static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 4224{
2c30c71b 4225 struct bio_vec *bvec;
0b32f4bb 4226 struct extent_buffer *eb;
2b070cfe 4227 int done;
6dc4f100 4228 struct bvec_iter_all iter_all;
0b32f4bb 4229
c09abff8 4230 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 4231 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
4232 struct page *page = bvec->bv_page;
4233
0b32f4bb
JB
4234 eb = (struct extent_buffer *)page->private;
4235 BUG_ON(!eb);
4236 done = atomic_dec_and_test(&eb->io_pages);
4237
4e4cbee9 4238 if (bio->bi_status ||
4246a0b6 4239 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 4240 ClearPageUptodate(page);
5a2c6075 4241 set_btree_ioerr(page, eb);
0b32f4bb
JB
4242 }
4243
4244 end_page_writeback(page);
4245
4246 if (!done)
4247 continue;
4248
4249 end_extent_buffer_writeback(eb);
2c30c71b 4250 }
0b32f4bb
JB
4251
4252 bio_put(bio);
0b32f4bb
JB
4253}
4254
fa04c165
QW
4255static void prepare_eb_write(struct extent_buffer *eb)
4256{
4257 u32 nritems;
4258 unsigned long start;
4259 unsigned long end;
4260
4261 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4262 atomic_set(&eb->io_pages, num_extent_pages(eb));
4263
4264 /* Set btree blocks beyond nritems with 0 to avoid stale content */
4265 nritems = btrfs_header_nritems(eb);
4266 if (btrfs_header_level(eb) > 0) {
4267 end = btrfs_node_key_ptr_offset(nritems);
4268 memzero_extent_buffer(eb, end, eb->len - end);
4269 } else {
4270 /*
4271 * Leaf:
4272 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4273 */
4274 start = btrfs_item_nr_offset(nritems);
4275 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4276 memzero_extent_buffer(eb, start, end - start);
4277 }
4278}
4279
35b6ddfa
QW
4280/*
4281 * Unlike the work in write_one_eb(), we rely completely on extent locking.
4282 * Page locking is only utilized at minimum to keep the VMM code happy.
35b6ddfa
QW
4283 */
4284static int write_one_subpage_eb(struct extent_buffer *eb,
4285 struct writeback_control *wbc,
4286 struct extent_page_data *epd)
4287{
4288 struct btrfs_fs_info *fs_info = eb->fs_info;
4289 struct page *page = eb->pages[0];
4290 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
4291 bool no_dirty_ebs = false;
4292 int ret;
4293
fa04c165
QW
4294 prepare_eb_write(eb);
4295
35b6ddfa
QW
4296 /* clear_page_dirty_for_io() in subpage helper needs page locked */
4297 lock_page(page);
4298 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4299
4300 /* Check if this is the last dirty bit to update nr_written */
4301 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4302 eb->start, eb->len);
4303 if (no_dirty_ebs)
4304 clear_page_dirty_for_io(page);
4305
390ed29b
QW
4306 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4307 &epd->bio_ctrl, page, eb->start, eb->len,
4308 eb->start - page_offset(page),
fa04c165 4309 end_bio_subpage_eb_writepage, 0, 0, false);
35b6ddfa
QW
4310 if (ret) {
4311 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4312 set_btree_ioerr(page, eb);
4313 unlock_page(page);
4314
4315 if (atomic_dec_and_test(&eb->io_pages))
4316 end_extent_buffer_writeback(eb);
4317 return -EIO;
4318 }
4319 unlock_page(page);
4320 /*
4321 * Submission finished without problem, if no range of the page is
4322 * dirty anymore, we have submitted a page. Update nr_written in wbc.
4323 */
4324 if (no_dirty_ebs)
4325 update_nr_written(wbc, 1);
4326 return ret;
4327}
4328
0e378df1 4329static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
4330 struct writeback_control *wbc,
4331 struct extent_page_data *epd)
4332{
0c64c33c 4333 u64 disk_bytenr = eb->start;
cc5e31a4 4334 int i, num_pages;
ff40adf7 4335 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 4336 int ret = 0;
0b32f4bb 4337
fa04c165 4338 prepare_eb_write(eb);
35b6ddfa 4339
fa04c165 4340 num_pages = num_extent_pages(eb);
0b32f4bb 4341 for (i = 0; i < num_pages; i++) {
fb85fc9a 4342 struct page *p = eb->pages[i];
0b32f4bb
JB
4343
4344 clear_page_dirty_for_io(p);
4345 set_page_writeback(p);
0ceb34bf 4346 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
390ed29b
QW
4347 &epd->bio_ctrl, p, disk_bytenr,
4348 PAGE_SIZE, 0,
1f7ad75b 4349 end_bio_extent_buffer_writepage,
390ed29b 4350 0, 0, false);
0b32f4bb 4351 if (ret) {
5a2c6075 4352 set_btree_ioerr(p, eb);
fe01aa65
TK
4353 if (PageWriteback(p))
4354 end_page_writeback(p);
0b32f4bb
JB
4355 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4356 end_extent_buffer_writeback(eb);
4357 ret = -EIO;
4358 break;
4359 }
0c64c33c 4360 disk_bytenr += PAGE_SIZE;
3d4b9496 4361 update_nr_written(wbc, 1);
0b32f4bb
JB
4362 unlock_page(p);
4363 }
4364
4365 if (unlikely(ret)) {
4366 for (; i < num_pages; i++) {
bbf65cf0 4367 struct page *p = eb->pages[i];
81465028 4368 clear_page_dirty_for_io(p);
0b32f4bb
JB
4369 unlock_page(p);
4370 }
4371 }
4372
4373 return ret;
4374}
4375
c4aec299
QW
4376/*
4377 * Submit one subpage btree page.
4378 *
4379 * The main difference to submit_eb_page() is:
4380 * - Page locking
4381 * For subpage, we don't rely on page locking at all.
4382 *
4383 * - Flush write bio
4384 * We only flush bio if we may be unable to fit current extent buffers into
4385 * current bio.
4386 *
4387 * Return >=0 for the number of submitted extent buffers.
4388 * Return <0 for fatal error.
4389 */
4390static int submit_eb_subpage(struct page *page,
4391 struct writeback_control *wbc,
4392 struct extent_page_data *epd)
4393{
4394 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4395 int submitted = 0;
4396 u64 page_start = page_offset(page);
4397 int bit_start = 0;
4398 const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
4399 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4400 int ret;
4401
4402 /* Lock and write each dirty extent buffers in the range */
4403 while (bit_start < nbits) {
4404 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4405 struct extent_buffer *eb;
4406 unsigned long flags;
4407 u64 start;
4408
4409 /*
4410 * Take private lock to ensure the subpage won't be detached
4411 * in the meantime.
4412 */
4413 spin_lock(&page->mapping->private_lock);
4414 if (!PagePrivate(page)) {
4415 spin_unlock(&page->mapping->private_lock);
4416 break;
4417 }
4418 spin_lock_irqsave(&subpage->lock, flags);
4419 if (!((1 << bit_start) & subpage->dirty_bitmap)) {
4420 spin_unlock_irqrestore(&subpage->lock, flags);
4421 spin_unlock(&page->mapping->private_lock);
4422 bit_start++;
4423 continue;
4424 }
4425
4426 start = page_start + bit_start * fs_info->sectorsize;
4427 bit_start += sectors_per_node;
4428
4429 /*
4430 * Here we just want to grab the eb without touching extra
4431 * spin locks, so call find_extent_buffer_nolock().
4432 */
4433 eb = find_extent_buffer_nolock(fs_info, start);
4434 spin_unlock_irqrestore(&subpage->lock, flags);
4435 spin_unlock(&page->mapping->private_lock);
4436
4437 /*
4438 * The eb has already reached 0 refs thus find_extent_buffer()
4439 * doesn't return it. We don't need to write back such eb
4440 * anyway.
4441 */
4442 if (!eb)
4443 continue;
4444
4445 ret = lock_extent_buffer_for_io(eb, epd);
4446 if (ret == 0) {
4447 free_extent_buffer(eb);
4448 continue;
4449 }
4450 if (ret < 0) {
4451 free_extent_buffer(eb);
4452 goto cleanup;
4453 }
fa04c165 4454 ret = write_one_subpage_eb(eb, wbc, epd);
c4aec299
QW
4455 free_extent_buffer(eb);
4456 if (ret < 0)
4457 goto cleanup;
4458 submitted++;
4459 }
4460 return submitted;
4461
4462cleanup:
4463 /* We hit error, end bio for the submitted extent buffers */
4464 end_write_bio(epd, ret);
4465 return ret;
4466}
4467
f91e0d0c
QW
4468/*
4469 * Submit all page(s) of one extent buffer.
4470 *
4471 * @page: the page of one extent buffer
4472 * @eb_context: to determine if we need to submit this page, if current page
4473 * belongs to this eb, we don't need to submit
4474 *
4475 * The caller should pass each page in their bytenr order, and here we use
4476 * @eb_context to determine if we have submitted pages of one extent buffer.
4477 *
4478 * If we have, we just skip until we hit a new page that doesn't belong to
4479 * current @eb_context.
4480 *
4481 * If not, we submit all the page(s) of the extent buffer.
4482 *
4483 * Return >0 if we have submitted the extent buffer successfully.
4484 * Return 0 if we don't need to submit the page, as it's already submitted by
4485 * previous call.
4486 * Return <0 for fatal error.
4487 */
4488static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4489 struct extent_page_data *epd,
4490 struct extent_buffer **eb_context)
4491{
4492 struct address_space *mapping = page->mapping;
0bc09ca1 4493 struct btrfs_block_group *cache = NULL;
f91e0d0c
QW
4494 struct extent_buffer *eb;
4495 int ret;
4496
4497 if (!PagePrivate(page))
4498 return 0;
4499
c4aec299
QW
4500 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
4501 return submit_eb_subpage(page, wbc, epd);
4502
f91e0d0c
QW
4503 spin_lock(&mapping->private_lock);
4504 if (!PagePrivate(page)) {
4505 spin_unlock(&mapping->private_lock);
4506 return 0;
4507 }
4508
4509 eb = (struct extent_buffer *)page->private;
4510
4511 /*
4512 * Shouldn't happen and normally this would be a BUG_ON but no point
4513 * crashing the machine for something we can survive anyway.
4514 */
4515 if (WARN_ON(!eb)) {
4516 spin_unlock(&mapping->private_lock);
4517 return 0;
4518 }
4519
4520 if (eb == *eb_context) {
4521 spin_unlock(&mapping->private_lock);
4522 return 0;
4523 }
4524 ret = atomic_inc_not_zero(&eb->refs);
4525 spin_unlock(&mapping->private_lock);
4526 if (!ret)
4527 return 0;
4528
0bc09ca1
NA
4529 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4530 /*
4531 * If for_sync, this hole will be filled with
4532 * trasnsaction commit.
4533 */
4534 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4535 ret = -EAGAIN;
4536 else
4537 ret = 0;
4538 free_extent_buffer(eb);
4539 return ret;
4540 }
4541
f91e0d0c
QW
4542 *eb_context = eb;
4543
4544 ret = lock_extent_buffer_for_io(eb, epd);
4545 if (ret <= 0) {
0bc09ca1
NA
4546 btrfs_revert_meta_write_pointer(cache, eb);
4547 if (cache)
4548 btrfs_put_block_group(cache);
f91e0d0c
QW
4549 free_extent_buffer(eb);
4550 return ret;
4551 }
0bc09ca1
NA
4552 if (cache)
4553 btrfs_put_block_group(cache);
f91e0d0c
QW
4554 ret = write_one_eb(eb, wbc, epd);
4555 free_extent_buffer(eb);
4556 if (ret < 0)
4557 return ret;
4558 return 1;
4559}
4560
0b32f4bb
JB
4561int btree_write_cache_pages(struct address_space *mapping,
4562 struct writeback_control *wbc)
4563{
f91e0d0c 4564 struct extent_buffer *eb_context = NULL;
0b32f4bb 4565 struct extent_page_data epd = {
390ed29b 4566 .bio_ctrl = { 0 },
0b32f4bb
JB
4567 .extent_locked = 0,
4568 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4569 };
b3ff8f1d 4570 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
4571 int ret = 0;
4572 int done = 0;
4573 int nr_to_write_done = 0;
4574 struct pagevec pvec;
4575 int nr_pages;
4576 pgoff_t index;
4577 pgoff_t end; /* Inclusive */
4578 int scanned = 0;
10bbd235 4579 xa_mark_t tag;
0b32f4bb 4580
86679820 4581 pagevec_init(&pvec);
0b32f4bb
JB
4582 if (wbc->range_cyclic) {
4583 index = mapping->writeback_index; /* Start from prev offset */
4584 end = -1;
556755a8
JB
4585 /*
4586 * Start from the beginning does not need to cycle over the
4587 * range, mark it as scanned.
4588 */
4589 scanned = (index == 0);
0b32f4bb 4590 } else {
09cbfeaf
KS
4591 index = wbc->range_start >> PAGE_SHIFT;
4592 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
4593 scanned = 1;
4594 }
4595 if (wbc->sync_mode == WB_SYNC_ALL)
4596 tag = PAGECACHE_TAG_TOWRITE;
4597 else
4598 tag = PAGECACHE_TAG_DIRTY;
0bc09ca1 4599 btrfs_zoned_meta_io_lock(fs_info);
0b32f4bb
JB
4600retry:
4601 if (wbc->sync_mode == WB_SYNC_ALL)
4602 tag_pages_for_writeback(mapping, index, end);
4603 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 4604 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 4605 tag))) {
0b32f4bb
JB
4606 unsigned i;
4607
0b32f4bb
JB
4608 for (i = 0; i < nr_pages; i++) {
4609 struct page *page = pvec.pages[i];
4610
f91e0d0c
QW
4611 ret = submit_eb_page(page, wbc, &epd, &eb_context);
4612 if (ret == 0)
0b32f4bb 4613 continue;
f91e0d0c 4614 if (ret < 0) {
0b32f4bb 4615 done = 1;
0b32f4bb
JB
4616 break;
4617 }
0b32f4bb
JB
4618
4619 /*
4620 * the filesystem may choose to bump up nr_to_write.
4621 * We have to make sure to honor the new nr_to_write
4622 * at any time
4623 */
4624 nr_to_write_done = wbc->nr_to_write <= 0;
4625 }
4626 pagevec_release(&pvec);
4627 cond_resched();
4628 }
4629 if (!scanned && !done) {
4630 /*
4631 * We hit the last page and there is more work to be done: wrap
4632 * back to the start of the file
4633 */
4634 scanned = 1;
4635 index = 0;
4636 goto retry;
4637 }
2b952eea
QW
4638 if (ret < 0) {
4639 end_write_bio(&epd, ret);
0bc09ca1 4640 goto out;
2b952eea 4641 }
b3ff8f1d
QW
4642 /*
4643 * If something went wrong, don't allow any metadata write bio to be
4644 * submitted.
4645 *
4646 * This would prevent use-after-free if we had dirty pages not
4647 * cleaned up, which can still happen by fuzzed images.
4648 *
4649 * - Bad extent tree
4650 * Allowing existing tree block to be allocated for other trees.
4651 *
4652 * - Log tree operations
4653 * Exiting tree blocks get allocated to log tree, bumps its
4654 * generation, then get cleaned in tree re-balance.
4655 * Such tree block will not be written back, since it's clean,
4656 * thus no WRITTEN flag set.
4657 * And after log writes back, this tree block is not traced by
4658 * any dirty extent_io_tree.
4659 *
4660 * - Offending tree block gets re-dirtied from its original owner
4661 * Since it has bumped generation, no WRITTEN flag, it can be
4662 * reused without COWing. This tree block will not be traced
4663 * by btrfs_transaction::dirty_pages.
4664 *
4665 * Now such dirty tree block will not be cleaned by any dirty
4666 * extent io tree. Thus we don't want to submit such wild eb
4667 * if the fs already has error.
4668 */
4669 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4670 ret = flush_write_bio(&epd);
4671 } else {
fbabd4a3 4672 ret = -EROFS;
b3ff8f1d
QW
4673 end_write_bio(&epd, ret);
4674 }
0bc09ca1
NA
4675out:
4676 btrfs_zoned_meta_io_unlock(fs_info);
0b32f4bb
JB
4677 return ret;
4678}
4679
d1310b2e 4680/**
3bed2da1
NB
4681 * Walk the list of dirty pages of the given address space and write all of them.
4682 *
d1310b2e 4683 * @mapping: address space structure to write
3bed2da1
NB
4684 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4685 * @epd: holds context for the write, namely the bio
d1310b2e
CM
4686 *
4687 * If a page is already under I/O, write_cache_pages() skips it, even
4688 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4689 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4690 * and msync() need to guarantee that all the data which was dirty at the time
4691 * the call was made get new I/O started against them. If wbc->sync_mode is
4692 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4693 * existing IO to complete.
4694 */
4242b64a 4695static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4696 struct writeback_control *wbc,
aab6e9ed 4697 struct extent_page_data *epd)
d1310b2e 4698{
7fd1a3f7 4699 struct inode *inode = mapping->host;
d1310b2e
CM
4700 int ret = 0;
4701 int done = 0;
f85d7d6c 4702 int nr_to_write_done = 0;
d1310b2e
CM
4703 struct pagevec pvec;
4704 int nr_pages;
4705 pgoff_t index;
4706 pgoff_t end; /* Inclusive */
a9132667
LB
4707 pgoff_t done_index;
4708 int range_whole = 0;
d1310b2e 4709 int scanned = 0;
10bbd235 4710 xa_mark_t tag;
d1310b2e 4711
7fd1a3f7
JB
4712 /*
4713 * We have to hold onto the inode so that ordered extents can do their
4714 * work when the IO finishes. The alternative to this is failing to add
4715 * an ordered extent if the igrab() fails there and that is a huge pain
4716 * to deal with, so instead just hold onto the inode throughout the
4717 * writepages operation. If it fails here we are freeing up the inode
4718 * anyway and we'd rather not waste our time writing out stuff that is
4719 * going to be truncated anyway.
4720 */
4721 if (!igrab(inode))
4722 return 0;
4723
86679820 4724 pagevec_init(&pvec);
d1310b2e
CM
4725 if (wbc->range_cyclic) {
4726 index = mapping->writeback_index; /* Start from prev offset */
4727 end = -1;
556755a8
JB
4728 /*
4729 * Start from the beginning does not need to cycle over the
4730 * range, mark it as scanned.
4731 */
4732 scanned = (index == 0);
d1310b2e 4733 } else {
09cbfeaf
KS
4734 index = wbc->range_start >> PAGE_SHIFT;
4735 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
4736 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4737 range_whole = 1;
d1310b2e
CM
4738 scanned = 1;
4739 }
3cd24c69
EL
4740
4741 /*
4742 * We do the tagged writepage as long as the snapshot flush bit is set
4743 * and we are the first one who do the filemap_flush() on this inode.
4744 *
4745 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4746 * not race in and drop the bit.
4747 */
4748 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4749 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4750 &BTRFS_I(inode)->runtime_flags))
4751 wbc->tagged_writepages = 1;
4752
4753 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
4754 tag = PAGECACHE_TAG_TOWRITE;
4755 else
4756 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 4757retry:
3cd24c69 4758 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 4759 tag_pages_for_writeback(mapping, index, end);
a9132667 4760 done_index = index;
f85d7d6c 4761 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
4762 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4763 &index, end, tag))) {
d1310b2e
CM
4764 unsigned i;
4765
d1310b2e
CM
4766 for (i = 0; i < nr_pages; i++) {
4767 struct page *page = pvec.pages[i];
4768
f7bddf1e 4769 done_index = page->index + 1;
d1310b2e 4770 /*
b93b0163
MW
4771 * At this point we hold neither the i_pages lock nor
4772 * the page lock: the page may be truncated or
4773 * invalidated (changing page->mapping to NULL),
4774 * or even swizzled back from swapper_space to
4775 * tmpfs file mapping
d1310b2e 4776 */
c8f2f24b 4777 if (!trylock_page(page)) {
f4340622
QW
4778 ret = flush_write_bio(epd);
4779 BUG_ON(ret < 0);
c8f2f24b 4780 lock_page(page);
01d658f2 4781 }
d1310b2e
CM
4782
4783 if (unlikely(page->mapping != mapping)) {
4784 unlock_page(page);
4785 continue;
4786 }
4787
d2c3f4f6 4788 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
4789 if (PageWriteback(page)) {
4790 ret = flush_write_bio(epd);
4791 BUG_ON(ret < 0);
4792 }
d1310b2e 4793 wait_on_page_writeback(page);
d2c3f4f6 4794 }
d1310b2e
CM
4795
4796 if (PageWriteback(page) ||
4797 !clear_page_dirty_for_io(page)) {
4798 unlock_page(page);
4799 continue;
4800 }
4801
aab6e9ed 4802 ret = __extent_writepage(page, wbc, epd);
a9132667 4803 if (ret < 0) {
a9132667
LB
4804 done = 1;
4805 break;
4806 }
f85d7d6c
CM
4807
4808 /*
4809 * the filesystem may choose to bump up nr_to_write.
4810 * We have to make sure to honor the new nr_to_write
4811 * at any time
4812 */
4813 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
4814 }
4815 pagevec_release(&pvec);
4816 cond_resched();
4817 }
894b36e3 4818 if (!scanned && !done) {
d1310b2e
CM
4819 /*
4820 * We hit the last page and there is more work to be done: wrap
4821 * back to the start of the file
4822 */
4823 scanned = 1;
4824 index = 0;
42ffb0bf
JB
4825
4826 /*
4827 * If we're looping we could run into a page that is locked by a
4828 * writer and that writer could be waiting on writeback for a
4829 * page in our current bio, and thus deadlock, so flush the
4830 * write bio here.
4831 */
4832 ret = flush_write_bio(epd);
4833 if (!ret)
4834 goto retry;
d1310b2e 4835 }
a9132667
LB
4836
4837 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4838 mapping->writeback_index = done_index;
4839
7fd1a3f7 4840 btrfs_add_delayed_iput(inode);
894b36e3 4841 return ret;
d1310b2e 4842}
d1310b2e 4843
0a9b0e53 4844int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
4845{
4846 int ret;
d1310b2e 4847 struct extent_page_data epd = {
390ed29b 4848 .bio_ctrl = { 0 },
771ed689 4849 .extent_locked = 0,
ffbd517d 4850 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 4851 };
d1310b2e 4852
d1310b2e 4853 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
4854 ASSERT(ret <= 0);
4855 if (ret < 0) {
4856 end_write_bio(&epd, ret);
4857 return ret;
4858 }
d1310b2e 4859
3065976b
QW
4860 ret = flush_write_bio(&epd);
4861 ASSERT(ret <= 0);
d1310b2e
CM
4862 return ret;
4863}
d1310b2e 4864
5e3ee236 4865int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
771ed689
CM
4866 int mode)
4867{
4868 int ret = 0;
4869 struct address_space *mapping = inode->i_mapping;
4870 struct page *page;
09cbfeaf
KS
4871 unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4872 PAGE_SHIFT;
771ed689
CM
4873
4874 struct extent_page_data epd = {
390ed29b 4875 .bio_ctrl = { 0 },
771ed689 4876 .extent_locked = 1,
ffbd517d 4877 .sync_io = mode == WB_SYNC_ALL,
771ed689
CM
4878 };
4879 struct writeback_control wbc_writepages = {
771ed689 4880 .sync_mode = mode,
771ed689
CM
4881 .nr_to_write = nr_pages * 2,
4882 .range_start = start,
4883 .range_end = end + 1,
ec39f769
CM
4884 /* We're called from an async helper function */
4885 .punt_to_cgroup = 1,
4886 .no_cgroup_owner = 1,
771ed689
CM
4887 };
4888
dbb70bec 4889 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
d397712b 4890 while (start <= end) {
09cbfeaf 4891 page = find_get_page(mapping, start >> PAGE_SHIFT);
771ed689
CM
4892 if (clear_page_dirty_for_io(page))
4893 ret = __extent_writepage(page, &wbc_writepages, &epd);
4894 else {
7087a9d8 4895 btrfs_writepage_endio_finish_ordered(page, start,
c629732d 4896 start + PAGE_SIZE - 1, 1);
771ed689
CM
4897 unlock_page(page);
4898 }
09cbfeaf
KS
4899 put_page(page);
4900 start += PAGE_SIZE;
771ed689
CM
4901 }
4902
02c6db4f 4903 ASSERT(ret <= 0);
dbb70bec
CM
4904 if (ret == 0)
4905 ret = flush_write_bio(&epd);
4906 else
02c6db4f 4907 end_write_bio(&epd, ret);
dbb70bec
CM
4908
4909 wbc_detach_inode(&wbc_writepages);
771ed689
CM
4910 return ret;
4911}
d1310b2e 4912
8ae225a8 4913int extent_writepages(struct address_space *mapping,
d1310b2e
CM
4914 struct writeback_control *wbc)
4915{
4916 int ret = 0;
4917 struct extent_page_data epd = {
390ed29b 4918 .bio_ctrl = { 0 },
771ed689 4919 .extent_locked = 0,
ffbd517d 4920 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
4921 };
4922
935db853 4923 ret = extent_write_cache_pages(mapping, wbc, &epd);
a2a72fbd
QW
4924 ASSERT(ret <= 0);
4925 if (ret < 0) {
4926 end_write_bio(&epd, ret);
4927 return ret;
4928 }
4929 ret = flush_write_bio(&epd);
d1310b2e
CM
4930 return ret;
4931}
d1310b2e 4932
ba206a02 4933void extent_readahead(struct readahead_control *rac)
d1310b2e 4934{
390ed29b 4935 struct btrfs_bio_ctrl bio_ctrl = { 0 };
67c9684f 4936 struct page *pagepool[16];
125bac01 4937 struct extent_map *em_cached = NULL;
808f80b4 4938 u64 prev_em_start = (u64)-1;
ba206a02 4939 int nr;
d1310b2e 4940
ba206a02 4941 while ((nr = readahead_page_batch(rac, pagepool))) {
32c0a6bc
MWO
4942 u64 contig_start = readahead_pos(rac);
4943 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
e65ef21e 4944
ba206a02 4945 contiguous_readpages(pagepool, nr, contig_start, contig_end,
390ed29b 4946 &em_cached, &bio_ctrl, &prev_em_start);
d1310b2e 4947 }
67c9684f 4948
125bac01
MX
4949 if (em_cached)
4950 free_extent_map(em_cached);
4951
390ed29b
QW
4952 if (bio_ctrl.bio) {
4953 if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
ba206a02
MWO
4954 return;
4955 }
d1310b2e 4956}
d1310b2e
CM
4957
4958/*
4959 * basic invalidatepage code, this waits on any locked or writeback
4960 * ranges corresponding to the page, and then deletes any extent state
4961 * records from the tree
4962 */
4963int extent_invalidatepage(struct extent_io_tree *tree,
4964 struct page *page, unsigned long offset)
4965{
2ac55d41 4966 struct extent_state *cached_state = NULL;
4eee4fa4 4967 u64 start = page_offset(page);
09cbfeaf 4968 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
4969 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4970
829ddec9
QW
4971 /* This function is only called for the btree inode */
4972 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
4973
fda2832f 4974 start += ALIGN(offset, blocksize);
d1310b2e
CM
4975 if (start > end)
4976 return 0;
4977
ff13db41 4978 lock_extent_bits(tree, start, end, &cached_state);
1edbb734 4979 wait_on_page_writeback(page);
829ddec9
QW
4980
4981 /*
4982 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
4983 * so here we only need to unlock the extent range to free any
4984 * existing extent state.
4985 */
4986 unlock_extent_cached(tree, start, end, &cached_state);
d1310b2e
CM
4987 return 0;
4988}
d1310b2e 4989
7b13b7b1
CM
4990/*
4991 * a helper for releasepage, this tests for areas of the page that
4992 * are locked or under IO and drops the related state bits if it is safe
4993 * to drop the page.
4994 */
29c68b2d 4995static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 4996 struct page *page, gfp_t mask)
7b13b7b1 4997{
4eee4fa4 4998 u64 start = page_offset(page);
09cbfeaf 4999 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
5000 int ret = 1;
5001
8882679e 5002 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 5003 ret = 0;
8882679e 5004 } else {
11ef160f 5005 /*
2766ff61
FM
5006 * At this point we can safely clear everything except the
5007 * locked bit, the nodatasum bit and the delalloc new bit.
5008 * The delalloc new bit will be cleared by ordered extent
5009 * completion.
11ef160f 5010 */
66b0c887 5011 ret = __clear_extent_bit(tree, start, end,
2766ff61
FM
5012 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5013 0, 0, NULL, mask, NULL);
e3f24cc5
CM
5014
5015 /* if clear_extent_bit failed for enomem reasons,
5016 * we can't allow the release to continue.
5017 */
5018 if (ret < 0)
5019 ret = 0;
5020 else
5021 ret = 1;
7b13b7b1
CM
5022 }
5023 return ret;
5024}
7b13b7b1 5025
d1310b2e
CM
5026/*
5027 * a helper for releasepage. As long as there are no locked extents
5028 * in the range corresponding to the page, both state records and extent
5029 * map records are removed
5030 */
477a30ba 5031int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
5032{
5033 struct extent_map *em;
4eee4fa4 5034 u64 start = page_offset(page);
09cbfeaf 5035 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
5036 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5037 struct extent_io_tree *tree = &btrfs_inode->io_tree;
5038 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 5039
d0164adc 5040 if (gfpflags_allow_blocking(mask) &&
ee22184b 5041 page->mapping->host->i_size > SZ_16M) {
39b5637f 5042 u64 len;
70dec807 5043 while (start <= end) {
fbc2bd7e
FM
5044 struct btrfs_fs_info *fs_info;
5045 u64 cur_gen;
5046
39b5637f 5047 len = end - start + 1;
890871be 5048 write_lock(&map->lock);
39b5637f 5049 em = lookup_extent_mapping(map, start, len);
285190d9 5050 if (!em) {
890871be 5051 write_unlock(&map->lock);
70dec807
CM
5052 break;
5053 }
7f3c74fb
CM
5054 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5055 em->start != start) {
890871be 5056 write_unlock(&map->lock);
70dec807
CM
5057 free_extent_map(em);
5058 break;
5059 }
3d6448e6
FM
5060 if (test_range_bit(tree, em->start,
5061 extent_map_end(em) - 1,
5062 EXTENT_LOCKED, 0, NULL))
5063 goto next;
5064 /*
5065 * If it's not in the list of modified extents, used
5066 * by a fast fsync, we can remove it. If it's being
5067 * logged we can safely remove it since fsync took an
5068 * extra reference on the em.
5069 */
5070 if (list_empty(&em->list) ||
fbc2bd7e
FM
5071 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5072 goto remove_em;
5073 /*
5074 * If it's in the list of modified extents, remove it
5075 * only if its generation is older then the current one,
5076 * in which case we don't need it for a fast fsync.
5077 * Otherwise don't remove it, we could be racing with an
5078 * ongoing fast fsync that could miss the new extent.
5079 */
5080 fs_info = btrfs_inode->root->fs_info;
5081 spin_lock(&fs_info->trans_lock);
5082 cur_gen = fs_info->generation;
5083 spin_unlock(&fs_info->trans_lock);
5084 if (em->generation >= cur_gen)
5085 goto next;
5086remove_em:
5e548b32
FM
5087 /*
5088 * We only remove extent maps that are not in the list of
5089 * modified extents or that are in the list but with a
5090 * generation lower then the current generation, so there
5091 * is no need to set the full fsync flag on the inode (it
5092 * hurts the fsync performance for workloads with a data
5093 * size that exceeds or is close to the system's memory).
5094 */
fbc2bd7e
FM
5095 remove_extent_mapping(map, em);
5096 /* once for the rb tree */
5097 free_extent_map(em);
3d6448e6 5098next:
70dec807 5099 start = extent_map_end(em);
890871be 5100 write_unlock(&map->lock);
70dec807
CM
5101
5102 /* once for us */
d1310b2e 5103 free_extent_map(em);
9f47eb54
PM
5104
5105 cond_resched(); /* Allow large-extent preemption. */
d1310b2e 5106 }
d1310b2e 5107 }
29c68b2d 5108 return try_release_extent_state(tree, page, mask);
d1310b2e 5109}
d1310b2e 5110
ec29ed5b
CM
5111/*
5112 * helper function for fiemap, which doesn't want to see any holes.
5113 * This maps until we find something past 'last'
5114 */
f1bbde8d 5115static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
e3350e16 5116 u64 offset, u64 last)
ec29ed5b 5117{
f1bbde8d 5118 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
5119 struct extent_map *em;
5120 u64 len;
5121
5122 if (offset >= last)
5123 return NULL;
5124
67871254 5125 while (1) {
ec29ed5b
CM
5126 len = last - offset;
5127 if (len == 0)
5128 break;
fda2832f 5129 len = ALIGN(len, sectorsize);
f1bbde8d 5130 em = btrfs_get_extent_fiemap(inode, offset, len);
c704005d 5131 if (IS_ERR_OR_NULL(em))
ec29ed5b
CM
5132 return em;
5133
5134 /* if this isn't a hole return it */
4a2d25cd 5135 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 5136 return em;
ec29ed5b
CM
5137
5138 /* this is a hole, advance to the next extent */
5139 offset = extent_map_end(em);
5140 free_extent_map(em);
5141 if (offset >= last)
5142 break;
5143 }
5144 return NULL;
5145}
5146
4751832d
QW
5147/*
5148 * To cache previous fiemap extent
5149 *
5150 * Will be used for merging fiemap extent
5151 */
5152struct fiemap_cache {
5153 u64 offset;
5154 u64 phys;
5155 u64 len;
5156 u32 flags;
5157 bool cached;
5158};
5159
5160/*
5161 * Helper to submit fiemap extent.
5162 *
5163 * Will try to merge current fiemap extent specified by @offset, @phys,
5164 * @len and @flags with cached one.
5165 * And only when we fails to merge, cached one will be submitted as
5166 * fiemap extent.
5167 *
5168 * Return value is the same as fiemap_fill_next_extent().
5169 */
5170static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5171 struct fiemap_cache *cache,
5172 u64 offset, u64 phys, u64 len, u32 flags)
5173{
5174 int ret = 0;
5175
5176 if (!cache->cached)
5177 goto assign;
5178
5179 /*
5180 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 5181 * fiemap extent won't overlap with cached one.
4751832d
QW
5182 * Not recoverable.
5183 *
5184 * NOTE: Physical address can overlap, due to compression
5185 */
5186 if (cache->offset + cache->len > offset) {
5187 WARN_ON(1);
5188 return -EINVAL;
5189 }
5190
5191 /*
5192 * Only merges fiemap extents if
5193 * 1) Their logical addresses are continuous
5194 *
5195 * 2) Their physical addresses are continuous
5196 * So truly compressed (physical size smaller than logical size)
5197 * extents won't get merged with each other
5198 *
5199 * 3) Share same flags except FIEMAP_EXTENT_LAST
5200 * So regular extent won't get merged with prealloc extent
5201 */
5202 if (cache->offset + cache->len == offset &&
5203 cache->phys + cache->len == phys &&
5204 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5205 (flags & ~FIEMAP_EXTENT_LAST)) {
5206 cache->len += len;
5207 cache->flags |= flags;
5208 goto try_submit_last;
5209 }
5210
5211 /* Not mergeable, need to submit cached one */
5212 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5213 cache->len, cache->flags);
5214 cache->cached = false;
5215 if (ret)
5216 return ret;
5217assign:
5218 cache->cached = true;
5219 cache->offset = offset;
5220 cache->phys = phys;
5221 cache->len = len;
5222 cache->flags = flags;
5223try_submit_last:
5224 if (cache->flags & FIEMAP_EXTENT_LAST) {
5225 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5226 cache->phys, cache->len, cache->flags);
5227 cache->cached = false;
5228 }
5229 return ret;
5230}
5231
5232/*
848c23b7 5233 * Emit last fiemap cache
4751832d 5234 *
848c23b7
QW
5235 * The last fiemap cache may still be cached in the following case:
5236 * 0 4k 8k
5237 * |<- Fiemap range ->|
5238 * |<------------ First extent ----------->|
5239 *
5240 * In this case, the first extent range will be cached but not emitted.
5241 * So we must emit it before ending extent_fiemap().
4751832d 5242 */
5c5aff98 5243static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 5244 struct fiemap_cache *cache)
4751832d
QW
5245{
5246 int ret;
5247
5248 if (!cache->cached)
5249 return 0;
5250
4751832d
QW
5251 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5252 cache->len, cache->flags);
5253 cache->cached = false;
5254 if (ret > 0)
5255 ret = 0;
5256 return ret;
5257}
5258
facee0a0 5259int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
bab16e21 5260 u64 start, u64 len)
1506fcc8 5261{
975f84fe 5262 int ret = 0;
15c7745c 5263 u64 off;
1506fcc8
YS
5264 u64 max = start + len;
5265 u32 flags = 0;
975f84fe
JB
5266 u32 found_type;
5267 u64 last;
ec29ed5b 5268 u64 last_for_get_extent = 0;
1506fcc8 5269 u64 disko = 0;
facee0a0 5270 u64 isize = i_size_read(&inode->vfs_inode);
975f84fe 5271 struct btrfs_key found_key;
1506fcc8 5272 struct extent_map *em = NULL;
2ac55d41 5273 struct extent_state *cached_state = NULL;
975f84fe 5274 struct btrfs_path *path;
facee0a0 5275 struct btrfs_root *root = inode->root;
4751832d 5276 struct fiemap_cache cache = { 0 };
5911c8fe
DS
5277 struct ulist *roots;
5278 struct ulist *tmp_ulist;
1506fcc8 5279 int end = 0;
ec29ed5b
CM
5280 u64 em_start = 0;
5281 u64 em_len = 0;
5282 u64 em_end = 0;
1506fcc8
YS
5283
5284 if (len == 0)
5285 return -EINVAL;
5286
975f84fe
JB
5287 path = btrfs_alloc_path();
5288 if (!path)
5289 return -ENOMEM;
975f84fe 5290
5911c8fe
DS
5291 roots = ulist_alloc(GFP_KERNEL);
5292 tmp_ulist = ulist_alloc(GFP_KERNEL);
5293 if (!roots || !tmp_ulist) {
5294 ret = -ENOMEM;
5295 goto out_free_ulist;
5296 }
5297
15c7745c
BB
5298 /*
5299 * We can't initialize that to 'start' as this could miss extents due
5300 * to extent item merging
5301 */
5302 off = 0;
facee0a0
NB
5303 start = round_down(start, btrfs_inode_sectorsize(inode));
5304 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 5305
ec29ed5b
CM
5306 /*
5307 * lookup the last file extent. We're not using i_size here
5308 * because there might be preallocation past i_size
5309 */
facee0a0
NB
5310 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5311 0);
975f84fe 5312 if (ret < 0) {
5911c8fe 5313 goto out_free_ulist;
2d324f59
LB
5314 } else {
5315 WARN_ON(!ret);
5316 if (ret == 1)
5317 ret = 0;
975f84fe 5318 }
2d324f59 5319
975f84fe 5320 path->slots[0]--;
975f84fe 5321 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 5322 found_type = found_key.type;
975f84fe 5323
ec29ed5b 5324 /* No extents, but there might be delalloc bits */
facee0a0 5325 if (found_key.objectid != btrfs_ino(inode) ||
975f84fe 5326 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
5327 /* have to trust i_size as the end */
5328 last = (u64)-1;
5329 last_for_get_extent = isize;
5330 } else {
5331 /*
5332 * remember the start of the last extent. There are a
5333 * bunch of different factors that go into the length of the
5334 * extent, so its much less complex to remember where it started
5335 */
5336 last = found_key.offset;
5337 last_for_get_extent = last + 1;
975f84fe 5338 }
fe09e16c 5339 btrfs_release_path(path);
975f84fe 5340
ec29ed5b
CM
5341 /*
5342 * we might have some extents allocated but more delalloc past those
5343 * extents. so, we trust isize unless the start of the last extent is
5344 * beyond isize
5345 */
5346 if (last < isize) {
5347 last = (u64)-1;
5348 last_for_get_extent = isize;
5349 }
5350
facee0a0 5351 lock_extent_bits(&inode->io_tree, start, start + len - 1,
d0082371 5352 &cached_state);
ec29ed5b 5353
facee0a0 5354 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
5355 if (!em)
5356 goto out;
5357 if (IS_ERR(em)) {
5358 ret = PTR_ERR(em);
5359 goto out;
5360 }
975f84fe 5361
1506fcc8 5362 while (!end) {
b76bb701 5363 u64 offset_in_extent = 0;
ea8efc74
CM
5364
5365 /* break if the extent we found is outside the range */
5366 if (em->start >= max || extent_map_end(em) < off)
5367 break;
5368
5369 /*
5370 * get_extent may return an extent that starts before our
5371 * requested range. We have to make sure the ranges
5372 * we return to fiemap always move forward and don't
5373 * overlap, so adjust the offsets here
5374 */
5375 em_start = max(em->start, off);
1506fcc8 5376
ea8efc74
CM
5377 /*
5378 * record the offset from the start of the extent
b76bb701
JB
5379 * for adjusting the disk offset below. Only do this if the
5380 * extent isn't compressed since our in ram offset may be past
5381 * what we have actually allocated on disk.
ea8efc74 5382 */
b76bb701
JB
5383 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5384 offset_in_extent = em_start - em->start;
ec29ed5b 5385 em_end = extent_map_end(em);
ea8efc74 5386 em_len = em_end - em_start;
1506fcc8 5387 flags = 0;
f0986318
FM
5388 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5389 disko = em->block_start + offset_in_extent;
5390 else
5391 disko = 0;
1506fcc8 5392
ea8efc74
CM
5393 /*
5394 * bump off for our next call to get_extent
5395 */
5396 off = extent_map_end(em);
5397 if (off >= max)
5398 end = 1;
5399
93dbfad7 5400 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
5401 end = 1;
5402 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 5403 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
5404 flags |= (FIEMAP_EXTENT_DATA_INLINE |
5405 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 5406 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
5407 flags |= (FIEMAP_EXTENT_DELALLOC |
5408 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
5409 } else if (fieinfo->fi_extents_max) {
5410 u64 bytenr = em->block_start -
5411 (em->start - em->orig_start);
fe09e16c 5412
fe09e16c
LB
5413 /*
5414 * As btrfs supports shared space, this information
5415 * can be exported to userspace tools via
dc046b10
JB
5416 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
5417 * then we're just getting a count and we can skip the
5418 * lookup stuff.
fe09e16c 5419 */
facee0a0 5420 ret = btrfs_check_shared(root, btrfs_ino(inode),
5911c8fe 5421 bytenr, roots, tmp_ulist);
dc046b10 5422 if (ret < 0)
fe09e16c 5423 goto out_free;
dc046b10 5424 if (ret)
fe09e16c 5425 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 5426 ret = 0;
1506fcc8
YS
5427 }
5428 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5429 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
5430 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5431 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 5432
1506fcc8
YS
5433 free_extent_map(em);
5434 em = NULL;
ec29ed5b
CM
5435 if ((em_start >= last) || em_len == (u64)-1 ||
5436 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
5437 flags |= FIEMAP_EXTENT_LAST;
5438 end = 1;
5439 }
5440
ec29ed5b 5441 /* now scan forward to see if this is really the last extent. */
facee0a0 5442 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
5443 if (IS_ERR(em)) {
5444 ret = PTR_ERR(em);
5445 goto out;
5446 }
5447 if (!em) {
975f84fe
JB
5448 flags |= FIEMAP_EXTENT_LAST;
5449 end = 1;
5450 }
4751832d
QW
5451 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5452 em_len, flags);
26e726af
CS
5453 if (ret) {
5454 if (ret == 1)
5455 ret = 0;
ec29ed5b 5456 goto out_free;
26e726af 5457 }
1506fcc8
YS
5458 }
5459out_free:
4751832d 5460 if (!ret)
5c5aff98 5461 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
5462 free_extent_map(em);
5463out:
facee0a0 5464 unlock_extent_cached(&inode->io_tree, start, start + len - 1,
e43bbe5e 5465 &cached_state);
5911c8fe
DS
5466
5467out_free_ulist:
e02d48ea 5468 btrfs_free_path(path);
5911c8fe
DS
5469 ulist_free(roots);
5470 ulist_free(tmp_ulist);
1506fcc8
YS
5471 return ret;
5472}
5473
727011e0
CM
5474static void __free_extent_buffer(struct extent_buffer *eb)
5475{
727011e0
CM
5476 kmem_cache_free(extent_buffer_cache, eb);
5477}
5478
2b48966a 5479int extent_buffer_under_io(const struct extent_buffer *eb)
db7f3436
JB
5480{
5481 return (atomic_read(&eb->io_pages) ||
5482 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5483 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5484}
5485
8ff8466d 5486static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
db7f3436 5487{
8ff8466d 5488 struct btrfs_subpage *subpage;
db7f3436 5489
8ff8466d 5490 lockdep_assert_held(&page->mapping->private_lock);
db7f3436 5491
8ff8466d
QW
5492 if (PagePrivate(page)) {
5493 subpage = (struct btrfs_subpage *)page->private;
5494 if (atomic_read(&subpage->eb_refs))
5495 return true;
5496 }
5497 return false;
5498}
db7f3436 5499
8ff8466d
QW
5500static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5501{
5502 struct btrfs_fs_info *fs_info = eb->fs_info;
5503 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5504
5505 /*
5506 * For mapped eb, we're going to change the page private, which should
5507 * be done under the private_lock.
5508 */
5509 if (mapped)
5510 spin_lock(&page->mapping->private_lock);
5511
5512 if (!PagePrivate(page)) {
5d2361db 5513 if (mapped)
8ff8466d
QW
5514 spin_unlock(&page->mapping->private_lock);
5515 return;
5516 }
5517
5518 if (fs_info->sectorsize == PAGE_SIZE) {
5d2361db
FL
5519 /*
5520 * We do this since we'll remove the pages after we've
5521 * removed the eb from the radix tree, so we could race
5522 * and have this page now attached to the new eb. So
5523 * only clear page_private if it's still connected to
5524 * this eb.
5525 */
5526 if (PagePrivate(page) &&
5527 page->private == (unsigned long)eb) {
5528 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5529 BUG_ON(PageDirty(page));
5530 BUG_ON(PageWriteback(page));
db7f3436 5531 /*
5d2361db
FL
5532 * We need to make sure we haven't be attached
5533 * to a new eb.
db7f3436 5534 */
d1b89bc0 5535 detach_page_private(page);
db7f3436 5536 }
5d2361db
FL
5537 if (mapped)
5538 spin_unlock(&page->mapping->private_lock);
8ff8466d
QW
5539 return;
5540 }
5541
5542 /*
5543 * For subpage, we can have dummy eb with page private. In this case,
5544 * we can directly detach the private as such page is only attached to
5545 * one dummy eb, no sharing.
5546 */
5547 if (!mapped) {
5548 btrfs_detach_subpage(fs_info, page);
5549 return;
5550 }
5551
5552 btrfs_page_dec_eb_refs(fs_info, page);
5553
5554 /*
5555 * We can only detach the page private if there are no other ebs in the
5556 * page range.
5557 */
5558 if (!page_range_has_eb(fs_info, page))
5559 btrfs_detach_subpage(fs_info, page);
5560
5561 spin_unlock(&page->mapping->private_lock);
5562}
5563
5564/* Release all pages attached to the extent buffer */
5565static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5566{
5567 int i;
5568 int num_pages;
5569
5570 ASSERT(!extent_buffer_under_io(eb));
5571
5572 num_pages = num_extent_pages(eb);
5573 for (i = 0; i < num_pages; i++) {
5574 struct page *page = eb->pages[i];
5575
5576 if (!page)
5577 continue;
5578
5579 detach_extent_buffer_page(eb, page);
5d2361db 5580
01327610 5581 /* One for when we allocated the page */
09cbfeaf 5582 put_page(page);
d64766fd 5583 }
db7f3436
JB
5584}
5585
5586/*
5587 * Helper for releasing the extent buffer.
5588 */
5589static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5590{
55ac0139 5591 btrfs_release_extent_buffer_pages(eb);
8c38938c 5592 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
db7f3436
JB
5593 __free_extent_buffer(eb);
5594}
5595
f28491e0
JB
5596static struct extent_buffer *
5597__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 5598 unsigned long len)
d1310b2e
CM
5599{
5600 struct extent_buffer *eb = NULL;
5601
d1b5c567 5602 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
5603 eb->start = start;
5604 eb->len = len;
f28491e0 5605 eb->fs_info = fs_info;
815a51c7 5606 eb->bflags = 0;
196d59ab 5607 init_rwsem(&eb->lock);
b4ce94de 5608
3fd63727
JB
5609 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5610 &fs_info->allocated_ebs);
d3575156 5611 INIT_LIST_HEAD(&eb->release_list);
6d49ba1b 5612
3083ee2e 5613 spin_lock_init(&eb->refs_lock);
d1310b2e 5614 atomic_set(&eb->refs, 1);
0b32f4bb 5615 atomic_set(&eb->io_pages, 0);
727011e0 5616
deb67895 5617 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
d1310b2e
CM
5618
5619 return eb;
5620}
5621
2b48966a 5622struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
815a51c7 5623{
cc5e31a4 5624 int i;
815a51c7
JS
5625 struct page *p;
5626 struct extent_buffer *new;
cc5e31a4 5627 int num_pages = num_extent_pages(src);
815a51c7 5628
3f556f78 5629 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
5630 if (new == NULL)
5631 return NULL;
5632
62c053fb
QW
5633 /*
5634 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5635 * btrfs_release_extent_buffer() have different behavior for
5636 * UNMAPPED subpage extent buffer.
5637 */
5638 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5639
815a51c7 5640 for (i = 0; i < num_pages; i++) {
760f991f
QW
5641 int ret;
5642
9ec72677 5643 p = alloc_page(GFP_NOFS);
db7f3436
JB
5644 if (!p) {
5645 btrfs_release_extent_buffer(new);
5646 return NULL;
5647 }
760f991f
QW
5648 ret = attach_extent_buffer_page(new, p, NULL);
5649 if (ret < 0) {
5650 put_page(p);
5651 btrfs_release_extent_buffer(new);
5652 return NULL;
5653 }
815a51c7 5654 WARN_ON(PageDirty(p));
815a51c7 5655 new->pages[i] = p;
fba1acf9 5656 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7 5657 }
92d83e94 5658 set_extent_buffer_uptodate(new);
815a51c7
JS
5659
5660 return new;
5661}
5662
0f331229
OS
5663struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5664 u64 start, unsigned long len)
815a51c7
JS
5665{
5666 struct extent_buffer *eb;
cc5e31a4
DS
5667 int num_pages;
5668 int i;
815a51c7 5669
3f556f78 5670 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
5671 if (!eb)
5672 return NULL;
5673
65ad0104 5674 num_pages = num_extent_pages(eb);
815a51c7 5675 for (i = 0; i < num_pages; i++) {
09bc1f0f
QW
5676 int ret;
5677
9ec72677 5678 eb->pages[i] = alloc_page(GFP_NOFS);
815a51c7
JS
5679 if (!eb->pages[i])
5680 goto err;
09bc1f0f
QW
5681 ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
5682 if (ret < 0)
5683 goto err;
815a51c7
JS
5684 }
5685 set_extent_buffer_uptodate(eb);
5686 btrfs_set_header_nritems(eb, 0);
b0132a3b 5687 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
5688
5689 return eb;
5690err:
09bc1f0f
QW
5691 for (; i > 0; i--) {
5692 detach_extent_buffer_page(eb, eb->pages[i - 1]);
84167d19 5693 __free_page(eb->pages[i - 1]);
09bc1f0f 5694 }
815a51c7
JS
5695 __free_extent_buffer(eb);
5696 return NULL;
5697}
5698
0f331229 5699struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5700 u64 start)
0f331229 5701{
da17066c 5702 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
5703}
5704
0b32f4bb
JB
5705static void check_buffer_tree_ref(struct extent_buffer *eb)
5706{
242e18c7 5707 int refs;
6bf9cd2e
BB
5708 /*
5709 * The TREE_REF bit is first set when the extent_buffer is added
5710 * to the radix tree. It is also reset, if unset, when a new reference
5711 * is created by find_extent_buffer.
0b32f4bb 5712 *
6bf9cd2e
BB
5713 * It is only cleared in two cases: freeing the last non-tree
5714 * reference to the extent_buffer when its STALE bit is set or
5715 * calling releasepage when the tree reference is the only reference.
0b32f4bb 5716 *
6bf9cd2e
BB
5717 * In both cases, care is taken to ensure that the extent_buffer's
5718 * pages are not under io. However, releasepage can be concurrently
5719 * called with creating new references, which is prone to race
5720 * conditions between the calls to check_buffer_tree_ref in those
5721 * codepaths and clearing TREE_REF in try_release_extent_buffer.
0b32f4bb 5722 *
6bf9cd2e
BB
5723 * The actual lifetime of the extent_buffer in the radix tree is
5724 * adequately protected by the refcount, but the TREE_REF bit and
5725 * its corresponding reference are not. To protect against this
5726 * class of races, we call check_buffer_tree_ref from the codepaths
5727 * which trigger io after they set eb->io_pages. Note that once io is
5728 * initiated, TREE_REF can no longer be cleared, so that is the
5729 * moment at which any such race is best fixed.
0b32f4bb 5730 */
242e18c7
CM
5731 refs = atomic_read(&eb->refs);
5732 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5733 return;
5734
594831c4
JB
5735 spin_lock(&eb->refs_lock);
5736 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 5737 atomic_inc(&eb->refs);
594831c4 5738 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
5739}
5740
2457aec6
MG
5741static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5742 struct page *accessed)
5df4235e 5743{
cc5e31a4 5744 int num_pages, i;
5df4235e 5745
0b32f4bb
JB
5746 check_buffer_tree_ref(eb);
5747
65ad0104 5748 num_pages = num_extent_pages(eb);
5df4235e 5749 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
5750 struct page *p = eb->pages[i];
5751
2457aec6
MG
5752 if (p != accessed)
5753 mark_page_accessed(p);
5df4235e
JB
5754 }
5755}
5756
f28491e0
JB
5757struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5758 u64 start)
452c75c3
CS
5759{
5760 struct extent_buffer *eb;
5761
2f3186d8
QW
5762 eb = find_extent_buffer_nolock(fs_info, start);
5763 if (!eb)
5764 return NULL;
5765 /*
5766 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
5767 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
5768 * another task running free_extent_buffer() might have seen that flag
5769 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
5770 * writeback flags not set) and it's still in the tree (flag
5771 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
5772 * decrementing the extent buffer's reference count twice. So here we
5773 * could race and increment the eb's reference count, clear its stale
5774 * flag, mark it as dirty and drop our reference before the other task
5775 * finishes executing free_extent_buffer, which would later result in
5776 * an attempt to free an extent buffer that is dirty.
5777 */
5778 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5779 spin_lock(&eb->refs_lock);
5780 spin_unlock(&eb->refs_lock);
452c75c3 5781 }
2f3186d8
QW
5782 mark_extent_buffer_accessed(eb, NULL);
5783 return eb;
452c75c3
CS
5784}
5785
faa2dbf0
JB
5786#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5787struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5788 u64 start)
faa2dbf0
JB
5789{
5790 struct extent_buffer *eb, *exists = NULL;
5791 int ret;
5792
5793 eb = find_extent_buffer(fs_info, start);
5794 if (eb)
5795 return eb;
da17066c 5796 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 5797 if (!eb)
b6293c82 5798 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
5799 eb->fs_info = fs_info;
5800again:
e1860a77 5801 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
5802 if (ret) {
5803 exists = ERR_PTR(ret);
faa2dbf0 5804 goto free_eb;
b6293c82 5805 }
faa2dbf0
JB
5806 spin_lock(&fs_info->buffer_lock);
5807 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 5808 start >> fs_info->sectorsize_bits, eb);
faa2dbf0
JB
5809 spin_unlock(&fs_info->buffer_lock);
5810 radix_tree_preload_end();
5811 if (ret == -EEXIST) {
5812 exists = find_extent_buffer(fs_info, start);
5813 if (exists)
5814 goto free_eb;
5815 else
5816 goto again;
5817 }
5818 check_buffer_tree_ref(eb);
5819 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5820
faa2dbf0
JB
5821 return eb;
5822free_eb:
5823 btrfs_release_extent_buffer(eb);
5824 return exists;
5825}
5826#endif
5827
81982210
QW
5828static struct extent_buffer *grab_extent_buffer(
5829 struct btrfs_fs_info *fs_info, struct page *page)
c0f0a9e7
QW
5830{
5831 struct extent_buffer *exists;
5832
81982210
QW
5833 /*
5834 * For subpage case, we completely rely on radix tree to ensure we
5835 * don't try to insert two ebs for the same bytenr. So here we always
5836 * return NULL and just continue.
5837 */
5838 if (fs_info->sectorsize < PAGE_SIZE)
5839 return NULL;
5840
c0f0a9e7
QW
5841 /* Page not yet attached to an extent buffer */
5842 if (!PagePrivate(page))
5843 return NULL;
5844
5845 /*
5846 * We could have already allocated an eb for this page and attached one
5847 * so lets see if we can get a ref on the existing eb, and if we can we
5848 * know it's good and we can just return that one, else we know we can
5849 * just overwrite page->private.
5850 */
5851 exists = (struct extent_buffer *)page->private;
5852 if (atomic_inc_not_zero(&exists->refs))
5853 return exists;
5854
5855 WARN_ON(PageDirty(page));
5856 detach_page_private(page);
5857 return NULL;
5858}
5859
f28491e0 5860struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3fbaf258 5861 u64 start, u64 owner_root, int level)
d1310b2e 5862{
da17066c 5863 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
5864 int num_pages;
5865 int i;
09cbfeaf 5866 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 5867 struct extent_buffer *eb;
6af118ce 5868 struct extent_buffer *exists = NULL;
d1310b2e 5869 struct page *p;
f28491e0 5870 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 5871 int uptodate = 1;
19fe0a8b 5872 int ret;
d1310b2e 5873
da17066c 5874 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
c871b0f2
LB
5875 btrfs_err(fs_info, "bad tree block start %llu", start);
5876 return ERR_PTR(-EINVAL);
5877 }
5878
e9306ad4
QW
5879#if BITS_PER_LONG == 32
5880 if (start >= MAX_LFS_FILESIZE) {
5881 btrfs_err_rl(fs_info,
5882 "extent buffer %llu is beyond 32bit page cache limit", start);
5883 btrfs_err_32bit_limit(fs_info);
5884 return ERR_PTR(-EOVERFLOW);
5885 }
5886 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
5887 btrfs_warn_32bit_limit(fs_info);
5888#endif
5889
1aaac38c
QW
5890 if (fs_info->sectorsize < PAGE_SIZE &&
5891 offset_in_page(start) + len > PAGE_SIZE) {
5892 btrfs_err(fs_info,
5893 "tree block crosses page boundary, start %llu nodesize %lu",
5894 start, len);
5895 return ERR_PTR(-EINVAL);
5896 }
5897
f28491e0 5898 eb = find_extent_buffer(fs_info, start);
452c75c3 5899 if (eb)
6af118ce 5900 return eb;
6af118ce 5901
23d79d81 5902 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 5903 if (!eb)
c871b0f2 5904 return ERR_PTR(-ENOMEM);
e114c545 5905 btrfs_set_buffer_lockdep_class(owner_root, eb, level);
d1310b2e 5906
65ad0104 5907 num_pages = num_extent_pages(eb);
727011e0 5908 for (i = 0; i < num_pages; i++, index++) {
760f991f
QW
5909 struct btrfs_subpage *prealloc = NULL;
5910
d1b5c567 5911 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
5912 if (!p) {
5913 exists = ERR_PTR(-ENOMEM);
6af118ce 5914 goto free_eb;
c871b0f2 5915 }
4f2de97a 5916
760f991f
QW
5917 /*
5918 * Preallocate page->private for subpage case, so that we won't
5919 * allocate memory with private_lock hold. The memory will be
5920 * freed by attach_extent_buffer_page() or freed manually if
5921 * we exit earlier.
5922 *
5923 * Although we have ensured one subpage eb can only have one
5924 * page, but it may change in the future for 16K page size
5925 * support, so we still preallocate the memory in the loop.
5926 */
5927 ret = btrfs_alloc_subpage(fs_info, &prealloc,
5928 BTRFS_SUBPAGE_METADATA);
5929 if (ret < 0) {
5930 unlock_page(p);
5931 put_page(p);
5932 exists = ERR_PTR(ret);
5933 goto free_eb;
5934 }
5935
4f2de97a 5936 spin_lock(&mapping->private_lock);
81982210 5937 exists = grab_extent_buffer(fs_info, p);
c0f0a9e7
QW
5938 if (exists) {
5939 spin_unlock(&mapping->private_lock);
5940 unlock_page(p);
5941 put_page(p);
5942 mark_extent_buffer_accessed(exists, p);
760f991f 5943 btrfs_free_subpage(prealloc);
c0f0a9e7 5944 goto free_eb;
d1310b2e 5945 }
760f991f
QW
5946 /* Should not fail, as we have preallocated the memory */
5947 ret = attach_extent_buffer_page(eb, p, prealloc);
5948 ASSERT(!ret);
8ff8466d
QW
5949 /*
5950 * To inform we have extra eb under allocation, so that
5951 * detach_extent_buffer_page() won't release the page private
5952 * when the eb hasn't yet been inserted into radix tree.
5953 *
5954 * The ref will be decreased when the eb released the page, in
5955 * detach_extent_buffer_page().
5956 * Thus needs no special handling in error path.
5957 */
5958 btrfs_page_inc_eb_refs(fs_info, p);
4f2de97a 5959 spin_unlock(&mapping->private_lock);
760f991f 5960
1e5eb3d6 5961 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
727011e0 5962 eb->pages[i] = p;
d1310b2e
CM
5963 if (!PageUptodate(p))
5964 uptodate = 0;
eb14ab8e
CM
5965
5966 /*
b16d011e
NB
5967 * We can't unlock the pages just yet since the extent buffer
5968 * hasn't been properly inserted in the radix tree, this
5969 * opens a race with btree_releasepage which can free a page
5970 * while we are still filling in all pages for the buffer and
5971 * we could crash.
eb14ab8e 5972 */
d1310b2e
CM
5973 }
5974 if (uptodate)
b4ce94de 5975 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 5976again:
e1860a77 5977 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
5978 if (ret) {
5979 exists = ERR_PTR(ret);
19fe0a8b 5980 goto free_eb;
c871b0f2 5981 }
19fe0a8b 5982
f28491e0
JB
5983 spin_lock(&fs_info->buffer_lock);
5984 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 5985 start >> fs_info->sectorsize_bits, eb);
f28491e0 5986 spin_unlock(&fs_info->buffer_lock);
452c75c3 5987 radix_tree_preload_end();
19fe0a8b 5988 if (ret == -EEXIST) {
f28491e0 5989 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
5990 if (exists)
5991 goto free_eb;
5992 else
115391d2 5993 goto again;
6af118ce 5994 }
6af118ce 5995 /* add one reference for the tree */
0b32f4bb 5996 check_buffer_tree_ref(eb);
34b41ace 5997 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
5998
5999 /*
b16d011e
NB
6000 * Now it's safe to unlock the pages because any calls to
6001 * btree_releasepage will correctly detect that a page belongs to a
6002 * live buffer and won't free them prematurely.
eb14ab8e 6003 */
28187ae5
NB
6004 for (i = 0; i < num_pages; i++)
6005 unlock_page(eb->pages[i]);
d1310b2e
CM
6006 return eb;
6007
6af118ce 6008free_eb:
5ca64f45 6009 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
6010 for (i = 0; i < num_pages; i++) {
6011 if (eb->pages[i])
6012 unlock_page(eb->pages[i]);
6013 }
eb14ab8e 6014
897ca6e9 6015 btrfs_release_extent_buffer(eb);
6af118ce 6016 return exists;
d1310b2e 6017}
d1310b2e 6018
3083ee2e
JB
6019static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6020{
6021 struct extent_buffer *eb =
6022 container_of(head, struct extent_buffer, rcu_head);
6023
6024 __free_extent_buffer(eb);
6025}
6026
f7a52a40 6027static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 6028 __releases(&eb->refs_lock)
3083ee2e 6029{
07e21c4d
NB
6030 lockdep_assert_held(&eb->refs_lock);
6031
3083ee2e
JB
6032 WARN_ON(atomic_read(&eb->refs) == 0);
6033 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 6034 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 6035 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 6036
815a51c7 6037 spin_unlock(&eb->refs_lock);
3083ee2e 6038
f28491e0
JB
6039 spin_lock(&fs_info->buffer_lock);
6040 radix_tree_delete(&fs_info->buffer_radix,
478ef886 6041 eb->start >> fs_info->sectorsize_bits);
f28491e0 6042 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
6043 } else {
6044 spin_unlock(&eb->refs_lock);
815a51c7 6045 }
3083ee2e 6046
8c38938c 6047 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
3083ee2e 6048 /* Should be safe to release our pages at this point */
55ac0139 6049 btrfs_release_extent_buffer_pages(eb);
bcb7e449 6050#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 6051 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
6052 __free_extent_buffer(eb);
6053 return 1;
6054 }
6055#endif
3083ee2e 6056 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 6057 return 1;
3083ee2e
JB
6058 }
6059 spin_unlock(&eb->refs_lock);
e64860aa
JB
6060
6061 return 0;
3083ee2e
JB
6062}
6063
d1310b2e
CM
6064void free_extent_buffer(struct extent_buffer *eb)
6065{
242e18c7
CM
6066 int refs;
6067 int old;
d1310b2e
CM
6068 if (!eb)
6069 return;
6070
242e18c7
CM
6071 while (1) {
6072 refs = atomic_read(&eb->refs);
46cc775e
NB
6073 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6074 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6075 refs == 1))
242e18c7
CM
6076 break;
6077 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6078 if (old == refs)
6079 return;
6080 }
6081
3083ee2e
JB
6082 spin_lock(&eb->refs_lock);
6083 if (atomic_read(&eb->refs) == 2 &&
6084 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 6085 !extent_buffer_under_io(eb) &&
3083ee2e
JB
6086 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6087 atomic_dec(&eb->refs);
6088
6089 /*
6090 * I know this is terrible, but it's temporary until we stop tracking
6091 * the uptodate bits and such for the extent buffers.
6092 */
f7a52a40 6093 release_extent_buffer(eb);
3083ee2e
JB
6094}
6095
6096void free_extent_buffer_stale(struct extent_buffer *eb)
6097{
6098 if (!eb)
d1310b2e
CM
6099 return;
6100
3083ee2e
JB
6101 spin_lock(&eb->refs_lock);
6102 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6103
0b32f4bb 6104 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
6105 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6106 atomic_dec(&eb->refs);
f7a52a40 6107 release_extent_buffer(eb);
d1310b2e 6108}
d1310b2e 6109
0d27797e
QW
6110static void btree_clear_page_dirty(struct page *page)
6111{
6112 ASSERT(PageDirty(page));
6113 ASSERT(PageLocked(page));
6114 clear_page_dirty_for_io(page);
6115 xa_lock_irq(&page->mapping->i_pages);
6116 if (!PageDirty(page))
6117 __xa_clear_mark(&page->mapping->i_pages,
6118 page_index(page), PAGECACHE_TAG_DIRTY);
6119 xa_unlock_irq(&page->mapping->i_pages);
6120}
6121
6122static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6123{
6124 struct btrfs_fs_info *fs_info = eb->fs_info;
6125 struct page *page = eb->pages[0];
6126 bool last;
6127
6128 /* btree_clear_page_dirty() needs page locked */
6129 lock_page(page);
6130 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6131 eb->len);
6132 if (last)
6133 btree_clear_page_dirty(page);
6134 unlock_page(page);
6135 WARN_ON(atomic_read(&eb->refs) == 0);
6136}
6137
2b48966a 6138void clear_extent_buffer_dirty(const struct extent_buffer *eb)
d1310b2e 6139{
cc5e31a4
DS
6140 int i;
6141 int num_pages;
d1310b2e
CM
6142 struct page *page;
6143
0d27797e
QW
6144 if (eb->fs_info->sectorsize < PAGE_SIZE)
6145 return clear_subpage_extent_buffer_dirty(eb);
6146
65ad0104 6147 num_pages = num_extent_pages(eb);
d1310b2e
CM
6148
6149 for (i = 0; i < num_pages; i++) {
fb85fc9a 6150 page = eb->pages[i];
b9473439 6151 if (!PageDirty(page))
d2c3f4f6 6152 continue;
a61e6f29 6153 lock_page(page);
0d27797e 6154 btree_clear_page_dirty(page);
bf0da8c1 6155 ClearPageError(page);
a61e6f29 6156 unlock_page(page);
d1310b2e 6157 }
0b32f4bb 6158 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 6159}
d1310b2e 6160
abb57ef3 6161bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 6162{
cc5e31a4
DS
6163 int i;
6164 int num_pages;
abb57ef3 6165 bool was_dirty;
d1310b2e 6166
0b32f4bb
JB
6167 check_buffer_tree_ref(eb);
6168
b9473439 6169 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 6170
65ad0104 6171 num_pages = num_extent_pages(eb);
3083ee2e 6172 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
6173 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6174
0d27797e
QW
6175 if (!was_dirty) {
6176 bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
51995c39 6177
0d27797e
QW
6178 /*
6179 * For subpage case, we can have other extent buffers in the
6180 * same page, and in clear_subpage_extent_buffer_dirty() we
6181 * have to clear page dirty without subpage lock held.
6182 * This can cause race where our page gets dirty cleared after
6183 * we just set it.
6184 *
6185 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6186 * its page for other reasons, we can use page lock to prevent
6187 * the above race.
6188 */
6189 if (subpage)
6190 lock_page(eb->pages[0]);
6191 for (i = 0; i < num_pages; i++)
6192 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6193 eb->start, eb->len);
6194 if (subpage)
6195 unlock_page(eb->pages[0]);
6196 }
51995c39
LB
6197#ifdef CONFIG_BTRFS_DEBUG
6198 for (i = 0; i < num_pages; i++)
6199 ASSERT(PageDirty(eb->pages[i]));
6200#endif
6201
b9473439 6202 return was_dirty;
d1310b2e 6203}
d1310b2e 6204
69ba3927 6205void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 6206{
251f2acc 6207 struct btrfs_fs_info *fs_info = eb->fs_info;
1259ab75 6208 struct page *page;
cc5e31a4 6209 int num_pages;
251f2acc 6210 int i;
1259ab75 6211
b4ce94de 6212 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 6213 num_pages = num_extent_pages(eb);
1259ab75 6214 for (i = 0; i < num_pages; i++) {
fb85fc9a 6215 page = eb->pages[i];
33958dc6 6216 if (page)
251f2acc
QW
6217 btrfs_page_clear_uptodate(fs_info, page,
6218 eb->start, eb->len);
1259ab75 6219 }
1259ab75
CM
6220}
6221
09c25a8c 6222void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 6223{
251f2acc 6224 struct btrfs_fs_info *fs_info = eb->fs_info;
d1310b2e 6225 struct page *page;
cc5e31a4 6226 int num_pages;
251f2acc 6227 int i;
d1310b2e 6228
0b32f4bb 6229 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 6230 num_pages = num_extent_pages(eb);
d1310b2e 6231 for (i = 0; i < num_pages; i++) {
fb85fc9a 6232 page = eb->pages[i];
251f2acc 6233 btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
d1310b2e 6234 }
d1310b2e 6235}
d1310b2e 6236
4012daf7
QW
6237static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6238 int mirror_num)
6239{
6240 struct btrfs_fs_info *fs_info = eb->fs_info;
6241 struct extent_io_tree *io_tree;
6242 struct page *page = eb->pages[0];
390ed29b 6243 struct btrfs_bio_ctrl bio_ctrl = { 0 };
4012daf7
QW
6244 int ret = 0;
6245
6246 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6247 ASSERT(PagePrivate(page));
6248 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6249
6250 if (wait == WAIT_NONE) {
dc56219f
GR
6251 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6252 return -EAGAIN;
4012daf7
QW
6253 } else {
6254 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6255 if (ret < 0)
6256 return ret;
6257 }
6258
6259 ret = 0;
6260 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6261 PageUptodate(page) ||
6262 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6263 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6264 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6265 return ret;
6266 }
6267
6268 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6269 eb->read_mirror = 0;
6270 atomic_set(&eb->io_pages, 1);
6271 check_buffer_tree_ref(eb);
6272 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6273
390ed29b
QW
6274 ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
6275 page, eb->start, eb->len,
6276 eb->start - page_offset(page),
6277 end_bio_extent_readpage, mirror_num, 0,
4012daf7
QW
6278 true);
6279 if (ret) {
6280 /*
6281 * In the endio function, if we hit something wrong we will
6282 * increase the io_pages, so here we need to decrease it for
6283 * error path.
6284 */
6285 atomic_dec(&eb->io_pages);
6286 }
390ed29b 6287 if (bio_ctrl.bio) {
4012daf7
QW
6288 int tmp;
6289
390ed29b
QW
6290 tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
6291 bio_ctrl.bio = NULL;
4012daf7
QW
6292 if (tmp < 0)
6293 return tmp;
6294 }
6295 if (ret || wait != WAIT_COMPLETE)
6296 return ret;
6297
6298 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6299 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6300 ret = -EIO;
6301 return ret;
6302}
6303
c2ccfbc6 6304int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 6305{
cc5e31a4 6306 int i;
d1310b2e
CM
6307 struct page *page;
6308 int err;
6309 int ret = 0;
ce9adaa5
CM
6310 int locked_pages = 0;
6311 int all_uptodate = 1;
cc5e31a4 6312 int num_pages;
727011e0 6313 unsigned long num_reads = 0;
390ed29b 6314 struct btrfs_bio_ctrl bio_ctrl = { 0 };
a86c12c7 6315
b4ce94de 6316 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
6317 return 0;
6318
4012daf7
QW
6319 if (eb->fs_info->sectorsize < PAGE_SIZE)
6320 return read_extent_buffer_subpage(eb, wait, mirror_num);
6321
65ad0104 6322 num_pages = num_extent_pages(eb);
8436ea91 6323 for (i = 0; i < num_pages; i++) {
fb85fc9a 6324 page = eb->pages[i];
bb82ab88 6325 if (wait == WAIT_NONE) {
2c4d8cb7
QW
6326 /*
6327 * WAIT_NONE is only utilized by readahead. If we can't
6328 * acquire the lock atomically it means either the eb
6329 * is being read out or under modification.
6330 * Either way the eb will be or has been cached,
6331 * readahead can exit safely.
6332 */
2db04966 6333 if (!trylock_page(page))
ce9adaa5 6334 goto unlock_exit;
d1310b2e
CM
6335 } else {
6336 lock_page(page);
6337 }
ce9adaa5 6338 locked_pages++;
2571e739
LB
6339 }
6340 /*
6341 * We need to firstly lock all pages to make sure that
6342 * the uptodate bit of our pages won't be affected by
6343 * clear_extent_buffer_uptodate().
6344 */
8436ea91 6345 for (i = 0; i < num_pages; i++) {
2571e739 6346 page = eb->pages[i];
727011e0
CM
6347 if (!PageUptodate(page)) {
6348 num_reads++;
ce9adaa5 6349 all_uptodate = 0;
727011e0 6350 }
ce9adaa5 6351 }
2571e739 6352
ce9adaa5 6353 if (all_uptodate) {
8436ea91 6354 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
6355 goto unlock_exit;
6356 }
6357
656f30db 6358 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 6359 eb->read_mirror = 0;
0b32f4bb 6360 atomic_set(&eb->io_pages, num_reads);
6bf9cd2e
BB
6361 /*
6362 * It is possible for releasepage to clear the TREE_REF bit before we
6363 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6364 */
6365 check_buffer_tree_ref(eb);
8436ea91 6366 for (i = 0; i < num_pages; i++) {
fb85fc9a 6367 page = eb->pages[i];
baf863b9 6368
ce9adaa5 6369 if (!PageUptodate(page)) {
baf863b9
LB
6370 if (ret) {
6371 atomic_dec(&eb->io_pages);
6372 unlock_page(page);
6373 continue;
6374 }
6375
f188591e 6376 ClearPageError(page);
0420177c 6377 err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
390ed29b
QW
6378 &bio_ctrl, page, page_offset(page),
6379 PAGE_SIZE, 0, end_bio_extent_readpage,
6380 mirror_num, 0, false);
baf863b9 6381 if (err) {
baf863b9 6382 /*
0420177c
NB
6383 * We failed to submit the bio so it's the
6384 * caller's responsibility to perform cleanup
6385 * i.e unlock page/set error bit.
baf863b9 6386 */
0420177c
NB
6387 ret = err;
6388 SetPageError(page);
6389 unlock_page(page);
baf863b9
LB
6390 atomic_dec(&eb->io_pages);
6391 }
d1310b2e
CM
6392 } else {
6393 unlock_page(page);
6394 }
6395 }
6396
390ed29b
QW
6397 if (bio_ctrl.bio) {
6398 err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
6399 bio_ctrl.bio = NULL;
79787eaa
JM
6400 if (err)
6401 return err;
355808c2 6402 }
a86c12c7 6403
bb82ab88 6404 if (ret || wait != WAIT_COMPLETE)
d1310b2e 6405 return ret;
d397712b 6406
8436ea91 6407 for (i = 0; i < num_pages; i++) {
fb85fc9a 6408 page = eb->pages[i];
d1310b2e 6409 wait_on_page_locked(page);
d397712b 6410 if (!PageUptodate(page))
d1310b2e 6411 ret = -EIO;
d1310b2e 6412 }
d397712b 6413
d1310b2e 6414 return ret;
ce9adaa5
CM
6415
6416unlock_exit:
d397712b 6417 while (locked_pages > 0) {
ce9adaa5 6418 locked_pages--;
8436ea91
JB
6419 page = eb->pages[locked_pages];
6420 unlock_page(page);
ce9adaa5
CM
6421 }
6422 return ret;
d1310b2e 6423}
d1310b2e 6424
f98b6215
QW
6425static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6426 unsigned long len)
6427{
6428 btrfs_warn(eb->fs_info,
6429 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6430 eb->start, eb->len, start, len);
6431 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6432
6433 return true;
6434}
6435
6436/*
6437 * Check if the [start, start + len) range is valid before reading/writing
6438 * the eb.
6439 * NOTE: @start and @len are offset inside the eb, not logical address.
6440 *
6441 * Caller should not touch the dst/src memory if this function returns error.
6442 */
6443static inline int check_eb_range(const struct extent_buffer *eb,
6444 unsigned long start, unsigned long len)
6445{
6446 unsigned long offset;
6447
6448 /* start, start + len should not go beyond eb->len nor overflow */
6449 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6450 return report_eb_range(eb, start, len);
6451
6452 return false;
6453}
6454
1cbb1f45
JM
6455void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6456 unsigned long start, unsigned long len)
d1310b2e
CM
6457{
6458 size_t cur;
6459 size_t offset;
6460 struct page *page;
6461 char *kaddr;
6462 char *dst = (char *)dstv;
884b07d0 6463 unsigned long i = get_eb_page_index(start);
d1310b2e 6464
f98b6215 6465 if (check_eb_range(eb, start, len))
f716abd5 6466 return;
d1310b2e 6467
884b07d0 6468 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6469
d397712b 6470 while (len > 0) {
fb85fc9a 6471 page = eb->pages[i];
d1310b2e 6472
09cbfeaf 6473 cur = min(len, (PAGE_SIZE - offset));
a6591715 6474 kaddr = page_address(page);
d1310b2e 6475 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
6476
6477 dst += cur;
6478 len -= cur;
6479 offset = 0;
6480 i++;
6481 }
6482}
d1310b2e 6483
a48b73ec
JB
6484int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6485 void __user *dstv,
6486 unsigned long start, unsigned long len)
550ac1d8
GH
6487{
6488 size_t cur;
6489 size_t offset;
6490 struct page *page;
6491 char *kaddr;
6492 char __user *dst = (char __user *)dstv;
884b07d0 6493 unsigned long i = get_eb_page_index(start);
550ac1d8
GH
6494 int ret = 0;
6495
6496 WARN_ON(start > eb->len);
6497 WARN_ON(start + len > eb->start + eb->len);
6498
884b07d0 6499 offset = get_eb_offset_in_page(eb, start);
550ac1d8
GH
6500
6501 while (len > 0) {
fb85fc9a 6502 page = eb->pages[i];
550ac1d8 6503
09cbfeaf 6504 cur = min(len, (PAGE_SIZE - offset));
550ac1d8 6505 kaddr = page_address(page);
a48b73ec 6506 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
550ac1d8
GH
6507 ret = -EFAULT;
6508 break;
6509 }
6510
6511 dst += cur;
6512 len -= cur;
6513 offset = 0;
6514 i++;
6515 }
6516
6517 return ret;
6518}
6519
1cbb1f45
JM
6520int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6521 unsigned long start, unsigned long len)
d1310b2e
CM
6522{
6523 size_t cur;
6524 size_t offset;
6525 struct page *page;
6526 char *kaddr;
6527 char *ptr = (char *)ptrv;
884b07d0 6528 unsigned long i = get_eb_page_index(start);
d1310b2e
CM
6529 int ret = 0;
6530
f98b6215
QW
6531 if (check_eb_range(eb, start, len))
6532 return -EINVAL;
d1310b2e 6533
884b07d0 6534 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6535
d397712b 6536 while (len > 0) {
fb85fc9a 6537 page = eb->pages[i];
d1310b2e 6538
09cbfeaf 6539 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 6540
a6591715 6541 kaddr = page_address(page);
d1310b2e 6542 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
6543 if (ret)
6544 break;
6545
6546 ptr += cur;
6547 len -= cur;
6548 offset = 0;
6549 i++;
6550 }
6551 return ret;
6552}
d1310b2e 6553
b8f95771
QW
6554/*
6555 * Check that the extent buffer is uptodate.
6556 *
6557 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6558 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6559 */
6560static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6561 struct page *page)
6562{
6563 struct btrfs_fs_info *fs_info = eb->fs_info;
6564
6565 if (fs_info->sectorsize < PAGE_SIZE) {
6566 bool uptodate;
6567
6568 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6569 eb->start, eb->len);
6570 WARN_ON(!uptodate);
6571 } else {
6572 WARN_ON(!PageUptodate(page));
6573 }
6574}
6575
2b48966a 6576void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
f157bf76
DS
6577 const void *srcv)
6578{
6579 char *kaddr;
6580
b8f95771 6581 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
6582 kaddr = page_address(eb->pages[0]) +
6583 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6584 chunk_tree_uuid));
6585 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
6586}
6587
2b48966a 6588void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
f157bf76
DS
6589{
6590 char *kaddr;
6591
b8f95771 6592 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
6593 kaddr = page_address(eb->pages[0]) +
6594 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6595 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
6596}
6597
2b48966a 6598void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
d1310b2e
CM
6599 unsigned long start, unsigned long len)
6600{
6601 size_t cur;
6602 size_t offset;
6603 struct page *page;
6604 char *kaddr;
6605 char *src = (char *)srcv;
884b07d0 6606 unsigned long i = get_eb_page_index(start);
d1310b2e 6607
d3575156
NA
6608 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6609
f98b6215
QW
6610 if (check_eb_range(eb, start, len))
6611 return;
d1310b2e 6612
884b07d0 6613 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6614
d397712b 6615 while (len > 0) {
fb85fc9a 6616 page = eb->pages[i];
b8f95771 6617 assert_eb_page_uptodate(eb, page);
d1310b2e 6618
09cbfeaf 6619 cur = min(len, PAGE_SIZE - offset);
a6591715 6620 kaddr = page_address(page);
d1310b2e 6621 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
6622
6623 src += cur;
6624 len -= cur;
6625 offset = 0;
6626 i++;
6627 }
6628}
d1310b2e 6629
2b48966a 6630void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
b159fa28 6631 unsigned long len)
d1310b2e
CM
6632{
6633 size_t cur;
6634 size_t offset;
6635 struct page *page;
6636 char *kaddr;
884b07d0 6637 unsigned long i = get_eb_page_index(start);
d1310b2e 6638
f98b6215
QW
6639 if (check_eb_range(eb, start, len))
6640 return;
d1310b2e 6641
884b07d0 6642 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6643
d397712b 6644 while (len > 0) {
fb85fc9a 6645 page = eb->pages[i];
b8f95771 6646 assert_eb_page_uptodate(eb, page);
d1310b2e 6647
09cbfeaf 6648 cur = min(len, PAGE_SIZE - offset);
a6591715 6649 kaddr = page_address(page);
b159fa28 6650 memset(kaddr + offset, 0, cur);
d1310b2e
CM
6651
6652 len -= cur;
6653 offset = 0;
6654 i++;
6655 }
6656}
d1310b2e 6657
2b48966a
DS
6658void copy_extent_buffer_full(const struct extent_buffer *dst,
6659 const struct extent_buffer *src)
58e8012c
DS
6660{
6661 int i;
cc5e31a4 6662 int num_pages;
58e8012c
DS
6663
6664 ASSERT(dst->len == src->len);
6665
884b07d0
QW
6666 if (dst->fs_info->sectorsize == PAGE_SIZE) {
6667 num_pages = num_extent_pages(dst);
6668 for (i = 0; i < num_pages; i++)
6669 copy_page(page_address(dst->pages[i]),
6670 page_address(src->pages[i]));
6671 } else {
6672 size_t src_offset = get_eb_offset_in_page(src, 0);
6673 size_t dst_offset = get_eb_offset_in_page(dst, 0);
6674
6675 ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
6676 memcpy(page_address(dst->pages[0]) + dst_offset,
6677 page_address(src->pages[0]) + src_offset,
6678 src->len);
6679 }
58e8012c
DS
6680}
6681
2b48966a
DS
6682void copy_extent_buffer(const struct extent_buffer *dst,
6683 const struct extent_buffer *src,
d1310b2e
CM
6684 unsigned long dst_offset, unsigned long src_offset,
6685 unsigned long len)
6686{
6687 u64 dst_len = dst->len;
6688 size_t cur;
6689 size_t offset;
6690 struct page *page;
6691 char *kaddr;
884b07d0 6692 unsigned long i = get_eb_page_index(dst_offset);
d1310b2e 6693
f98b6215
QW
6694 if (check_eb_range(dst, dst_offset, len) ||
6695 check_eb_range(src, src_offset, len))
6696 return;
6697
d1310b2e
CM
6698 WARN_ON(src->len != dst_len);
6699
884b07d0 6700 offset = get_eb_offset_in_page(dst, dst_offset);
d1310b2e 6701
d397712b 6702 while (len > 0) {
fb85fc9a 6703 page = dst->pages[i];
b8f95771 6704 assert_eb_page_uptodate(dst, page);
d1310b2e 6705
09cbfeaf 6706 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 6707
a6591715 6708 kaddr = page_address(page);
d1310b2e 6709 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
6710
6711 src_offset += cur;
6712 len -= cur;
6713 offset = 0;
6714 i++;
6715 }
6716}
d1310b2e 6717
3e1e8bb7
OS
6718/*
6719 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
6720 * given bit number
6721 * @eb: the extent buffer
6722 * @start: offset of the bitmap item in the extent buffer
6723 * @nr: bit number
6724 * @page_index: return index of the page in the extent buffer that contains the
6725 * given bit number
6726 * @page_offset: return offset into the page given by page_index
6727 *
6728 * This helper hides the ugliness of finding the byte in an extent buffer which
6729 * contains a given bit.
6730 */
2b48966a 6731static inline void eb_bitmap_offset(const struct extent_buffer *eb,
3e1e8bb7
OS
6732 unsigned long start, unsigned long nr,
6733 unsigned long *page_index,
6734 size_t *page_offset)
6735{
3e1e8bb7
OS
6736 size_t byte_offset = BIT_BYTE(nr);
6737 size_t offset;
6738
6739 /*
6740 * The byte we want is the offset of the extent buffer + the offset of
6741 * the bitmap item in the extent buffer + the offset of the byte in the
6742 * bitmap item.
6743 */
884b07d0 6744 offset = start + offset_in_page(eb->start) + byte_offset;
3e1e8bb7 6745
09cbfeaf 6746 *page_index = offset >> PAGE_SHIFT;
7073017a 6747 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
6748}
6749
6750/**
6751 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
6752 * @eb: the extent buffer
6753 * @start: offset of the bitmap item in the extent buffer
6754 * @nr: bit number to test
6755 */
2b48966a 6756int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
6757 unsigned long nr)
6758{
2fe1d551 6759 u8 *kaddr;
3e1e8bb7
OS
6760 struct page *page;
6761 unsigned long i;
6762 size_t offset;
6763
6764 eb_bitmap_offset(eb, start, nr, &i, &offset);
6765 page = eb->pages[i];
b8f95771 6766 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
6767 kaddr = page_address(page);
6768 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
6769}
6770
6771/**
6772 * extent_buffer_bitmap_set - set an area of a bitmap
6773 * @eb: the extent buffer
6774 * @start: offset of the bitmap item in the extent buffer
6775 * @pos: bit number of the first bit
6776 * @len: number of bits to set
6777 */
2b48966a 6778void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
6779 unsigned long pos, unsigned long len)
6780{
2fe1d551 6781 u8 *kaddr;
3e1e8bb7
OS
6782 struct page *page;
6783 unsigned long i;
6784 size_t offset;
6785 const unsigned int size = pos + len;
6786 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 6787 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
6788
6789 eb_bitmap_offset(eb, start, pos, &i, &offset);
6790 page = eb->pages[i];
b8f95771 6791 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
6792 kaddr = page_address(page);
6793
6794 while (len >= bits_to_set) {
6795 kaddr[offset] |= mask_to_set;
6796 len -= bits_to_set;
6797 bits_to_set = BITS_PER_BYTE;
9c894696 6798 mask_to_set = ~0;
09cbfeaf 6799 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
6800 offset = 0;
6801 page = eb->pages[++i];
b8f95771 6802 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
6803 kaddr = page_address(page);
6804 }
6805 }
6806 if (len) {
6807 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
6808 kaddr[offset] |= mask_to_set;
6809 }
6810}
6811
6812
6813/**
6814 * extent_buffer_bitmap_clear - clear an area of a bitmap
6815 * @eb: the extent buffer
6816 * @start: offset of the bitmap item in the extent buffer
6817 * @pos: bit number of the first bit
6818 * @len: number of bits to clear
6819 */
2b48966a
DS
6820void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
6821 unsigned long start, unsigned long pos,
6822 unsigned long len)
3e1e8bb7 6823{
2fe1d551 6824 u8 *kaddr;
3e1e8bb7
OS
6825 struct page *page;
6826 unsigned long i;
6827 size_t offset;
6828 const unsigned int size = pos + len;
6829 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 6830 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
6831
6832 eb_bitmap_offset(eb, start, pos, &i, &offset);
6833 page = eb->pages[i];
b8f95771 6834 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
6835 kaddr = page_address(page);
6836
6837 while (len >= bits_to_clear) {
6838 kaddr[offset] &= ~mask_to_clear;
6839 len -= bits_to_clear;
6840 bits_to_clear = BITS_PER_BYTE;
9c894696 6841 mask_to_clear = ~0;
09cbfeaf 6842 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
6843 offset = 0;
6844 page = eb->pages[++i];
b8f95771 6845 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
6846 kaddr = page_address(page);
6847 }
6848 }
6849 if (len) {
6850 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
6851 kaddr[offset] &= ~mask_to_clear;
6852 }
6853}
6854
3387206f
ST
6855static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
6856{
6857 unsigned long distance = (src > dst) ? src - dst : dst - src;
6858 return distance < len;
6859}
6860
d1310b2e
CM
6861static void copy_pages(struct page *dst_page, struct page *src_page,
6862 unsigned long dst_off, unsigned long src_off,
6863 unsigned long len)
6864{
a6591715 6865 char *dst_kaddr = page_address(dst_page);
d1310b2e 6866 char *src_kaddr;
727011e0 6867 int must_memmove = 0;
d1310b2e 6868
3387206f 6869 if (dst_page != src_page) {
a6591715 6870 src_kaddr = page_address(src_page);
3387206f 6871 } else {
d1310b2e 6872 src_kaddr = dst_kaddr;
727011e0
CM
6873 if (areas_overlap(src_off, dst_off, len))
6874 must_memmove = 1;
3387206f 6875 }
d1310b2e 6876
727011e0
CM
6877 if (must_memmove)
6878 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
6879 else
6880 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
6881}
6882
2b48966a
DS
6883void memcpy_extent_buffer(const struct extent_buffer *dst,
6884 unsigned long dst_offset, unsigned long src_offset,
6885 unsigned long len)
d1310b2e
CM
6886{
6887 size_t cur;
6888 size_t dst_off_in_page;
6889 size_t src_off_in_page;
d1310b2e
CM
6890 unsigned long dst_i;
6891 unsigned long src_i;
6892
f98b6215
QW
6893 if (check_eb_range(dst, dst_offset, len) ||
6894 check_eb_range(dst, src_offset, len))
6895 return;
d1310b2e 6896
d397712b 6897 while (len > 0) {
884b07d0
QW
6898 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
6899 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
d1310b2e 6900
884b07d0
QW
6901 dst_i = get_eb_page_index(dst_offset);
6902 src_i = get_eb_page_index(src_offset);
d1310b2e 6903
09cbfeaf 6904 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
6905 src_off_in_page));
6906 cur = min_t(unsigned long, cur,
09cbfeaf 6907 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 6908
fb85fc9a 6909 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6910 dst_off_in_page, src_off_in_page, cur);
6911
6912 src_offset += cur;
6913 dst_offset += cur;
6914 len -= cur;
6915 }
6916}
d1310b2e 6917
2b48966a
DS
6918void memmove_extent_buffer(const struct extent_buffer *dst,
6919 unsigned long dst_offset, unsigned long src_offset,
6920 unsigned long len)
d1310b2e
CM
6921{
6922 size_t cur;
6923 size_t dst_off_in_page;
6924 size_t src_off_in_page;
6925 unsigned long dst_end = dst_offset + len - 1;
6926 unsigned long src_end = src_offset + len - 1;
d1310b2e
CM
6927 unsigned long dst_i;
6928 unsigned long src_i;
6929
f98b6215
QW
6930 if (check_eb_range(dst, dst_offset, len) ||
6931 check_eb_range(dst, src_offset, len))
6932 return;
727011e0 6933 if (dst_offset < src_offset) {
d1310b2e
CM
6934 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6935 return;
6936 }
d397712b 6937 while (len > 0) {
884b07d0
QW
6938 dst_i = get_eb_page_index(dst_end);
6939 src_i = get_eb_page_index(src_end);
d1310b2e 6940
884b07d0
QW
6941 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
6942 src_off_in_page = get_eb_offset_in_page(dst, src_end);
d1310b2e
CM
6943
6944 cur = min_t(unsigned long, len, src_off_in_page + 1);
6945 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 6946 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6947 dst_off_in_page - cur + 1,
6948 src_off_in_page - cur + 1, cur);
6949
6950 dst_end -= cur;
6951 src_end -= cur;
6952 len -= cur;
6953 }
6954}
6af118ce 6955
d1e86e3f
QW
6956static struct extent_buffer *get_next_extent_buffer(
6957 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
6958{
6959 struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
6960 struct extent_buffer *found = NULL;
6961 u64 page_start = page_offset(page);
6962 int ret;
6963 int i;
6964
6965 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
6966 ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
6967 lockdep_assert_held(&fs_info->buffer_lock);
6968
6969 ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
6970 bytenr >> fs_info->sectorsize_bits,
6971 PAGE_SIZE / fs_info->nodesize);
6972 for (i = 0; i < ret; i++) {
6973 /* Already beyond page end */
6974 if (gang[i]->start >= page_start + PAGE_SIZE)
6975 break;
6976 /* Found one */
6977 if (gang[i]->start >= bytenr) {
6978 found = gang[i];
6979 break;
6980 }
6981 }
6982 return found;
6983}
6984
6985static int try_release_subpage_extent_buffer(struct page *page)
6986{
6987 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
6988 u64 cur = page_offset(page);
6989 const u64 end = page_offset(page) + PAGE_SIZE;
6990 int ret;
6991
6992 while (cur < end) {
6993 struct extent_buffer *eb = NULL;
6994
6995 /*
6996 * Unlike try_release_extent_buffer() which uses page->private
6997 * to grab buffer, for subpage case we rely on radix tree, thus
6998 * we need to ensure radix tree consistency.
6999 *
7000 * We also want an atomic snapshot of the radix tree, thus go
7001 * with spinlock rather than RCU.
7002 */
7003 spin_lock(&fs_info->buffer_lock);
7004 eb = get_next_extent_buffer(fs_info, page, cur);
7005 if (!eb) {
7006 /* No more eb in the page range after or at cur */
7007 spin_unlock(&fs_info->buffer_lock);
7008 break;
7009 }
7010 cur = eb->start + eb->len;
7011
7012 /*
7013 * The same as try_release_extent_buffer(), to ensure the eb
7014 * won't disappear out from under us.
7015 */
7016 spin_lock(&eb->refs_lock);
7017 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7018 spin_unlock(&eb->refs_lock);
7019 spin_unlock(&fs_info->buffer_lock);
7020 break;
7021 }
7022 spin_unlock(&fs_info->buffer_lock);
7023
7024 /*
7025 * If tree ref isn't set then we know the ref on this eb is a
7026 * real ref, so just return, this eb will likely be freed soon
7027 * anyway.
7028 */
7029 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7030 spin_unlock(&eb->refs_lock);
7031 break;
7032 }
7033
7034 /*
7035 * Here we don't care about the return value, we will always
7036 * check the page private at the end. And
7037 * release_extent_buffer() will release the refs_lock.
7038 */
7039 release_extent_buffer(eb);
7040 }
7041 /*
7042 * Finally to check if we have cleared page private, as if we have
7043 * released all ebs in the page, the page private should be cleared now.
7044 */
7045 spin_lock(&page->mapping->private_lock);
7046 if (!PagePrivate(page))
7047 ret = 1;
7048 else
7049 ret = 0;
7050 spin_unlock(&page->mapping->private_lock);
7051 return ret;
7052
7053}
7054
f7a52a40 7055int try_release_extent_buffer(struct page *page)
19fe0a8b 7056{
6af118ce 7057 struct extent_buffer *eb;
6af118ce 7058
d1e86e3f
QW
7059 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
7060 return try_release_subpage_extent_buffer(page);
7061
3083ee2e 7062 /*
d1e86e3f
QW
7063 * We need to make sure nobody is changing page->private, as we rely on
7064 * page->private as the pointer to extent buffer.
3083ee2e
JB
7065 */
7066 spin_lock(&page->mapping->private_lock);
7067 if (!PagePrivate(page)) {
7068 spin_unlock(&page->mapping->private_lock);
4f2de97a 7069 return 1;
45f49bce 7070 }
6af118ce 7071
3083ee2e
JB
7072 eb = (struct extent_buffer *)page->private;
7073 BUG_ON(!eb);
19fe0a8b
MX
7074
7075 /*
3083ee2e
JB
7076 * This is a little awful but should be ok, we need to make sure that
7077 * the eb doesn't disappear out from under us while we're looking at
7078 * this page.
19fe0a8b 7079 */
3083ee2e 7080 spin_lock(&eb->refs_lock);
0b32f4bb 7081 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
7082 spin_unlock(&eb->refs_lock);
7083 spin_unlock(&page->mapping->private_lock);
7084 return 0;
b9473439 7085 }
3083ee2e 7086 spin_unlock(&page->mapping->private_lock);
897ca6e9 7087
19fe0a8b 7088 /*
3083ee2e
JB
7089 * If tree ref isn't set then we know the ref on this eb is a real ref,
7090 * so just return, this page will likely be freed soon anyway.
19fe0a8b 7091 */
3083ee2e
JB
7092 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7093 spin_unlock(&eb->refs_lock);
7094 return 0;
b9473439 7095 }
19fe0a8b 7096
f7a52a40 7097 return release_extent_buffer(eb);
6af118ce 7098}
bfb484d9
JB
7099
7100/*
7101 * btrfs_readahead_tree_block - attempt to readahead a child block
7102 * @fs_info: the fs_info
7103 * @bytenr: bytenr to read
3fbaf258 7104 * @owner_root: objectid of the root that owns this eb
bfb484d9 7105 * @gen: generation for the uptodate check, can be 0
3fbaf258 7106 * @level: level for the eb
bfb484d9
JB
7107 *
7108 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
7109 * normal uptodate check of the eb, without checking the generation. If we have
7110 * to read the block we will not block on anything.
7111 */
7112void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
3fbaf258 7113 u64 bytenr, u64 owner_root, u64 gen, int level)
bfb484d9
JB
7114{
7115 struct extent_buffer *eb;
7116 int ret;
7117
3fbaf258 7118 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
bfb484d9
JB
7119 if (IS_ERR(eb))
7120 return;
7121
7122 if (btrfs_buffer_uptodate(eb, gen, 1)) {
7123 free_extent_buffer(eb);
7124 return;
7125 }
7126
7127 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7128 if (ret < 0)
7129 free_extent_buffer_stale(eb);
7130 else
7131 free_extent_buffer(eb);
7132}
7133
7134/*
7135 * btrfs_readahead_node_child - readahead a node's child block
7136 * @node: parent node we're reading from
7137 * @slot: slot in the parent node for the child we want to read
7138 *
7139 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7140 * the slot in the node provided.
7141 */
7142void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7143{
7144 btrfs_readahead_tree_block(node->fs_info,
7145 btrfs_node_blockptr(node, slot),
3fbaf258
JB
7146 btrfs_header_owner(node),
7147 btrfs_node_ptr_generation(node, slot),
7148 btrfs_header_level(node) - 1);
bfb484d9 7149}