btrfs: split page locking out of __process_pages_contig
[linux-block.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
395cb57e 9#include <linux/sched/mm.h>
d1310b2e
CM
10#include <linux/spinlock.h>
11#include <linux/blkdev.h>
12#include <linux/swap.h>
d1310b2e
CM
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
268bb0ce 15#include <linux/prefetch.h>
14605409 16#include <linux/fsverity.h>
cea62800 17#include "misc.h"
d1310b2e 18#include "extent_io.h"
9c7d3a54 19#include "extent-io-tree.h"
d1310b2e 20#include "extent_map.h"
902b22f3
DW
21#include "ctree.h"
22#include "btrfs_inode.h"
103c1972 23#include "bio.h"
21adbd5c 24#include "check-integrity.h"
0b32f4bb 25#include "locking.h"
606686ee 26#include "rcu-string.h"
fe09e16c 27#include "backref.h"
6af49dbd 28#include "disk-io.h"
760f991f 29#include "subpage.h"
d3575156 30#include "zoned.h"
0bc09ca1 31#include "block-group.h"
2a5232a8 32#include "compression.h"
ec8eb376 33#include "fs.h"
07e81dc9 34#include "accessors.h"
7c8ede16 35#include "file-item.h"
af142b6f 36#include "file.h"
77407dc0 37#include "dev-replace.h"
7f0add25 38#include "super.h"
98c8d683 39#include "transaction.h"
d1310b2e 40
d1310b2e
CM
41static struct kmem_cache *extent_buffer_cache;
42
6d49ba1b 43#ifdef CONFIG_BTRFS_DEBUG
a40246e8
JB
44static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
45{
46 struct btrfs_fs_info *fs_info = eb->fs_info;
47 unsigned long flags;
48
49 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
50 list_add(&eb->leak_list, &fs_info->allocated_ebs);
51 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
52}
53
a40246e8
JB
54static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
55{
56 struct btrfs_fs_info *fs_info = eb->fs_info;
57 unsigned long flags;
58
59 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
60 list_del(&eb->leak_list);
61 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
6d49ba1b
ES
62}
63
3fd63727 64void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 65{
6d49ba1b 66 struct extent_buffer *eb;
3fd63727 67 unsigned long flags;
6d49ba1b 68
8c38938c
JB
69 /*
70 * If we didn't get into open_ctree our allocated_ebs will not be
71 * initialized, so just skip this.
72 */
73 if (!fs_info->allocated_ebs.next)
74 return;
75
b95b78e6 76 WARN_ON(!list_empty(&fs_info->allocated_ebs));
3fd63727
JB
77 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
78 while (!list_empty(&fs_info->allocated_ebs)) {
79 eb = list_first_entry(&fs_info->allocated_ebs,
80 struct extent_buffer, leak_list);
8c38938c
JB
81 pr_err(
82 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
83 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
84 btrfs_header_owner(eb));
33ca832f
JB
85 list_del(&eb->leak_list);
86 kmem_cache_free(extent_buffer_cache, eb);
87 }
3fd63727 88 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f 89}
6d49ba1b 90#else
a40246e8 91#define btrfs_leak_debug_add_eb(eb) do {} while (0)
a40246e8 92#define btrfs_leak_debug_del_eb(eb) do {} while (0)
4bef0848 93#endif
d1310b2e 94
7aab8b32
CH
95/*
96 * Structure to record info about the bio being assembled, and other info like
97 * how many bytes are there before stripe/ordered extent boundary.
98 */
99struct btrfs_bio_ctrl {
9dfde1b4 100 struct btrfs_bio *bbio;
0f07003b 101 enum btrfs_compression_type compress_type;
7aab8b32 102 u32 len_to_oe_boundary;
c000bc04 103 blk_opf_t opf;
5467abba 104 btrfs_bio_end_io_t end_io_func;
72b505dc 105 struct writeback_control *wbc;
d1310b2e
CM
106};
107
722c82ac 108static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bb58eb9e 109{
9dfde1b4 110 struct btrfs_bio *bbio = bio_ctrl->bbio;
722c82ac 111
9dfde1b4 112 if (!bbio)
722c82ac 113 return;
bb58eb9e 114
e0eefe07 115 /* Caller should ensure the bio has at least some range added */
9dfde1b4 116 ASSERT(bbio->bio.bi_iter.bi_size);
c9583ada 117
9dfde1b4 118 if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
35a8d7da 119 bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
e1949310 120 btrfs_submit_compressed_read(bbio);
35a8d7da 121 else
b78b98e0 122 btrfs_submit_bio(bbio, 0);
35a8d7da 123
9dfde1b4
CH
124 /* The bbio is owned by the end_io handler now */
125 bio_ctrl->bbio = NULL;
3065976b
QW
126}
127
f4340622 128/*
ee5f017d 129 * Submit or fail the current bio in the bio_ctrl structure.
f4340622 130 */
ee5f017d 131static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
bb58eb9e 132{
9dfde1b4 133 struct btrfs_bio *bbio = bio_ctrl->bbio;
bb58eb9e 134
9dfde1b4 135 if (!bbio)
9845e5dd
CH
136 return;
137
138 if (ret) {
139 ASSERT(ret < 0);
9dfde1b4 140 btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
917f32a2 141 /* The bio is owned by the end_io handler now */
9dfde1b4 142 bio_ctrl->bbio = NULL;
9845e5dd 143 } else {
ee5f017d 144 submit_one_bio(bio_ctrl);
bb58eb9e
QW
145 }
146}
e2932ee0 147
a62a3bd9
JB
148int __init extent_buffer_init_cachep(void)
149{
837e1972 150 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 151 sizeof(struct extent_buffer), 0,
fba4b697 152 SLAB_MEM_SPREAD, NULL);
a62a3bd9 153 if (!extent_buffer_cache)
6f0d04f8 154 return -ENOMEM;
b208c2f7 155
d1310b2e 156 return 0;
d1310b2e
CM
157}
158
a62a3bd9 159void __cold extent_buffer_free_cachep(void)
d1310b2e 160{
8c0a8537
KS
161 /*
162 * Make sure all delayed rcu free are flushed before we
163 * destroy caches.
164 */
165 rcu_barrier();
5598e900 166 kmem_cache_destroy(extent_buffer_cache);
d1310b2e
CM
167}
168
bd1fa4f0 169void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 170{
09cbfeaf
KS
171 unsigned long index = start >> PAGE_SHIFT;
172 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
173 struct page *page;
174
175 while (index <= end_index) {
176 page = find_get_page(inode->i_mapping, index);
177 BUG_ON(!page); /* Pages should be in the extent_io_tree */
178 clear_page_dirty_for_io(page);
09cbfeaf 179 put_page(page);
4adaa611
CM
180 index++;
181 }
4adaa611
CM
182}
183
f6311572 184void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 185{
ebf55c88 186 struct address_space *mapping = inode->i_mapping;
09cbfeaf
KS
187 unsigned long index = start >> PAGE_SHIFT;
188 unsigned long end_index = end >> PAGE_SHIFT;
ebf55c88 189 struct folio *folio;
4adaa611
CM
190
191 while (index <= end_index) {
ebf55c88
MWO
192 folio = filemap_get_folio(mapping, index);
193 filemap_dirty_folio(mapping, folio);
194 folio_account_redirty(folio);
195 index += folio_nr_pages(folio);
196 folio_put(folio);
4adaa611 197 }
4adaa611
CM
198}
199
ef4e88e6
CH
200static void process_one_page(struct btrfs_fs_info *fs_info,
201 struct page *page, struct page *locked_page,
202 unsigned long page_ops, u64 start, u64 end)
ed8f13bf 203{
e38992be
QW
204 u32 len;
205
206 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
207 len = end + 1 - start;
208
ed8f13bf 209 if (page_ops & PAGE_SET_ORDERED)
b945a463 210 btrfs_page_clamp_set_ordered(fs_info, page, start, len);
ed8f13bf 211 if (page_ops & PAGE_START_WRITEBACK) {
e38992be
QW
212 btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
213 btrfs_page_clamp_set_writeback(fs_info, page, start, len);
ed8f13bf
QW
214 }
215 if (page_ops & PAGE_END_WRITEBACK)
e38992be 216 btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
a33a8e9a 217
ef4e88e6 218 if (page != locked_page && (page_ops & PAGE_UNLOCK))
1e1de387 219 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
220}
221
ef4e88e6
CH
222static void __process_pages_contig(struct address_space *mapping,
223 struct page *locked_page, u64 start, u64 end,
224 unsigned long page_ops)
ed8f13bf 225{
e38992be 226 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
ed8f13bf
QW
227 pgoff_t start_index = start >> PAGE_SHIFT;
228 pgoff_t end_index = end >> PAGE_SHIFT;
229 pgoff_t index = start_index;
04c6b79a 230 struct folio_batch fbatch;
ed8f13bf
QW
231 int i;
232
04c6b79a
VMO
233 folio_batch_init(&fbatch);
234 while (index <= end_index) {
235 int found_folios;
236
237 found_folios = filemap_get_folios_contig(mapping, &index,
238 end_index, &fbatch);
04c6b79a 239 for (i = 0; i < found_folios; i++) {
04c6b79a 240 struct folio *folio = fbatch.folios[i];
ef4e88e6
CH
241
242 process_one_page(fs_info, &folio->page, locked_page,
243 page_ops, start, end);
ed8f13bf 244 }
04c6b79a 245 folio_batch_release(&fbatch);
ed8f13bf
QW
246 cond_resched();
247 }
ed8f13bf 248}
da2c7009 249
143bede5
JM
250static noinline void __unlock_for_delalloc(struct inode *inode,
251 struct page *locked_page,
252 u64 start, u64 end)
c8b97818 253{
09cbfeaf
KS
254 unsigned long index = start >> PAGE_SHIFT;
255 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 256
76c0021d 257 ASSERT(locked_page);
c8b97818 258 if (index == locked_page->index && end_index == index)
143bede5 259 return;
c8b97818 260
98af9ab1 261 __process_pages_contig(inode->i_mapping, locked_page, start, end,
ef4e88e6 262 PAGE_UNLOCK);
c8b97818
CM
263}
264
265static noinline int lock_delalloc_pages(struct inode *inode,
266 struct page *locked_page,
ef4e88e6
CH
267 u64 start,
268 u64 end)
c8b97818 269{
ef4e88e6
CH
270 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
271 struct address_space *mapping = inode->i_mapping;
272 pgoff_t start_index = start >> PAGE_SHIFT;
273 pgoff_t end_index = end >> PAGE_SHIFT;
274 pgoff_t index = start_index;
275 u64 processed_end = start;
276 struct folio_batch fbatch;
c8b97818 277
c8b97818
CM
278 if (index == locked_page->index && index == end_index)
279 return 0;
280
ef4e88e6
CH
281 folio_batch_init(&fbatch);
282 while (index <= end_index) {
283 unsigned int found_folios, i;
284
285 found_folios = filemap_get_folios_contig(mapping, &index,
286 end_index, &fbatch);
287 if (found_folios == 0)
288 goto out;
289
290 for (i = 0; i < found_folios; i++) {
291 struct page *page = &fbatch.folios[i]->page;
292 u32 len = end + 1 - start;
293
294 if (page == locked_page)
295 continue;
296
297 if (btrfs_page_start_writer_lock(fs_info, page, start,
298 len))
299 goto out;
300
301 if (!PageDirty(page) || page->mapping != mapping) {
302 btrfs_page_end_writer_lock(fs_info, page, start,
303 len);
304 goto out;
305 }
306
307 processed_end = page_offset(page) + PAGE_SIZE - 1;
308 }
309 folio_batch_release(&fbatch);
310 cond_resched();
311 }
312
313 return 0;
314out:
315 folio_batch_release(&fbatch);
316 if (processed_end > start)
317 __unlock_for_delalloc(inode, locked_page, start, processed_end);
318 return -EAGAIN;
c8b97818
CM
319}
320
321/*
3522e903 322 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
2749f7ef 323 * more than @max_bytes.
c8b97818 324 *
2749f7ef
QW
325 * @start: The original start bytenr to search.
326 * Will store the extent range start bytenr.
327 * @end: The original end bytenr of the search range
328 * Will store the extent range end bytenr.
329 *
330 * Return true if we find a delalloc range which starts inside the original
331 * range, and @start/@end will store the delalloc range start/end.
332 *
333 * Return false if we can't find any delalloc range which starts inside the
334 * original range, and @start/@end will be the non-delalloc range start/end.
c8b97818 335 */
ce9f967f 336EXPORT_FOR_TESTS
3522e903 337noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 338 struct page *locked_page, u64 *start,
917aacec 339 u64 *end)
c8b97818 340{
f7b12a62 341 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9978059b 342 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2749f7ef
QW
343 const u64 orig_start = *start;
344 const u64 orig_end = *end;
f7b12a62
NA
345 /* The sanity tests may not set a valid fs_info. */
346 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
347 u64 delalloc_start;
348 u64 delalloc_end;
3522e903 349 bool found;
9655d298 350 struct extent_state *cached_state = NULL;
c8b97818
CM
351 int ret;
352 int loops = 0;
353
2749f7ef
QW
354 /* Caller should pass a valid @end to indicate the search range end */
355 ASSERT(orig_end > orig_start);
356
357 /* The range should at least cover part of the page */
358 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
359 orig_end <= page_offset(locked_page)));
c8b97818
CM
360again:
361 /* step one, find a bunch of delalloc bytes starting at start */
362 delalloc_start = *start;
363 delalloc_end = 0;
083e75e7
JB
364 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
365 max_bytes, &cached_state);
2749f7ef 366 if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
c8b97818 367 *start = delalloc_start;
2749f7ef
QW
368
369 /* @delalloc_end can be -1, never go beyond @orig_end */
370 *end = min(delalloc_end, orig_end);
c2a128d2 371 free_extent_state(cached_state);
3522e903 372 return false;
c8b97818
CM
373 }
374
70b99e69
CM
375 /*
376 * start comes from the offset of locked_page. We have to lock
377 * pages in order, so we can't process delalloc bytes before
378 * locked_page
379 */
d397712b 380 if (delalloc_start < *start)
70b99e69 381 delalloc_start = *start;
70b99e69 382
c8b97818
CM
383 /*
384 * make sure to limit the number of pages we try to lock down
c8b97818 385 */
7bf811a5
JB
386 if (delalloc_end + 1 - delalloc_start > max_bytes)
387 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 388
c8b97818
CM
389 /* step two, lock all the pages after the page that has start */
390 ret = lock_delalloc_pages(inode, locked_page,
391 delalloc_start, delalloc_end);
9bfd61d9 392 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
393 if (ret == -EAGAIN) {
394 /* some of the pages are gone, lets avoid looping by
395 * shortening the size of the delalloc range we're searching
396 */
9655d298 397 free_extent_state(cached_state);
7d788742 398 cached_state = NULL;
c8b97818 399 if (!loops) {
09cbfeaf 400 max_bytes = PAGE_SIZE;
c8b97818
CM
401 loops = 1;
402 goto again;
403 } else {
3522e903 404 found = false;
c8b97818
CM
405 goto out_failed;
406 }
407 }
c8b97818
CM
408
409 /* step three, lock the state bits for the whole range */
570eb97b 410 lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
411
412 /* then test to make sure it is all still delalloc */
413 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 414 EXTENT_DELALLOC, 1, cached_state);
c8b97818 415 if (!ret) {
570eb97b
JB
416 unlock_extent(tree, delalloc_start, delalloc_end,
417 &cached_state);
c8b97818
CM
418 __unlock_for_delalloc(inode, locked_page,
419 delalloc_start, delalloc_end);
420 cond_resched();
421 goto again;
422 }
9655d298 423 free_extent_state(cached_state);
c8b97818
CM
424 *start = delalloc_start;
425 *end = delalloc_end;
426out_failed:
427 return found;
428}
429
ad7ff17b 430void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
74e9194a 431 struct page *locked_page,
f97e27e9 432 u32 clear_bits, unsigned long page_ops)
873695b3 433{
bd015294 434 clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
873695b3 435
ad7ff17b 436 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
ef4e88e6 437 start, end, page_ops);
873695b3
LB
438}
439
ed9ee98e
CH
440static bool btrfs_verify_page(struct page *page, u64 start)
441{
442 if (!fsverity_active(page->mapping->host) ||
57201ddd 443 PageUptodate(page) ||
ed9ee98e
CH
444 start >= i_size_read(page->mapping->host))
445 return true;
446 return fsverity_verify_page(page);
447}
448
150e4b05
QW
449static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
450{
451 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
452
453 ASSERT(page_offset(page) <= start &&
454 start + len <= page_offset(page) + PAGE_SIZE);
455
2b2553f1 456 if (uptodate && btrfs_verify_page(page, start))
2c14f0ff 457 btrfs_page_set_uptodate(fs_info, page, start, len);
2b2553f1 458 else
150e4b05 459 btrfs_page_clear_uptodate(fs_info, page, start, len);
150e4b05 460
fbca46eb 461 if (!btrfs_is_subpage(fs_info, page))
150e4b05 462 unlock_page(page);
3d078efa 463 else
150e4b05
QW
464 btrfs_subpage_end_reader(fs_info, page, start, len);
465}
466
d1310b2e
CM
467/* lots and lots of room for performance fixes in the end_bio funcs */
468
b5227c07 469void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0 470{
38a39ac7 471 struct btrfs_inode *inode;
25c1252a 472 const bool uptodate = (err == 0);
3e2426bd 473 int ret = 0;
87826df0 474
38a39ac7
QW
475 ASSERT(page && page->mapping);
476 inode = BTRFS_I(page->mapping->host);
477 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
87826df0 478
87826df0 479 if (!uptodate) {
963e4db8
QW
480 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
481 u32 len;
482
483 ASSERT(end + 1 - start <= U32_MAX);
484 len = end + 1 - start;
485
486 btrfs_page_clear_uptodate(fs_info, page, start, len);
bff5baf8 487 ret = err < 0 ? err : -EIO;
5dca6eea 488 mapping_set_error(page->mapping, ret);
87826df0 489 }
87826df0
JM
490}
491
d1310b2e
CM
492/*
493 * after a writepage IO is done, we need to:
494 * clear the uptodate bits on error
495 * clear the writeback bits in the extent tree for this IO
496 * end_page_writeback if the page has no more pending IO
497 *
498 * Scheduling is not allowed, so the extent state tree is expected
499 * to have one and only one object corresponding to this IO.
500 */
917f32a2 501static void end_bio_extent_writepage(struct btrfs_bio *bbio)
d1310b2e 502{
917f32a2 503 struct bio *bio = &bbio->bio;
4e4cbee9 504 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 505 struct bio_vec *bvec;
6dc4f100 506 struct bvec_iter_all iter_all;
d1310b2e 507
c09abff8 508 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 509 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 510 struct page *page = bvec->bv_page;
0b246afa
JM
511 struct inode *inode = page->mapping->host;
512 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
321a02db 513 const u32 sectorsize = fs_info->sectorsize;
4ba8223d
CH
514 u64 start = page_offset(page) + bvec->bv_offset;
515 u32 len = bvec->bv_len;
321a02db
QW
516
517 /* Our read/write should always be sector aligned. */
518 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
519 btrfs_err(fs_info,
520 "partial page write in btrfs with offset %u and length %u",
521 bvec->bv_offset, bvec->bv_len);
522 else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
523 btrfs_info(fs_info,
524 "incomplete page write with offset %u and length %u",
525 bvec->bv_offset, bvec->bv_len);
526
0d394cca 527 btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
4ba8223d
CH
528 if (error) {
529 btrfs_page_clear_uptodate(fs_info, page, start, len);
530 mapping_set_error(page->mapping, error);
531 }
532 btrfs_page_clear_writeback(fs_info, page, start, len);
2c30c71b 533 }
2b1f55b0 534
d1310b2e 535 bio_put(bio);
d1310b2e
CM
536}
537
94e8c95c
QW
538/*
539 * Record previously processed extent range
540 *
541 * For endio_readpage_release_extent() to handle a full extent range, reducing
542 * the extent io operations.
543 */
544struct processed_extent {
545 struct btrfs_inode *inode;
546 /* Start of the range in @inode */
547 u64 start;
2e626e56 548 /* End of the range in @inode */
94e8c95c
QW
549 u64 end;
550 bool uptodate;
551};
552
553/*
554 * Try to release processed extent range
555 *
556 * May not release the extent range right now if the current range is
557 * contiguous to processed extent.
558 *
559 * Will release processed extent when any of @inode, @uptodate, the range is
560 * no longer contiguous to the processed range.
561 *
562 * Passing @inode == NULL will force processed extent to be released.
563 */
564static void endio_readpage_release_extent(struct processed_extent *processed,
565 struct btrfs_inode *inode, u64 start, u64 end,
566 bool uptodate)
883d0de4
MX
567{
568 struct extent_state *cached = NULL;
94e8c95c
QW
569 struct extent_io_tree *tree;
570
571 /* The first extent, initialize @processed */
572 if (!processed->inode)
573 goto update;
883d0de4 574
94e8c95c
QW
575 /*
576 * Contiguous to processed extent, just uptodate the end.
577 *
578 * Several things to notice:
579 *
580 * - bio can be merged as long as on-disk bytenr is contiguous
581 * This means we can have page belonging to other inodes, thus need to
582 * check if the inode still matches.
583 * - bvec can contain range beyond current page for multi-page bvec
584 * Thus we need to do processed->end + 1 >= start check
585 */
586 if (processed->inode == inode && processed->uptodate == uptodate &&
587 processed->end + 1 >= start && end >= processed->end) {
588 processed->end = end;
589 return;
590 }
591
592 tree = &processed->inode->io_tree;
593 /*
594 * Now we don't have range contiguous to the processed range, release
595 * the processed range now.
596 */
48acc47d 597 unlock_extent(tree, processed->start, processed->end, &cached);
94e8c95c
QW
598
599update:
600 /* Update processed to current range */
601 processed->inode = inode;
602 processed->start = start;
603 processed->end = end;
604 processed->uptodate = uptodate;
883d0de4
MX
605}
606
92082d40
QW
607static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
608{
609 ASSERT(PageLocked(page));
fbca46eb 610 if (!btrfs_is_subpage(fs_info, page))
92082d40
QW
611 return;
612
613 ASSERT(PagePrivate(page));
614 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
615}
616
d1310b2e
CM
617/*
618 * after a readpage IO is done, we need to:
619 * clear the uptodate bits on error
620 * set the uptodate bits if things worked
621 * set the page up to date if all extents in the tree are uptodate
622 * clear the lock bit in the extent tree
623 * unlock the page if there are no other extents locked for it
624 *
625 * Scheduling is not allowed, so the extent state tree is expected
626 * to have one and only one object corresponding to this IO.
627 */
917f32a2 628static void end_bio_extent_readpage(struct btrfs_bio *bbio)
d1310b2e 629{
917f32a2 630 struct bio *bio = &bbio->bio;
2c30c71b 631 struct bio_vec *bvec;
94e8c95c 632 struct processed_extent processed = { 0 };
7ffd27e3
QW
633 /*
634 * The offset to the beginning of a bio, since one bio can never be
635 * larger than UINT_MAX, u32 here is enough.
636 */
637 u32 bio_offset = 0;
6dc4f100 638 struct bvec_iter_all iter_all;
d1310b2e 639
c09abff8 640 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 641 bio_for_each_segment_all(bvec, bio, iter_all) {
150e4b05 642 bool uptodate = !bio->bi_status;
d1310b2e 643 struct page *page = bvec->bv_page;
a71754fc 644 struct inode *inode = page->mapping->host;
ab8d0fc4 645 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7ffd27e3
QW
646 const u32 sectorsize = fs_info->sectorsize;
647 u64 start;
648 u64 end;
649 u32 len;
507903b8 650
ab8d0fc4
JM
651 btrfs_debug(fs_info,
652 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
1201b58b 653 bio->bi_iter.bi_sector, bio->bi_status,
c3a3b19b 654 bbio->mirror_num);
902b22f3 655
8b8bbd46
QW
656 /*
657 * We always issue full-sector reads, but if some block in a
658 * page fails to read, blk_update_request() will advance
659 * bv_offset and adjust bv_len to compensate. Print a warning
660 * for unaligned offsets, and an error if they don't add up to
661 * a full sector.
662 */
663 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
664 btrfs_err(fs_info,
665 "partial page read in btrfs with offset %u and length %u",
666 bvec->bv_offset, bvec->bv_len);
667 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
668 sectorsize))
669 btrfs_info(fs_info,
670 "incomplete page read with offset %u and length %u",
671 bvec->bv_offset, bvec->bv_len);
672
673 start = page_offset(page) + bvec->bv_offset;
674 end = start + bvec->bv_len - 1;
facc8a22 675 len = bvec->bv_len;
d1310b2e 676
883d0de4 677 if (likely(uptodate)) {
a71754fc 678 loff_t i_size = i_size_read(inode);
09cbfeaf 679 pgoff_t end_index = i_size >> PAGE_SHIFT;
a71754fc 680
c28ea613
QW
681 /*
682 * Zero out the remaining part if this range straddles
683 * i_size.
684 *
685 * Here we should only zero the range inside the bvec,
686 * not touch anything else.
687 *
688 * NOTE: i_size is exclusive while end is inclusive.
689 */
690 if (page->index == end_index && i_size <= end) {
691 u32 zero_start = max(offset_in_page(i_size),
d2dcc8ed 692 offset_in_page(start));
c28ea613
QW
693
694 zero_user_segment(page, zero_start,
695 offset_in_page(end) + 1);
696 }
70dec807 697 }
97861cd1 698
7609afac
CH
699 /* Update page status and unlock. */
700 end_page_read(page, uptodate, start, len);
701 endio_readpage_release_extent(&processed, BTRFS_I(inode),
31dd8c81 702 start, end, uptodate);
97861cd1 703
7ffd27e3
QW
704 ASSERT(bio_offset + len > bio_offset);
705 bio_offset += len;
883d0de4 706
2c30c71b 707 }
94e8c95c
QW
708 /* Release the last extent */
709 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
d1310b2e 710 bio_put(bio);
d1310b2e
CM
711}
712
43dd529a 713/*
dd137dd1
STD
714 * Populate every free slot in a provided array with pages.
715 *
716 * @nr_pages: number of pages to allocate
717 * @page_array: the array to fill with pages; any existing non-null entries in
718 * the array will be skipped
719 *
720 * Return: 0 if all pages were able to be allocated;
721 * -ENOMEM otherwise, and the caller is responsible for freeing all
722 * non-null page pointers in the array.
723 */
724int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
725{
91d6ac1d 726 unsigned int allocated;
dd137dd1 727
91d6ac1d
STD
728 for (allocated = 0; allocated < nr_pages;) {
729 unsigned int last = allocated;
dd137dd1 730
91d6ac1d
STD
731 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
732
395cb57e
STD
733 if (allocated == nr_pages)
734 return 0;
735
91d6ac1d
STD
736 /*
737 * During this iteration, no page could be allocated, even
738 * though alloc_pages_bulk_array() falls back to alloc_page()
739 * if it could not bulk-allocate. So we must be out of memory.
740 */
741 if (allocated == last)
dd137dd1 742 return -ENOMEM;
395cb57e
STD
743
744 memalloc_retry_wait(GFP_NOFS);
dd137dd1
STD
745 }
746 return 0;
747}
748
78a2ef1b
CH
749static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
750 struct page *page, u64 disk_bytenr,
751 unsigned int pg_offset)
752{
9dfde1b4 753 struct bio *bio = &bio_ctrl->bbio->bio;
78a2ef1b
CH
754 struct bio_vec *bvec = bio_last_bvec_all(bio);
755 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
756
757 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
758 /*
759 * For compression, all IO should have its logical bytenr set
760 * to the starting bytenr of the compressed extent.
761 */
762 return bio->bi_iter.bi_sector == sector;
763 }
764
765 /*
766 * The contig check requires the following conditions to be met:
767 *
768 * 1) The pages are belonging to the same inode
769 * This is implied by the call chain.
770 *
771 * 2) The range has adjacent logical bytenr
772 *
773 * 3) The range has adjacent file offset
774 * This is required for the usage of btrfs_bio->file_offset.
775 */
776 return bio_end_sector(bio) == sector &&
777 page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
778 page_offset(page) + pg_offset;
779}
780
198bd49e
JT
781static void alloc_new_bio(struct btrfs_inode *inode,
782 struct btrfs_bio_ctrl *bio_ctrl,
783 u64 disk_bytenr, u64 file_offset)
390ed29b 784{
198bd49e 785 struct btrfs_fs_info *fs_info = inode->root->fs_info;
b41bbd29 786 struct btrfs_bio *bbio;
198bd49e 787
4317ff00 788 bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
b41bbd29
CH
789 bio_ctrl->end_io_func, NULL);
790 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
4317ff00 791 bbio->inode = inode;
b41bbd29
CH
792 bbio->file_offset = file_offset;
793 bio_ctrl->bbio = bbio;
198bd49e 794 bio_ctrl->len_to_oe_boundary = U32_MAX;
390ed29b 795
a39da514
CH
796 /* Limit data write bios to the ordered boundary. */
797 if (bio_ctrl->wbc) {
198bd49e
JT
798 struct btrfs_ordered_extent *ordered;
799
2380220e
QW
800 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
801 if (ordered) {
802 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
72fcf1a4
CH
803 ordered->file_offset +
804 ordered->disk_num_bytes - file_offset);
ec63b84d 805 bbio->ordered = ordered;
2380220e 806 }
390ed29b 807
50f1cff3 808 /*
d5e4377d
CH
809 * Pick the last added device to support cgroup writeback. For
810 * multi-device file systems this means blk-cgroup policies have
811 * to always be set on the last added/replaced device.
812 * This is a bit odd but has been like that for a long time.
50f1cff3 813 */
b41bbd29
CH
814 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
815 wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
e0eefe07 816 }
e0eefe07
QW
817}
818
4b81ba48 819/*
0c64c33c 820 * @disk_bytenr: logical bytenr where the write will be
209ecde5 821 * @page: page to add to the bio
0c64c33c 822 * @size: portion of page that we want to write to
b8b3d625
DS
823 * @pg_offset: offset of the new bio or to check whether we are adding
824 * a contiguous page to the previous one
814b6f91 825 *
9dfde1b4
CH
826 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
827 * new one in @bio_ctrl->bbio.
814b6f91
QW
828 * The mirror number for this IO should already be initizlied in
829 * @bio_ctrl->mirror_num.
4b81ba48 830 */
55173337
CH
831static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
832 u64 disk_bytenr, struct page *page,
833 size_t size, unsigned long pg_offset)
d1310b2e 834{
e1326f03 835 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
5467abba 836
24e6c808 837 ASSERT(pg_offset + size <= PAGE_SIZE);
5467abba
QW
838 ASSERT(bio_ctrl->end_io_func);
839
9dfde1b4 840 if (bio_ctrl->bbio &&
78a2ef1b
CH
841 !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
842 submit_one_bio(bio_ctrl);
843
24e6c808
CH
844 do {
845 u32 len = size;
e0eefe07
QW
846
847 /* Allocate new bio if needed */
9dfde1b4 848 if (!bio_ctrl->bbio) {
72b505dc 849 alloc_new_bio(inode, bio_ctrl, disk_bytenr,
24e6c808 850 page_offset(page) + pg_offset);
e0eefe07 851 }
24e6c808
CH
852
853 /* Cap to the current ordered extent boundary if there is one. */
854 if (len > bio_ctrl->len_to_oe_boundary) {
855 ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
856 ASSERT(is_data_inode(&inode->vfs_inode));
857 len = bio_ctrl->len_to_oe_boundary;
858 }
859
9dfde1b4 860 if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
24e6c808 861 /* bio full: move on to a new one */
722c82ac 862 submit_one_bio(bio_ctrl);
24e6c808 863 continue;
d1310b2e 864 }
24e6c808
CH
865
866 if (bio_ctrl->wbc)
867 wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
868
869 size -= len;
870 pg_offset += len;
871 disk_bytenr += len;
09c3717c
CM
872
873 /*
874 * len_to_oe_boundary defaults to U32_MAX, which isn't page or
875 * sector aligned. alloc_new_bio() then sets it to the end of
876 * our ordered extent for writes into zoned devices.
877 *
878 * When len_to_oe_boundary is tracking an ordered extent, we
879 * trust the ordered extent code to align things properly, and
880 * the check above to cap our write to the ordered extent
881 * boundary is correct.
882 *
883 * When len_to_oe_boundary is U32_MAX, the cap above would
884 * result in a 4095 byte IO for the last page right before
885 * we hit the bio limit of UINT_MAX. bio_add_page() has all
886 * the checks required to make sure we don't overflow the bio,
887 * and we should just ignore len_to_oe_boundary completely
888 * unless we're using it to track an ordered extent.
889 *
890 * It's pretty hard to make a bio sized U32_MAX, but it can
891 * happen when the page cache is able to feed us contiguous
892 * pages for large extents.
893 */
894 if (bio_ctrl->len_to_oe_boundary != U32_MAX)
895 bio_ctrl->len_to_oe_boundary -= len;
24e6c808
CH
896
897 /* Ordered extent boundary: move on to a new bio. */
898 if (bio_ctrl->len_to_oe_boundary == 0)
899 submit_one_bio(bio_ctrl);
900 } while (size);
d1310b2e
CM
901}
902
760f991f
QW
903static int attach_extent_buffer_page(struct extent_buffer *eb,
904 struct page *page,
905 struct btrfs_subpage *prealloc)
d1310b2e 906{
760f991f
QW
907 struct btrfs_fs_info *fs_info = eb->fs_info;
908 int ret = 0;
909
0d01e247
QW
910 /*
911 * If the page is mapped to btree inode, we should hold the private
912 * lock to prevent race.
913 * For cloned or dummy extent buffers, their pages are not mapped and
914 * will not race with any other ebs.
915 */
916 if (page->mapping)
917 lockdep_assert_held(&page->mapping->private_lock);
918
fbca46eb 919 if (fs_info->nodesize >= PAGE_SIZE) {
760f991f
QW
920 if (!PagePrivate(page))
921 attach_page_private(page, eb);
922 else
923 WARN_ON(page->private != (unsigned long)eb);
924 return 0;
925 }
926
927 /* Already mapped, just free prealloc */
928 if (PagePrivate(page)) {
929 btrfs_free_subpage(prealloc);
930 return 0;
931 }
932
933 if (prealloc)
934 /* Has preallocated memory for subpage */
935 attach_page_private(page, prealloc);
d1b89bc0 936 else
760f991f
QW
937 /* Do new allocation to attach subpage */
938 ret = btrfs_attach_subpage(fs_info, page,
939 BTRFS_SUBPAGE_METADATA);
940 return ret;
d1310b2e
CM
941}
942
32443de3 943int set_page_extent_mapped(struct page *page)
d1310b2e 944{
32443de3
QW
945 struct btrfs_fs_info *fs_info;
946
947 ASSERT(page->mapping);
948
949 if (PagePrivate(page))
950 return 0;
951
952 fs_info = btrfs_sb(page->mapping->host->i_sb);
953
fbca46eb 954 if (btrfs_is_subpage(fs_info, page))
32443de3
QW
955 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
956
957 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
958 return 0;
959}
960
961void clear_page_extent_mapped(struct page *page)
962{
963 struct btrfs_fs_info *fs_info;
964
965 ASSERT(page->mapping);
966
d1b89bc0 967 if (!PagePrivate(page))
32443de3
QW
968 return;
969
970 fs_info = btrfs_sb(page->mapping->host->i_sb);
fbca46eb 971 if (btrfs_is_subpage(fs_info, page))
32443de3
QW
972 return btrfs_detach_subpage(fs_info, page);
973
974 detach_page_private(page);
d1310b2e
CM
975}
976
125bac01
MX
977static struct extent_map *
978__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
1a5ee1e6 979 u64 start, u64 len, struct extent_map **em_cached)
125bac01
MX
980{
981 struct extent_map *em;
982
983 if (em_cached && *em_cached) {
984 em = *em_cached;
cbc0e928 985 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 986 start < extent_map_end(em)) {
490b54d6 987 refcount_inc(&em->refs);
125bac01
MX
988 return em;
989 }
990
991 free_extent_map(em);
992 *em_cached = NULL;
993 }
994
1a5ee1e6 995 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
c0347550 996 if (em_cached && !IS_ERR(em)) {
125bac01 997 BUG_ON(*em_cached);
490b54d6 998 refcount_inc(&em->refs);
125bac01
MX
999 *em_cached = em;
1000 }
1001 return em;
1002}
d1310b2e
CM
1003/*
1004 * basic readpage implementation. Locked extent state structs are inserted
1005 * into the tree that are removed when the IO is done (by the end_io
1006 * handlers)
79787eaa 1007 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 1008 * return 0 on success, otherwise return error
d1310b2e 1009 */
7aab8b32 1010static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
c000bc04 1011 struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
d1310b2e
CM
1012{
1013 struct inode *inode = page->mapping->host;
92082d40 1014 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4eee4fa4 1015 u64 start = page_offset(page);
8eec8296 1016 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
1017 u64 cur = start;
1018 u64 extent_offset;
1019 u64 last_byte = i_size_read(inode);
1020 u64 block_start;
d1310b2e 1021 struct extent_map *em;
baf863b9 1022 int ret = 0;
306e16ce 1023 size_t pg_offset = 0;
d1310b2e
CM
1024 size_t iosize;
1025 size_t blocksize = inode->i_sb->s_blocksize;
f657a31c 1026 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 1027
32443de3
QW
1028 ret = set_page_extent_mapped(page);
1029 if (ret < 0) {
570eb97b 1030 unlock_extent(tree, start, end, NULL);
92082d40 1031 unlock_page(page);
55173337 1032 return ret;
32443de3 1033 }
d1310b2e 1034
09cbfeaf 1035 if (page->index == last_byte >> PAGE_SHIFT) {
7073017a 1036 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
1037
1038 if (zero_offset) {
09cbfeaf 1039 iosize = PAGE_SIZE - zero_offset;
d048b9c2 1040 memzero_page(page, zero_offset, iosize);
c8b97818
CM
1041 }
1042 }
5467abba 1043 bio_ctrl->end_io_func = end_bio_extent_readpage;
92082d40 1044 begin_page_read(fs_info, page);
d1310b2e 1045 while (cur <= end) {
a140453b 1046 enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
005efedf 1047 bool force_bio_submit = false;
0c64c33c 1048 u64 disk_bytenr;
c8f2f24b 1049
6a404910 1050 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
d1310b2e 1051 if (cur >= last_byte) {
09cbfeaf 1052 iosize = PAGE_SIZE - pg_offset;
d048b9c2 1053 memzero_page(page, pg_offset, iosize);
2c8f5e8c 1054 unlock_extent(tree, cur, cur + iosize - 1, NULL);
92082d40 1055 end_page_read(page, true, cur, iosize);
d1310b2e
CM
1056 break;
1057 }
125bac01 1058 em = __get_extent_map(inode, page, pg_offset, cur,
1a5ee1e6 1059 end - cur + 1, em_cached);
c0347550 1060 if (IS_ERR(em)) {
570eb97b 1061 unlock_extent(tree, cur, end, NULL);
92082d40 1062 end_page_read(page, false, cur, end + 1 - cur);
55173337 1063 return PTR_ERR(em);
d1310b2e 1064 }
d1310b2e
CM
1065 extent_offset = cur - em->start;
1066 BUG_ON(extent_map_end(em) <= cur);
1067 BUG_ON(end < cur);
1068
7f6ca7f2 1069 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
a140453b 1070 compress_type = em->compress_type;
c8b97818 1071
d1310b2e 1072 iosize = min(extent_map_end(em) - cur, end - cur + 1);
fda2832f 1073 iosize = ALIGN(iosize, blocksize);
a140453b 1074 if (compress_type != BTRFS_COMPRESS_NONE)
0c64c33c 1075 disk_bytenr = em->block_start;
949b3273 1076 else
0c64c33c 1077 disk_bytenr = em->block_start + extent_offset;
d1310b2e 1078 block_start = em->block_start;
d899e052
YZ
1079 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
1080 block_start = EXTENT_MAP_HOLE;
005efedf
FM
1081
1082 /*
1083 * If we have a file range that points to a compressed extent
260db43c 1084 * and it's followed by a consecutive file range that points
005efedf
FM
1085 * to the same compressed extent (possibly with a different
1086 * offset and/or length, so it either points to the whole extent
1087 * or only part of it), we must make sure we do not submit a
1088 * single bio to populate the pages for the 2 ranges because
1089 * this makes the compressed extent read zero out the pages
1090 * belonging to the 2nd range. Imagine the following scenario:
1091 *
1092 * File layout
1093 * [0 - 8K] [8K - 24K]
1094 * | |
1095 * | |
1096 * points to extent X, points to extent X,
1097 * offset 4K, length of 8K offset 0, length 16K
1098 *
1099 * [extent X, compressed length = 4K uncompressed length = 16K]
1100 *
1101 * If the bio to read the compressed extent covers both ranges,
1102 * it will decompress extent X into the pages belonging to the
1103 * first range and then it will stop, zeroing out the remaining
1104 * pages that belong to the other range that points to extent X.
1105 * So here we make sure we submit 2 bios, one for the first
1106 * range and another one for the third range. Both will target
1107 * the same physical extent from disk, but we can't currently
1108 * make the compressed bio endio callback populate the pages
1109 * for both ranges because each compressed bio is tightly
1110 * coupled with a single extent map, and each range can have
1111 * an extent map with a different offset value relative to the
1112 * uncompressed data of our extent and different lengths. This
1113 * is a corner case so we prioritize correctness over
1114 * non-optimal behavior (submitting 2 bios for the same extent).
1115 */
1116 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
1117 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 1118 *prev_em_start != em->start)
005efedf
FM
1119 force_bio_submit = true;
1120
1121 if (prev_em_start)
8e928218 1122 *prev_em_start = em->start;
005efedf 1123
d1310b2e
CM
1124 free_extent_map(em);
1125 em = NULL;
1126
1127 /* we've found a hole, just zero and go on */
1128 if (block_start == EXTENT_MAP_HOLE) {
d048b9c2 1129 memzero_page(page, pg_offset, iosize);
d1310b2e 1130
2c8f5e8c 1131 unlock_extent(tree, cur, cur + iosize - 1, NULL);
92082d40 1132 end_page_read(page, true, cur, iosize);
d1310b2e 1133 cur = cur + iosize;
306e16ce 1134 pg_offset += iosize;
d1310b2e
CM
1135 continue;
1136 }
1137 /* the get_extent function already copied into the page */
70dec807 1138 if (block_start == EXTENT_MAP_INLINE) {
570eb97b 1139 unlock_extent(tree, cur, cur + iosize - 1, NULL);
52b029f4 1140 end_page_read(page, true, cur, iosize);
70dec807 1141 cur = cur + iosize;
306e16ce 1142 pg_offset += iosize;
70dec807
CM
1143 continue;
1144 }
d1310b2e 1145
f8ed4852 1146 if (bio_ctrl->compress_type != compress_type) {
c9bc621f 1147 submit_one_bio(bio_ctrl);
f8ed4852
CH
1148 bio_ctrl->compress_type = compress_type;
1149 }
c9bc621f 1150
eb8d0c6d
CH
1151 if (force_bio_submit)
1152 submit_one_bio(bio_ctrl);
55173337
CH
1153 submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
1154 pg_offset);
d1310b2e 1155 cur = cur + iosize;
306e16ce 1156 pg_offset += iosize;
d1310b2e 1157 }
55173337
CH
1158
1159 return 0;
d1310b2e
CM
1160}
1161
fdaf9a58 1162int btrfs_read_folio(struct file *file, struct folio *folio)
7aab8b32 1163{
fdaf9a58 1164 struct page *page = &folio->page;
7aab8b32
CH
1165 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
1166 u64 start = page_offset(page);
1167 u64 end = start + PAGE_SIZE - 1;
c000bc04 1168 struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
7aab8b32
CH
1169 int ret;
1170
1171 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
1172
c000bc04 1173 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
7aab8b32
CH
1174 /*
1175 * If btrfs_do_readpage() failed we will want to submit the assembled
1176 * bio to do the cleanup.
1177 */
722c82ac 1178 submit_one_bio(&bio_ctrl);
7aab8b32
CH
1179 return ret;
1180}
1181
b6660e80 1182static inline void contiguous_readpages(struct page *pages[], int nr_pages,
390ed29b
QW
1183 u64 start, u64 end,
1184 struct extent_map **em_cached,
1185 struct btrfs_bio_ctrl *bio_ctrl,
1186 u64 *prev_em_start)
9974090b 1187{
23d31bd4 1188 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
1189 int index;
1190
b272ae22 1191 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
1192
1193 for (index = 0; index < nr_pages; index++) {
390ed29b 1194 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
c000bc04 1195 prev_em_start);
09cbfeaf 1196 put_page(pages[index]);
9974090b
MX
1197 }
1198}
1199
d1310b2e 1200/*
40f76580
CM
1201 * helper for __extent_writepage, doing all of the delayed allocation setup.
1202 *
5eaad97a 1203 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
1204 * to write the page (copy into inline extent). In this case the IO has
1205 * been started and the page is already unlocked.
1206 *
1207 * This returns 0 if all went well (page still locked)
1208 * This returns < 0 if there were errors (page still locked)
d1310b2e 1209 */
cd4c0bf9 1210static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
83f1b680 1211 struct page *page, struct writeback_control *wbc)
40f76580 1212{
2749f7ef 1213 const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
cf3075fb 1214 u64 delalloc_start = page_offset(page);
40f76580 1215 u64 delalloc_to_write = 0;
83f1b680
QW
1216 /* How many pages are started by btrfs_run_delalloc_range() */
1217 unsigned long nr_written = 0;
40f76580
CM
1218 int ret;
1219 int page_started = 0;
1220
2749f7ef
QW
1221 while (delalloc_start < page_end) {
1222 u64 delalloc_end = page_end;
1223 bool found;
40f76580 1224
cd4c0bf9 1225 found = find_lock_delalloc_range(&inode->vfs_inode, page,
40f76580 1226 &delalloc_start,
917aacec 1227 &delalloc_end);
3522e903 1228 if (!found) {
40f76580
CM
1229 delalloc_start = delalloc_end + 1;
1230 continue;
1231 }
cd4c0bf9 1232 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
83f1b680 1233 delalloc_end, &page_started, &nr_written, wbc);
2b2553f1 1234 if (ret)
7361b4ae 1235 return ret;
2b2553f1 1236
40f76580 1237 /*
ea1754a0
KS
1238 * delalloc_end is already one less than the total length, so
1239 * we don't subtract one from PAGE_SIZE
40f76580
CM
1240 */
1241 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 1242 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
1243 delalloc_start = delalloc_end + 1;
1244 }
1245 if (wbc->nr_to_write < delalloc_to_write) {
1246 int thresh = 8192;
1247
1248 if (delalloc_to_write < thresh * 2)
1249 thresh = delalloc_to_write;
1250 wbc->nr_to_write = min_t(u64, delalloc_to_write,
1251 thresh);
1252 }
1253
83f1b680 1254 /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
40f76580
CM
1255 if (page_started) {
1256 /*
83f1b680
QW
1257 * We've unlocked the page, so we can't update the mapping's
1258 * writeback index, just update nr_to_write.
40f76580 1259 */
83f1b680 1260 wbc->nr_to_write -= nr_written;
40f76580
CM
1261 return 1;
1262 }
1263
b69d1ee9 1264 return 0;
40f76580
CM
1265}
1266
c5ef5c6c
QW
1267/*
1268 * Find the first byte we need to write.
1269 *
1270 * For subpage, one page can contain several sectors, and
1271 * __extent_writepage_io() will just grab all extent maps in the page
1272 * range and try to submit all non-inline/non-compressed extents.
1273 *
1274 * This is a big problem for subpage, we shouldn't re-submit already written
1275 * data at all.
1276 * This function will lookup subpage dirty bit to find which range we really
1277 * need to submit.
1278 *
1279 * Return the next dirty range in [@start, @end).
1280 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
1281 */
1282static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
1283 struct page *page, u64 *start, u64 *end)
1284{
1285 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
72a69cd0 1286 struct btrfs_subpage_info *spi = fs_info->subpage_info;
c5ef5c6c
QW
1287 u64 orig_start = *start;
1288 /* Declare as unsigned long so we can use bitmap ops */
c5ef5c6c 1289 unsigned long flags;
72a69cd0 1290 int range_start_bit;
c5ef5c6c
QW
1291 int range_end_bit;
1292
1293 /*
1294 * For regular sector size == page size case, since one page only
1295 * contains one sector, we return the page offset directly.
1296 */
fbca46eb 1297 if (!btrfs_is_subpage(fs_info, page)) {
c5ef5c6c
QW
1298 *start = page_offset(page);
1299 *end = page_offset(page) + PAGE_SIZE;
1300 return;
1301 }
1302
72a69cd0
QW
1303 range_start_bit = spi->dirty_offset +
1304 (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
1305
c5ef5c6c
QW
1306 /* We should have the page locked, but just in case */
1307 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
1308 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
1309 spi->dirty_offset + spi->bitmap_nr_bits);
c5ef5c6c
QW
1310 spin_unlock_irqrestore(&subpage->lock, flags);
1311
72a69cd0
QW
1312 range_start_bit -= spi->dirty_offset;
1313 range_end_bit -= spi->dirty_offset;
1314
c5ef5c6c
QW
1315 *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
1316 *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
1317}
1318
40f76580
CM
1319/*
1320 * helper for __extent_writepage. This calls the writepage start hooks,
1321 * and does the loop to map the page into extents and bios.
1322 *
1323 * We return 1 if the IO is started and the page is unlocked,
1324 * 0 if all went well (page still locked)
1325 * < 0 if there were errors (page still locked)
1326 */
d4580fe2 1327static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
40f76580 1328 struct page *page,
ee5f017d 1329 struct btrfs_bio_ctrl *bio_ctrl,
40f76580 1330 loff_t i_size,
57e5ffeb 1331 int *nr_ret)
d1310b2e 1332{
6bc5636a 1333 struct btrfs_fs_info *fs_info = inode->root->fs_info;
a129ffb8
QW
1334 u64 cur = page_offset(page);
1335 u64 end = cur + PAGE_SIZE - 1;
d1310b2e 1336 u64 extent_offset;
d1310b2e 1337 u64 block_start;
d1310b2e 1338 struct extent_map *em;
40f76580
CM
1339 int ret = 0;
1340 int nr = 0;
c8b97818 1341
a129ffb8 1342 ret = btrfs_writepage_cow_fixup(page);
d75855b4
NB
1343 if (ret) {
1344 /* Fixup worker will requeue */
72b505dc 1345 redirty_page_for_writepage(bio_ctrl->wbc, page);
d75855b4
NB
1346 unlock_page(page);
1347 return 1;
247e743c
CM
1348 }
1349
ee5f017d 1350 bio_ctrl->end_io_func = end_bio_extent_writepage;
d1310b2e 1351 while (cur <= end) {
0c64c33c 1352 u64 disk_bytenr;
40f76580 1353 u64 em_end;
c5ef5c6c
QW
1354 u64 dirty_range_start = cur;
1355 u64 dirty_range_end;
6bc5636a 1356 u32 iosize;
58409edd 1357
40f76580 1358 if (cur >= i_size) {
38a39ac7 1359 btrfs_writepage_endio_finish_ordered(inode, page, cur,
25c1252a 1360 end, true);
cc1d0d93
QW
1361 /*
1362 * This range is beyond i_size, thus we don't need to
1363 * bother writing back.
1364 * But we still need to clear the dirty subpage bit, or
1365 * the next time the page gets dirtied, we will try to
1366 * writeback the sectors with subpage dirty bits,
1367 * causing writeback without ordered extent.
1368 */
1369 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
d1310b2e
CM
1370 break;
1371 }
c5ef5c6c
QW
1372
1373 find_next_dirty_byte(fs_info, page, &dirty_range_start,
1374 &dirty_range_end);
1375 if (cur < dirty_range_start) {
1376 cur = dirty_range_start;
1377 continue;
1378 }
1379
d4580fe2 1380 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
c0347550 1381 if (IS_ERR(em)) {
61391d56 1382 ret = PTR_ERR_OR_ZERO(em);
5380311f 1383 goto out_error;
d1310b2e
CM
1384 }
1385
1386 extent_offset = cur - em->start;
40f76580 1387 em_end = extent_map_end(em);
6bc5636a
QW
1388 ASSERT(cur <= em_end);
1389 ASSERT(cur < end);
1390 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
1391 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
f22b5dcb 1392
d1310b2e 1393 block_start = em->block_start;
6bc5636a
QW
1394 disk_bytenr = em->block_start + extent_offset;
1395
f22b5dcb
CH
1396 ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
1397 ASSERT(block_start != EXTENT_MAP_HOLE);
1398 ASSERT(block_start != EXTENT_MAP_INLINE);
1399
c5ef5c6c
QW
1400 /*
1401 * Note that em_end from extent_map_end() and dirty_range_end from
1402 * find_next_dirty_byte() are all exclusive
1403 */
1404 iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
d1310b2e
CM
1405 free_extent_map(em);
1406 em = NULL;
1407
d2a91064 1408 btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
58409edd 1409 if (!PageWriteback(page)) {
d4580fe2 1410 btrfs_err(inode->root->fs_info,
58409edd
DS
1411 "page %lu not writeback, cur %llu end %llu",
1412 page->index, cur, end);
d1310b2e 1413 }
7f3c74fb 1414
c5ef5c6c
QW
1415 /*
1416 * Although the PageDirty bit is cleared before entering this
1417 * function, subpage dirty bit is not cleared.
1418 * So clear subpage dirty bit here so next time we won't submit
1419 * page for range already written to disk.
1420 */
1421 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
1422
55173337
CH
1423 submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
1424 cur - page_offset(page));
6bc5636a 1425 cur += iosize;
d1310b2e
CM
1426 nr++;
1427 }
5380311f
CH
1428
1429 btrfs_page_assert_not_dirty(fs_info, page);
1430 *nr_ret = nr;
1431 return 0;
1432
1433out_error:
cc1d0d93
QW
1434 /*
1435 * If we finish without problem, we should not only clear page dirty,
1436 * but also empty subpage dirty bits
1437 */
40f76580 1438 *nr_ret = nr;
40f76580
CM
1439 return ret;
1440}
1441
1442/*
1443 * the writepage semantics are similar to regular writepage. extent
1444 * records are inserted to lock ranges in the tree, and as dirty areas
1445 * are found, they are marked writeback. Then the lock bits are removed
1446 * and the end_io handler clears the writeback ranges
3065976b
QW
1447 *
1448 * Return 0 if everything goes well.
1449 * Return <0 for error.
40f76580 1450 */
72b505dc 1451static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
40f76580 1452{
8e1dec8e 1453 struct folio *folio = page_folio(page);
40f76580 1454 struct inode *inode = page->mapping->host;
cf3075fb
QW
1455 const u64 page_start = page_offset(page);
1456 const u64 page_end = page_start + PAGE_SIZE - 1;
40f76580
CM
1457 int ret;
1458 int nr = 0;
eb70d222 1459 size_t pg_offset;
40f76580 1460 loff_t i_size = i_size_read(inode);
09cbfeaf 1461 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580 1462
72b505dc 1463 trace___extent_writepage(page, inode, bio_ctrl->wbc);
40f76580
CM
1464
1465 WARN_ON(!PageLocked(page));
1466
7073017a 1467 pg_offset = offset_in_page(i_size);
40f76580
CM
1468 if (page->index > end_index ||
1469 (page->index == end_index && !pg_offset)) {
8e1dec8e
MWO
1470 folio_invalidate(folio, 0, folio_size(folio));
1471 folio_unlock(folio);
40f76580
CM
1472 return 0;
1473 }
1474
21a8935e 1475 if (page->index == end_index)
d048b9c2 1476 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
40f76580 1477
32443de3 1478 ret = set_page_extent_mapped(page);
2b2553f1 1479 if (ret < 0)
32443de3 1480 goto done;
40f76580 1481
eb34dcea
CH
1482 ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
1483 if (ret == 1)
1484 return 0;
1485 if (ret)
1486 goto done;
40f76580 1487
72b505dc 1488 ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
40f76580 1489 if (ret == 1)
169d2c87 1490 return 0;
40f76580 1491
9ecdbee8
CH
1492 bio_ctrl->wbc->nr_to_write--;
1493
d1310b2e
CM
1494done:
1495 if (nr == 0) {
1496 /* make sure the mapping tag for page dirty gets cleared */
1497 set_page_writeback(page);
1498 end_page_writeback(page);
1499 }
3e92499e 1500 if (ret)
cf3075fb 1501 end_extent_writepage(page, ret, page_start, page_end);
eb34dcea 1502 unlock_page(page);
3065976b 1503 ASSERT(ret <= 0);
40f76580 1504 return ret;
d1310b2e
CM
1505}
1506
fd8b2b61 1507void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 1508{
74316201
N
1509 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
1510 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
1511}
1512
2e3c2513 1513/*
a3efb2f0 1514 * Lock extent buffer status and pages for writeback.
2e3c2513 1515 *
9fdd1601
CH
1516 * Return %false if the extent buffer doesn't need to be submitted (e.g. the
1517 * extent buffer is not dirty)
1518 * Return %true is the extent buffer is submitted to bio.
2e3c2513 1519 */
9fdd1601 1520static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb,
50b21d7a 1521 struct writeback_control *wbc)
0b32f4bb 1522{
9df76fb5 1523 struct btrfs_fs_info *fs_info = eb->fs_info;
9fdd1601 1524 bool ret = false;
0b32f4bb 1525
50b21d7a
CH
1526 btrfs_tree_lock(eb);
1527 while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
0b32f4bb 1528 btrfs_tree_unlock(eb);
50b21d7a 1529 if (wbc->sync_mode != WB_SYNC_ALL)
9fdd1601 1530 return false;
50b21d7a
CH
1531 wait_on_extent_buffer_writeback(eb);
1532 btrfs_tree_lock(eb);
0b32f4bb
JB
1533 }
1534
51561ffe
JB
1535 /*
1536 * We need to do this to prevent races in people who check if the eb is
1537 * under IO since we can end up having no IO bits set for a short period
1538 * of time.
1539 */
1540 spin_lock(&eb->refs_lock);
0b32f4bb
JB
1541 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
1542 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 1543 spin_unlock(&eb->refs_lock);
0b32f4bb 1544 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
1545 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1546 -eb->len,
1547 fs_info->dirty_metadata_batch);
9fdd1601 1548 ret = true;
51561ffe
JB
1549 } else {
1550 spin_unlock(&eb->refs_lock);
0b32f4bb 1551 }
0b32f4bb 1552 btrfs_tree_unlock(eb);
2e3c2513 1553 return ret;
0b32f4bb
JB
1554}
1555
cd88a4fd 1556static void set_btree_ioerr(struct extent_buffer *eb)
656f30db 1557{
5a2c6075 1558 struct btrfs_fs_info *fs_info = eb->fs_info;
656f30db 1559
cd88a4fd 1560 set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
656f30db 1561
c2e39305
JB
1562 /*
1563 * A read may stumble upon this buffer later, make sure that it gets an
1564 * error and knows there was an error.
1565 */
1566 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
1567
68b85589
JB
1568 /*
1569 * We need to set the mapping with the io error as well because a write
1570 * error will flip the file system readonly, and then syncfs() will
1571 * return a 0 because we are readonly if we don't modify the err seq for
1572 * the superblock.
1573 */
cd88a4fd 1574 mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);
68b85589 1575
656f30db
FM
1576 /*
1577 * If writeback for a btree extent that doesn't belong to a log tree
1578 * failed, increment the counter transaction->eb_write_errors.
1579 * We do this because while the transaction is running and before it's
1580 * committing (when we call filemap_fdata[write|wait]_range against
1581 * the btree inode), we might have
1582 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
1583 * returns an error or an error happens during writeback, when we're
1584 * committing the transaction we wouldn't know about it, since the pages
1585 * can be no longer dirty nor marked anymore for writeback (if a
1586 * subsequent modification to the extent buffer didn't happen before the
1587 * transaction commit), which makes filemap_fdata[write|wait]_range not
1588 * able to find the pages tagged with SetPageError at transaction
1589 * commit time. So if this happens we must abort the transaction,
1590 * otherwise we commit a super block with btree roots that point to
1591 * btree nodes/leafs whose content on disk is invalid - either garbage
1592 * or the content of some node/leaf from a past generation that got
1593 * cowed or deleted and is no longer valid.
1594 *
1595 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
1596 * not be enough - we need to distinguish between log tree extents vs
1597 * non-log tree extents, and the next filemap_fdatawait_range() call
1598 * will catch and clear such errors in the mapping - and that call might
1599 * be from a log sync and not from a transaction commit. Also, checking
1600 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
1601 * not done and would not be reliable - the eb might have been released
1602 * from memory and reading it back again means that flag would not be
1603 * set (since it's a runtime flag, not persisted on disk).
1604 *
1605 * Using the flags below in the btree inode also makes us achieve the
1606 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
1607 * writeback for all dirty pages and before filemap_fdatawait_range()
1608 * is called, the writeback for all dirty pages had already finished
1609 * with errors - because we were not using AS_EIO/AS_ENOSPC,
1610 * filemap_fdatawait_range() would return success, as it could not know
1611 * that writeback errors happened (the pages were no longer tagged for
1612 * writeback).
1613 */
1614 switch (eb->log_index) {
1615 case -1:
5a2c6075 1616 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
656f30db
FM
1617 break;
1618 case 0:
5a2c6075 1619 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
656f30db
FM
1620 break;
1621 case 1:
5a2c6075 1622 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
656f30db
FM
1623 break;
1624 default:
1625 BUG(); /* unexpected, logic error */
1626 }
1627}
1628
2f3186d8
QW
1629/*
1630 * The endio specific version which won't touch any unsafe spinlock in endio
1631 * context.
1632 */
1633static struct extent_buffer *find_extent_buffer_nolock(
1634 struct btrfs_fs_info *fs_info, u64 start)
1635{
1636 struct extent_buffer *eb;
1637
1638 rcu_read_lock();
01cd3909
DS
1639 eb = radix_tree_lookup(&fs_info->buffer_radix,
1640 start >> fs_info->sectorsize_bits);
2f3186d8
QW
1641 if (eb && atomic_inc_not_zero(&eb->refs)) {
1642 rcu_read_unlock();
1643 return eb;
1644 }
1645 rcu_read_unlock();
1646 return NULL;
1647}
1648
cd88a4fd 1649static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
2f3186d8 1650{
cd88a4fd
CH
1651 struct extent_buffer *eb = bbio->private;
1652 struct btrfs_fs_info *fs_info = eb->fs_info;
1653 bool uptodate = !bbio->bio.bi_status;
2f3186d8 1654 struct bvec_iter_all iter_all;
cd88a4fd
CH
1655 struct bio_vec *bvec;
1656 u32 bio_offset = 0;
2f3186d8 1657
cd88a4fd
CH
1658 if (!uptodate)
1659 set_btree_ioerr(eb);
fa04c165 1660
cd88a4fd
CH
1661 bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
1662 u64 start = eb->start + bio_offset;
2f3186d8 1663 struct page *page = bvec->bv_page;
cd88a4fd 1664 u32 len = bvec->bv_len;
2f3186d8 1665
011134f4 1666 if (!uptodate)
cd88a4fd 1667 btrfs_page_clear_uptodate(fs_info, page, start, len);
cd88a4fd
CH
1668 btrfs_page_clear_writeback(fs_info, page, start, len);
1669 bio_offset += len;
2f3186d8 1670 }
0b32f4bb 1671
cd88a4fd
CH
1672 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
1673 smp_mb__after_atomic();
1674 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
0b32f4bb 1675
cd88a4fd 1676 bio_put(&bbio->bio);
0b32f4bb
JB
1677}
1678
fa04c165
QW
1679static void prepare_eb_write(struct extent_buffer *eb)
1680{
1681 u32 nritems;
1682 unsigned long start;
1683 unsigned long end;
1684
1685 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
fa04c165
QW
1686
1687 /* Set btree blocks beyond nritems with 0 to avoid stale content */
1688 nritems = btrfs_header_nritems(eb);
1689 if (btrfs_header_level(eb) > 0) {
e23efd8e 1690 end = btrfs_node_key_ptr_offset(eb, nritems);
fa04c165
QW
1691 memzero_extent_buffer(eb, end, eb->len - end);
1692 } else {
1693 /*
1694 * Leaf:
1695 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
1696 */
42c9419a 1697 start = btrfs_item_nr_offset(eb, nritems);
8009adf3 1698 end = btrfs_item_nr_offset(eb, 0);
3a3178c7
JB
1699 if (nritems == 0)
1700 end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
1701 else
1702 end += btrfs_item_offset(eb, nritems - 1);
fa04c165
QW
1703 memzero_extent_buffer(eb, start, end - start);
1704 }
1705}
1706
55173337 1707static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
50b21d7a 1708 struct writeback_control *wbc)
0b32f4bb 1709{
46672a44 1710 struct btrfs_fs_info *fs_info = eb->fs_info;
b51e6b4b 1711 struct btrfs_bio *bbio;
0b32f4bb 1712
fa04c165 1713 prepare_eb_write(eb);
35b6ddfa 1714
b51e6b4b
CH
1715 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
1716 REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
cd88a4fd 1717 eb->fs_info, extent_buffer_write_end_io, eb);
b51e6b4b 1718 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
46672a44 1719 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
b51e6b4b
CH
1720 wbc_init_bio(wbc, &bbio->bio);
1721 bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
1722 bbio->file_offset = eb->start;
46672a44
CH
1723 if (fs_info->nodesize < PAGE_SIZE) {
1724 struct page *p = eb->pages[0];
0b32f4bb 1725
81a79b6a 1726 lock_page(p);
46672a44
CH
1727 btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
1728 if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
1729 eb->len)) {
1730 clear_page_dirty_for_io(p);
1731 wbc->nr_to_write--;
1732 }
1733 __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
1734 wbc_account_cgroup_owner(wbc, p, eb->len);
0b32f4bb 1735 unlock_page(p);
46672a44
CH
1736 } else {
1737 for (int i = 0; i < num_extent_pages(eb); i++) {
1738 struct page *p = eb->pages[i];
1739
1740 lock_page(p);
1741 clear_page_dirty_for_io(p);
1742 set_page_writeback(p);
1743 __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
1744 wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
1745 wbc->nr_to_write--;
1746 unlock_page(p);
1747 }
0b32f4bb 1748 }
b51e6b4b 1749 btrfs_submit_bio(bbio, 0);
0b32f4bb
JB
1750}
1751
c4aec299
QW
1752/*
1753 * Submit one subpage btree page.
1754 *
1755 * The main difference to submit_eb_page() is:
1756 * - Page locking
1757 * For subpage, we don't rely on page locking at all.
1758 *
1759 * - Flush write bio
1760 * We only flush bio if we may be unable to fit current extent buffers into
1761 * current bio.
1762 *
1763 * Return >=0 for the number of submitted extent buffers.
1764 * Return <0 for fatal error.
1765 */
50b21d7a 1766static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
c4aec299
QW
1767{
1768 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
1769 int submitted = 0;
1770 u64 page_start = page_offset(page);
1771 int bit_start = 0;
c4aec299 1772 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
c4aec299
QW
1773
1774 /* Lock and write each dirty extent buffers in the range */
72a69cd0 1775 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
c4aec299
QW
1776 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
1777 struct extent_buffer *eb;
1778 unsigned long flags;
1779 u64 start;
1780
1781 /*
1782 * Take private lock to ensure the subpage won't be detached
1783 * in the meantime.
1784 */
1785 spin_lock(&page->mapping->private_lock);
1786 if (!PagePrivate(page)) {
1787 spin_unlock(&page->mapping->private_lock);
1788 break;
1789 }
1790 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
1791 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
1792 subpage->bitmaps)) {
c4aec299
QW
1793 spin_unlock_irqrestore(&subpage->lock, flags);
1794 spin_unlock(&page->mapping->private_lock);
1795 bit_start++;
1796 continue;
1797 }
1798
1799 start = page_start + bit_start * fs_info->sectorsize;
1800 bit_start += sectors_per_node;
1801
1802 /*
1803 * Here we just want to grab the eb without touching extra
1804 * spin locks, so call find_extent_buffer_nolock().
1805 */
1806 eb = find_extent_buffer_nolock(fs_info, start);
1807 spin_unlock_irqrestore(&subpage->lock, flags);
1808 spin_unlock(&page->mapping->private_lock);
1809
1810 /*
1811 * The eb has already reached 0 refs thus find_extent_buffer()
1812 * doesn't return it. We don't need to write back such eb
1813 * anyway.
1814 */
1815 if (!eb)
1816 continue;
1817
50b21d7a 1818 if (lock_extent_buffer_for_io(eb, wbc)) {
46672a44 1819 write_one_eb(eb, wbc);
9fdd1601 1820 submitted++;
c4aec299 1821 }
c4aec299 1822 free_extent_buffer(eb);
c4aec299
QW
1823 }
1824 return submitted;
c4aec299
QW
1825}
1826
f91e0d0c
QW
1827/*
1828 * Submit all page(s) of one extent buffer.
1829 *
1830 * @page: the page of one extent buffer
1831 * @eb_context: to determine if we need to submit this page, if current page
1832 * belongs to this eb, we don't need to submit
1833 *
1834 * The caller should pass each page in their bytenr order, and here we use
1835 * @eb_context to determine if we have submitted pages of one extent buffer.
1836 *
1837 * If we have, we just skip until we hit a new page that doesn't belong to
1838 * current @eb_context.
1839 *
1840 * If not, we submit all the page(s) of the extent buffer.
1841 *
1842 * Return >0 if we have submitted the extent buffer successfully.
1843 * Return 0 if we don't need to submit the page, as it's already submitted by
1844 * previous call.
1845 * Return <0 for fatal error.
1846 */
50b21d7a 1847static int submit_eb_page(struct page *page, struct writeback_control *wbc,
f91e0d0c
QW
1848 struct extent_buffer **eb_context)
1849{
1850 struct address_space *mapping = page->mapping;
0bc09ca1 1851 struct btrfs_block_group *cache = NULL;
f91e0d0c
QW
1852 struct extent_buffer *eb;
1853 int ret;
1854
1855 if (!PagePrivate(page))
1856 return 0;
1857
fbca46eb 1858 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
50b21d7a 1859 return submit_eb_subpage(page, wbc);
c4aec299 1860
f91e0d0c
QW
1861 spin_lock(&mapping->private_lock);
1862 if (!PagePrivate(page)) {
1863 spin_unlock(&mapping->private_lock);
1864 return 0;
1865 }
1866
1867 eb = (struct extent_buffer *)page->private;
1868
1869 /*
1870 * Shouldn't happen and normally this would be a BUG_ON but no point
1871 * crashing the machine for something we can survive anyway.
1872 */
1873 if (WARN_ON(!eb)) {
1874 spin_unlock(&mapping->private_lock);
1875 return 0;
1876 }
1877
1878 if (eb == *eb_context) {
1879 spin_unlock(&mapping->private_lock);
1880 return 0;
1881 }
1882 ret = atomic_inc_not_zero(&eb->refs);
1883 spin_unlock(&mapping->private_lock);
1884 if (!ret)
1885 return 0;
1886
0bc09ca1
NA
1887 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
1888 /*
1889 * If for_sync, this hole will be filled with
1890 * trasnsaction commit.
1891 */
50b21d7a 1892 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
0bc09ca1
NA
1893 ret = -EAGAIN;
1894 else
1895 ret = 0;
1896 free_extent_buffer(eb);
1897 return ret;
1898 }
1899
f91e0d0c
QW
1900 *eb_context = eb;
1901
50b21d7a 1902 if (!lock_extent_buffer_for_io(eb, wbc)) {
0bc09ca1
NA
1903 btrfs_revert_meta_write_pointer(cache, eb);
1904 if (cache)
1905 btrfs_put_block_group(cache);
f91e0d0c 1906 free_extent_buffer(eb);
50b21d7a 1907 return 0;
f91e0d0c 1908 }
be1a1d7a 1909 if (cache) {
d3e29967
NB
1910 /*
1911 * Implies write in zoned mode. Mark the last eb in a block group.
1912 */
56fbb0a4 1913 btrfs_schedule_zone_finish_bg(cache, eb);
d3e29967 1914 btrfs_put_block_group(cache);
be1a1d7a 1915 }
50b21d7a 1916 write_one_eb(eb, wbc);
f91e0d0c 1917 free_extent_buffer(eb);
f91e0d0c
QW
1918 return 1;
1919}
1920
0b32f4bb
JB
1921int btree_write_cache_pages(struct address_space *mapping,
1922 struct writeback_control *wbc)
1923{
f91e0d0c 1924 struct extent_buffer *eb_context = NULL;
b3ff8f1d 1925 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
1926 int ret = 0;
1927 int done = 0;
1928 int nr_to_write_done = 0;
51c5cd3b
VMO
1929 struct folio_batch fbatch;
1930 unsigned int nr_folios;
0b32f4bb
JB
1931 pgoff_t index;
1932 pgoff_t end; /* Inclusive */
1933 int scanned = 0;
10bbd235 1934 xa_mark_t tag;
0b32f4bb 1935
51c5cd3b 1936 folio_batch_init(&fbatch);
0b32f4bb
JB
1937 if (wbc->range_cyclic) {
1938 index = mapping->writeback_index; /* Start from prev offset */
1939 end = -1;
556755a8
JB
1940 /*
1941 * Start from the beginning does not need to cycle over the
1942 * range, mark it as scanned.
1943 */
1944 scanned = (index == 0);
0b32f4bb 1945 } else {
09cbfeaf
KS
1946 index = wbc->range_start >> PAGE_SHIFT;
1947 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
1948 scanned = 1;
1949 }
1950 if (wbc->sync_mode == WB_SYNC_ALL)
1951 tag = PAGECACHE_TAG_TOWRITE;
1952 else
1953 tag = PAGECACHE_TAG_DIRTY;
0bc09ca1 1954 btrfs_zoned_meta_io_lock(fs_info);
0b32f4bb
JB
1955retry:
1956 if (wbc->sync_mode == WB_SYNC_ALL)
1957 tag_pages_for_writeback(mapping, index, end);
1958 while (!done && !nr_to_write_done && (index <= end) &&
51c5cd3b
VMO
1959 (nr_folios = filemap_get_folios_tag(mapping, &index, end,
1960 tag, &fbatch))) {
0b32f4bb
JB
1961 unsigned i;
1962
51c5cd3b
VMO
1963 for (i = 0; i < nr_folios; i++) {
1964 struct folio *folio = fbatch.folios[i];
0b32f4bb 1965
50b21d7a 1966 ret = submit_eb_page(&folio->page, wbc, &eb_context);
f91e0d0c 1967 if (ret == 0)
0b32f4bb 1968 continue;
f91e0d0c 1969 if (ret < 0) {
0b32f4bb 1970 done = 1;
0b32f4bb
JB
1971 break;
1972 }
0b32f4bb
JB
1973
1974 /*
1975 * the filesystem may choose to bump up nr_to_write.
1976 * We have to make sure to honor the new nr_to_write
1977 * at any time
1978 */
1979 nr_to_write_done = wbc->nr_to_write <= 0;
1980 }
51c5cd3b 1981 folio_batch_release(&fbatch);
0b32f4bb
JB
1982 cond_resched();
1983 }
1984 if (!scanned && !done) {
1985 /*
1986 * We hit the last page and there is more work to be done: wrap
1987 * back to the start of the file
1988 */
1989 scanned = 1;
1990 index = 0;
1991 goto retry;
1992 }
b3ff8f1d
QW
1993 /*
1994 * If something went wrong, don't allow any metadata write bio to be
1995 * submitted.
1996 *
1997 * This would prevent use-after-free if we had dirty pages not
1998 * cleaned up, which can still happen by fuzzed images.
1999 *
2000 * - Bad extent tree
2001 * Allowing existing tree block to be allocated for other trees.
2002 *
2003 * - Log tree operations
2004 * Exiting tree blocks get allocated to log tree, bumps its
2005 * generation, then get cleaned in tree re-balance.
2006 * Such tree block will not be written back, since it's clean,
2007 * thus no WRITTEN flag set.
2008 * And after log writes back, this tree block is not traced by
2009 * any dirty extent_io_tree.
2010 *
2011 * - Offending tree block gets re-dirtied from its original owner
2012 * Since it has bumped generation, no WRITTEN flag, it can be
2013 * reused without COWing. This tree block will not be traced
2014 * by btrfs_transaction::dirty_pages.
2015 *
2016 * Now such dirty tree block will not be cleaned by any dirty
2017 * extent io tree. Thus we don't want to submit such wild eb
2018 * if the fs already has error.
9845e5dd 2019 *
c9583ada
QW
2020 * We can get ret > 0 from submit_extent_page() indicating how many ebs
2021 * were submitted. Reset it to 0 to avoid false alerts for the caller.
2022 */
2023 if (ret > 0)
2024 ret = 0;
9845e5dd
CH
2025 if (!ret && BTRFS_FS_ERROR(fs_info))
2026 ret = -EROFS;
9845e5dd 2027 btrfs_zoned_meta_io_unlock(fs_info);
0b32f4bb
JB
2028 return ret;
2029}
2030
43dd529a 2031/*
3bed2da1
NB
2032 * Walk the list of dirty pages of the given address space and write all of them.
2033 *
ee5f017d
DS
2034 * @mapping: address space structure to write
2035 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2036 * @bio_ctrl: holds context for the write, namely the bio
d1310b2e
CM
2037 *
2038 * If a page is already under I/O, write_cache_pages() skips it, even
2039 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2040 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2041 * and msync() need to guarantee that all the data which was dirty at the time
2042 * the call was made get new I/O started against them. If wbc->sync_mode is
2043 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2044 * existing IO to complete.
2045 */
4242b64a 2046static int extent_write_cache_pages(struct address_space *mapping,
ee5f017d 2047 struct btrfs_bio_ctrl *bio_ctrl)
d1310b2e 2048{
72b505dc 2049 struct writeback_control *wbc = bio_ctrl->wbc;
7fd1a3f7 2050 struct inode *inode = mapping->host;
d1310b2e
CM
2051 int ret = 0;
2052 int done = 0;
f85d7d6c 2053 int nr_to_write_done = 0;
9f50fd2e
VMO
2054 struct folio_batch fbatch;
2055 unsigned int nr_folios;
d1310b2e
CM
2056 pgoff_t index;
2057 pgoff_t end; /* Inclusive */
a9132667
LB
2058 pgoff_t done_index;
2059 int range_whole = 0;
d1310b2e 2060 int scanned = 0;
10bbd235 2061 xa_mark_t tag;
d1310b2e 2062
7fd1a3f7
JB
2063 /*
2064 * We have to hold onto the inode so that ordered extents can do their
2065 * work when the IO finishes. The alternative to this is failing to add
2066 * an ordered extent if the igrab() fails there and that is a huge pain
2067 * to deal with, so instead just hold onto the inode throughout the
2068 * writepages operation. If it fails here we are freeing up the inode
2069 * anyway and we'd rather not waste our time writing out stuff that is
2070 * going to be truncated anyway.
2071 */
2072 if (!igrab(inode))
2073 return 0;
2074
9f50fd2e 2075 folio_batch_init(&fbatch);
d1310b2e
CM
2076 if (wbc->range_cyclic) {
2077 index = mapping->writeback_index; /* Start from prev offset */
2078 end = -1;
556755a8
JB
2079 /*
2080 * Start from the beginning does not need to cycle over the
2081 * range, mark it as scanned.
2082 */
2083 scanned = (index == 0);
d1310b2e 2084 } else {
09cbfeaf
KS
2085 index = wbc->range_start >> PAGE_SHIFT;
2086 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
2087 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2088 range_whole = 1;
d1310b2e
CM
2089 scanned = 1;
2090 }
3cd24c69
EL
2091
2092 /*
2093 * We do the tagged writepage as long as the snapshot flush bit is set
2094 * and we are the first one who do the filemap_flush() on this inode.
2095 *
2096 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
2097 * not race in and drop the bit.
2098 */
2099 if (range_whole && wbc->nr_to_write == LONG_MAX &&
2100 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
2101 &BTRFS_I(inode)->runtime_flags))
2102 wbc->tagged_writepages = 1;
2103
2104 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
2105 tag = PAGECACHE_TAG_TOWRITE;
2106 else
2107 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 2108retry:
3cd24c69 2109 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 2110 tag_pages_for_writeback(mapping, index, end);
a9132667 2111 done_index = index;
f85d7d6c 2112 while (!done && !nr_to_write_done && (index <= end) &&
9f50fd2e
VMO
2113 (nr_folios = filemap_get_folios_tag(mapping, &index,
2114 end, tag, &fbatch))) {
d1310b2e
CM
2115 unsigned i;
2116
9f50fd2e
VMO
2117 for (i = 0; i < nr_folios; i++) {
2118 struct folio *folio = fbatch.folios[i];
d1310b2e 2119
7b365a2a 2120 done_index = folio_next_index(folio);
d1310b2e 2121 /*
b93b0163
MW
2122 * At this point we hold neither the i_pages lock nor
2123 * the page lock: the page may be truncated or
2124 * invalidated (changing page->mapping to NULL),
2125 * or even swizzled back from swapper_space to
2126 * tmpfs file mapping
d1310b2e 2127 */
9f50fd2e 2128 if (!folio_trylock(folio)) {
ee5f017d 2129 submit_write_bio(bio_ctrl, 0);
9f50fd2e 2130 folio_lock(folio);
01d658f2 2131 }
d1310b2e 2132
9f50fd2e
VMO
2133 if (unlikely(folio->mapping != mapping)) {
2134 folio_unlock(folio);
d1310b2e
CM
2135 continue;
2136 }
2137
5c256998
CH
2138 if (!folio_test_dirty(folio)) {
2139 /* Someone wrote it for us. */
2140 folio_unlock(folio);
2141 continue;
2142 }
2143
d2c3f4f6 2144 if (wbc->sync_mode != WB_SYNC_NONE) {
9f50fd2e 2145 if (folio_test_writeback(folio))
ee5f017d 2146 submit_write_bio(bio_ctrl, 0);
9f50fd2e 2147 folio_wait_writeback(folio);
d2c3f4f6 2148 }
d1310b2e 2149
9f50fd2e
VMO
2150 if (folio_test_writeback(folio) ||
2151 !folio_clear_dirty_for_io(folio)) {
2152 folio_unlock(folio);
d1310b2e
CM
2153 continue;
2154 }
2155
72b505dc 2156 ret = __extent_writepage(&folio->page, bio_ctrl);
a9132667 2157 if (ret < 0) {
a9132667
LB
2158 done = 1;
2159 break;
2160 }
f85d7d6c
CM
2161
2162 /*
effa24f6 2163 * The filesystem may choose to bump up nr_to_write.
f85d7d6c 2164 * We have to make sure to honor the new nr_to_write
effa24f6 2165 * at any time.
f85d7d6c 2166 */
effa24f6
CH
2167 nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
2168 wbc->nr_to_write <= 0);
d1310b2e 2169 }
9f50fd2e 2170 folio_batch_release(&fbatch);
d1310b2e
CM
2171 cond_resched();
2172 }
894b36e3 2173 if (!scanned && !done) {
d1310b2e
CM
2174 /*
2175 * We hit the last page and there is more work to be done: wrap
2176 * back to the start of the file
2177 */
2178 scanned = 1;
2179 index = 0;
42ffb0bf
JB
2180
2181 /*
2182 * If we're looping we could run into a page that is locked by a
2183 * writer and that writer could be waiting on writeback for a
2184 * page in our current bio, and thus deadlock, so flush the
2185 * write bio here.
2186 */
ee5f017d 2187 submit_write_bio(bio_ctrl, 0);
c9583ada 2188 goto retry;
d1310b2e 2189 }
a9132667
LB
2190
2191 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
2192 mapping->writeback_index = done_index;
2193
e55cf7ca 2194 btrfs_add_delayed_iput(BTRFS_I(inode));
894b36e3 2195 return ret;
d1310b2e 2196}
d1310b2e 2197
2bd0fc93
QW
2198/*
2199 * Submit the pages in the range to bio for call sites which delalloc range has
2200 * already been ran (aka, ordered extent inserted) and all pages are still
2201 * locked.
2202 */
7027f871
CH
2203int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
2204 struct writeback_control *wbc)
771ed689 2205{
2bd0fc93
QW
2206 bool found_error = false;
2207 int first_error = 0;
771ed689
CM
2208 int ret = 0;
2209 struct address_space *mapping = inode->i_mapping;
eb34dcea
CH
2210 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2211 const u32 sectorsize = fs_info->sectorsize;
2212 loff_t i_size = i_size_read(inode);
2bd0fc93 2213 u64 cur = start;
c000bc04 2214 struct btrfs_bio_ctrl bio_ctrl = {
7027f871
CH
2215 .wbc = wbc,
2216 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
c000bc04 2217 };
771ed689 2218
7027f871
CH
2219 if (wbc->no_cgroup_owner)
2220 bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT;
2221
66448b9d 2222 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
66448b9d 2223
2bd0fc93 2224 while (cur <= end) {
66448b9d 2225 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
eb34dcea
CH
2226 struct page *page;
2227 int nr = 0;
66448b9d 2228
2bd0fc93
QW
2229 page = find_get_page(mapping, cur >> PAGE_SHIFT);
2230 /*
2231 * All pages in the range are locked since
2232 * btrfs_run_delalloc_range(), thus there is no way to clear
2233 * the page dirty flag.
2234 */
66448b9d 2235 ASSERT(PageLocked(page));
2bd0fc93
QW
2236 ASSERT(PageDirty(page));
2237 clear_page_dirty_for_io(page);
eb34dcea
CH
2238
2239 ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
2240 i_size, &nr);
2241 if (ret == 1)
2242 goto next_page;
2243
2244 /* Make sure the mapping tag for page dirty gets cleared. */
2245 if (nr == 0) {
2246 set_page_writeback(page);
2247 end_page_writeback(page);
2248 }
2249 if (ret)
2250 end_extent_writepage(page, ret, cur, cur_end);
2251 btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur);
2bd0fc93
QW
2252 if (ret < 0) {
2253 found_error = true;
2254 first_error = ret;
771ed689 2255 }
eb34dcea 2256next_page:
09cbfeaf 2257 put_page(page);
66448b9d 2258 cur = cur_end + 1;
771ed689
CM
2259 }
2260
ee5f017d 2261 submit_write_bio(&bio_ctrl, found_error ? ret : 0);
dbb70bec 2262
2bd0fc93
QW
2263 if (found_error)
2264 return first_error;
771ed689
CM
2265 return ret;
2266}
d1310b2e 2267
8ae225a8 2268int extent_writepages(struct address_space *mapping,
d1310b2e
CM
2269 struct writeback_control *wbc)
2270{
35156d85 2271 struct inode *inode = mapping->host;
d1310b2e 2272 int ret = 0;
ee5f017d 2273 struct btrfs_bio_ctrl bio_ctrl = {
72b505dc 2274 .wbc = wbc,
c000bc04 2275 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
d1310b2e
CM
2276 };
2277
35156d85
JT
2278 /*
2279 * Allow only a single thread to do the reloc work in zoned mode to
2280 * protect the write pointer updates.
2281 */
869f4cdc 2282 btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
72b505dc 2283 ret = extent_write_cache_pages(mapping, &bio_ctrl);
ee5f017d 2284 submit_write_bio(&bio_ctrl, ret);
19ab78ca 2285 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
d1310b2e
CM
2286 return ret;
2287}
d1310b2e 2288
ba206a02 2289void extent_readahead(struct readahead_control *rac)
d1310b2e 2290{
c000bc04 2291 struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
67c9684f 2292 struct page *pagepool[16];
125bac01 2293 struct extent_map *em_cached = NULL;
808f80b4 2294 u64 prev_em_start = (u64)-1;
ba206a02 2295 int nr;
d1310b2e 2296
ba206a02 2297 while ((nr = readahead_page_batch(rac, pagepool))) {
32c0a6bc
MWO
2298 u64 contig_start = readahead_pos(rac);
2299 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
e65ef21e 2300
ba206a02 2301 contiguous_readpages(pagepool, nr, contig_start, contig_end,
390ed29b 2302 &em_cached, &bio_ctrl, &prev_em_start);
d1310b2e 2303 }
67c9684f 2304
125bac01
MX
2305 if (em_cached)
2306 free_extent_map(em_cached);
722c82ac 2307 submit_one_bio(&bio_ctrl);
d1310b2e 2308}
d1310b2e
CM
2309
2310/*
895586eb
MWO
2311 * basic invalidate_folio code, this waits on any locked or writeback
2312 * ranges corresponding to the folio, and then deletes any extent state
d1310b2e
CM
2313 * records from the tree
2314 */
895586eb
MWO
2315int extent_invalidate_folio(struct extent_io_tree *tree,
2316 struct folio *folio, size_t offset)
d1310b2e 2317{
2ac55d41 2318 struct extent_state *cached_state = NULL;
895586eb
MWO
2319 u64 start = folio_pos(folio);
2320 u64 end = start + folio_size(folio) - 1;
2321 size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
d1310b2e 2322
829ddec9
QW
2323 /* This function is only called for the btree inode */
2324 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
2325
fda2832f 2326 start += ALIGN(offset, blocksize);
d1310b2e
CM
2327 if (start > end)
2328 return 0;
2329
570eb97b 2330 lock_extent(tree, start, end, &cached_state);
895586eb 2331 folio_wait_writeback(folio);
829ddec9
QW
2332
2333 /*
2334 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
2335 * so here we only need to unlock the extent range to free any
2336 * existing extent state.
2337 */
570eb97b 2338 unlock_extent(tree, start, end, &cached_state);
d1310b2e
CM
2339 return 0;
2340}
d1310b2e 2341
7b13b7b1 2342/*
f913cff3 2343 * a helper for release_folio, this tests for areas of the page that
7b13b7b1
CM
2344 * are locked or under IO and drops the related state bits if it is safe
2345 * to drop the page.
2346 */
29c68b2d 2347static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 2348 struct page *page, gfp_t mask)
7b13b7b1 2349{
4eee4fa4 2350 u64 start = page_offset(page);
09cbfeaf 2351 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
2352 int ret = 1;
2353
8882679e 2354 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 2355 ret = 0;
8882679e 2356 } else {
b71fb16b
JB
2357 u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
2358 EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
2359
11ef160f 2360 /*
2766ff61
FM
2361 * At this point we can safely clear everything except the
2362 * locked bit, the nodatasum bit and the delalloc new bit.
2363 * The delalloc new bit will be cleared by ordered extent
2364 * completion.
11ef160f 2365 */
1d126800 2366 ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
e3f24cc5
CM
2367
2368 /* if clear_extent_bit failed for enomem reasons,
2369 * we can't allow the release to continue.
2370 */
2371 if (ret < 0)
2372 ret = 0;
2373 else
2374 ret = 1;
7b13b7b1
CM
2375 }
2376 return ret;
2377}
7b13b7b1 2378
d1310b2e 2379/*
f913cff3 2380 * a helper for release_folio. As long as there are no locked extents
d1310b2e
CM
2381 * in the range corresponding to the page, both state records and extent
2382 * map records are removed
2383 */
477a30ba 2384int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
2385{
2386 struct extent_map *em;
4eee4fa4 2387 u64 start = page_offset(page);
09cbfeaf 2388 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
2389 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
2390 struct extent_io_tree *tree = &btrfs_inode->io_tree;
2391 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 2392
d0164adc 2393 if (gfpflags_allow_blocking(mask) &&
ee22184b 2394 page->mapping->host->i_size > SZ_16M) {
39b5637f 2395 u64 len;
70dec807 2396 while (start <= end) {
fbc2bd7e
FM
2397 struct btrfs_fs_info *fs_info;
2398 u64 cur_gen;
2399
39b5637f 2400 len = end - start + 1;
890871be 2401 write_lock(&map->lock);
39b5637f 2402 em = lookup_extent_mapping(map, start, len);
285190d9 2403 if (!em) {
890871be 2404 write_unlock(&map->lock);
70dec807
CM
2405 break;
2406 }
7f3c74fb
CM
2407 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2408 em->start != start) {
890871be 2409 write_unlock(&map->lock);
70dec807
CM
2410 free_extent_map(em);
2411 break;
2412 }
3d6448e6
FM
2413 if (test_range_bit(tree, em->start,
2414 extent_map_end(em) - 1,
2415 EXTENT_LOCKED, 0, NULL))
2416 goto next;
2417 /*
2418 * If it's not in the list of modified extents, used
2419 * by a fast fsync, we can remove it. If it's being
2420 * logged we can safely remove it since fsync took an
2421 * extra reference on the em.
2422 */
2423 if (list_empty(&em->list) ||
fbc2bd7e
FM
2424 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
2425 goto remove_em;
2426 /*
2427 * If it's in the list of modified extents, remove it
2428 * only if its generation is older then the current one,
2429 * in which case we don't need it for a fast fsync.
2430 * Otherwise don't remove it, we could be racing with an
2431 * ongoing fast fsync that could miss the new extent.
2432 */
2433 fs_info = btrfs_inode->root->fs_info;
2434 spin_lock(&fs_info->trans_lock);
2435 cur_gen = fs_info->generation;
2436 spin_unlock(&fs_info->trans_lock);
2437 if (em->generation >= cur_gen)
2438 goto next;
2439remove_em:
5e548b32
FM
2440 /*
2441 * We only remove extent maps that are not in the list of
2442 * modified extents or that are in the list but with a
2443 * generation lower then the current generation, so there
2444 * is no need to set the full fsync flag on the inode (it
2445 * hurts the fsync performance for workloads with a data
2446 * size that exceeds or is close to the system's memory).
2447 */
fbc2bd7e
FM
2448 remove_extent_mapping(map, em);
2449 /* once for the rb tree */
2450 free_extent_map(em);
3d6448e6 2451next:
70dec807 2452 start = extent_map_end(em);
890871be 2453 write_unlock(&map->lock);
70dec807
CM
2454
2455 /* once for us */
d1310b2e 2456 free_extent_map(em);
9f47eb54
PM
2457
2458 cond_resched(); /* Allow large-extent preemption. */
d1310b2e 2459 }
d1310b2e 2460 }
29c68b2d 2461 return try_release_extent_state(tree, page, mask);
d1310b2e 2462}
d1310b2e 2463
4751832d
QW
2464/*
2465 * To cache previous fiemap extent
2466 *
2467 * Will be used for merging fiemap extent
2468 */
2469struct fiemap_cache {
2470 u64 offset;
2471 u64 phys;
2472 u64 len;
2473 u32 flags;
2474 bool cached;
2475};
2476
2477/*
2478 * Helper to submit fiemap extent.
2479 *
2480 * Will try to merge current fiemap extent specified by @offset, @phys,
2481 * @len and @flags with cached one.
2482 * And only when we fails to merge, cached one will be submitted as
2483 * fiemap extent.
2484 *
2485 * Return value is the same as fiemap_fill_next_extent().
2486 */
2487static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
2488 struct fiemap_cache *cache,
2489 u64 offset, u64 phys, u64 len, u32 flags)
2490{
2491 int ret = 0;
2492
ac3c0d36
FM
2493 /* Set at the end of extent_fiemap(). */
2494 ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
2495
4751832d
QW
2496 if (!cache->cached)
2497 goto assign;
2498
2499 /*
2500 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 2501 * fiemap extent won't overlap with cached one.
4751832d
QW
2502 * Not recoverable.
2503 *
2504 * NOTE: Physical address can overlap, due to compression
2505 */
2506 if (cache->offset + cache->len > offset) {
2507 WARN_ON(1);
2508 return -EINVAL;
2509 }
2510
2511 /*
2512 * Only merges fiemap extents if
2513 * 1) Their logical addresses are continuous
2514 *
2515 * 2) Their physical addresses are continuous
2516 * So truly compressed (physical size smaller than logical size)
2517 * extents won't get merged with each other
2518 *
ac3c0d36 2519 * 3) Share same flags
4751832d
QW
2520 */
2521 if (cache->offset + cache->len == offset &&
2522 cache->phys + cache->len == phys &&
ac3c0d36 2523 cache->flags == flags) {
4751832d 2524 cache->len += len;
ac3c0d36 2525 return 0;
4751832d
QW
2526 }
2527
2528 /* Not mergeable, need to submit cached one */
2529 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
2530 cache->len, cache->flags);
2531 cache->cached = false;
2532 if (ret)
2533 return ret;
2534assign:
2535 cache->cached = true;
2536 cache->offset = offset;
2537 cache->phys = phys;
2538 cache->len = len;
2539 cache->flags = flags;
ac3c0d36
FM
2540
2541 return 0;
4751832d
QW
2542}
2543
2544/*
848c23b7 2545 * Emit last fiemap cache
4751832d 2546 *
848c23b7
QW
2547 * The last fiemap cache may still be cached in the following case:
2548 * 0 4k 8k
2549 * |<- Fiemap range ->|
2550 * |<------------ First extent ----------->|
2551 *
2552 * In this case, the first extent range will be cached but not emitted.
2553 * So we must emit it before ending extent_fiemap().
4751832d 2554 */
5c5aff98 2555static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 2556 struct fiemap_cache *cache)
4751832d
QW
2557{
2558 int ret;
2559
2560 if (!cache->cached)
2561 return 0;
2562
4751832d
QW
2563 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
2564 cache->len, cache->flags);
2565 cache->cached = false;
2566 if (ret > 0)
2567 ret = 0;
2568 return ret;
2569}
2570
ac3c0d36 2571static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
1506fcc8 2572{
ac3c0d36
FM
2573 struct extent_buffer *clone;
2574 struct btrfs_key key;
2575 int slot;
2576 int ret;
2577
2578 path->slots[0]++;
2579 if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
2580 return 0;
2581
2582 ret = btrfs_next_leaf(inode->root, path);
2583 if (ret != 0)
2584 return ret;
2585
2586 /*
2587 * Don't bother with cloning if there are no more file extent items for
2588 * our inode.
2589 */
2590 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2591 if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
2592 return 1;
2593
2594 /* See the comment at fiemap_search_slot() about why we clone. */
2595 clone = btrfs_clone_extent_buffer(path->nodes[0]);
2596 if (!clone)
2597 return -ENOMEM;
2598
2599 slot = path->slots[0];
2600 btrfs_release_path(path);
2601 path->nodes[0] = clone;
2602 path->slots[0] = slot;
2603
2604 return 0;
2605}
2606
2607/*
2608 * Search for the first file extent item that starts at a given file offset or
2609 * the one that starts immediately before that offset.
2610 * Returns: 0 on success, < 0 on error, 1 if not found.
2611 */
2612static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
2613 u64 file_offset)
2614{
2615 const u64 ino = btrfs_ino(inode);
facee0a0 2616 struct btrfs_root *root = inode->root;
ac3c0d36
FM
2617 struct extent_buffer *clone;
2618 struct btrfs_key key;
2619 int slot;
2620 int ret;
1506fcc8 2621
ac3c0d36
FM
2622 key.objectid = ino;
2623 key.type = BTRFS_EXTENT_DATA_KEY;
2624 key.offset = file_offset;
2625
2626 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2627 if (ret < 0)
2628 return ret;
2629
2630 if (ret > 0 && path->slots[0] > 0) {
2631 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
2632 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
2633 path->slots[0]--;
2634 }
2635
2636 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2637 ret = btrfs_next_leaf(root, path);
2638 if (ret != 0)
2639 return ret;
2640
2641 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2642 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
2643 return 1;
5911c8fe
DS
2644 }
2645
15c7745c 2646 /*
ac3c0d36
FM
2647 * We clone the leaf and use it during fiemap. This is because while
2648 * using the leaf we do expensive things like checking if an extent is
2649 * shared, which can take a long time. In order to prevent blocking
2650 * other tasks for too long, we use a clone of the leaf. We have locked
2651 * the file range in the inode's io tree, so we know none of our file
2652 * extent items can change. This way we avoid blocking other tasks that
2653 * want to insert items for other inodes in the same leaf or b+tree
2654 * rebalance operations (triggered for example when someone is trying
2655 * to push items into this leaf when trying to insert an item in a
2656 * neighbour leaf).
2657 * We also need the private clone because holding a read lock on an
2658 * extent buffer of the subvolume's b+tree will make lockdep unhappy
2659 * when we call fiemap_fill_next_extent(), because that may cause a page
2660 * fault when filling the user space buffer with fiemap data.
15c7745c 2661 */
ac3c0d36
FM
2662 clone = btrfs_clone_extent_buffer(path->nodes[0]);
2663 if (!clone)
2664 return -ENOMEM;
2665
2666 slot = path->slots[0];
2667 btrfs_release_path(path);
2668 path->nodes[0] = clone;
2669 path->slots[0] = slot;
2670
2671 return 0;
2672}
2673
2674/*
2675 * Process a range which is a hole or a prealloc extent in the inode's subvolume
2676 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
2677 * extent. The end offset (@end) is inclusive.
2678 */
2679static int fiemap_process_hole(struct btrfs_inode *inode,
2680 struct fiemap_extent_info *fieinfo,
2681 struct fiemap_cache *cache,
b3e744fe 2682 struct extent_state **delalloc_cached_state,
61dbb952 2683 struct btrfs_backref_share_check_ctx *backref_ctx,
ac3c0d36
FM
2684 u64 disk_bytenr, u64 extent_offset,
2685 u64 extent_gen,
ac3c0d36
FM
2686 u64 start, u64 end)
2687{
2688 const u64 i_size = i_size_read(&inode->vfs_inode);
ac3c0d36
FM
2689 u64 cur_offset = start;
2690 u64 last_delalloc_end = 0;
2691 u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
2692 bool checked_extent_shared = false;
2693 int ret;
4d479cf0 2694
ec29ed5b 2695 /*
ac3c0d36
FM
2696 * There can be no delalloc past i_size, so don't waste time looking for
2697 * it beyond i_size.
ec29ed5b 2698 */
ac3c0d36
FM
2699 while (cur_offset < end && cur_offset < i_size) {
2700 u64 delalloc_start;
2701 u64 delalloc_end;
2702 u64 prealloc_start;
2703 u64 prealloc_len = 0;
2704 bool delalloc;
2705
2706 delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
b3e744fe 2707 delalloc_cached_state,
ac3c0d36
FM
2708 &delalloc_start,
2709 &delalloc_end);
2710 if (!delalloc)
2711 break;
2d324f59 2712
ec29ed5b 2713 /*
ac3c0d36
FM
2714 * If this is a prealloc extent we have to report every section
2715 * of it that has no delalloc.
ec29ed5b 2716 */
ac3c0d36
FM
2717 if (disk_bytenr != 0) {
2718 if (last_delalloc_end == 0) {
2719 prealloc_start = start;
2720 prealloc_len = delalloc_start - start;
2721 } else {
2722 prealloc_start = last_delalloc_end + 1;
2723 prealloc_len = delalloc_start - prealloc_start;
2724 }
2725 }
2726
2727 if (prealloc_len > 0) {
2728 if (!checked_extent_shared && fieinfo->fi_extents_max) {
ceb707da 2729 ret = btrfs_is_data_extent_shared(inode,
84a7949d
FM
2730 disk_bytenr,
2731 extent_gen,
2732 backref_ctx);
ac3c0d36
FM
2733 if (ret < 0)
2734 return ret;
2735 else if (ret > 0)
2736 prealloc_flags |= FIEMAP_EXTENT_SHARED;
2737
2738 checked_extent_shared = true;
2739 }
2740 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
2741 disk_bytenr + extent_offset,
2742 prealloc_len, prealloc_flags);
2743 if (ret)
2744 return ret;
2745 extent_offset += prealloc_len;
2746 }
2747
2748 ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
2749 delalloc_end + 1 - delalloc_start,
2750 FIEMAP_EXTENT_DELALLOC |
2751 FIEMAP_EXTENT_UNKNOWN);
2752 if (ret)
2753 return ret;
2754
2755 last_delalloc_end = delalloc_end;
2756 cur_offset = delalloc_end + 1;
2757 extent_offset += cur_offset - delalloc_start;
2758 cond_resched();
2759 }
2760
2761 /*
2762 * Either we found no delalloc for the whole prealloc extent or we have
2763 * a prealloc extent that spans i_size or starts at or after i_size.
2764 */
2765 if (disk_bytenr != 0 && last_delalloc_end < end) {
2766 u64 prealloc_start;
2767 u64 prealloc_len;
2768
2769 if (last_delalloc_end == 0) {
2770 prealloc_start = start;
2771 prealloc_len = end + 1 - start;
2772 } else {
2773 prealloc_start = last_delalloc_end + 1;
2774 prealloc_len = end + 1 - prealloc_start;
2775 }
2776
2777 if (!checked_extent_shared && fieinfo->fi_extents_max) {
ceb707da
FM
2778 ret = btrfs_is_data_extent_shared(inode,
2779 disk_bytenr,
84a7949d 2780 extent_gen,
61dbb952 2781 backref_ctx);
ac3c0d36
FM
2782 if (ret < 0)
2783 return ret;
2784 else if (ret > 0)
2785 prealloc_flags |= FIEMAP_EXTENT_SHARED;
2786 }
2787 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
2788 disk_bytenr + extent_offset,
2789 prealloc_len, prealloc_flags);
2790 if (ret)
2791 return ret;
2792 }
2793
2794 return 0;
2795}
2796
2797static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
2798 struct btrfs_path *path,
2799 u64 *last_extent_end_ret)
2800{
2801 const u64 ino = btrfs_ino(inode);
2802 struct btrfs_root *root = inode->root;
2803 struct extent_buffer *leaf;
2804 struct btrfs_file_extent_item *ei;
2805 struct btrfs_key key;
2806 u64 disk_bytenr;
2807 int ret;
2808
2809 /*
2810 * Lookup the last file extent. We're not using i_size here because
2811 * there might be preallocation past i_size.
2812 */
2813 ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
2814 /* There can't be a file extent item at offset (u64)-1 */
2815 ASSERT(ret != 0);
2816 if (ret < 0)
2817 return ret;
2818
2819 /*
2820 * For a non-existing key, btrfs_search_slot() always leaves us at a
2821 * slot > 0, except if the btree is empty, which is impossible because
2822 * at least it has the inode item for this inode and all the items for
2823 * the root inode 256.
2824 */
2825 ASSERT(path->slots[0] > 0);
2826 path->slots[0]--;
2827 leaf = path->nodes[0];
2828 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2829 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
2830 /* No file extent items in the subvolume tree. */
2831 *last_extent_end_ret = 0;
2832 return 0;
975f84fe 2833 }
975f84fe 2834
ec29ed5b 2835 /*
ac3c0d36
FM
2836 * For an inline extent, the disk_bytenr is where inline data starts at,
2837 * so first check if we have an inline extent item before checking if we
2838 * have an implicit hole (disk_bytenr == 0).
ec29ed5b 2839 */
ac3c0d36
FM
2840 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
2841 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
2842 *last_extent_end_ret = btrfs_file_extent_end(path);
2843 return 0;
ec29ed5b
CM
2844 }
2845
ac3c0d36
FM
2846 /*
2847 * Find the last file extent item that is not a hole (when NO_HOLES is
2848 * not enabled). This should take at most 2 iterations in the worst
2849 * case: we have one hole file extent item at slot 0 of a leaf and
2850 * another hole file extent item as the last item in the previous leaf.
2851 * This is because we merge file extent items that represent holes.
2852 */
2853 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
2854 while (disk_bytenr == 0) {
2855 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
2856 if (ret < 0) {
2857 return ret;
2858 } else if (ret > 0) {
2859 /* No file extent items that are not holes. */
2860 *last_extent_end_ret = 0;
2861 return 0;
2862 }
2863 leaf = path->nodes[0];
2864 ei = btrfs_item_ptr(leaf, path->slots[0],
2865 struct btrfs_file_extent_item);
2866 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
2867 }
ec29ed5b 2868
ac3c0d36
FM
2869 *last_extent_end_ret = btrfs_file_extent_end(path);
2870 return 0;
2871}
2872
2873int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
2874 u64 start, u64 len)
2875{
2876 const u64 ino = btrfs_ino(inode);
2877 struct extent_state *cached_state = NULL;
b3e744fe 2878 struct extent_state *delalloc_cached_state = NULL;
ac3c0d36 2879 struct btrfs_path *path;
ac3c0d36 2880 struct fiemap_cache cache = { 0 };
61dbb952 2881 struct btrfs_backref_share_check_ctx *backref_ctx;
ac3c0d36
FM
2882 u64 last_extent_end;
2883 u64 prev_extent_end;
2884 u64 lockstart;
2885 u64 lockend;
2886 bool stopped = false;
2887 int ret;
2888
84a7949d 2889 backref_ctx = btrfs_alloc_backref_share_check_ctx();
ac3c0d36 2890 path = btrfs_alloc_path();
84a7949d 2891 if (!backref_ctx || !path) {
ac3c0d36 2892 ret = -ENOMEM;
1506fcc8
YS
2893 goto out;
2894 }
975f84fe 2895
ceb707da
FM
2896 lockstart = round_down(start, inode->root->fs_info->sectorsize);
2897 lockend = round_up(start + len, inode->root->fs_info->sectorsize);
ac3c0d36 2898 prev_extent_end = lockstart;
ea8efc74 2899
519b7e13 2900 btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
570eb97b 2901 lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ea8efc74 2902
ac3c0d36
FM
2903 ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
2904 if (ret < 0)
2905 goto out_unlock;
2906 btrfs_release_path(path);
1506fcc8 2907
ac3c0d36
FM
2908 path->reada = READA_FORWARD;
2909 ret = fiemap_search_slot(inode, path, lockstart);
2910 if (ret < 0) {
2911 goto out_unlock;
2912 } else if (ret > 0) {
ea8efc74 2913 /*
ac3c0d36
FM
2914 * No file extent item found, but we may have delalloc between
2915 * the current offset and i_size. So check for that.
ea8efc74 2916 */
ac3c0d36
FM
2917 ret = 0;
2918 goto check_eof_delalloc;
2919 }
2920
2921 while (prev_extent_end < lockend) {
2922 struct extent_buffer *leaf = path->nodes[0];
2923 struct btrfs_file_extent_item *ei;
2924 struct btrfs_key key;
2925 u64 extent_end;
2926 u64 extent_len;
2927 u64 extent_offset = 0;
2928 u64 extent_gen;
2929 u64 disk_bytenr = 0;
2930 u64 flags = 0;
2931 int extent_type;
2932 u8 compression;
2933
2934 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2935 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
2936 break;
2937
2938 extent_end = btrfs_file_extent_end(path);
1506fcc8 2939
ea8efc74 2940 /*
ac3c0d36
FM
2941 * The first iteration can leave us at an extent item that ends
2942 * before our range's start. Move to the next item.
ea8efc74 2943 */
ac3c0d36
FM
2944 if (extent_end <= lockstart)
2945 goto next_item;
fe09e16c 2946
877c1476
FM
2947 backref_ctx->curr_leaf_bytenr = leaf->start;
2948
ac3c0d36
FM
2949 /* We have in implicit hole (NO_HOLES feature enabled). */
2950 if (prev_extent_end < key.offset) {
2951 const u64 range_end = min(key.offset, lockend) - 1;
b8f164e3 2952
ac3c0d36 2953 ret = fiemap_process_hole(inode, fieinfo, &cache,
b3e744fe 2954 &delalloc_cached_state,
61dbb952 2955 backref_ctx, 0, 0, 0,
ac3c0d36
FM
2956 prev_extent_end, range_end);
2957 if (ret < 0) {
2958 goto out_unlock;
2959 } else if (ret > 0) {
2960 /* fiemap_fill_next_extent() told us to stop. */
2961 stopped = true;
2962 break;
2963 }
1506fcc8 2964
ac3c0d36
FM
2965 /* We've reached the end of the fiemap range, stop. */
2966 if (key.offset >= lockend) {
2967 stopped = true;
2968 break;
2969 }
1506fcc8
YS
2970 }
2971
ac3c0d36
FM
2972 extent_len = extent_end - key.offset;
2973 ei = btrfs_item_ptr(leaf, path->slots[0],
2974 struct btrfs_file_extent_item);
2975 compression = btrfs_file_extent_compression(leaf, ei);
2976 extent_type = btrfs_file_extent_type(leaf, ei);
2977 extent_gen = btrfs_file_extent_generation(leaf, ei);
2978
2979 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2980 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
2981 if (compression == BTRFS_COMPRESS_NONE)
2982 extent_offset = btrfs_file_extent_offset(leaf, ei);
ec29ed5b 2983 }
ac3c0d36
FM
2984
2985 if (compression != BTRFS_COMPRESS_NONE)
2986 flags |= FIEMAP_EXTENT_ENCODED;
2987
2988 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2989 flags |= FIEMAP_EXTENT_DATA_INLINE;
2990 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
2991 ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
2992 extent_len, flags);
2993 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
2994 ret = fiemap_process_hole(inode, fieinfo, &cache,
b3e744fe 2995 &delalloc_cached_state,
61dbb952 2996 backref_ctx,
ac3c0d36 2997 disk_bytenr, extent_offset,
84a7949d
FM
2998 extent_gen, key.offset,
2999 extent_end - 1);
ac3c0d36
FM
3000 } else if (disk_bytenr == 0) {
3001 /* We have an explicit hole. */
3002 ret = fiemap_process_hole(inode, fieinfo, &cache,
b3e744fe 3003 &delalloc_cached_state,
61dbb952 3004 backref_ctx, 0, 0, 0,
ac3c0d36
FM
3005 key.offset, extent_end - 1);
3006 } else {
3007 /* We have a regular extent. */
3008 if (fieinfo->fi_extents_max) {
ceb707da 3009 ret = btrfs_is_data_extent_shared(inode,
ac3c0d36
FM
3010 disk_bytenr,
3011 extent_gen,
61dbb952 3012 backref_ctx);
ac3c0d36
FM
3013 if (ret < 0)
3014 goto out_unlock;
3015 else if (ret > 0)
3016 flags |= FIEMAP_EXTENT_SHARED;
3017 }
3018
3019 ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
3020 disk_bytenr + extent_offset,
3021 extent_len, flags);
975f84fe 3022 }
ac3c0d36
FM
3023
3024 if (ret < 0) {
3025 goto out_unlock;
3026 } else if (ret > 0) {
3027 /* fiemap_fill_next_extent() told us to stop. */
3028 stopped = true;
3029 break;
26e726af 3030 }
09fbc1c8 3031
ac3c0d36
FM
3032 prev_extent_end = extent_end;
3033next_item:
09fbc1c8
FM
3034 if (fatal_signal_pending(current)) {
3035 ret = -EINTR;
ac3c0d36 3036 goto out_unlock;
09fbc1c8 3037 }
ac3c0d36
FM
3038
3039 ret = fiemap_next_leaf_item(inode, path);
3040 if (ret < 0) {
3041 goto out_unlock;
3042 } else if (ret > 0) {
3043 /* No more file extent items for this inode. */
3044 break;
3045 }
3046 cond_resched();
1506fcc8 3047 }
5911c8fe 3048
ac3c0d36
FM
3049check_eof_delalloc:
3050 /*
3051 * Release (and free) the path before emitting any final entries to
3052 * fiemap_fill_next_extent() to keep lockdep happy. This is because
3053 * once we find no more file extent items exist, we may have a
3054 * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
3055 * faults when copying data to the user space buffer.
3056 */
3057 btrfs_free_path(path);
3058 path = NULL;
3059
3060 if (!stopped && prev_extent_end < lockend) {
b3e744fe
FM
3061 ret = fiemap_process_hole(inode, fieinfo, &cache,
3062 &delalloc_cached_state, backref_ctx,
84a7949d 3063 0, 0, 0, prev_extent_end, lockend - 1);
ac3c0d36
FM
3064 if (ret < 0)
3065 goto out_unlock;
3066 prev_extent_end = lockend;
3067 }
3068
3069 if (cache.cached && cache.offset + cache.len >= last_extent_end) {
3070 const u64 i_size = i_size_read(&inode->vfs_inode);
3071
3072 if (prev_extent_end < i_size) {
3073 u64 delalloc_start;
3074 u64 delalloc_end;
3075 bool delalloc;
3076
3077 delalloc = btrfs_find_delalloc_in_range(inode,
3078 prev_extent_end,
3079 i_size - 1,
b3e744fe 3080 &delalloc_cached_state,
ac3c0d36
FM
3081 &delalloc_start,
3082 &delalloc_end);
3083 if (!delalloc)
3084 cache.flags |= FIEMAP_EXTENT_LAST;
3085 } else {
3086 cache.flags |= FIEMAP_EXTENT_LAST;
3087 }
3088 }
3089
3090 ret = emit_last_fiemap_cache(fieinfo, &cache);
3091
3092out_unlock:
570eb97b 3093 unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
519b7e13 3094 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
ac3c0d36 3095out:
b3e744fe 3096 free_extent_state(delalloc_cached_state);
84a7949d 3097 btrfs_free_backref_share_ctx(backref_ctx);
e02d48ea 3098 btrfs_free_path(path);
1506fcc8
YS
3099 return ret;
3100}
3101
727011e0
CM
3102static void __free_extent_buffer(struct extent_buffer *eb)
3103{
727011e0
CM
3104 kmem_cache_free(extent_buffer_cache, eb);
3105}
3106
7f26fb1c 3107static int extent_buffer_under_io(const struct extent_buffer *eb)
db7f3436 3108{
113fa05c 3109 return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
db7f3436
JB
3110 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
3111}
3112
8ff8466d 3113static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
db7f3436 3114{
8ff8466d 3115 struct btrfs_subpage *subpage;
db7f3436 3116
8ff8466d 3117 lockdep_assert_held(&page->mapping->private_lock);
db7f3436 3118
8ff8466d
QW
3119 if (PagePrivate(page)) {
3120 subpage = (struct btrfs_subpage *)page->private;
3121 if (atomic_read(&subpage->eb_refs))
3122 return true;
3d078efa
QW
3123 /*
3124 * Even there is no eb refs here, we may still have
3125 * end_page_read() call relying on page::private.
3126 */
3127 if (atomic_read(&subpage->readers))
3128 return true;
8ff8466d
QW
3129 }
3130 return false;
3131}
db7f3436 3132
8ff8466d
QW
3133static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
3134{
3135 struct btrfs_fs_info *fs_info = eb->fs_info;
3136 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
3137
3138 /*
3139 * For mapped eb, we're going to change the page private, which should
3140 * be done under the private_lock.
3141 */
3142 if (mapped)
3143 spin_lock(&page->mapping->private_lock);
3144
3145 if (!PagePrivate(page)) {
5d2361db 3146 if (mapped)
8ff8466d
QW
3147 spin_unlock(&page->mapping->private_lock);
3148 return;
3149 }
3150
fbca46eb 3151 if (fs_info->nodesize >= PAGE_SIZE) {
5d2361db
FL
3152 /*
3153 * We do this since we'll remove the pages after we've
3154 * removed the eb from the radix tree, so we could race
3155 * and have this page now attached to the new eb. So
3156 * only clear page_private if it's still connected to
3157 * this eb.
3158 */
3159 if (PagePrivate(page) &&
3160 page->private == (unsigned long)eb) {
3161 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
3162 BUG_ON(PageDirty(page));
3163 BUG_ON(PageWriteback(page));
db7f3436 3164 /*
5d2361db
FL
3165 * We need to make sure we haven't be attached
3166 * to a new eb.
db7f3436 3167 */
d1b89bc0 3168 detach_page_private(page);
db7f3436 3169 }
5d2361db
FL
3170 if (mapped)
3171 spin_unlock(&page->mapping->private_lock);
8ff8466d
QW
3172 return;
3173 }
3174
3175 /*
3176 * For subpage, we can have dummy eb with page private. In this case,
3177 * we can directly detach the private as such page is only attached to
3178 * one dummy eb, no sharing.
3179 */
3180 if (!mapped) {
3181 btrfs_detach_subpage(fs_info, page);
3182 return;
3183 }
3184
3185 btrfs_page_dec_eb_refs(fs_info, page);
3186
3187 /*
3188 * We can only detach the page private if there are no other ebs in the
3d078efa 3189 * page range and no unfinished IO.
8ff8466d
QW
3190 */
3191 if (!page_range_has_eb(fs_info, page))
3192 btrfs_detach_subpage(fs_info, page);
3193
3194 spin_unlock(&page->mapping->private_lock);
3195}
3196
3197/* Release all pages attached to the extent buffer */
3198static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
3199{
3200 int i;
3201 int num_pages;
3202
3203 ASSERT(!extent_buffer_under_io(eb));
3204
3205 num_pages = num_extent_pages(eb);
3206 for (i = 0; i < num_pages; i++) {
3207 struct page *page = eb->pages[i];
3208
3209 if (!page)
3210 continue;
3211
3212 detach_extent_buffer_page(eb, page);
5d2361db 3213
01327610 3214 /* One for when we allocated the page */
09cbfeaf 3215 put_page(page);
d64766fd 3216 }
db7f3436
JB
3217}
3218
3219/*
3220 * Helper for releasing the extent buffer.
3221 */
3222static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3223{
55ac0139 3224 btrfs_release_extent_buffer_pages(eb);
a40246e8 3225 btrfs_leak_debug_del_eb(eb);
db7f3436
JB
3226 __free_extent_buffer(eb);
3227}
3228
f28491e0
JB
3229static struct extent_buffer *
3230__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 3231 unsigned long len)
d1310b2e
CM
3232{
3233 struct extent_buffer *eb = NULL;
3234
d1b5c567 3235 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
3236 eb->start = start;
3237 eb->len = len;
f28491e0 3238 eb->fs_info = fs_info;
196d59ab 3239 init_rwsem(&eb->lock);
b4ce94de 3240
a40246e8 3241 btrfs_leak_debug_add_eb(eb);
6d49ba1b 3242
3083ee2e 3243 spin_lock_init(&eb->refs_lock);
d1310b2e 3244 atomic_set(&eb->refs, 1);
727011e0 3245
deb67895 3246 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
d1310b2e
CM
3247
3248 return eb;
3249}
3250
2b48966a 3251struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
815a51c7 3252{
cc5e31a4 3253 int i;
815a51c7 3254 struct extent_buffer *new;
cc5e31a4 3255 int num_pages = num_extent_pages(src);
dd137dd1 3256 int ret;
815a51c7 3257
3f556f78 3258 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
3259 if (new == NULL)
3260 return NULL;
3261
62c053fb
QW
3262 /*
3263 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
3264 * btrfs_release_extent_buffer() have different behavior for
3265 * UNMAPPED subpage extent buffer.
3266 */
3267 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
3268
dd137dd1
STD
3269 ret = btrfs_alloc_page_array(num_pages, new->pages);
3270 if (ret) {
3271 btrfs_release_extent_buffer(new);
3272 return NULL;
3273 }
3274
815a51c7 3275 for (i = 0; i < num_pages; i++) {
760f991f 3276 int ret;
dd137dd1 3277 struct page *p = new->pages[i];
760f991f 3278
760f991f
QW
3279 ret = attach_extent_buffer_page(new, p, NULL);
3280 if (ret < 0) {
760f991f
QW
3281 btrfs_release_extent_buffer(new);
3282 return NULL;
3283 }
815a51c7 3284 WARN_ON(PageDirty(p));
fba1acf9 3285 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7 3286 }
92d83e94 3287 set_extent_buffer_uptodate(new);
815a51c7
JS
3288
3289 return new;
3290}
3291
0f331229
OS
3292struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
3293 u64 start, unsigned long len)
815a51c7
JS
3294{
3295 struct extent_buffer *eb;
cc5e31a4
DS
3296 int num_pages;
3297 int i;
dd137dd1 3298 int ret;
815a51c7 3299
3f556f78 3300 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
3301 if (!eb)
3302 return NULL;
3303
65ad0104 3304 num_pages = num_extent_pages(eb);
dd137dd1
STD
3305 ret = btrfs_alloc_page_array(num_pages, eb->pages);
3306 if (ret)
3307 goto err;
3308
815a51c7 3309 for (i = 0; i < num_pages; i++) {
dd137dd1 3310 struct page *p = eb->pages[i];
09bc1f0f 3311
dd137dd1 3312 ret = attach_extent_buffer_page(eb, p, NULL);
09bc1f0f
QW
3313 if (ret < 0)
3314 goto err;
815a51c7 3315 }
dd137dd1 3316
815a51c7
JS
3317 set_extent_buffer_uptodate(eb);
3318 btrfs_set_header_nritems(eb, 0);
b0132a3b 3319 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
3320
3321 return eb;
3322err:
dd137dd1
STD
3323 for (i = 0; i < num_pages; i++) {
3324 if (eb->pages[i]) {
3325 detach_extent_buffer_page(eb, eb->pages[i]);
3326 __free_page(eb->pages[i]);
3327 }
09bc1f0f 3328 }
815a51c7
JS
3329 __free_extent_buffer(eb);
3330 return NULL;
3331}
3332
0f331229 3333struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 3334 u64 start)
0f331229 3335{
da17066c 3336 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
3337}
3338
0b32f4bb
JB
3339static void check_buffer_tree_ref(struct extent_buffer *eb)
3340{
242e18c7 3341 int refs;
6bf9cd2e
BB
3342 /*
3343 * The TREE_REF bit is first set when the extent_buffer is added
3344 * to the radix tree. It is also reset, if unset, when a new reference
3345 * is created by find_extent_buffer.
0b32f4bb 3346 *
6bf9cd2e
BB
3347 * It is only cleared in two cases: freeing the last non-tree
3348 * reference to the extent_buffer when its STALE bit is set or
f913cff3 3349 * calling release_folio when the tree reference is the only reference.
0b32f4bb 3350 *
6bf9cd2e 3351 * In both cases, care is taken to ensure that the extent_buffer's
f913cff3 3352 * pages are not under io. However, release_folio can be concurrently
6bf9cd2e
BB
3353 * called with creating new references, which is prone to race
3354 * conditions between the calls to check_buffer_tree_ref in those
3355 * codepaths and clearing TREE_REF in try_release_extent_buffer.
0b32f4bb 3356 *
6bf9cd2e
BB
3357 * The actual lifetime of the extent_buffer in the radix tree is
3358 * adequately protected by the refcount, but the TREE_REF bit and
3359 * its corresponding reference are not. To protect against this
3360 * class of races, we call check_buffer_tree_ref from the codepaths
113fa05c
CH
3361 * which trigger io. Note that once io is initiated, TREE_REF can no
3362 * longer be cleared, so that is the moment at which any such race is
3363 * best fixed.
0b32f4bb 3364 */
242e18c7
CM
3365 refs = atomic_read(&eb->refs);
3366 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3367 return;
3368
594831c4
JB
3369 spin_lock(&eb->refs_lock);
3370 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 3371 atomic_inc(&eb->refs);
594831c4 3372 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
3373}
3374
2457aec6
MG
3375static void mark_extent_buffer_accessed(struct extent_buffer *eb,
3376 struct page *accessed)
5df4235e 3377{
cc5e31a4 3378 int num_pages, i;
5df4235e 3379
0b32f4bb
JB
3380 check_buffer_tree_ref(eb);
3381
65ad0104 3382 num_pages = num_extent_pages(eb);
5df4235e 3383 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
3384 struct page *p = eb->pages[i];
3385
2457aec6
MG
3386 if (p != accessed)
3387 mark_page_accessed(p);
5df4235e
JB
3388 }
3389}
3390
f28491e0
JB
3391struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
3392 u64 start)
452c75c3
CS
3393{
3394 struct extent_buffer *eb;
3395
2f3186d8
QW
3396 eb = find_extent_buffer_nolock(fs_info, start);
3397 if (!eb)
3398 return NULL;
3399 /*
3400 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
3401 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
3402 * another task running free_extent_buffer() might have seen that flag
3403 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
3404 * writeback flags not set) and it's still in the tree (flag
3405 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
3406 * decrementing the extent buffer's reference count twice. So here we
3407 * could race and increment the eb's reference count, clear its stale
3408 * flag, mark it as dirty and drop our reference before the other task
3409 * finishes executing free_extent_buffer, which would later result in
3410 * an attempt to free an extent buffer that is dirty.
3411 */
3412 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
3413 spin_lock(&eb->refs_lock);
3414 spin_unlock(&eb->refs_lock);
452c75c3 3415 }
2f3186d8
QW
3416 mark_extent_buffer_accessed(eb, NULL);
3417 return eb;
452c75c3
CS
3418}
3419
faa2dbf0
JB
3420#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3421struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 3422 u64 start)
faa2dbf0
JB
3423{
3424 struct extent_buffer *eb, *exists = NULL;
3425 int ret;
3426
3427 eb = find_extent_buffer(fs_info, start);
3428 if (eb)
3429 return eb;
da17066c 3430 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 3431 if (!eb)
b6293c82 3432 return ERR_PTR(-ENOMEM);
faa2dbf0 3433 eb->fs_info = fs_info;
01cd3909
DS
3434again:
3435 ret = radix_tree_preload(GFP_NOFS);
3436 if (ret) {
3437 exists = ERR_PTR(ret);
3438 goto free_eb;
3439 }
3440 spin_lock(&fs_info->buffer_lock);
3441 ret = radix_tree_insert(&fs_info->buffer_radix,
3442 start >> fs_info->sectorsize_bits, eb);
3443 spin_unlock(&fs_info->buffer_lock);
3444 radix_tree_preload_end();
3445 if (ret == -EEXIST) {
3446 exists = find_extent_buffer(fs_info, start);
3447 if (exists)
faa2dbf0 3448 goto free_eb;
01cd3909
DS
3449 else
3450 goto again;
3451 }
faa2dbf0
JB
3452 check_buffer_tree_ref(eb);
3453 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
3454
faa2dbf0
JB
3455 return eb;
3456free_eb:
3457 btrfs_release_extent_buffer(eb);
3458 return exists;
3459}
3460#endif
3461
81982210
QW
3462static struct extent_buffer *grab_extent_buffer(
3463 struct btrfs_fs_info *fs_info, struct page *page)
c0f0a9e7
QW
3464{
3465 struct extent_buffer *exists;
3466
81982210
QW
3467 /*
3468 * For subpage case, we completely rely on radix tree to ensure we
3469 * don't try to insert two ebs for the same bytenr. So here we always
3470 * return NULL and just continue.
3471 */
fbca46eb 3472 if (fs_info->nodesize < PAGE_SIZE)
81982210
QW
3473 return NULL;
3474
c0f0a9e7
QW
3475 /* Page not yet attached to an extent buffer */
3476 if (!PagePrivate(page))
3477 return NULL;
3478
3479 /*
3480 * We could have already allocated an eb for this page and attached one
3481 * so lets see if we can get a ref on the existing eb, and if we can we
3482 * know it's good and we can just return that one, else we know we can
3483 * just overwrite page->private.
3484 */
3485 exists = (struct extent_buffer *)page->private;
3486 if (atomic_inc_not_zero(&exists->refs))
3487 return exists;
3488
3489 WARN_ON(PageDirty(page));
3490 detach_page_private(page);
3491 return NULL;
3492}
3493
fbca46eb
QW
3494static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
3495{
3496 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
3497 btrfs_err(fs_info, "bad tree block start %llu", start);
3498 return -EINVAL;
3499 }
3500
3501 if (fs_info->nodesize < PAGE_SIZE &&
3502 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
3503 btrfs_err(fs_info,
3504 "tree block crosses page boundary, start %llu nodesize %u",
3505 start, fs_info->nodesize);
3506 return -EINVAL;
3507 }
3508 if (fs_info->nodesize >= PAGE_SIZE &&
1280d2d1 3509 !PAGE_ALIGNED(start)) {
fbca46eb
QW
3510 btrfs_err(fs_info,
3511 "tree block is not page aligned, start %llu nodesize %u",
3512 start, fs_info->nodesize);
3513 return -EINVAL;
3514 }
3515 return 0;
3516}
3517
f28491e0 3518struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3fbaf258 3519 u64 start, u64 owner_root, int level)
d1310b2e 3520{
da17066c 3521 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
3522 int num_pages;
3523 int i;
09cbfeaf 3524 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 3525 struct extent_buffer *eb;
6af118ce 3526 struct extent_buffer *exists = NULL;
d1310b2e 3527 struct page *p;
f28491e0 3528 struct address_space *mapping = fs_info->btree_inode->i_mapping;
52ea5bfb 3529 struct btrfs_subpage *prealloc = NULL;
b40130b2 3530 u64 lockdep_owner = owner_root;
d1310b2e 3531 int uptodate = 1;
19fe0a8b 3532 int ret;
d1310b2e 3533
fbca46eb 3534 if (check_eb_alignment(fs_info, start))
c871b0f2 3535 return ERR_PTR(-EINVAL);
c871b0f2 3536
e9306ad4
QW
3537#if BITS_PER_LONG == 32
3538 if (start >= MAX_LFS_FILESIZE) {
3539 btrfs_err_rl(fs_info,
3540 "extent buffer %llu is beyond 32bit page cache limit", start);
3541 btrfs_err_32bit_limit(fs_info);
3542 return ERR_PTR(-EOVERFLOW);
3543 }
3544 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
3545 btrfs_warn_32bit_limit(fs_info);
3546#endif
3547
f28491e0 3548 eb = find_extent_buffer(fs_info, start);
452c75c3 3549 if (eb)
6af118ce 3550 return eb;
6af118ce 3551
23d79d81 3552 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 3553 if (!eb)
c871b0f2 3554 return ERR_PTR(-ENOMEM);
b40130b2
JB
3555
3556 /*
3557 * The reloc trees are just snapshots, so we need them to appear to be
3558 * just like any other fs tree WRT lockdep.
3559 */
3560 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
3561 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
3562
3563 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
d1310b2e 3564
65ad0104 3565 num_pages = num_extent_pages(eb);
760f991f 3566
52ea5bfb
QW
3567 /*
3568 * Preallocate page->private for subpage case, so that we won't
3569 * allocate memory with private_lock nor page lock hold.
3570 *
3571 * The memory will be freed by attach_extent_buffer_page() or freed
3572 * manually if we exit earlier.
3573 */
3574 if (fs_info->nodesize < PAGE_SIZE) {
3575 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
3576 if (IS_ERR(prealloc)) {
3577 exists = ERR_CAST(prealloc);
3578 goto free_eb;
3579 }
3580 }
3581
3582 for (i = 0; i < num_pages; i++, index++) {
d1b5c567 3583 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
3584 if (!p) {
3585 exists = ERR_PTR(-ENOMEM);
52ea5bfb 3586 btrfs_free_subpage(prealloc);
6af118ce 3587 goto free_eb;
c871b0f2 3588 }
4f2de97a
JB
3589
3590 spin_lock(&mapping->private_lock);
81982210 3591 exists = grab_extent_buffer(fs_info, p);
c0f0a9e7
QW
3592 if (exists) {
3593 spin_unlock(&mapping->private_lock);
3594 unlock_page(p);
3595 put_page(p);
3596 mark_extent_buffer_accessed(exists, p);
760f991f 3597 btrfs_free_subpage(prealloc);
c0f0a9e7 3598 goto free_eb;
d1310b2e 3599 }
760f991f
QW
3600 /* Should not fail, as we have preallocated the memory */
3601 ret = attach_extent_buffer_page(eb, p, prealloc);
3602 ASSERT(!ret);
8ff8466d
QW
3603 /*
3604 * To inform we have extra eb under allocation, so that
3605 * detach_extent_buffer_page() won't release the page private
3606 * when the eb hasn't yet been inserted into radix tree.
3607 *
3608 * The ref will be decreased when the eb released the page, in
3609 * detach_extent_buffer_page().
3610 * Thus needs no special handling in error path.
3611 */
3612 btrfs_page_inc_eb_refs(fs_info, p);
4f2de97a 3613 spin_unlock(&mapping->private_lock);
760f991f 3614
1e5eb3d6 3615 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
727011e0 3616 eb->pages[i] = p;
5a963419 3617 if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
d1310b2e 3618 uptodate = 0;
eb14ab8e
CM
3619
3620 /*
b16d011e
NB
3621 * We can't unlock the pages just yet since the extent buffer
3622 * hasn't been properly inserted in the radix tree, this
f913cff3 3623 * opens a race with btree_release_folio which can free a page
b16d011e
NB
3624 * while we are still filling in all pages for the buffer and
3625 * we could crash.
eb14ab8e 3626 */
d1310b2e
CM
3627 }
3628 if (uptodate)
b4ce94de 3629 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
01cd3909
DS
3630again:
3631 ret = radix_tree_preload(GFP_NOFS);
3632 if (ret) {
3633 exists = ERR_PTR(ret);
3634 goto free_eb;
3635 }
3636
3637 spin_lock(&fs_info->buffer_lock);
3638 ret = radix_tree_insert(&fs_info->buffer_radix,
3639 start >> fs_info->sectorsize_bits, eb);
3640 spin_unlock(&fs_info->buffer_lock);
3641 radix_tree_preload_end();
3642 if (ret == -EEXIST) {
3643 exists = find_extent_buffer(fs_info, start);
3644 if (exists)
452c75c3 3645 goto free_eb;
01cd3909
DS
3646 else
3647 goto again;
3648 }
6af118ce 3649 /* add one reference for the tree */
0b32f4bb 3650 check_buffer_tree_ref(eb);
34b41ace 3651 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
3652
3653 /*
b16d011e 3654 * Now it's safe to unlock the pages because any calls to
f913cff3 3655 * btree_release_folio will correctly detect that a page belongs to a
b16d011e 3656 * live buffer and won't free them prematurely.
eb14ab8e 3657 */
28187ae5
NB
3658 for (i = 0; i < num_pages; i++)
3659 unlock_page(eb->pages[i]);
d1310b2e
CM
3660 return eb;
3661
6af118ce 3662free_eb:
5ca64f45 3663 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
3664 for (i = 0; i < num_pages; i++) {
3665 if (eb->pages[i])
3666 unlock_page(eb->pages[i]);
3667 }
eb14ab8e 3668
897ca6e9 3669 btrfs_release_extent_buffer(eb);
6af118ce 3670 return exists;
d1310b2e 3671}
d1310b2e 3672
3083ee2e
JB
3673static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3674{
3675 struct extent_buffer *eb =
3676 container_of(head, struct extent_buffer, rcu_head);
3677
3678 __free_extent_buffer(eb);
3679}
3680
f7a52a40 3681static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 3682 __releases(&eb->refs_lock)
3083ee2e 3683{
07e21c4d
NB
3684 lockdep_assert_held(&eb->refs_lock);
3685
3083ee2e
JB
3686 WARN_ON(atomic_read(&eb->refs) == 0);
3687 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 3688 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 3689 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 3690
815a51c7 3691 spin_unlock(&eb->refs_lock);
3083ee2e 3692
01cd3909
DS
3693 spin_lock(&fs_info->buffer_lock);
3694 radix_tree_delete(&fs_info->buffer_radix,
3695 eb->start >> fs_info->sectorsize_bits);
3696 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
3697 } else {
3698 spin_unlock(&eb->refs_lock);
815a51c7 3699 }
3083ee2e 3700
a40246e8 3701 btrfs_leak_debug_del_eb(eb);
3083ee2e 3702 /* Should be safe to release our pages at this point */
55ac0139 3703 btrfs_release_extent_buffer_pages(eb);
bcb7e449 3704#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 3705 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
3706 __free_extent_buffer(eb);
3707 return 1;
3708 }
3709#endif
3083ee2e 3710 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 3711 return 1;
3083ee2e
JB
3712 }
3713 spin_unlock(&eb->refs_lock);
e64860aa
JB
3714
3715 return 0;
3083ee2e
JB
3716}
3717
d1310b2e
CM
3718void free_extent_buffer(struct extent_buffer *eb)
3719{
242e18c7 3720 int refs;
d1310b2e
CM
3721 if (!eb)
3722 return;
3723
e5677f05 3724 refs = atomic_read(&eb->refs);
242e18c7 3725 while (1) {
46cc775e
NB
3726 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
3727 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
3728 refs == 1))
242e18c7 3729 break;
e5677f05 3730 if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
242e18c7
CM
3731 return;
3732 }
3733
3083ee2e
JB
3734 spin_lock(&eb->refs_lock);
3735 if (atomic_read(&eb->refs) == 2 &&
3736 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 3737 !extent_buffer_under_io(eb) &&
3083ee2e
JB
3738 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3739 atomic_dec(&eb->refs);
3740
3741 /*
3742 * I know this is terrible, but it's temporary until we stop tracking
3743 * the uptodate bits and such for the extent buffers.
3744 */
f7a52a40 3745 release_extent_buffer(eb);
3083ee2e
JB
3746}
3747
3748void free_extent_buffer_stale(struct extent_buffer *eb)
3749{
3750 if (!eb)
d1310b2e
CM
3751 return;
3752
3083ee2e
JB
3753 spin_lock(&eb->refs_lock);
3754 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
3755
0b32f4bb 3756 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
3757 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3758 atomic_dec(&eb->refs);
f7a52a40 3759 release_extent_buffer(eb);
d1310b2e 3760}
d1310b2e 3761
0d27797e
QW
3762static void btree_clear_page_dirty(struct page *page)
3763{
3764 ASSERT(PageDirty(page));
3765 ASSERT(PageLocked(page));
3766 clear_page_dirty_for_io(page);
3767 xa_lock_irq(&page->mapping->i_pages);
3768 if (!PageDirty(page))
3769 __xa_clear_mark(&page->mapping->i_pages,
3770 page_index(page), PAGECACHE_TAG_DIRTY);
3771 xa_unlock_irq(&page->mapping->i_pages);
3772}
3773
3774static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
3775{
3776 struct btrfs_fs_info *fs_info = eb->fs_info;
3777 struct page *page = eb->pages[0];
3778 bool last;
3779
3780 /* btree_clear_page_dirty() needs page locked */
3781 lock_page(page);
3782 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
3783 eb->len);
3784 if (last)
3785 btree_clear_page_dirty(page);
3786 unlock_page(page);
3787 WARN_ON(atomic_read(&eb->refs) == 0);
3788}
3789
98c8d683
JB
3790void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
3791 struct extent_buffer *eb)
d1310b2e 3792{
98c8d683 3793 struct btrfs_fs_info *fs_info = eb->fs_info;
cc5e31a4
DS
3794 int i;
3795 int num_pages;
d1310b2e
CM
3796 struct page *page;
3797
98c8d683
JB
3798 btrfs_assert_tree_write_locked(eb);
3799
3800 if (trans && btrfs_header_generation(eb) != trans->transid)
3801 return;
3802
3803 if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
3804 return;
3805
3806 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
3807 fs_info->dirty_metadata_batch);
3808
fbca46eb 3809 if (eb->fs_info->nodesize < PAGE_SIZE)
0d27797e
QW
3810 return clear_subpage_extent_buffer_dirty(eb);
3811
65ad0104 3812 num_pages = num_extent_pages(eb);
d1310b2e
CM
3813
3814 for (i = 0; i < num_pages; i++) {
fb85fc9a 3815 page = eb->pages[i];
b9473439 3816 if (!PageDirty(page))
d2c3f4f6 3817 continue;
a61e6f29 3818 lock_page(page);
0d27797e 3819 btree_clear_page_dirty(page);
a61e6f29 3820 unlock_page(page);
d1310b2e 3821 }
0b32f4bb 3822 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 3823}
d1310b2e 3824
f18cc978 3825void set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 3826{
cc5e31a4
DS
3827 int i;
3828 int num_pages;
abb57ef3 3829 bool was_dirty;
d1310b2e 3830
0b32f4bb
JB
3831 check_buffer_tree_ref(eb);
3832
b9473439 3833 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 3834
65ad0104 3835 num_pages = num_extent_pages(eb);
3083ee2e 3836 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
3837 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
3838
0d27797e 3839 if (!was_dirty) {
fbca46eb 3840 bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
51995c39 3841
0d27797e
QW
3842 /*
3843 * For subpage case, we can have other extent buffers in the
3844 * same page, and in clear_subpage_extent_buffer_dirty() we
3845 * have to clear page dirty without subpage lock held.
3846 * This can cause race where our page gets dirty cleared after
3847 * we just set it.
3848 *
3849 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
3850 * its page for other reasons, we can use page lock to prevent
3851 * the above race.
3852 */
3853 if (subpage)
3854 lock_page(eb->pages[0]);
3855 for (i = 0; i < num_pages; i++)
3856 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
3857 eb->start, eb->len);
3858 if (subpage)
3859 unlock_page(eb->pages[0]);
f18cc978
CH
3860 percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
3861 eb->len,
3862 eb->fs_info->dirty_metadata_batch);
0d27797e 3863 }
51995c39
LB
3864#ifdef CONFIG_BTRFS_DEBUG
3865 for (i = 0; i < num_pages; i++)
3866 ASSERT(PageDirty(eb->pages[i]));
3867#endif
d1310b2e 3868}
d1310b2e 3869
69ba3927 3870void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 3871{
251f2acc 3872 struct btrfs_fs_info *fs_info = eb->fs_info;
1259ab75 3873 struct page *page;
cc5e31a4 3874 int num_pages;
251f2acc 3875 int i;
1259ab75 3876
b4ce94de 3877 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 3878 num_pages = num_extent_pages(eb);
1259ab75 3879 for (i = 0; i < num_pages; i++) {
fb85fc9a 3880 page = eb->pages[i];
fbca46eb
QW
3881 if (!page)
3882 continue;
3883
3884 /*
3885 * This is special handling for metadata subpage, as regular
3886 * btrfs_is_subpage() can not handle cloned/dummy metadata.
3887 */
3888 if (fs_info->nodesize >= PAGE_SIZE)
3889 ClearPageUptodate(page);
3890 else
3891 btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
3892 eb->len);
1259ab75 3893 }
1259ab75
CM
3894}
3895
09c25a8c 3896void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 3897{
251f2acc 3898 struct btrfs_fs_info *fs_info = eb->fs_info;
d1310b2e 3899 struct page *page;
cc5e31a4 3900 int num_pages;
251f2acc 3901 int i;
d1310b2e 3902
0b32f4bb 3903 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 3904 num_pages = num_extent_pages(eb);
d1310b2e 3905 for (i = 0; i < num_pages; i++) {
fb85fc9a 3906 page = eb->pages[i];
fbca46eb
QW
3907
3908 /*
3909 * This is special handling for metadata subpage, as regular
3910 * btrfs_is_subpage() can not handle cloned/dummy metadata.
3911 */
3912 if (fs_info->nodesize >= PAGE_SIZE)
3913 SetPageUptodate(page);
3914 else
3915 btrfs_subpage_set_uptodate(fs_info, page, eb->start,
3916 eb->len);
d1310b2e 3917 }
d1310b2e 3918}
d1310b2e 3919
046b562b
CH
3920static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
3921{
3922 struct extent_buffer *eb = bbio->private;
d7172f52 3923 struct btrfs_fs_info *fs_info = eb->fs_info;
046b562b
CH
3924 bool uptodate = !bbio->bio.bi_status;
3925 struct bvec_iter_all iter_all;
3926 struct bio_vec *bvec;
3927 u32 bio_offset = 0;
3928
046b562b
CH
3929 eb->read_mirror = bbio->mirror_num;
3930
3931 if (uptodate &&
3932 btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
3933 uptodate = false;
3934
3935 if (uptodate) {
3936 set_extent_buffer_uptodate(eb);
3937 } else {
3938 clear_extent_buffer_uptodate(eb);
3939 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3940 }
3941
3942 bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
d7172f52
CH
3943 u64 start = eb->start + bio_offset;
3944 struct page *page = bvec->bv_page;
3945 u32 len = bvec->bv_len;
046b562b 3946
d7172f52
CH
3947 if (uptodate)
3948 btrfs_page_set_uptodate(fs_info, page, start, len);
3949 else
3950 btrfs_page_clear_uptodate(fs_info, page, start, len);
3951
3952 bio_offset += len;
3d66b4b2 3953 }
d7172f52
CH
3954
3955 clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
3956 smp_mb__after_atomic();
3957 wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
046b562b
CH
3958 free_extent_buffer(eb);
3959
3960 bio_put(&bbio->bio);
3961}
3962
d7172f52
CH
3963int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
3964 struct btrfs_tree_parent_check *check)
b78b98e0
CH
3965{
3966 int num_pages = num_extent_pages(eb), i;
3967 struct btrfs_bio *bbio;
3968
d7172f52
CH
3969 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3970 return 0;
3971
3972 /*
3973 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
3974 * operation, which could potentially still be in flight. In this case
3975 * we simply want to return an error.
3976 */
3977 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
3978 return -EIO;
3979
3980 /* Someone else is already reading the buffer, just wait for it. */
3981 if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
3982 goto done;
3983
b78b98e0
CH
3984 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3985 eb->read_mirror = 0;
b78b98e0 3986 check_buffer_tree_ref(eb);
113fa05c 3987 atomic_inc(&eb->refs);
b78b98e0
CH
3988
3989 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
3990 REQ_OP_READ | REQ_META, eb->fs_info,
046b562b 3991 extent_buffer_read_end_io, eb);
b78b98e0
CH
3992 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
3993 bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
3994 bbio->file_offset = eb->start;
3995 memcpy(&bbio->parent_check, check, sizeof(*check));
3996 if (eb->fs_info->nodesize < PAGE_SIZE) {
3997 __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
3998 eb->start - page_offset(eb->pages[0]));
3999 } else {
011134f4 4000 for (i = 0; i < num_pages; i++)
b78b98e0 4001 __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
b78b98e0
CH
4002 }
4003 btrfs_submit_bio(bbio, mirror_num);
b78b98e0 4004
d7172f52
CH
4005done:
4006 if (wait == WAIT_COMPLETE) {
4007 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
4008 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
55173337 4009 return -EIO;
d1310b2e 4010 }
d397712b 4011
55173337 4012 return 0;
d1310b2e 4013}
d1310b2e 4014
f98b6215
QW
4015static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
4016 unsigned long len)
4017{
4018 btrfs_warn(eb->fs_info,
4019 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
4020 eb->start, eb->len, start, len);
4021 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4022
4023 return true;
4024}
4025
4026/*
4027 * Check if the [start, start + len) range is valid before reading/writing
4028 * the eb.
4029 * NOTE: @start and @len are offset inside the eb, not logical address.
4030 *
4031 * Caller should not touch the dst/src memory if this function returns error.
4032 */
4033static inline int check_eb_range(const struct extent_buffer *eb,
4034 unsigned long start, unsigned long len)
4035{
4036 unsigned long offset;
4037
4038 /* start, start + len should not go beyond eb->len nor overflow */
4039 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
4040 return report_eb_range(eb, start, len);
4041
4042 return false;
4043}
4044
1cbb1f45
JM
4045void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
4046 unsigned long start, unsigned long len)
d1310b2e
CM
4047{
4048 size_t cur;
4049 size_t offset;
4050 struct page *page;
4051 char *kaddr;
4052 char *dst = (char *)dstv;
884b07d0 4053 unsigned long i = get_eb_page_index(start);
d1310b2e 4054
f98b6215 4055 if (check_eb_range(eb, start, len))
f716abd5 4056 return;
d1310b2e 4057
884b07d0 4058 offset = get_eb_offset_in_page(eb, start);
d1310b2e 4059
d397712b 4060 while (len > 0) {
fb85fc9a 4061 page = eb->pages[i];
d1310b2e 4062
09cbfeaf 4063 cur = min(len, (PAGE_SIZE - offset));
a6591715 4064 kaddr = page_address(page);
d1310b2e 4065 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
4066
4067 dst += cur;
4068 len -= cur;
4069 offset = 0;
4070 i++;
4071 }
4072}
d1310b2e 4073
a48b73ec
JB
4074int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
4075 void __user *dstv,
4076 unsigned long start, unsigned long len)
550ac1d8
GH
4077{
4078 size_t cur;
4079 size_t offset;
4080 struct page *page;
4081 char *kaddr;
4082 char __user *dst = (char __user *)dstv;
884b07d0 4083 unsigned long i = get_eb_page_index(start);
550ac1d8
GH
4084 int ret = 0;
4085
4086 WARN_ON(start > eb->len);
4087 WARN_ON(start + len > eb->start + eb->len);
4088
884b07d0 4089 offset = get_eb_offset_in_page(eb, start);
550ac1d8
GH
4090
4091 while (len > 0) {
fb85fc9a 4092 page = eb->pages[i];
550ac1d8 4093
09cbfeaf 4094 cur = min(len, (PAGE_SIZE - offset));
550ac1d8 4095 kaddr = page_address(page);
a48b73ec 4096 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
550ac1d8
GH
4097 ret = -EFAULT;
4098 break;
4099 }
4100
4101 dst += cur;
4102 len -= cur;
4103 offset = 0;
4104 i++;
4105 }
4106
4107 return ret;
4108}
4109
1cbb1f45
JM
4110int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
4111 unsigned long start, unsigned long len)
d1310b2e
CM
4112{
4113 size_t cur;
4114 size_t offset;
4115 struct page *page;
4116 char *kaddr;
4117 char *ptr = (char *)ptrv;
884b07d0 4118 unsigned long i = get_eb_page_index(start);
d1310b2e
CM
4119 int ret = 0;
4120
f98b6215
QW
4121 if (check_eb_range(eb, start, len))
4122 return -EINVAL;
d1310b2e 4123
884b07d0 4124 offset = get_eb_offset_in_page(eb, start);
d1310b2e 4125
d397712b 4126 while (len > 0) {
fb85fc9a 4127 page = eb->pages[i];
d1310b2e 4128
09cbfeaf 4129 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 4130
a6591715 4131 kaddr = page_address(page);
d1310b2e 4132 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
4133 if (ret)
4134 break;
4135
4136 ptr += cur;
4137 len -= cur;
4138 offset = 0;
4139 i++;
4140 }
4141 return ret;
4142}
d1310b2e 4143
b8f95771
QW
4144/*
4145 * Check that the extent buffer is uptodate.
4146 *
4147 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
4148 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
4149 */
4150static void assert_eb_page_uptodate(const struct extent_buffer *eb,
4151 struct page *page)
4152{
4153 struct btrfs_fs_info *fs_info = eb->fs_info;
4154
a50e1fcb
JB
4155 /*
4156 * If we are using the commit root we could potentially clear a page
4157 * Uptodate while we're using the extent buffer that we've previously
4158 * looked up. We don't want to complain in this case, as the page was
4159 * valid before, we just didn't write it out. Instead we want to catch
4160 * the case where we didn't actually read the block properly, which
011134f4 4161 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR.
a50e1fcb 4162 */
011134f4
CH
4163 if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4164 return;
b8f95771 4165
011134f4 4166 if (fs_info->nodesize < PAGE_SIZE) {
75258f20
QW
4167 if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
4168 eb->start, eb->len)))
4169 btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
b8f95771 4170 } else {
011134f4 4171 WARN_ON(!PageUptodate(page));
b8f95771
QW
4172 }
4173}
4174
2b48966a 4175void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
f157bf76
DS
4176 const void *srcv)
4177{
4178 char *kaddr;
4179
b8f95771 4180 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
4181 kaddr = page_address(eb->pages[0]) +
4182 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
4183 chunk_tree_uuid));
4184 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
4185}
4186
2b48966a 4187void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
f157bf76
DS
4188{
4189 char *kaddr;
4190
b8f95771 4191 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
4192 kaddr = page_address(eb->pages[0]) +
4193 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
4194 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
4195}
4196
2b48966a 4197void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
d1310b2e
CM
4198 unsigned long start, unsigned long len)
4199{
4200 size_t cur;
4201 size_t offset;
4202 struct page *page;
4203 char *kaddr;
4204 char *src = (char *)srcv;
884b07d0 4205 unsigned long i = get_eb_page_index(start);
d1310b2e 4206
d3575156
NA
4207 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
4208
f98b6215
QW
4209 if (check_eb_range(eb, start, len))
4210 return;
d1310b2e 4211
884b07d0 4212 offset = get_eb_offset_in_page(eb, start);
d1310b2e 4213
d397712b 4214 while (len > 0) {
fb85fc9a 4215 page = eb->pages[i];
b8f95771 4216 assert_eb_page_uptodate(eb, page);
d1310b2e 4217
09cbfeaf 4218 cur = min(len, PAGE_SIZE - offset);
a6591715 4219 kaddr = page_address(page);
d1310b2e 4220 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
4221
4222 src += cur;
4223 len -= cur;
4224 offset = 0;
4225 i++;
4226 }
4227}
d1310b2e 4228
2b48966a 4229void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
b159fa28 4230 unsigned long len)
d1310b2e
CM
4231{
4232 size_t cur;
4233 size_t offset;
4234 struct page *page;
4235 char *kaddr;
884b07d0 4236 unsigned long i = get_eb_page_index(start);
d1310b2e 4237
f98b6215
QW
4238 if (check_eb_range(eb, start, len))
4239 return;
d1310b2e 4240
884b07d0 4241 offset = get_eb_offset_in_page(eb, start);
d1310b2e 4242
d397712b 4243 while (len > 0) {
fb85fc9a 4244 page = eb->pages[i];
b8f95771 4245 assert_eb_page_uptodate(eb, page);
d1310b2e 4246
09cbfeaf 4247 cur = min(len, PAGE_SIZE - offset);
a6591715 4248 kaddr = page_address(page);
b159fa28 4249 memset(kaddr + offset, 0, cur);
d1310b2e
CM
4250
4251 len -= cur;
4252 offset = 0;
4253 i++;
4254 }
4255}
d1310b2e 4256
2b48966a
DS
4257void copy_extent_buffer_full(const struct extent_buffer *dst,
4258 const struct extent_buffer *src)
58e8012c
DS
4259{
4260 int i;
cc5e31a4 4261 int num_pages;
58e8012c
DS
4262
4263 ASSERT(dst->len == src->len);
4264
fbca46eb 4265 if (dst->fs_info->nodesize >= PAGE_SIZE) {
884b07d0
QW
4266 num_pages = num_extent_pages(dst);
4267 for (i = 0; i < num_pages; i++)
4268 copy_page(page_address(dst->pages[i]),
4269 page_address(src->pages[i]));
4270 } else {
4271 size_t src_offset = get_eb_offset_in_page(src, 0);
4272 size_t dst_offset = get_eb_offset_in_page(dst, 0);
4273
fbca46eb 4274 ASSERT(src->fs_info->nodesize < PAGE_SIZE);
884b07d0
QW
4275 memcpy(page_address(dst->pages[0]) + dst_offset,
4276 page_address(src->pages[0]) + src_offset,
4277 src->len);
4278 }
58e8012c
DS
4279}
4280
2b48966a
DS
4281void copy_extent_buffer(const struct extent_buffer *dst,
4282 const struct extent_buffer *src,
d1310b2e
CM
4283 unsigned long dst_offset, unsigned long src_offset,
4284 unsigned long len)
4285{
4286 u64 dst_len = dst->len;
4287 size_t cur;
4288 size_t offset;
4289 struct page *page;
4290 char *kaddr;
884b07d0 4291 unsigned long i = get_eb_page_index(dst_offset);
d1310b2e 4292
f98b6215
QW
4293 if (check_eb_range(dst, dst_offset, len) ||
4294 check_eb_range(src, src_offset, len))
4295 return;
4296
d1310b2e
CM
4297 WARN_ON(src->len != dst_len);
4298
884b07d0 4299 offset = get_eb_offset_in_page(dst, dst_offset);
d1310b2e 4300
d397712b 4301 while (len > 0) {
fb85fc9a 4302 page = dst->pages[i];
b8f95771 4303 assert_eb_page_uptodate(dst, page);
d1310b2e 4304
09cbfeaf 4305 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 4306
a6591715 4307 kaddr = page_address(page);
d1310b2e 4308 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
4309
4310 src_offset += cur;
4311 len -= cur;
4312 offset = 0;
4313 i++;
4314 }
4315}
d1310b2e 4316
3e1e8bb7
OS
4317/*
4318 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
4319 * given bit number
4320 * @eb: the extent buffer
4321 * @start: offset of the bitmap item in the extent buffer
4322 * @nr: bit number
4323 * @page_index: return index of the page in the extent buffer that contains the
4324 * given bit number
4325 * @page_offset: return offset into the page given by page_index
4326 *
4327 * This helper hides the ugliness of finding the byte in an extent buffer which
4328 * contains a given bit.
4329 */
2b48966a 4330static inline void eb_bitmap_offset(const struct extent_buffer *eb,
3e1e8bb7
OS
4331 unsigned long start, unsigned long nr,
4332 unsigned long *page_index,
4333 size_t *page_offset)
4334{
3e1e8bb7
OS
4335 size_t byte_offset = BIT_BYTE(nr);
4336 size_t offset;
4337
4338 /*
4339 * The byte we want is the offset of the extent buffer + the offset of
4340 * the bitmap item in the extent buffer + the offset of the byte in the
4341 * bitmap item.
4342 */
884b07d0 4343 offset = start + offset_in_page(eb->start) + byte_offset;
3e1e8bb7 4344
09cbfeaf 4345 *page_index = offset >> PAGE_SHIFT;
7073017a 4346 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
4347}
4348
43dd529a
DS
4349/*
4350 * Determine whether a bit in a bitmap item is set.
4351 *
4352 * @eb: the extent buffer
4353 * @start: offset of the bitmap item in the extent buffer
4354 * @nr: bit number to test
3e1e8bb7 4355 */
2b48966a 4356int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
4357 unsigned long nr)
4358{
2fe1d551 4359 u8 *kaddr;
3e1e8bb7
OS
4360 struct page *page;
4361 unsigned long i;
4362 size_t offset;
4363
4364 eb_bitmap_offset(eb, start, nr, &i, &offset);
4365 page = eb->pages[i];
b8f95771 4366 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
4367 kaddr = page_address(page);
4368 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
4369}
4370
43dd529a
DS
4371/*
4372 * Set an area of a bitmap to 1.
4373 *
4374 * @eb: the extent buffer
4375 * @start: offset of the bitmap item in the extent buffer
4376 * @pos: bit number of the first bit
4377 * @len: number of bits to set
3e1e8bb7 4378 */
2b48966a 4379void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
4380 unsigned long pos, unsigned long len)
4381{
2fe1d551 4382 u8 *kaddr;
3e1e8bb7
OS
4383 struct page *page;
4384 unsigned long i;
4385 size_t offset;
4386 const unsigned int size = pos + len;
4387 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 4388 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
4389
4390 eb_bitmap_offset(eb, start, pos, &i, &offset);
4391 page = eb->pages[i];
b8f95771 4392 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
4393 kaddr = page_address(page);
4394
4395 while (len >= bits_to_set) {
4396 kaddr[offset] |= mask_to_set;
4397 len -= bits_to_set;
4398 bits_to_set = BITS_PER_BYTE;
9c894696 4399 mask_to_set = ~0;
09cbfeaf 4400 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
4401 offset = 0;
4402 page = eb->pages[++i];
b8f95771 4403 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
4404 kaddr = page_address(page);
4405 }
4406 }
4407 if (len) {
4408 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
4409 kaddr[offset] |= mask_to_set;
4410 }
4411}
4412
4413
43dd529a
DS
4414/*
4415 * Clear an area of a bitmap.
4416 *
4417 * @eb: the extent buffer
4418 * @start: offset of the bitmap item in the extent buffer
4419 * @pos: bit number of the first bit
4420 * @len: number of bits to clear
3e1e8bb7 4421 */
2b48966a
DS
4422void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
4423 unsigned long start, unsigned long pos,
4424 unsigned long len)
3e1e8bb7 4425{
2fe1d551 4426 u8 *kaddr;
3e1e8bb7
OS
4427 struct page *page;
4428 unsigned long i;
4429 size_t offset;
4430 const unsigned int size = pos + len;
4431 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 4432 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
4433
4434 eb_bitmap_offset(eb, start, pos, &i, &offset);
4435 page = eb->pages[i];
b8f95771 4436 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
4437 kaddr = page_address(page);
4438
4439 while (len >= bits_to_clear) {
4440 kaddr[offset] &= ~mask_to_clear;
4441 len -= bits_to_clear;
4442 bits_to_clear = BITS_PER_BYTE;
9c894696 4443 mask_to_clear = ~0;
09cbfeaf 4444 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
4445 offset = 0;
4446 page = eb->pages[++i];
b8f95771 4447 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
4448 kaddr = page_address(page);
4449 }
4450 }
4451 if (len) {
4452 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
4453 kaddr[offset] &= ~mask_to_clear;
4454 }
4455}
4456
3387206f
ST
4457static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4458{
4459 unsigned long distance = (src > dst) ? src - dst : dst - src;
4460 return distance < len;
4461}
4462
d1310b2e
CM
4463static void copy_pages(struct page *dst_page, struct page *src_page,
4464 unsigned long dst_off, unsigned long src_off,
4465 unsigned long len)
4466{
a6591715 4467 char *dst_kaddr = page_address(dst_page);
d1310b2e 4468 char *src_kaddr;
727011e0 4469 int must_memmove = 0;
d1310b2e 4470
3387206f 4471 if (dst_page != src_page) {
a6591715 4472 src_kaddr = page_address(src_page);
3387206f 4473 } else {
d1310b2e 4474 src_kaddr = dst_kaddr;
727011e0
CM
4475 if (areas_overlap(src_off, dst_off, len))
4476 must_memmove = 1;
3387206f 4477 }
d1310b2e 4478
727011e0
CM
4479 if (must_memmove)
4480 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4481 else
4482 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
4483}
4484
2b48966a
DS
4485void memcpy_extent_buffer(const struct extent_buffer *dst,
4486 unsigned long dst_offset, unsigned long src_offset,
4487 unsigned long len)
d1310b2e
CM
4488{
4489 size_t cur;
4490 size_t dst_off_in_page;
4491 size_t src_off_in_page;
d1310b2e
CM
4492 unsigned long dst_i;
4493 unsigned long src_i;
4494
f98b6215
QW
4495 if (check_eb_range(dst, dst_offset, len) ||
4496 check_eb_range(dst, src_offset, len))
4497 return;
d1310b2e 4498
d397712b 4499 while (len > 0) {
884b07d0
QW
4500 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
4501 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
d1310b2e 4502
884b07d0
QW
4503 dst_i = get_eb_page_index(dst_offset);
4504 src_i = get_eb_page_index(src_offset);
d1310b2e 4505
09cbfeaf 4506 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
4507 src_off_in_page));
4508 cur = min_t(unsigned long, cur,
09cbfeaf 4509 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 4510
fb85fc9a 4511 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
4512 dst_off_in_page, src_off_in_page, cur);
4513
4514 src_offset += cur;
4515 dst_offset += cur;
4516 len -= cur;
4517 }
4518}
d1310b2e 4519
2b48966a
DS
4520void memmove_extent_buffer(const struct extent_buffer *dst,
4521 unsigned long dst_offset, unsigned long src_offset,
4522 unsigned long len)
d1310b2e
CM
4523{
4524 size_t cur;
4525 size_t dst_off_in_page;
4526 size_t src_off_in_page;
4527 unsigned long dst_end = dst_offset + len - 1;
4528 unsigned long src_end = src_offset + len - 1;
d1310b2e
CM
4529 unsigned long dst_i;
4530 unsigned long src_i;
4531
f98b6215
QW
4532 if (check_eb_range(dst, dst_offset, len) ||
4533 check_eb_range(dst, src_offset, len))
4534 return;
727011e0 4535 if (dst_offset < src_offset) {
d1310b2e
CM
4536 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4537 return;
4538 }
d397712b 4539 while (len > 0) {
884b07d0
QW
4540 dst_i = get_eb_page_index(dst_end);
4541 src_i = get_eb_page_index(src_end);
d1310b2e 4542
884b07d0
QW
4543 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
4544 src_off_in_page = get_eb_offset_in_page(dst, src_end);
d1310b2e
CM
4545
4546 cur = min_t(unsigned long, len, src_off_in_page + 1);
4547 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 4548 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
4549 dst_off_in_page - cur + 1,
4550 src_off_in_page - cur + 1, cur);
4551
4552 dst_end -= cur;
4553 src_end -= cur;
4554 len -= cur;
4555 }
4556}
6af118ce 4557
01cd3909 4558#define GANG_LOOKUP_SIZE 16
d1e86e3f
QW
4559static struct extent_buffer *get_next_extent_buffer(
4560 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
4561{
01cd3909
DS
4562 struct extent_buffer *gang[GANG_LOOKUP_SIZE];
4563 struct extent_buffer *found = NULL;
d1e86e3f 4564 u64 page_start = page_offset(page);
01cd3909 4565 u64 cur = page_start;
d1e86e3f
QW
4566
4567 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
d1e86e3f
QW
4568 lockdep_assert_held(&fs_info->buffer_lock);
4569
01cd3909
DS
4570 while (cur < page_start + PAGE_SIZE) {
4571 int ret;
4572 int i;
4573
4574 ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
4575 (void **)gang, cur >> fs_info->sectorsize_bits,
4576 min_t(unsigned int, GANG_LOOKUP_SIZE,
4577 PAGE_SIZE / fs_info->nodesize));
4578 if (ret == 0)
4579 goto out;
4580 for (i = 0; i < ret; i++) {
4581 /* Already beyond page end */
4582 if (gang[i]->start >= page_start + PAGE_SIZE)
4583 goto out;
4584 /* Found one */
4585 if (gang[i]->start >= bytenr) {
4586 found = gang[i];
4587 goto out;
4588 }
4589 }
4590 cur = gang[ret - 1]->start + gang[ret - 1]->len;
d1e86e3f 4591 }
01cd3909
DS
4592out:
4593 return found;
d1e86e3f
QW
4594}
4595
4596static int try_release_subpage_extent_buffer(struct page *page)
4597{
4598 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4599 u64 cur = page_offset(page);
4600 const u64 end = page_offset(page) + PAGE_SIZE;
4601 int ret;
4602
4603 while (cur < end) {
4604 struct extent_buffer *eb = NULL;
4605
4606 /*
4607 * Unlike try_release_extent_buffer() which uses page->private
4608 * to grab buffer, for subpage case we rely on radix tree, thus
4609 * we need to ensure radix tree consistency.
4610 *
4611 * We also want an atomic snapshot of the radix tree, thus go
4612 * with spinlock rather than RCU.
4613 */
4614 spin_lock(&fs_info->buffer_lock);
4615 eb = get_next_extent_buffer(fs_info, page, cur);
4616 if (!eb) {
4617 /* No more eb in the page range after or at cur */
4618 spin_unlock(&fs_info->buffer_lock);
4619 break;
4620 }
4621 cur = eb->start + eb->len;
4622
4623 /*
4624 * The same as try_release_extent_buffer(), to ensure the eb
4625 * won't disappear out from under us.
4626 */
4627 spin_lock(&eb->refs_lock);
4628 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4629 spin_unlock(&eb->refs_lock);
4630 spin_unlock(&fs_info->buffer_lock);
4631 break;
4632 }
4633 spin_unlock(&fs_info->buffer_lock);
4634
4635 /*
4636 * If tree ref isn't set then we know the ref on this eb is a
4637 * real ref, so just return, this eb will likely be freed soon
4638 * anyway.
4639 */
4640 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4641 spin_unlock(&eb->refs_lock);
4642 break;
4643 }
4644
4645 /*
4646 * Here we don't care about the return value, we will always
4647 * check the page private at the end. And
4648 * release_extent_buffer() will release the refs_lock.
4649 */
4650 release_extent_buffer(eb);
4651 }
4652 /*
4653 * Finally to check if we have cleared page private, as if we have
4654 * released all ebs in the page, the page private should be cleared now.
4655 */
4656 spin_lock(&page->mapping->private_lock);
4657 if (!PagePrivate(page))
4658 ret = 1;
4659 else
4660 ret = 0;
4661 spin_unlock(&page->mapping->private_lock);
4662 return ret;
4663
4664}
4665
f7a52a40 4666int try_release_extent_buffer(struct page *page)
19fe0a8b 4667{
6af118ce 4668 struct extent_buffer *eb;
6af118ce 4669
fbca46eb 4670 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
d1e86e3f
QW
4671 return try_release_subpage_extent_buffer(page);
4672
3083ee2e 4673 /*
d1e86e3f
QW
4674 * We need to make sure nobody is changing page->private, as we rely on
4675 * page->private as the pointer to extent buffer.
3083ee2e
JB
4676 */
4677 spin_lock(&page->mapping->private_lock);
4678 if (!PagePrivate(page)) {
4679 spin_unlock(&page->mapping->private_lock);
4f2de97a 4680 return 1;
45f49bce 4681 }
6af118ce 4682
3083ee2e
JB
4683 eb = (struct extent_buffer *)page->private;
4684 BUG_ON(!eb);
19fe0a8b
MX
4685
4686 /*
3083ee2e
JB
4687 * This is a little awful but should be ok, we need to make sure that
4688 * the eb doesn't disappear out from under us while we're looking at
4689 * this page.
19fe0a8b 4690 */
3083ee2e 4691 spin_lock(&eb->refs_lock);
0b32f4bb 4692 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
4693 spin_unlock(&eb->refs_lock);
4694 spin_unlock(&page->mapping->private_lock);
4695 return 0;
b9473439 4696 }
3083ee2e 4697 spin_unlock(&page->mapping->private_lock);
897ca6e9 4698
19fe0a8b 4699 /*
3083ee2e
JB
4700 * If tree ref isn't set then we know the ref on this eb is a real ref,
4701 * so just return, this page will likely be freed soon anyway.
19fe0a8b 4702 */
3083ee2e
JB
4703 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4704 spin_unlock(&eb->refs_lock);
4705 return 0;
b9473439 4706 }
19fe0a8b 4707
f7a52a40 4708 return release_extent_buffer(eb);
6af118ce 4709}
bfb484d9
JB
4710
4711/*
4712 * btrfs_readahead_tree_block - attempt to readahead a child block
4713 * @fs_info: the fs_info
4714 * @bytenr: bytenr to read
3fbaf258 4715 * @owner_root: objectid of the root that owns this eb
bfb484d9 4716 * @gen: generation for the uptodate check, can be 0
3fbaf258 4717 * @level: level for the eb
bfb484d9
JB
4718 *
4719 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
4720 * normal uptodate check of the eb, without checking the generation. If we have
4721 * to read the block we will not block on anything.
4722 */
4723void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
3fbaf258 4724 u64 bytenr, u64 owner_root, u64 gen, int level)
bfb484d9 4725{
947a6299
QW
4726 struct btrfs_tree_parent_check check = {
4727 .has_first_key = 0,
4728 .level = level,
4729 .transid = gen
4730 };
bfb484d9
JB
4731 struct extent_buffer *eb;
4732 int ret;
4733
3fbaf258 4734 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
bfb484d9
JB
4735 if (IS_ERR(eb))
4736 return;
4737
4738 if (btrfs_buffer_uptodate(eb, gen, 1)) {
4739 free_extent_buffer(eb);
4740 return;
4741 }
4742
947a6299 4743 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
bfb484d9
JB
4744 if (ret < 0)
4745 free_extent_buffer_stale(eb);
4746 else
4747 free_extent_buffer(eb);
4748}
4749
4750/*
4751 * btrfs_readahead_node_child - readahead a node's child block
4752 * @node: parent node we're reading from
4753 * @slot: slot in the parent node for the child we want to read
4754 *
4755 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
4756 * the slot in the node provided.
4757 */
4758void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
4759{
4760 btrfs_readahead_tree_block(node->fs_info,
4761 btrfs_node_blockptr(node, slot),
3fbaf258
JB
4762 btrfs_header_owner(node),
4763 btrfs_node_ptr_generation(node, slot),
4764 btrfs_header_level(node) - 1);
bfb484d9 4765}