btrfs: move super prototypes into super.h
[linux-block.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
395cb57e 9#include <linux/sched/mm.h>
d1310b2e
CM
10#include <linux/spinlock.h>
11#include <linux/blkdev.h>
12#include <linux/swap.h>
d1310b2e
CM
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
268bb0ce 15#include <linux/prefetch.h>
14605409 16#include <linux/fsverity.h>
cea62800 17#include "misc.h"
d1310b2e 18#include "extent_io.h"
9c7d3a54 19#include "extent-io-tree.h"
d1310b2e 20#include "extent_map.h"
902b22f3
DW
21#include "ctree.h"
22#include "btrfs_inode.h"
4a54c8c1 23#include "volumes.h"
21adbd5c 24#include "check-integrity.h"
0b32f4bb 25#include "locking.h"
606686ee 26#include "rcu-string.h"
fe09e16c 27#include "backref.h"
6af49dbd 28#include "disk-io.h"
760f991f 29#include "subpage.h"
d3575156 30#include "zoned.h"
0bc09ca1 31#include "block-group.h"
2a5232a8 32#include "compression.h"
ec8eb376 33#include "fs.h"
07e81dc9 34#include "accessors.h"
7c8ede16 35#include "file-item.h"
af142b6f 36#include "file.h"
77407dc0 37#include "dev-replace.h"
d1310b2e 38
d1310b2e
CM
39static struct kmem_cache *extent_buffer_cache;
40
6d49ba1b 41#ifdef CONFIG_BTRFS_DEBUG
a40246e8
JB
42static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
43{
44 struct btrfs_fs_info *fs_info = eb->fs_info;
45 unsigned long flags;
46
47 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
48 list_add(&eb->leak_list, &fs_info->allocated_ebs);
49 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
50}
51
a40246e8
JB
52static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
53{
54 struct btrfs_fs_info *fs_info = eb->fs_info;
55 unsigned long flags;
56
57 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
58 list_del(&eb->leak_list);
59 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
6d49ba1b
ES
60}
61
3fd63727 62void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 63{
6d49ba1b 64 struct extent_buffer *eb;
3fd63727 65 unsigned long flags;
6d49ba1b 66
8c38938c
JB
67 /*
68 * If we didn't get into open_ctree our allocated_ebs will not be
69 * initialized, so just skip this.
70 */
71 if (!fs_info->allocated_ebs.next)
72 return;
73
b95b78e6 74 WARN_ON(!list_empty(&fs_info->allocated_ebs));
3fd63727
JB
75 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
76 while (!list_empty(&fs_info->allocated_ebs)) {
77 eb = list_first_entry(&fs_info->allocated_ebs,
78 struct extent_buffer, leak_list);
8c38938c
JB
79 pr_err(
80 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
81 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
82 btrfs_header_owner(eb));
33ca832f
JB
83 list_del(&eb->leak_list);
84 kmem_cache_free(extent_buffer_cache, eb);
85 }
3fd63727 86 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f 87}
6d49ba1b 88#else
a40246e8 89#define btrfs_leak_debug_add_eb(eb) do {} while (0)
a40246e8 90#define btrfs_leak_debug_del_eb(eb) do {} while (0)
4bef0848 91#endif
d1310b2e 92
7aab8b32
CH
93/*
94 * Structure to record info about the bio being assembled, and other info like
95 * how many bytes are there before stripe/ordered extent boundary.
96 */
97struct btrfs_bio_ctrl {
98 struct bio *bio;
722c82ac 99 int mirror_num;
0f07003b 100 enum btrfs_compression_type compress_type;
7aab8b32
CH
101 u32 len_to_stripe_boundary;
102 u32 len_to_oe_boundary;
5467abba 103 btrfs_bio_end_io_t end_io_func;
7aab8b32
CH
104};
105
d1310b2e 106struct extent_page_data {
390ed29b 107 struct btrfs_bio_ctrl bio_ctrl;
771ed689
CM
108 /* tells writepage not to lock the state bits for this range
109 * it still does the unlocking
110 */
ffbd517d
CM
111 unsigned int extent_locked:1;
112
70fd7614 113 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 114 unsigned int sync_io:1;
d1310b2e
CM
115};
116
722c82ac 117static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bb58eb9e 118{
722c82ac 119 struct bio *bio;
7aa51232 120 struct bio_vec *bv;
722c82ac
CH
121 struct inode *inode;
122 int mirror_num;
123
124 if (!bio_ctrl->bio)
125 return;
bb58eb9e 126
722c82ac 127 bio = bio_ctrl->bio;
7aa51232
CH
128 bv = bio_first_bvec_all(bio);
129 inode = bv->bv_page->mapping->host;
722c82ac 130 mirror_num = bio_ctrl->mirror_num;
bb58eb9e 131
e0eefe07
QW
132 /* Caller should ensure the bio has at least some range added */
133 ASSERT(bio->bi_iter.bi_size);
c9583ada 134
7aa51232 135 btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
bb58eb9e 136
c93104e7
CH
137 if (!is_data_inode(inode))
138 btrfs_submit_metadata_bio(inode, bio, mirror_num);
139 else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
140 btrfs_submit_data_write_bio(inode, bio, mirror_num);
908930f3 141 else
722c82ac
CH
142 btrfs_submit_data_read_bio(inode, bio, mirror_num,
143 bio_ctrl->compress_type);
390ed29b 144
917f32a2 145 /* The bio is owned by the end_io handler now */
722c82ac 146 bio_ctrl->bio = NULL;
3065976b
QW
147}
148
f4340622 149/*
9845e5dd 150 * Submit or fail the current bio in an extent_page_data structure.
f4340622 151 */
9845e5dd 152static void submit_write_bio(struct extent_page_data *epd, int ret)
bb58eb9e 153{
390ed29b 154 struct bio *bio = epd->bio_ctrl.bio;
bb58eb9e 155
9845e5dd
CH
156 if (!bio)
157 return;
158
159 if (ret) {
160 ASSERT(ret < 0);
917f32a2
CH
161 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
162 /* The bio is owned by the end_io handler now */
390ed29b 163 epd->bio_ctrl.bio = NULL;
9845e5dd 164 } else {
722c82ac 165 submit_one_bio(&epd->bio_ctrl);
bb58eb9e
QW
166 }
167}
e2932ee0 168
a62a3bd9
JB
169int __init extent_buffer_init_cachep(void)
170{
837e1972 171 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 172 sizeof(struct extent_buffer), 0,
fba4b697 173 SLAB_MEM_SPREAD, NULL);
a62a3bd9 174 if (!extent_buffer_cache)
6f0d04f8 175 return -ENOMEM;
b208c2f7 176
d1310b2e 177 return 0;
d1310b2e
CM
178}
179
a62a3bd9 180void __cold extent_buffer_free_cachep(void)
d1310b2e 181{
8c0a8537
KS
182 /*
183 * Make sure all delayed rcu free are flushed before we
184 * destroy caches.
185 */
186 rcu_barrier();
5598e900 187 kmem_cache_destroy(extent_buffer_cache);
d1310b2e
CM
188}
189
bd1fa4f0 190void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 191{
09cbfeaf
KS
192 unsigned long index = start >> PAGE_SHIFT;
193 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
194 struct page *page;
195
196 while (index <= end_index) {
197 page = find_get_page(inode->i_mapping, index);
198 BUG_ON(!page); /* Pages should be in the extent_io_tree */
199 clear_page_dirty_for_io(page);
09cbfeaf 200 put_page(page);
4adaa611
CM
201 index++;
202 }
4adaa611
CM
203}
204
f6311572 205void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 206{
ebf55c88 207 struct address_space *mapping = inode->i_mapping;
09cbfeaf
KS
208 unsigned long index = start >> PAGE_SHIFT;
209 unsigned long end_index = end >> PAGE_SHIFT;
ebf55c88 210 struct folio *folio;
4adaa611
CM
211
212 while (index <= end_index) {
ebf55c88
MWO
213 folio = filemap_get_folio(mapping, index);
214 filemap_dirty_folio(mapping, folio);
215 folio_account_redirty(folio);
216 index += folio_nr_pages(folio);
217 folio_put(folio);
4adaa611 218 }
4adaa611
CM
219}
220
ed8f13bf
QW
221/*
222 * Process one page for __process_pages_contig().
223 *
224 * Return >0 if we hit @page == @locked_page.
225 * Return 0 if we updated the page status.
226 * Return -EGAIN if the we need to try again.
227 * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
228 */
e38992be
QW
229static int process_one_page(struct btrfs_fs_info *fs_info,
230 struct address_space *mapping,
ed8f13bf 231 struct page *page, struct page *locked_page,
e38992be 232 unsigned long page_ops, u64 start, u64 end)
ed8f13bf 233{
e38992be
QW
234 u32 len;
235
236 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
237 len = end + 1 - start;
238
ed8f13bf 239 if (page_ops & PAGE_SET_ORDERED)
b945a463 240 btrfs_page_clamp_set_ordered(fs_info, page, start, len);
ed8f13bf 241 if (page_ops & PAGE_SET_ERROR)
e38992be 242 btrfs_page_clamp_set_error(fs_info, page, start, len);
ed8f13bf 243 if (page_ops & PAGE_START_WRITEBACK) {
e38992be
QW
244 btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
245 btrfs_page_clamp_set_writeback(fs_info, page, start, len);
ed8f13bf
QW
246 }
247 if (page_ops & PAGE_END_WRITEBACK)
e38992be 248 btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
a33a8e9a
QW
249
250 if (page == locked_page)
251 return 1;
252
ed8f13bf 253 if (page_ops & PAGE_LOCK) {
1e1de387
QW
254 int ret;
255
256 ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
257 if (ret)
258 return ret;
ed8f13bf 259 if (!PageDirty(page) || page->mapping != mapping) {
1e1de387 260 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
261 return -EAGAIN;
262 }
263 }
264 if (page_ops & PAGE_UNLOCK)
1e1de387 265 btrfs_page_end_writer_lock(fs_info, page, start, len);
ed8f13bf
QW
266 return 0;
267}
268
da2c7009
LB
269static int __process_pages_contig(struct address_space *mapping,
270 struct page *locked_page,
98af9ab1 271 u64 start, u64 end, unsigned long page_ops,
ed8f13bf
QW
272 u64 *processed_end)
273{
e38992be 274 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
ed8f13bf
QW
275 pgoff_t start_index = start >> PAGE_SHIFT;
276 pgoff_t end_index = end >> PAGE_SHIFT;
277 pgoff_t index = start_index;
ed8f13bf 278 unsigned long pages_processed = 0;
04c6b79a 279 struct folio_batch fbatch;
ed8f13bf
QW
280 int err = 0;
281 int i;
282
283 if (page_ops & PAGE_LOCK) {
284 ASSERT(page_ops == PAGE_LOCK);
285 ASSERT(processed_end && *processed_end == start);
286 }
287
04c6b79a 288 if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
ed8f13bf
QW
289 mapping_set_error(mapping, -EIO);
290
04c6b79a
VMO
291 folio_batch_init(&fbatch);
292 while (index <= end_index) {
293 int found_folios;
294
295 found_folios = filemap_get_folios_contig(mapping, &index,
296 end_index, &fbatch);
ed8f13bf 297
04c6b79a 298 if (found_folios == 0) {
ed8f13bf
QW
299 /*
300 * Only if we're going to lock these pages, we can find
301 * nothing at @index.
302 */
303 ASSERT(page_ops & PAGE_LOCK);
304 err = -EAGAIN;
305 goto out;
306 }
307
04c6b79a 308 for (i = 0; i < found_folios; i++) {
ed8f13bf 309 int process_ret;
04c6b79a 310 struct folio *folio = fbatch.folios[i];
e38992be 311 process_ret = process_one_page(fs_info, mapping,
04c6b79a 312 &folio->page, locked_page, page_ops,
e38992be 313 start, end);
ed8f13bf 314 if (process_ret < 0) {
ed8f13bf 315 err = -EAGAIN;
04c6b79a 316 folio_batch_release(&fbatch);
ed8f13bf
QW
317 goto out;
318 }
04c6b79a 319 pages_processed += folio_nr_pages(folio);
ed8f13bf 320 }
04c6b79a 321 folio_batch_release(&fbatch);
ed8f13bf
QW
322 cond_resched();
323 }
324out:
325 if (err && processed_end) {
326 /*
327 * Update @processed_end. I know this is awful since it has
328 * two different return value patterns (inclusive vs exclusive).
329 *
330 * But the exclusive pattern is necessary if @start is 0, or we
331 * underflow and check against processed_end won't work as
332 * expected.
333 */
334 if (pages_processed)
335 *processed_end = min(end,
336 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
337 else
338 *processed_end = start;
339 }
340 return err;
341}
da2c7009 342
143bede5
JM
343static noinline void __unlock_for_delalloc(struct inode *inode,
344 struct page *locked_page,
345 u64 start, u64 end)
c8b97818 346{
09cbfeaf
KS
347 unsigned long index = start >> PAGE_SHIFT;
348 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 349
76c0021d 350 ASSERT(locked_page);
c8b97818 351 if (index == locked_page->index && end_index == index)
143bede5 352 return;
c8b97818 353
98af9ab1 354 __process_pages_contig(inode->i_mapping, locked_page, start, end,
76c0021d 355 PAGE_UNLOCK, NULL);
c8b97818
CM
356}
357
358static noinline int lock_delalloc_pages(struct inode *inode,
359 struct page *locked_page,
360 u64 delalloc_start,
361 u64 delalloc_end)
362{
09cbfeaf 363 unsigned long index = delalloc_start >> PAGE_SHIFT;
09cbfeaf 364 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
98af9ab1 365 u64 processed_end = delalloc_start;
c8b97818 366 int ret;
c8b97818 367
76c0021d 368 ASSERT(locked_page);
c8b97818
CM
369 if (index == locked_page->index && index == end_index)
370 return 0;
371
98af9ab1
QW
372 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
373 delalloc_end, PAGE_LOCK, &processed_end);
374 if (ret == -EAGAIN && processed_end > delalloc_start)
76c0021d 375 __unlock_for_delalloc(inode, locked_page, delalloc_start,
98af9ab1 376 processed_end);
c8b97818
CM
377 return ret;
378}
379
380/*
3522e903 381 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
2749f7ef 382 * more than @max_bytes.
c8b97818 383 *
2749f7ef
QW
384 * @start: The original start bytenr to search.
385 * Will store the extent range start bytenr.
386 * @end: The original end bytenr of the search range
387 * Will store the extent range end bytenr.
388 *
389 * Return true if we find a delalloc range which starts inside the original
390 * range, and @start/@end will store the delalloc range start/end.
391 *
392 * Return false if we can't find any delalloc range which starts inside the
393 * original range, and @start/@end will be the non-delalloc range start/end.
c8b97818 394 */
ce9f967f 395EXPORT_FOR_TESTS
3522e903 396noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 397 struct page *locked_page, u64 *start,
917aacec 398 u64 *end)
c8b97818 399{
f7b12a62 400 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9978059b 401 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2749f7ef
QW
402 const u64 orig_start = *start;
403 const u64 orig_end = *end;
f7b12a62
NA
404 /* The sanity tests may not set a valid fs_info. */
405 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
406 u64 delalloc_start;
407 u64 delalloc_end;
3522e903 408 bool found;
9655d298 409 struct extent_state *cached_state = NULL;
c8b97818
CM
410 int ret;
411 int loops = 0;
412
2749f7ef
QW
413 /* Caller should pass a valid @end to indicate the search range end */
414 ASSERT(orig_end > orig_start);
415
416 /* The range should at least cover part of the page */
417 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
418 orig_end <= page_offset(locked_page)));
c8b97818
CM
419again:
420 /* step one, find a bunch of delalloc bytes starting at start */
421 delalloc_start = *start;
422 delalloc_end = 0;
083e75e7
JB
423 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
424 max_bytes, &cached_state);
2749f7ef 425 if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
c8b97818 426 *start = delalloc_start;
2749f7ef
QW
427
428 /* @delalloc_end can be -1, never go beyond @orig_end */
429 *end = min(delalloc_end, orig_end);
c2a128d2 430 free_extent_state(cached_state);
3522e903 431 return false;
c8b97818
CM
432 }
433
70b99e69
CM
434 /*
435 * start comes from the offset of locked_page. We have to lock
436 * pages in order, so we can't process delalloc bytes before
437 * locked_page
438 */
d397712b 439 if (delalloc_start < *start)
70b99e69 440 delalloc_start = *start;
70b99e69 441
c8b97818
CM
442 /*
443 * make sure to limit the number of pages we try to lock down
c8b97818 444 */
7bf811a5
JB
445 if (delalloc_end + 1 - delalloc_start > max_bytes)
446 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 447
c8b97818
CM
448 /* step two, lock all the pages after the page that has start */
449 ret = lock_delalloc_pages(inode, locked_page,
450 delalloc_start, delalloc_end);
9bfd61d9 451 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
452 if (ret == -EAGAIN) {
453 /* some of the pages are gone, lets avoid looping by
454 * shortening the size of the delalloc range we're searching
455 */
9655d298 456 free_extent_state(cached_state);
7d788742 457 cached_state = NULL;
c8b97818 458 if (!loops) {
09cbfeaf 459 max_bytes = PAGE_SIZE;
c8b97818
CM
460 loops = 1;
461 goto again;
462 } else {
3522e903 463 found = false;
c8b97818
CM
464 goto out_failed;
465 }
466 }
c8b97818
CM
467
468 /* step three, lock the state bits for the whole range */
570eb97b 469 lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
470
471 /* then test to make sure it is all still delalloc */
472 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 473 EXTENT_DELALLOC, 1, cached_state);
c8b97818 474 if (!ret) {
570eb97b
JB
475 unlock_extent(tree, delalloc_start, delalloc_end,
476 &cached_state);
c8b97818
CM
477 __unlock_for_delalloc(inode, locked_page,
478 delalloc_start, delalloc_end);
479 cond_resched();
480 goto again;
481 }
9655d298 482 free_extent_state(cached_state);
c8b97818
CM
483 *start = delalloc_start;
484 *end = delalloc_end;
485out_failed:
486 return found;
487}
488
ad7ff17b 489void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
74e9194a 490 struct page *locked_page,
f97e27e9 491 u32 clear_bits, unsigned long page_ops)
873695b3 492{
bd015294 493 clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
873695b3 494
ad7ff17b 495 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
98af9ab1 496 start, end, page_ops, NULL);
873695b3
LB
497}
498
87c11705
JB
499static int insert_failrec(struct btrfs_inode *inode,
500 struct io_failure_record *failrec)
d1310b2e 501{
87c11705 502 struct rb_node *exist;
d1310b2e 503
87c11705
JB
504 spin_lock(&inode->io_failure_lock);
505 exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
506 &failrec->rb_node);
507 spin_unlock(&inode->io_failure_lock);
508
509 return (exist == NULL) ? 0 : -EEXIST;
d1310b2e
CM
510}
511
87c11705 512static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
d1310b2e
CM
513{
514 struct rb_node *node;
87c11705 515 struct io_failure_record *failrec = ERR_PTR(-ENOENT);
d1310b2e 516
87c11705
JB
517 spin_lock(&inode->io_failure_lock);
518 node = rb_simple_search(&inode->io_failure_tree, start);
519 if (node)
520 failrec = rb_entry(node, struct io_failure_record, rb_node);
521 spin_unlock(&inode->io_failure_lock);
2279a270 522 return failrec;
d1310b2e
CM
523}
524
bd86a532
CH
525static void free_io_failure(struct btrfs_inode *inode,
526 struct io_failure_record *rec)
4a54c8c1 527{
87c11705
JB
528 spin_lock(&inode->io_failure_lock);
529 rb_erase(&rec->rb_node, &inode->io_failure_tree);
530 spin_unlock(&inode->io_failure_lock);
4a54c8c1 531
4a54c8c1 532 kfree(rec);
4a54c8c1
JS
533}
534
4a54c8c1
JS
535/*
536 * this bypasses the standard btrfs submit functions deliberately, as
537 * the standard behavior is to write all copies in a raid setup. here we only
538 * want to write the one bad copy. so we do the mapping for ourselves and issue
539 * submit_bio directly.
3ec706c8 540 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
541 * actually prevents the read that triggered the error from finishing.
542 * currently, there can be no more than two copies of every data bit. thus,
543 * exactly one rewrite is required.
544 */
38d5e541
QW
545static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
546 u64 length, u64 logical, struct page *page,
547 unsigned int pg_offset, int mirror_num)
4a54c8c1 548{
4a54c8c1 549 struct btrfs_device *dev;
e9458bfe
CH
550 struct bio_vec bvec;
551 struct bio bio;
4a54c8c1
JS
552 u64 map_length = 0;
553 u64 sector;
4c664611 554 struct btrfs_io_context *bioc = NULL;
e9458bfe 555 int ret = 0;
4a54c8c1 556
1751e8a6 557 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
558 BUG_ON(!mirror_num);
559
554aed7d
JT
560 if (btrfs_repair_one_zone(fs_info, logical))
561 return 0;
f7ef5287 562
4a54c8c1
JS
563 map_length = length;
564
b5de8d0d 565 /*
4c664611 566 * Avoid races with device replace and make sure our bioc has devices
b5de8d0d
FM
567 * associated to its stripes that don't go away while we are doing the
568 * read repair operation.
569 */
570 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 571 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
572 /*
573 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
574 * to update all raid stripes, but here we just want to correct
575 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
576 * stripe's dev and sector.
577 */
578 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
4c664611 579 &map_length, &bioc, 0);
e9458bfe
CH
580 if (ret)
581 goto out_counter_dec;
4c664611 582 ASSERT(bioc->mirror_num == 1);
c725328c
LB
583 } else {
584 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
4c664611 585 &map_length, &bioc, mirror_num);
e9458bfe
CH
586 if (ret)
587 goto out_counter_dec;
4c664611 588 BUG_ON(mirror_num != bioc->mirror_num);
4a54c8c1 589 }
c725328c 590
4c664611 591 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
4c664611
QW
592 dev = bioc->stripes[bioc->mirror_num - 1].dev;
593 btrfs_put_bioc(bioc);
e9458bfe 594
ebbede42
AJ
595 if (!dev || !dev->bdev ||
596 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
e9458bfe
CH
597 ret = -EIO;
598 goto out_counter_dec;
4a54c8c1 599 }
4a54c8c1 600
e9458bfe
CH
601 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
602 bio.bi_iter.bi_sector = sector;
603 __bio_add_page(&bio, page, length, pg_offset);
604
605 btrfsic_check_bio(&bio);
606 ret = submit_bio_wait(&bio);
607 if (ret) {
4a54c8c1 608 /* try to remap that extent elsewhere? */
442a4f63 609 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
e9458bfe 610 goto out_bio_uninit;
4a54c8c1
JS
611 }
612
b14af3b4
DS
613 btrfs_info_rl_in_rcu(fs_info,
614 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 615 ino, start,
1203b681 616 rcu_str_deref(dev->name), sector);
e9458bfe
CH
617 ret = 0;
618
619out_bio_uninit:
620 bio_uninit(&bio);
621out_counter_dec:
b5de8d0d 622 btrfs_bio_counter_dec(fs_info);
e9458bfe 623 return ret;
4a54c8c1
JS
624}
625
2b48966a 626int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
ea466794 627{
20a1fbf9 628 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 629 u64 start = eb->start;
cc5e31a4 630 int i, num_pages = num_extent_pages(eb);
d95603b2 631 int ret = 0;
ea466794 632
bc98a42c 633 if (sb_rdonly(fs_info->sb))
908960c6
ID
634 return -EROFS;
635
ea466794 636 for (i = 0; i < num_pages; i++) {
fb85fc9a 637 struct page *p = eb->pages[i];
1203b681 638
6ec656bc 639 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 640 start - page_offset(p), mirror_num);
ea466794
JB
641 if (ret)
642 break;
09cbfeaf 643 start += PAGE_SIZE;
ea466794
JB
644 }
645
646 return ret;
647}
648
c144c63f
CH
649static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
650{
651 if (cur_mirror == failrec->num_copies)
652 return cur_mirror + 1 - failrec->num_copies;
653 return cur_mirror + 1;
654}
655
656static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
657{
658 if (cur_mirror == 1)
659 return failrec->num_copies;
660 return cur_mirror - 1;
661}
662
4a54c8c1
JS
663/*
664 * each time an IO finishes, we do a fast check in the IO failure tree
665 * to see if we need to process or clean up an io_failure_record
666 */
0d0a762c
JB
667int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
668 struct page *page, unsigned int pg_offset)
4a54c8c1 669{
0d0a762c 670 struct btrfs_fs_info *fs_info = inode->root->fs_info;
0d0a762c
JB
671 struct extent_io_tree *io_tree = &inode->io_tree;
672 u64 ino = btrfs_ino(inode);
cdca85b0 673 u64 locked_start, locked_end;
4a54c8c1 674 struct io_failure_record *failrec;
c144c63f 675 int mirror;
cdca85b0 676 int ret;
4a54c8c1 677
87c11705 678 failrec = get_failrec(inode, start);
2279a270 679 if (IS_ERR(failrec))
4a54c8c1
JS
680 return 0;
681
4a54c8c1
JS
682 BUG_ON(!failrec->this_mirror);
683
bc98a42c 684 if (sb_rdonly(fs_info->sb))
908960c6 685 goto out;
4a54c8c1 686
cdca85b0
JB
687 ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
688 &locked_end, EXTENT_LOCKED, NULL);
689 if (ret || locked_start > failrec->bytenr ||
690 locked_end < failrec->bytenr + failrec->len - 1)
c144c63f
CH
691 goto out;
692
693 mirror = failrec->this_mirror;
694 do {
695 mirror = prev_mirror(failrec, mirror);
696 repair_io_failure(fs_info, ino, start, failrec->len,
697 failrec->logical, page, pg_offset, mirror);
698 } while (mirror != failrec->failed_mirror);
4a54c8c1
JS
699
700out:
87c11705 701 free_io_failure(inode, failrec);
454ff3de 702 return 0;
4a54c8c1
JS
703}
704
f612496b
MX
705/*
706 * Can be called when
707 * - hold extent lock
708 * - under ordered extent
709 * - the inode is freeing
710 */
7ab7956e 711void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 712{
f612496b 713 struct io_failure_record *failrec;
87c11705 714 struct rb_node *node, *next;
f612496b 715
87c11705 716 if (RB_EMPTY_ROOT(&inode->io_failure_tree))
f612496b
MX
717 return;
718
87c11705
JB
719 spin_lock(&inode->io_failure_lock);
720 node = rb_simple_search_first(&inode->io_failure_tree, start);
721 while (node) {
722 failrec = rb_entry(node, struct io_failure_record, rb_node);
723 if (failrec->bytenr > end)
f612496b
MX
724 break;
725
87c11705
JB
726 next = rb_next(node);
727 rb_erase(&failrec->rb_node, &inode->io_failure_tree);
f612496b
MX
728 kfree(failrec);
729
87c11705 730 node = next;
f612496b 731 }
87c11705 732 spin_unlock(&inode->io_failure_lock);
f612496b
MX
733}
734
3526302f 735static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
7aa51232
CH
736 struct btrfs_bio *bbio,
737 unsigned int bio_offset)
4a54c8c1 738{
ab8d0fc4 739 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7aa51232 740 u64 start = bbio->file_offset + bio_offset;
2fe6303e 741 struct io_failure_record *failrec;
150e4b05 742 const u32 sectorsize = fs_info->sectorsize;
4a54c8c1 743 int ret;
4a54c8c1 744
87c11705 745 failrec = get_failrec(BTRFS_I(inode), start);
3526302f 746 if (!IS_ERR(failrec)) {
ab8d0fc4 747 btrfs_debug(fs_info,
1245835d 748 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
87c11705 749 failrec->logical, failrec->bytenr, failrec->len);
4a54c8c1
JS
750 /*
751 * when data can be on disk more than twice, add to failrec here
752 * (e.g. with a list for failed_mirror) to make
753 * clean_io_failure() clean all those errors at once.
754 */
7aa51232 755 ASSERT(failrec->this_mirror == bbio->mirror_num);
c144c63f 756 ASSERT(failrec->len == fs_info->sectorsize);
3526302f 757 return failrec;
4a54c8c1 758 }
2fe6303e 759
3526302f
NB
760 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
761 if (!failrec)
762 return ERR_PTR(-ENOMEM);
2fe6303e 763
87c11705
JB
764 RB_CLEAR_NODE(&failrec->rb_node);
765 failrec->bytenr = start;
150e4b05 766 failrec->len = sectorsize;
7aa51232
CH
767 failrec->failed_mirror = bbio->mirror_num;
768 failrec->this_mirror = bbio->mirror_num;
81bd9328 769 failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
3526302f 770
3526302f 771 btrfs_debug(fs_info,
81bd9328
CH
772 "new io failure record logical %llu start %llu",
773 failrec->logical, start);
3526302f 774
81bd9328 775 failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
c144c63f
CH
776 if (failrec->num_copies == 1) {
777 /*
778 * We only have a single copy of the data, so don't bother with
779 * all the retry and error correction code that follows. No
780 * matter what the error is, it is very likely to persist.
781 */
782 btrfs_debug(fs_info,
783 "cannot repair logical %llu num_copies %d",
784 failrec->logical, failrec->num_copies);
3526302f
NB
785 kfree(failrec);
786 return ERR_PTR(-EIO);
787 }
788
3526302f 789 /* Set the bits in the private failure tree */
87c11705
JB
790 ret = insert_failrec(BTRFS_I(inode), failrec);
791 if (ret) {
3526302f
NB
792 kfree(failrec);
793 return ERR_PTR(ret);
794 }
795
796 return failrec;
2fe6303e
MX
797}
798
7aa51232
CH
799int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
800 u32 bio_offset, struct page *page, unsigned int pgoff,
150e4b05 801 submit_bio_hook_t *submit_bio_hook)
2fe6303e 802{
7aa51232 803 u64 start = failed_bbio->file_offset + bio_offset;
2fe6303e 804 struct io_failure_record *failrec;
77d5d689 805 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7aa51232 806 struct bio *failed_bio = &failed_bbio->bio;
7ffd27e3 807 const int icsum = bio_offset >> fs_info->sectorsize_bits;
77d5d689 808 struct bio *repair_bio;
c3a3b19b 809 struct btrfs_bio *repair_bbio;
2fe6303e 810
77d5d689
OS
811 btrfs_debug(fs_info,
812 "repair read error: read error at %llu", start);
2fe6303e 813
1f7ad75b 814 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e 815
7aa51232 816 failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
3526302f 817 if (IS_ERR(failrec))
150e4b05 818 return PTR_ERR(failrec);
2fe6303e 819
c144c63f
CH
820 /*
821 * There are two premises:
822 * a) deliver good data to the caller
823 * b) correct the bad sectors on disk
824 *
825 * Since we're only doing repair for one sector, we only need to get
826 * a good copy of the failed sector and if we succeed, we have setup
827 * everything for repair_io_failure to do the rest for us.
828 */
829 failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
830 if (failrec->this_mirror == failrec->failed_mirror) {
831 btrfs_debug(fs_info,
832 "failed to repair num_copies %d this_mirror %d failed_mirror %d",
833 failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
87c11705 834 free_io_failure(BTRFS_I(inode), failrec);
150e4b05 835 return -EIO;
2fe6303e
MX
836 }
837
917f32a2
CH
838 repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
839 failed_bbio->private);
c3a3b19b 840 repair_bbio = btrfs_bio(repair_bio);
00d82525 841 repair_bbio->file_offset = start;
77d5d689 842 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2fe6303e 843
c3a3b19b 844 if (failed_bbio->csum) {
223486c2 845 const u32 csum_size = fs_info->csum_size;
77d5d689 846
c3a3b19b
QW
847 repair_bbio->csum = repair_bbio->csum_inline;
848 memcpy(repair_bbio->csum,
849 failed_bbio->csum + csum_size * icsum, csum_size);
77d5d689 850 }
2fe6303e 851
77d5d689 852 bio_add_page(repair_bio, page, failrec->len, pgoff);
c3a3b19b 853 repair_bbio->iter = repair_bio->bi_iter;
4a54c8c1 854
ab8d0fc4 855 btrfs_debug(btrfs_sb(inode->i_sb),
1245835d
QW
856 "repair read error: submitting new read to mirror %d",
857 failrec->this_mirror);
4a54c8c1 858
8cbc3001
JB
859 /*
860 * At this point we have a bio, so any errors from submit_bio_hook()
861 * will be handled by the endio on the repair_bio, so we can't return an
862 * error here.
863 */
81bd9328 864 submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
8cbc3001 865 return BLK_STS_OK;
150e4b05
QW
866}
867
868static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
869{
870 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
871
872 ASSERT(page_offset(page) <= start &&
873 start + len <= page_offset(page) + PAGE_SIZE);
874
150e4b05 875 if (uptodate) {
14605409
BB
876 if (fsverity_active(page->mapping->host) &&
877 !PageError(page) &&
878 !PageUptodate(page) &&
879 start < i_size_read(page->mapping->host) &&
880 !fsverity_verify_page(page)) {
881 btrfs_page_set_error(fs_info, page, start, len);
882 } else {
883 btrfs_page_set_uptodate(fs_info, page, start, len);
884 }
150e4b05
QW
885 } else {
886 btrfs_page_clear_uptodate(fs_info, page, start, len);
887 btrfs_page_set_error(fs_info, page, start, len);
888 }
889
fbca46eb 890 if (!btrfs_is_subpage(fs_info, page))
150e4b05 891 unlock_page(page);
3d078efa 892 else
150e4b05
QW
893 btrfs_subpage_end_reader(fs_info, page, start, len);
894}
895
a5aa7ab6
CH
896static void end_sector_io(struct page *page, u64 offset, bool uptodate)
897{
898 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
899 const u32 sectorsize = inode->root->fs_info->sectorsize;
900 struct extent_state *cached = NULL;
901
902 end_page_read(page, uptodate, offset, sectorsize);
903 if (uptodate)
904 set_extent_uptodate(&inode->io_tree, offset,
48acc47d
JB
905 offset + sectorsize - 1, &cached, GFP_NOFS);
906 unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1,
907 &cached);
a5aa7ab6
CH
908}
909
7aa51232
CH
910static void submit_data_read_repair(struct inode *inode,
911 struct btrfs_bio *failed_bbio,
fd5a6f63 912 u32 bio_offset, const struct bio_vec *bvec,
7aa51232 913 unsigned int error_bitmap)
150e4b05 914{
fd5a6f63 915 const unsigned int pgoff = bvec->bv_offset;
150e4b05 916 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
fd5a6f63
QW
917 struct page *page = bvec->bv_page;
918 const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
919 const u64 end = start + bvec->bv_len - 1;
150e4b05
QW
920 const u32 sectorsize = fs_info->sectorsize;
921 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
150e4b05
QW
922 int i;
923
7aa51232 924 BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
150e4b05 925
c0111c44
QW
926 /* This repair is only for data */
927 ASSERT(is_data_inode(inode));
928
150e4b05
QW
929 /* We're here because we had some read errors or csum mismatch */
930 ASSERT(error_bitmap);
931
932 /*
933 * We only get called on buffered IO, thus page must be mapped and bio
934 * must not be cloned.
935 */
7aa51232 936 ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
150e4b05
QW
937
938 /* Iterate through all the sectors in the range */
939 for (i = 0; i < nr_bits; i++) {
940 const unsigned int offset = i * sectorsize;
150e4b05
QW
941 bool uptodate = false;
942 int ret;
943
944 if (!(error_bitmap & (1U << i))) {
945 /*
946 * This sector has no error, just end the page read
947 * and unlock the range.
948 */
949 uptodate = true;
950 goto next;
951 }
952
7aa51232
CH
953 ret = btrfs_repair_one_sector(inode, failed_bbio,
954 bio_offset + offset, page, pgoff + offset,
955 btrfs_submit_data_read_bio);
150e4b05
QW
956 if (!ret) {
957 /*
958 * We have submitted the read repair, the page release
959 * will be handled by the endio function of the
960 * submitted repair bio.
961 * Thus we don't need to do any thing here.
962 */
963 continue;
964 }
965 /*
fd5a6f63
QW
966 * Continue on failed repair, otherwise the remaining sectors
967 * will not be properly unlocked.
150e4b05 968 */
150e4b05 969next:
a5aa7ab6 970 end_sector_io(page, start + offset, uptodate);
150e4b05 971 }
4a54c8c1
JS
972}
973
d1310b2e
CM
974/* lots and lots of room for performance fixes in the end_bio funcs */
975
b5227c07 976void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0 977{
38a39ac7 978 struct btrfs_inode *inode;
25c1252a 979 const bool uptodate = (err == 0);
3e2426bd 980 int ret = 0;
87826df0 981
38a39ac7
QW
982 ASSERT(page && page->mapping);
983 inode = BTRFS_I(page->mapping->host);
984 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
87826df0 985
87826df0 986 if (!uptodate) {
963e4db8
QW
987 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
988 u32 len;
989
990 ASSERT(end + 1 - start <= U32_MAX);
991 len = end + 1 - start;
992
993 btrfs_page_clear_uptodate(fs_info, page, start, len);
994 btrfs_page_set_error(fs_info, page, start, len);
bff5baf8 995 ret = err < 0 ? err : -EIO;
5dca6eea 996 mapping_set_error(page->mapping, ret);
87826df0 997 }
87826df0
JM
998}
999
d1310b2e
CM
1000/*
1001 * after a writepage IO is done, we need to:
1002 * clear the uptodate bits on error
1003 * clear the writeback bits in the extent tree for this IO
1004 * end_page_writeback if the page has no more pending IO
1005 *
1006 * Scheduling is not allowed, so the extent state tree is expected
1007 * to have one and only one object corresponding to this IO.
1008 */
917f32a2 1009static void end_bio_extent_writepage(struct btrfs_bio *bbio)
d1310b2e 1010{
917f32a2 1011 struct bio *bio = &bbio->bio;
4e4cbee9 1012 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 1013 struct bio_vec *bvec;
d1310b2e
CM
1014 u64 start;
1015 u64 end;
6dc4f100 1016 struct bvec_iter_all iter_all;
d8e3fb10 1017 bool first_bvec = true;
d1310b2e 1018
c09abff8 1019 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 1020 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 1021 struct page *page = bvec->bv_page;
0b246afa
JM
1022 struct inode *inode = page->mapping->host;
1023 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
321a02db
QW
1024 const u32 sectorsize = fs_info->sectorsize;
1025
1026 /* Our read/write should always be sector aligned. */
1027 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
1028 btrfs_err(fs_info,
1029 "partial page write in btrfs with offset %u and length %u",
1030 bvec->bv_offset, bvec->bv_len);
1031 else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
1032 btrfs_info(fs_info,
1033 "incomplete page write with offset %u and length %u",
1034 bvec->bv_offset, bvec->bv_len);
1035
1036 start = page_offset(page) + bvec->bv_offset;
1037 end = start + bvec->bv_len - 1;
d1310b2e 1038
d8e3fb10
NA
1039 if (first_bvec) {
1040 btrfs_record_physical_zoned(inode, start, bio);
1041 first_bvec = false;
1042 }
1043
4e4cbee9 1044 end_extent_writepage(page, error, start, end);
9047e317
QW
1045
1046 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2c30c71b 1047 }
2b1f55b0 1048
d1310b2e 1049 bio_put(bio);
d1310b2e
CM
1050}
1051
94e8c95c
QW
1052/*
1053 * Record previously processed extent range
1054 *
1055 * For endio_readpage_release_extent() to handle a full extent range, reducing
1056 * the extent io operations.
1057 */
1058struct processed_extent {
1059 struct btrfs_inode *inode;
1060 /* Start of the range in @inode */
1061 u64 start;
2e626e56 1062 /* End of the range in @inode */
94e8c95c
QW
1063 u64 end;
1064 bool uptodate;
1065};
1066
1067/*
1068 * Try to release processed extent range
1069 *
1070 * May not release the extent range right now if the current range is
1071 * contiguous to processed extent.
1072 *
1073 * Will release processed extent when any of @inode, @uptodate, the range is
1074 * no longer contiguous to the processed range.
1075 *
1076 * Passing @inode == NULL will force processed extent to be released.
1077 */
1078static void endio_readpage_release_extent(struct processed_extent *processed,
1079 struct btrfs_inode *inode, u64 start, u64 end,
1080 bool uptodate)
883d0de4
MX
1081{
1082 struct extent_state *cached = NULL;
94e8c95c
QW
1083 struct extent_io_tree *tree;
1084
1085 /* The first extent, initialize @processed */
1086 if (!processed->inode)
1087 goto update;
883d0de4 1088
94e8c95c
QW
1089 /*
1090 * Contiguous to processed extent, just uptodate the end.
1091 *
1092 * Several things to notice:
1093 *
1094 * - bio can be merged as long as on-disk bytenr is contiguous
1095 * This means we can have page belonging to other inodes, thus need to
1096 * check if the inode still matches.
1097 * - bvec can contain range beyond current page for multi-page bvec
1098 * Thus we need to do processed->end + 1 >= start check
1099 */
1100 if (processed->inode == inode && processed->uptodate == uptodate &&
1101 processed->end + 1 >= start && end >= processed->end) {
1102 processed->end = end;
1103 return;
1104 }
1105
1106 tree = &processed->inode->io_tree;
1107 /*
1108 * Now we don't have range contiguous to the processed range, release
1109 * the processed range now.
1110 */
48acc47d 1111 unlock_extent(tree, processed->start, processed->end, &cached);
94e8c95c
QW
1112
1113update:
1114 /* Update processed to current range */
1115 processed->inode = inode;
1116 processed->start = start;
1117 processed->end = end;
1118 processed->uptodate = uptodate;
883d0de4
MX
1119}
1120
92082d40
QW
1121static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
1122{
1123 ASSERT(PageLocked(page));
fbca46eb 1124 if (!btrfs_is_subpage(fs_info, page))
92082d40
QW
1125 return;
1126
1127 ASSERT(PagePrivate(page));
1128 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
1129}
1130
d9bb77d5 1131/*
01cd3909 1132 * Find extent buffer for a givne bytenr.
d9bb77d5
QW
1133 *
1134 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
1135 * in endio context.
1136 */
1137static struct extent_buffer *find_extent_buffer_readpage(
1138 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
1139{
1140 struct extent_buffer *eb;
1141
1142 /*
1143 * For regular sectorsize, we can use page->private to grab extent
1144 * buffer
1145 */
fbca46eb 1146 if (fs_info->nodesize >= PAGE_SIZE) {
d9bb77d5
QW
1147 ASSERT(PagePrivate(page) && page->private);
1148 return (struct extent_buffer *)page->private;
1149 }
1150
01cd3909
DS
1151 /* For subpage case, we need to lookup buffer radix tree */
1152 rcu_read_lock();
1153 eb = radix_tree_lookup(&fs_info->buffer_radix,
1154 bytenr >> fs_info->sectorsize_bits);
1155 rcu_read_unlock();
d9bb77d5
QW
1156 ASSERT(eb);
1157 return eb;
1158}
1159
d1310b2e
CM
1160/*
1161 * after a readpage IO is done, we need to:
1162 * clear the uptodate bits on error
1163 * set the uptodate bits if things worked
1164 * set the page up to date if all extents in the tree are uptodate
1165 * clear the lock bit in the extent tree
1166 * unlock the page if there are no other extents locked for it
1167 *
1168 * Scheduling is not allowed, so the extent state tree is expected
1169 * to have one and only one object corresponding to this IO.
1170 */
917f32a2 1171static void end_bio_extent_readpage(struct btrfs_bio *bbio)
d1310b2e 1172{
917f32a2 1173 struct bio *bio = &bbio->bio;
2c30c71b 1174 struct bio_vec *bvec;
94e8c95c 1175 struct processed_extent processed = { 0 };
7ffd27e3
QW
1176 /*
1177 * The offset to the beginning of a bio, since one bio can never be
1178 * larger than UINT_MAX, u32 here is enough.
1179 */
1180 u32 bio_offset = 0;
5cf1ab56 1181 int mirror;
6dc4f100 1182 struct bvec_iter_all iter_all;
d1310b2e 1183
c09abff8 1184 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 1185 bio_for_each_segment_all(bvec, bio, iter_all) {
150e4b05 1186 bool uptodate = !bio->bi_status;
d1310b2e 1187 struct page *page = bvec->bv_page;
a71754fc 1188 struct inode *inode = page->mapping->host;
ab8d0fc4 1189 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7ffd27e3 1190 const u32 sectorsize = fs_info->sectorsize;
150e4b05 1191 unsigned int error_bitmap = (unsigned int)-1;
97861cd1 1192 bool repair = false;
7ffd27e3
QW
1193 u64 start;
1194 u64 end;
1195 u32 len;
507903b8 1196
ab8d0fc4
JM
1197 btrfs_debug(fs_info,
1198 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
1201b58b 1199 bio->bi_iter.bi_sector, bio->bi_status,
c3a3b19b 1200 bbio->mirror_num);
902b22f3 1201
8b8bbd46
QW
1202 /*
1203 * We always issue full-sector reads, but if some block in a
1204 * page fails to read, blk_update_request() will advance
1205 * bv_offset and adjust bv_len to compensate. Print a warning
1206 * for unaligned offsets, and an error if they don't add up to
1207 * a full sector.
1208 */
1209 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
1210 btrfs_err(fs_info,
1211 "partial page read in btrfs with offset %u and length %u",
1212 bvec->bv_offset, bvec->bv_len);
1213 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
1214 sectorsize))
1215 btrfs_info(fs_info,
1216 "incomplete page read with offset %u and length %u",
1217 bvec->bv_offset, bvec->bv_len);
1218
1219 start = page_offset(page) + bvec->bv_offset;
1220 end = start + bvec->bv_len - 1;
facc8a22 1221 len = bvec->bv_len;
d1310b2e 1222
c3a3b19b 1223 mirror = bbio->mirror_num;
78e62c02 1224 if (likely(uptodate)) {
150e4b05 1225 if (is_data_inode(inode)) {
c3a3b19b 1226 error_bitmap = btrfs_verify_data_csum(bbio,
5e295768 1227 bio_offset, page, start, end);
97861cd1
CH
1228 if (error_bitmap)
1229 uptodate = false;
150e4b05 1230 } else {
97861cd1
CH
1231 if (btrfs_validate_metadata_buffer(bbio,
1232 page, start, end, mirror))
1233 uptodate = false;
150e4b05 1234 }
d1310b2e 1235 }
ea466794 1236
883d0de4 1237 if (likely(uptodate)) {
a71754fc 1238 loff_t i_size = i_size_read(inode);
09cbfeaf 1239 pgoff_t end_index = i_size >> PAGE_SHIFT;
a71754fc 1240
0d0a762c 1241 btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
97861cd1 1242
c28ea613
QW
1243 /*
1244 * Zero out the remaining part if this range straddles
1245 * i_size.
1246 *
1247 * Here we should only zero the range inside the bvec,
1248 * not touch anything else.
1249 *
1250 * NOTE: i_size is exclusive while end is inclusive.
1251 */
1252 if (page->index == end_index && i_size <= end) {
1253 u32 zero_start = max(offset_in_page(i_size),
d2dcc8ed 1254 offset_in_page(start));
c28ea613
QW
1255
1256 zero_user_segment(page, zero_start,
1257 offset_in_page(end) + 1);
1258 }
97861cd1
CH
1259 } else if (is_data_inode(inode)) {
1260 /*
1261 * Only try to repair bios that actually made it to a
1262 * device. If the bio failed to be submitted mirror
1263 * is 0 and we need to fail it without retrying.
81bd9328
CH
1264 *
1265 * This also includes the high level bios for compressed
1266 * extents - these never make it to a device and repair
1267 * is already handled on the lower compressed bio.
97861cd1
CH
1268 */
1269 if (mirror > 0)
1270 repair = true;
1271 } else {
1272 struct extent_buffer *eb;
1273
1274 eb = find_extent_buffer_readpage(fs_info, page, start);
1275 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
1276 eb->read_mirror = mirror;
1277 atomic_dec(&eb->io_pages);
70dec807 1278 }
97861cd1
CH
1279
1280 if (repair) {
1281 /*
1282 * submit_data_read_repair() will handle all the good
1283 * and bad sectors, we just continue to the next bvec.
1284 */
7aa51232
CH
1285 submit_data_read_repair(inode, bbio, bio_offset, bvec,
1286 error_bitmap);
97861cd1
CH
1287 } else {
1288 /* Update page status and unlock */
1289 end_page_read(page, uptodate, start, len);
1290 endio_readpage_release_extent(&processed, BTRFS_I(inode),
1291 start, end, PageUptodate(page));
70dec807 1292 }
97861cd1 1293
7ffd27e3
QW
1294 ASSERT(bio_offset + len > bio_offset);
1295 bio_offset += len;
883d0de4 1296
2c30c71b 1297 }
94e8c95c
QW
1298 /* Release the last extent */
1299 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
c3a3b19b 1300 btrfs_bio_free_csum(bbio);
d1310b2e 1301 bio_put(bio);
d1310b2e
CM
1302}
1303
43dd529a 1304/*
dd137dd1
STD
1305 * Populate every free slot in a provided array with pages.
1306 *
1307 * @nr_pages: number of pages to allocate
1308 * @page_array: the array to fill with pages; any existing non-null entries in
1309 * the array will be skipped
1310 *
1311 * Return: 0 if all pages were able to be allocated;
1312 * -ENOMEM otherwise, and the caller is responsible for freeing all
1313 * non-null page pointers in the array.
1314 */
1315int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
1316{
91d6ac1d 1317 unsigned int allocated;
dd137dd1 1318
91d6ac1d
STD
1319 for (allocated = 0; allocated < nr_pages;) {
1320 unsigned int last = allocated;
dd137dd1 1321
91d6ac1d
STD
1322 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
1323
395cb57e
STD
1324 if (allocated == nr_pages)
1325 return 0;
1326
91d6ac1d
STD
1327 /*
1328 * During this iteration, no page could be allocated, even
1329 * though alloc_pages_bulk_array() falls back to alloc_page()
1330 * if it could not bulk-allocate. So we must be out of memory.
1331 */
1332 if (allocated == last)
dd137dd1 1333 return -ENOMEM;
395cb57e
STD
1334
1335 memalloc_retry_wait(GFP_NOFS);
dd137dd1
STD
1336 }
1337 return 0;
1338}
1339
43dd529a
DS
1340/*
1341 * Attempt to add a page to bio.
953651eb 1342 *
43dd529a
DS
1343 * @bio_ctrl: record both the bio, and its bio_flags
1344 * @page: page to add to the bio
1345 * @disk_bytenr: offset of the new bio or to check whether we are adding
1346 * a contiguous page to the previous one
1347 * @size: portion of page that we want to write
1348 * @pg_offset: starting offset in the page
1349 * @compress_type: compression type of the current bio to see if we can merge them
953651eb
NA
1350 *
1351 * Attempt to add a page to bio considering stripe alignment etc.
1352 *
e0eefe07
QW
1353 * Return >= 0 for the number of bytes added to the bio.
1354 * Can return 0 if the current bio is already at stripe/zone boundary.
1355 * Return <0 for error.
953651eb 1356 */
e0eefe07
QW
1357static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
1358 struct page *page,
1359 u64 disk_bytenr, unsigned int size,
1360 unsigned int pg_offset,
cb3a12d9 1361 enum btrfs_compression_type compress_type)
953651eb 1362{
390ed29b
QW
1363 struct bio *bio = bio_ctrl->bio;
1364 u32 bio_size = bio->bi_iter.bi_size;
e0eefe07 1365 u32 real_size;
953651eb 1366 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
4a445b7b 1367 bool contig = false;
e1326f03 1368 int ret;
953651eb 1369
390ed29b
QW
1370 ASSERT(bio);
1371 /* The limit should be calculated when bio_ctrl->bio is allocated */
1372 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
0f07003b 1373 if (bio_ctrl->compress_type != compress_type)
e0eefe07 1374 return 0;
953651eb 1375
4a445b7b
QW
1376
1377 if (bio->bi_iter.bi_size == 0) {
1378 /* We can always add a page into an empty bio. */
1379 contig = true;
1380 } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
1381 struct bio_vec *bvec = bio_last_bvec_all(bio);
1382
1383 /*
1384 * The contig check requires the following conditions to be met:
1385 * 1) The pages are belonging to the same inode
1386 * This is implied by the call chain.
1387 *
1388 * 2) The range has adjacent logical bytenr
1389 *
1390 * 3) The range has adjacent file offset
1391 * This is required for the usage of btrfs_bio->file_offset.
1392 */
1393 if (bio_end_sector(bio) == sector &&
1394 page_offset(bvec->bv_page) + bvec->bv_offset +
1395 bvec->bv_len == page_offset(page) + pg_offset)
1396 contig = true;
1397 } else {
1398 /*
1399 * For compression, all IO should have its logical bytenr
1400 * set to the starting bytenr of the compressed extent.
1401 */
953651eb 1402 contig = bio->bi_iter.bi_sector == sector;
4a445b7b
QW
1403 }
1404
953651eb 1405 if (!contig)
e0eefe07 1406 return 0;
953651eb 1407
e0eefe07
QW
1408 real_size = min(bio_ctrl->len_to_oe_boundary,
1409 bio_ctrl->len_to_stripe_boundary) - bio_size;
1410 real_size = min(real_size, size);
1411
1412 /*
1413 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
1414 * bio will still execute its endio function on the page!
1415 */
1416 if (real_size == 0)
1417 return 0;
953651eb 1418
390ed29b 1419 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
e0eefe07 1420 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
390ed29b 1421 else
e0eefe07 1422 ret = bio_add_page(bio, page, real_size, pg_offset);
e1326f03 1423
e0eefe07 1424 return ret;
953651eb
NA
1425}
1426
390ed29b 1427static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
939c7feb 1428 struct btrfs_inode *inode, u64 file_offset)
390ed29b
QW
1429{
1430 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1431 struct btrfs_io_geometry geom;
1432 struct btrfs_ordered_extent *ordered;
1433 struct extent_map *em;
1434 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
1435 int ret;
1436
1437 /*
1438 * Pages for compressed extent are never submitted to disk directly,
1439 * thus it has no real boundary, just set them to U32_MAX.
1440 *
1441 * The split happens for real compressed bio, which happens in
1442 * btrfs_submit_compressed_read/write().
1443 */
0f07003b 1444 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
390ed29b
QW
1445 bio_ctrl->len_to_oe_boundary = U32_MAX;
1446 bio_ctrl->len_to_stripe_boundary = U32_MAX;
1447 return 0;
1448 }
1449 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
1450 if (IS_ERR(em))
1451 return PTR_ERR(em);
1452 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
1453 logical, &geom);
1454 free_extent_map(em);
1455 if (ret < 0) {
1456 return ret;
1457 }
1458 if (geom.len > U32_MAX)
1459 bio_ctrl->len_to_stripe_boundary = U32_MAX;
1460 else
1461 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
1462
73672710 1463 if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
390ed29b
QW
1464 bio_ctrl->len_to_oe_boundary = U32_MAX;
1465 return 0;
1466 }
1467
390ed29b 1468 /* Ordered extent not yet created, so we're good */
939c7feb 1469 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
390ed29b
QW
1470 if (!ordered) {
1471 bio_ctrl->len_to_oe_boundary = U32_MAX;
1472 return 0;
1473 }
1474
1475 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
1476 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
1477 btrfs_put_ordered_extent(ordered);
1478 return 0;
1479}
1480
e0eefe07
QW
1481static int alloc_new_bio(struct btrfs_inode *inode,
1482 struct btrfs_bio_ctrl *bio_ctrl,
1483 struct writeback_control *wbc,
bf9486d6 1484 blk_opf_t opf,
939c7feb 1485 u64 disk_bytenr, u32 offset, u64 file_offset,
cb3a12d9 1486 enum btrfs_compression_type compress_type)
e0eefe07
QW
1487{
1488 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1489 struct bio *bio;
1490 int ret;
1491
5467abba
QW
1492 ASSERT(bio_ctrl->end_io_func);
1493
1494 bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
e0eefe07
QW
1495 /*
1496 * For compressed page range, its disk_bytenr is always @disk_bytenr
1497 * passed in, no matter if we have added any range into previous bio.
1498 */
cb3a12d9 1499 if (compress_type != BTRFS_COMPRESS_NONE)
cd8e0cca 1500 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
e0eefe07 1501 else
cd8e0cca 1502 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
e0eefe07 1503 bio_ctrl->bio = bio;
0f07003b 1504 bio_ctrl->compress_type = compress_type;
939c7feb
NA
1505 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
1506 if (ret < 0)
1507 goto error;
e0eefe07 1508
50f1cff3
CH
1509 if (wbc) {
1510 /*
1511 * For Zone append we need the correct block_device that we are
1512 * going to write to set in the bio to be able to respect the
1513 * hardware limitation. Look it up here:
1514 */
1515 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1516 struct btrfs_device *dev;
1517
1518 dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
1519 fs_info->sectorsize);
1520 if (IS_ERR(dev)) {
1521 ret = PTR_ERR(dev);
1522 goto error;
1523 }
e0eefe07 1524
50f1cff3
CH
1525 bio_set_dev(bio, dev->bdev);
1526 } else {
1527 /*
1528 * Otherwise pick the last added device to support
1529 * cgroup writeback. For multi-device file systems this
1530 * means blk-cgroup policies have to always be set on the
1531 * last added/replaced device. This is a bit odd but has
1532 * been like that for a long time.
1533 */
1534 bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
e0eefe07 1535 }
50f1cff3
CH
1536 wbc_init_bio(wbc, bio);
1537 } else {
1538 ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
e0eefe07
QW
1539 }
1540 return 0;
1541error:
1542 bio_ctrl->bio = NULL;
917f32a2 1543 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
e0eefe07
QW
1544 return ret;
1545}
1546
4b81ba48
DS
1547/*
1548 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625 1549 * @wbc: optional writeback control for io accounting
0c64c33c 1550 * @disk_bytenr: logical bytenr where the write will be
209ecde5 1551 * @page: page to add to the bio
0c64c33c 1552 * @size: portion of page that we want to write to
b8b3d625
DS
1553 * @pg_offset: offset of the new bio or to check whether we are adding
1554 * a contiguous page to the previous one
cb3a12d9 1555 * @compress_type: compress type for current bio
814b6f91
QW
1556 *
1557 * The will either add the page into the existing @bio_ctrl->bio, or allocate a
1558 * new one in @bio_ctrl->bio.
1559 * The mirror number for this IO should already be initizlied in
1560 * @bio_ctrl->mirror_num.
4b81ba48 1561 */
bf9486d6 1562static int submit_extent_page(blk_opf_t opf,
da2f0f74 1563 struct writeback_control *wbc,
390ed29b 1564 struct btrfs_bio_ctrl *bio_ctrl,
209ecde5 1565 u64 disk_bytenr, struct page *page,
6c5a4e2c 1566 size_t size, unsigned long pg_offset,
cb3a12d9 1567 enum btrfs_compression_type compress_type,
005efedf 1568 bool force_bio_submit)
d1310b2e
CM
1569{
1570 int ret = 0;
e1326f03 1571 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
e0eefe07 1572 unsigned int cur = pg_offset;
d1310b2e 1573
390ed29b 1574 ASSERT(bio_ctrl);
5c2b1fd7 1575
390ed29b
QW
1576 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
1577 pg_offset + size <= PAGE_SIZE);
5467abba
QW
1578
1579 ASSERT(bio_ctrl->end_io_func);
1580
722c82ac
CH
1581 if (force_bio_submit)
1582 submit_one_bio(bio_ctrl);
e0eefe07
QW
1583
1584 while (cur < pg_offset + size) {
1585 u32 offset = cur - pg_offset;
1586 int added;
1587
1588 /* Allocate new bio if needed */
1589 if (!bio_ctrl->bio) {
1590 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
5467abba 1591 disk_bytenr, offset,
939c7feb 1592 page_offset(page) + cur,
cb3a12d9 1593 compress_type);
e0eefe07
QW
1594 if (ret < 0)
1595 return ret;
1596 }
1597 /*
1598 * We must go through btrfs_bio_add_page() to ensure each
1599 * page range won't cross various boundaries.
1600 */
cb3a12d9 1601 if (compress_type != BTRFS_COMPRESS_NONE)
e0eefe07
QW
1602 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
1603 size - offset, pg_offset + offset,
cb3a12d9 1604 compress_type);
e0eefe07
QW
1605 else
1606 added = btrfs_bio_add_page(bio_ctrl, page,
1607 disk_bytenr + offset, size - offset,
cb3a12d9 1608 pg_offset + offset, compress_type);
e0eefe07
QW
1609
1610 /* Metadata page range should never be split */
1611 if (!is_data_inode(&inode->vfs_inode))
1612 ASSERT(added == 0 || added == size - offset);
1613
1614 /* At least we added some page, update the account */
1615 if (wbc && added)
1616 wbc_account_cgroup_owner(wbc, page, added);
1617
1618 /* We have reached boundary, submit right now */
1619 if (added < size - offset) {
1620 /* The bio should contain some page(s) */
1621 ASSERT(bio_ctrl->bio->bi_iter.bi_size);
722c82ac 1622 submit_one_bio(bio_ctrl);
d1310b2e 1623 }
e0eefe07 1624 cur += added;
d1310b2e 1625 }
e0eefe07 1626 return 0;
d1310b2e
CM
1627}
1628
760f991f
QW
1629static int attach_extent_buffer_page(struct extent_buffer *eb,
1630 struct page *page,
1631 struct btrfs_subpage *prealloc)
d1310b2e 1632{
760f991f
QW
1633 struct btrfs_fs_info *fs_info = eb->fs_info;
1634 int ret = 0;
1635
0d01e247
QW
1636 /*
1637 * If the page is mapped to btree inode, we should hold the private
1638 * lock to prevent race.
1639 * For cloned or dummy extent buffers, their pages are not mapped and
1640 * will not race with any other ebs.
1641 */
1642 if (page->mapping)
1643 lockdep_assert_held(&page->mapping->private_lock);
1644
fbca46eb 1645 if (fs_info->nodesize >= PAGE_SIZE) {
760f991f
QW
1646 if (!PagePrivate(page))
1647 attach_page_private(page, eb);
1648 else
1649 WARN_ON(page->private != (unsigned long)eb);
1650 return 0;
1651 }
1652
1653 /* Already mapped, just free prealloc */
1654 if (PagePrivate(page)) {
1655 btrfs_free_subpage(prealloc);
1656 return 0;
1657 }
1658
1659 if (prealloc)
1660 /* Has preallocated memory for subpage */
1661 attach_page_private(page, prealloc);
d1b89bc0 1662 else
760f991f
QW
1663 /* Do new allocation to attach subpage */
1664 ret = btrfs_attach_subpage(fs_info, page,
1665 BTRFS_SUBPAGE_METADATA);
1666 return ret;
d1310b2e
CM
1667}
1668
32443de3 1669int set_page_extent_mapped(struct page *page)
d1310b2e 1670{
32443de3
QW
1671 struct btrfs_fs_info *fs_info;
1672
1673 ASSERT(page->mapping);
1674
1675 if (PagePrivate(page))
1676 return 0;
1677
1678 fs_info = btrfs_sb(page->mapping->host->i_sb);
1679
fbca46eb 1680 if (btrfs_is_subpage(fs_info, page))
32443de3
QW
1681 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
1682
1683 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
1684 return 0;
1685}
1686
1687void clear_page_extent_mapped(struct page *page)
1688{
1689 struct btrfs_fs_info *fs_info;
1690
1691 ASSERT(page->mapping);
1692
d1b89bc0 1693 if (!PagePrivate(page))
32443de3
QW
1694 return;
1695
1696 fs_info = btrfs_sb(page->mapping->host->i_sb);
fbca46eb 1697 if (btrfs_is_subpage(fs_info, page))
32443de3
QW
1698 return btrfs_detach_subpage(fs_info, page);
1699
1700 detach_page_private(page);
d1310b2e
CM
1701}
1702
125bac01
MX
1703static struct extent_map *
1704__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
1a5ee1e6 1705 u64 start, u64 len, struct extent_map **em_cached)
125bac01
MX
1706{
1707 struct extent_map *em;
1708
1709 if (em_cached && *em_cached) {
1710 em = *em_cached;
cbc0e928 1711 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 1712 start < extent_map_end(em)) {
490b54d6 1713 refcount_inc(&em->refs);
125bac01
MX
1714 return em;
1715 }
1716
1717 free_extent_map(em);
1718 *em_cached = NULL;
1719 }
1720
1a5ee1e6 1721 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
c0347550 1722 if (em_cached && !IS_ERR(em)) {
125bac01 1723 BUG_ON(*em_cached);
490b54d6 1724 refcount_inc(&em->refs);
125bac01
MX
1725 *em_cached = em;
1726 }
1727 return em;
1728}
d1310b2e
CM
1729/*
1730 * basic readpage implementation. Locked extent state structs are inserted
1731 * into the tree that are removed when the IO is done (by the end_io
1732 * handlers)
79787eaa 1733 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 1734 * return 0 on success, otherwise return error
d1310b2e 1735 */
7aab8b32 1736static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
390ed29b 1737 struct btrfs_bio_ctrl *bio_ctrl,
bf9486d6 1738 blk_opf_t read_flags, u64 *prev_em_start)
d1310b2e
CM
1739{
1740 struct inode *inode = page->mapping->host;
92082d40 1741 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4eee4fa4 1742 u64 start = page_offset(page);
8eec8296 1743 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
1744 u64 cur = start;
1745 u64 extent_offset;
1746 u64 last_byte = i_size_read(inode);
1747 u64 block_start;
d1310b2e 1748 struct extent_map *em;
baf863b9 1749 int ret = 0;
306e16ce 1750 size_t pg_offset = 0;
d1310b2e
CM
1751 size_t iosize;
1752 size_t blocksize = inode->i_sb->s_blocksize;
f657a31c 1753 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 1754
32443de3
QW
1755 ret = set_page_extent_mapped(page);
1756 if (ret < 0) {
570eb97b 1757 unlock_extent(tree, start, end, NULL);
92082d40
QW
1758 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
1759 unlock_page(page);
32443de3
QW
1760 goto out;
1761 }
d1310b2e 1762
09cbfeaf 1763 if (page->index == last_byte >> PAGE_SHIFT) {
7073017a 1764 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
1765
1766 if (zero_offset) {
09cbfeaf 1767 iosize = PAGE_SIZE - zero_offset;
d048b9c2 1768 memzero_page(page, zero_offset, iosize);
c8b97818
CM
1769 }
1770 }
5467abba 1771 bio_ctrl->end_io_func = end_bio_extent_readpage;
92082d40 1772 begin_page_read(fs_info, page);
d1310b2e 1773 while (cur <= end) {
4c37a793 1774 unsigned long this_bio_flag = 0;
005efedf 1775 bool force_bio_submit = false;
0c64c33c 1776 u64 disk_bytenr;
c8f2f24b 1777
6a404910 1778 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
d1310b2e 1779 if (cur >= last_byte) {
507903b8
AJ
1780 struct extent_state *cached = NULL;
1781
09cbfeaf 1782 iosize = PAGE_SIZE - pg_offset;
d048b9c2 1783 memzero_page(page, pg_offset, iosize);
d1310b2e 1784 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 1785 &cached, GFP_NOFS);
570eb97b 1786 unlock_extent(tree, cur, cur + iosize - 1, &cached);
92082d40 1787 end_page_read(page, true, cur, iosize);
d1310b2e
CM
1788 break;
1789 }
125bac01 1790 em = __get_extent_map(inode, page, pg_offset, cur,
1a5ee1e6 1791 end - cur + 1, em_cached);
c0347550 1792 if (IS_ERR(em)) {
570eb97b 1793 unlock_extent(tree, cur, end, NULL);
92082d40 1794 end_page_read(page, false, cur, end + 1 - cur);
bbf0ea7e 1795 ret = PTR_ERR(em);
d1310b2e
CM
1796 break;
1797 }
d1310b2e
CM
1798 extent_offset = cur - em->start;
1799 BUG_ON(extent_map_end(em) <= cur);
1800 BUG_ON(end < cur);
1801
7f6ca7f2
DS
1802 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1803 this_bio_flag = em->compress_type;
c8b97818 1804
d1310b2e 1805 iosize = min(extent_map_end(em) - cur, end - cur + 1);
fda2832f 1806 iosize = ALIGN(iosize, blocksize);
2a5232a8 1807 if (this_bio_flag != BTRFS_COMPRESS_NONE)
0c64c33c 1808 disk_bytenr = em->block_start;
949b3273 1809 else
0c64c33c 1810 disk_bytenr = em->block_start + extent_offset;
d1310b2e 1811 block_start = em->block_start;
d899e052
YZ
1812 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
1813 block_start = EXTENT_MAP_HOLE;
005efedf
FM
1814
1815 /*
1816 * If we have a file range that points to a compressed extent
260db43c 1817 * and it's followed by a consecutive file range that points
005efedf
FM
1818 * to the same compressed extent (possibly with a different
1819 * offset and/or length, so it either points to the whole extent
1820 * or only part of it), we must make sure we do not submit a
1821 * single bio to populate the pages for the 2 ranges because
1822 * this makes the compressed extent read zero out the pages
1823 * belonging to the 2nd range. Imagine the following scenario:
1824 *
1825 * File layout
1826 * [0 - 8K] [8K - 24K]
1827 * | |
1828 * | |
1829 * points to extent X, points to extent X,
1830 * offset 4K, length of 8K offset 0, length 16K
1831 *
1832 * [extent X, compressed length = 4K uncompressed length = 16K]
1833 *
1834 * If the bio to read the compressed extent covers both ranges,
1835 * it will decompress extent X into the pages belonging to the
1836 * first range and then it will stop, zeroing out the remaining
1837 * pages that belong to the other range that points to extent X.
1838 * So here we make sure we submit 2 bios, one for the first
1839 * range and another one for the third range. Both will target
1840 * the same physical extent from disk, but we can't currently
1841 * make the compressed bio endio callback populate the pages
1842 * for both ranges because each compressed bio is tightly
1843 * coupled with a single extent map, and each range can have
1844 * an extent map with a different offset value relative to the
1845 * uncompressed data of our extent and different lengths. This
1846 * is a corner case so we prioritize correctness over
1847 * non-optimal behavior (submitting 2 bios for the same extent).
1848 */
1849 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
1850 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 1851 *prev_em_start != em->start)
005efedf
FM
1852 force_bio_submit = true;
1853
1854 if (prev_em_start)
8e928218 1855 *prev_em_start = em->start;
005efedf 1856
d1310b2e
CM
1857 free_extent_map(em);
1858 em = NULL;
1859
1860 /* we've found a hole, just zero and go on */
1861 if (block_start == EXTENT_MAP_HOLE) {
507903b8
AJ
1862 struct extent_state *cached = NULL;
1863
d048b9c2 1864 memzero_page(page, pg_offset, iosize);
d1310b2e
CM
1865
1866 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 1867 &cached, GFP_NOFS);
570eb97b 1868 unlock_extent(tree, cur, cur + iosize - 1, &cached);
92082d40 1869 end_page_read(page, true, cur, iosize);
d1310b2e 1870 cur = cur + iosize;
306e16ce 1871 pg_offset += iosize;
d1310b2e
CM
1872 continue;
1873 }
1874 /* the get_extent function already copied into the page */
70dec807 1875 if (block_start == EXTENT_MAP_INLINE) {
570eb97b 1876 unlock_extent(tree, cur, cur + iosize - 1, NULL);
52b029f4 1877 end_page_read(page, true, cur, iosize);
70dec807 1878 cur = cur + iosize;
306e16ce 1879 pg_offset += iosize;
70dec807
CM
1880 continue;
1881 }
d1310b2e 1882
0ceb34bf 1883 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
209ecde5 1884 bio_ctrl, disk_bytenr, page, iosize,
5467abba
QW
1885 pg_offset, this_bio_flag,
1886 force_bio_submit);
ad3fc794 1887 if (ret) {
10f7f6f8
QW
1888 /*
1889 * We have to unlock the remaining range, or the page
1890 * will never be unlocked.
1891 */
570eb97b 1892 unlock_extent(tree, cur, end, NULL);
10f7f6f8 1893 end_page_read(page, false, cur, end + 1 - cur);
baf863b9 1894 goto out;
edd33c99 1895 }
d1310b2e 1896 cur = cur + iosize;
306e16ce 1897 pg_offset += iosize;
d1310b2e 1898 }
90a887c9 1899out:
baf863b9 1900 return ret;
d1310b2e
CM
1901}
1902
fdaf9a58 1903int btrfs_read_folio(struct file *file, struct folio *folio)
7aab8b32 1904{
fdaf9a58 1905 struct page *page = &folio->page;
7aab8b32
CH
1906 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
1907 u64 start = page_offset(page);
1908 u64 end = start + PAGE_SIZE - 1;
1909 struct btrfs_bio_ctrl bio_ctrl = { 0 };
1910 int ret;
1911
1912 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
1913
1914 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
1915 /*
1916 * If btrfs_do_readpage() failed we will want to submit the assembled
1917 * bio to do the cleanup.
1918 */
722c82ac 1919 submit_one_bio(&bio_ctrl);
7aab8b32
CH
1920 return ret;
1921}
1922
b6660e80 1923static inline void contiguous_readpages(struct page *pages[], int nr_pages,
390ed29b
QW
1924 u64 start, u64 end,
1925 struct extent_map **em_cached,
1926 struct btrfs_bio_ctrl *bio_ctrl,
1927 u64 *prev_em_start)
9974090b 1928{
23d31bd4 1929 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
1930 int index;
1931
b272ae22 1932 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
1933
1934 for (index = 0; index < nr_pages; index++) {
390ed29b 1935 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
0f208812 1936 REQ_RAHEAD, prev_em_start);
09cbfeaf 1937 put_page(pages[index]);
9974090b
MX
1938 }
1939}
1940
d1310b2e 1941/*
40f76580
CM
1942 * helper for __extent_writepage, doing all of the delayed allocation setup.
1943 *
5eaad97a 1944 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
1945 * to write the page (copy into inline extent). In this case the IO has
1946 * been started and the page is already unlocked.
1947 *
1948 * This returns 0 if all went well (page still locked)
1949 * This returns < 0 if there were errors (page still locked)
d1310b2e 1950 */
cd4c0bf9 1951static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
83f1b680 1952 struct page *page, struct writeback_control *wbc)
40f76580 1953{
2749f7ef 1954 const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
cf3075fb 1955 u64 delalloc_start = page_offset(page);
40f76580 1956 u64 delalloc_to_write = 0;
83f1b680
QW
1957 /* How many pages are started by btrfs_run_delalloc_range() */
1958 unsigned long nr_written = 0;
40f76580
CM
1959 int ret;
1960 int page_started = 0;
1961
2749f7ef
QW
1962 while (delalloc_start < page_end) {
1963 u64 delalloc_end = page_end;
1964 bool found;
40f76580 1965
cd4c0bf9 1966 found = find_lock_delalloc_range(&inode->vfs_inode, page,
40f76580 1967 &delalloc_start,
917aacec 1968 &delalloc_end);
3522e903 1969 if (!found) {
40f76580
CM
1970 delalloc_start = delalloc_end + 1;
1971 continue;
1972 }
cd4c0bf9 1973 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
83f1b680 1974 delalloc_end, &page_started, &nr_written, wbc);
40f76580 1975 if (ret) {
963e4db8
QW
1976 btrfs_page_set_error(inode->root->fs_info, page,
1977 page_offset(page), PAGE_SIZE);
7361b4ae 1978 return ret;
40f76580
CM
1979 }
1980 /*
ea1754a0
KS
1981 * delalloc_end is already one less than the total length, so
1982 * we don't subtract one from PAGE_SIZE
40f76580
CM
1983 */
1984 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 1985 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
1986 delalloc_start = delalloc_end + 1;
1987 }
1988 if (wbc->nr_to_write < delalloc_to_write) {
1989 int thresh = 8192;
1990
1991 if (delalloc_to_write < thresh * 2)
1992 thresh = delalloc_to_write;
1993 wbc->nr_to_write = min_t(u64, delalloc_to_write,
1994 thresh);
1995 }
1996
83f1b680 1997 /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
40f76580
CM
1998 if (page_started) {
1999 /*
83f1b680
QW
2000 * We've unlocked the page, so we can't update the mapping's
2001 * writeback index, just update nr_to_write.
40f76580 2002 */
83f1b680 2003 wbc->nr_to_write -= nr_written;
40f76580
CM
2004 return 1;
2005 }
2006
b69d1ee9 2007 return 0;
40f76580
CM
2008}
2009
c5ef5c6c
QW
2010/*
2011 * Find the first byte we need to write.
2012 *
2013 * For subpage, one page can contain several sectors, and
2014 * __extent_writepage_io() will just grab all extent maps in the page
2015 * range and try to submit all non-inline/non-compressed extents.
2016 *
2017 * This is a big problem for subpage, we shouldn't re-submit already written
2018 * data at all.
2019 * This function will lookup subpage dirty bit to find which range we really
2020 * need to submit.
2021 *
2022 * Return the next dirty range in [@start, @end).
2023 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
2024 */
2025static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
2026 struct page *page, u64 *start, u64 *end)
2027{
2028 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
72a69cd0 2029 struct btrfs_subpage_info *spi = fs_info->subpage_info;
c5ef5c6c
QW
2030 u64 orig_start = *start;
2031 /* Declare as unsigned long so we can use bitmap ops */
c5ef5c6c 2032 unsigned long flags;
72a69cd0 2033 int range_start_bit;
c5ef5c6c
QW
2034 int range_end_bit;
2035
2036 /*
2037 * For regular sector size == page size case, since one page only
2038 * contains one sector, we return the page offset directly.
2039 */
fbca46eb 2040 if (!btrfs_is_subpage(fs_info, page)) {
c5ef5c6c
QW
2041 *start = page_offset(page);
2042 *end = page_offset(page) + PAGE_SIZE;
2043 return;
2044 }
2045
72a69cd0
QW
2046 range_start_bit = spi->dirty_offset +
2047 (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
2048
c5ef5c6c
QW
2049 /* We should have the page locked, but just in case */
2050 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
2051 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
2052 spi->dirty_offset + spi->bitmap_nr_bits);
c5ef5c6c
QW
2053 spin_unlock_irqrestore(&subpage->lock, flags);
2054
72a69cd0
QW
2055 range_start_bit -= spi->dirty_offset;
2056 range_end_bit -= spi->dirty_offset;
2057
c5ef5c6c
QW
2058 *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
2059 *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
2060}
2061
40f76580
CM
2062/*
2063 * helper for __extent_writepage. This calls the writepage start hooks,
2064 * and does the loop to map the page into extents and bios.
2065 *
2066 * We return 1 if the IO is started and the page is unlocked,
2067 * 0 if all went well (page still locked)
2068 * < 0 if there were errors (page still locked)
2069 */
d4580fe2 2070static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
40f76580
CM
2071 struct page *page,
2072 struct writeback_control *wbc,
2073 struct extent_page_data *epd,
2074 loff_t i_size,
57e5ffeb 2075 int *nr_ret)
d1310b2e 2076{
6bc5636a 2077 struct btrfs_fs_info *fs_info = inode->root->fs_info;
a129ffb8
QW
2078 u64 cur = page_offset(page);
2079 u64 end = cur + PAGE_SIZE - 1;
d1310b2e 2080 u64 extent_offset;
d1310b2e 2081 u64 block_start;
d1310b2e 2082 struct extent_map *em;
44e5801f 2083 int saved_ret = 0;
40f76580
CM
2084 int ret = 0;
2085 int nr = 0;
bf9486d6
BVA
2086 enum req_op op = REQ_OP_WRITE;
2087 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
44e5801f 2088 bool has_error = false;
40f76580 2089 bool compressed;
c8b97818 2090
a129ffb8 2091 ret = btrfs_writepage_cow_fixup(page);
d75855b4
NB
2092 if (ret) {
2093 /* Fixup worker will requeue */
5ab58055 2094 redirty_page_for_writepage(wbc, page);
d75855b4
NB
2095 unlock_page(page);
2096 return 1;
247e743c
CM
2097 }
2098
11c8349b
CM
2099 /*
2100 * we don't want to touch the inode after unlocking the page,
2101 * so we update the mapping writeback index now
2102 */
572f3dad 2103 wbc->nr_to_write--;
771ed689 2104
5467abba 2105 epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
d1310b2e 2106 while (cur <= end) {
0c64c33c 2107 u64 disk_bytenr;
40f76580 2108 u64 em_end;
c5ef5c6c
QW
2109 u64 dirty_range_start = cur;
2110 u64 dirty_range_end;
6bc5636a 2111 u32 iosize;
58409edd 2112
40f76580 2113 if (cur >= i_size) {
38a39ac7 2114 btrfs_writepage_endio_finish_ordered(inode, page, cur,
25c1252a 2115 end, true);
cc1d0d93
QW
2116 /*
2117 * This range is beyond i_size, thus we don't need to
2118 * bother writing back.
2119 * But we still need to clear the dirty subpage bit, or
2120 * the next time the page gets dirtied, we will try to
2121 * writeback the sectors with subpage dirty bits,
2122 * causing writeback without ordered extent.
2123 */
2124 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
d1310b2e
CM
2125 break;
2126 }
c5ef5c6c
QW
2127
2128 find_next_dirty_byte(fs_info, page, &dirty_range_start,
2129 &dirty_range_end);
2130 if (cur < dirty_range_start) {
2131 cur = dirty_range_start;
2132 continue;
2133 }
2134
d4580fe2 2135 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
c0347550 2136 if (IS_ERR(em)) {
c5ef5c6c 2137 btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
61391d56 2138 ret = PTR_ERR_OR_ZERO(em);
44e5801f
QW
2139 has_error = true;
2140 if (!saved_ret)
2141 saved_ret = ret;
d1310b2e
CM
2142 break;
2143 }
2144
2145 extent_offset = cur - em->start;
40f76580 2146 em_end = extent_map_end(em);
6bc5636a
QW
2147 ASSERT(cur <= em_end);
2148 ASSERT(cur < end);
2149 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
2150 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
d1310b2e 2151 block_start = em->block_start;
c8b97818 2152 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6bc5636a
QW
2153 disk_bytenr = em->block_start + extent_offset;
2154
c5ef5c6c
QW
2155 /*
2156 * Note that em_end from extent_map_end() and dirty_range_end from
2157 * find_next_dirty_byte() are all exclusive
2158 */
2159 iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
d8e3fb10 2160
e380adfc 2161 if (btrfs_use_zone_append(inode, em->block_start))
bf9486d6 2162 op = REQ_OP_ZONE_APPEND;
d8e3fb10 2163
d1310b2e
CM
2164 free_extent_map(em);
2165 em = NULL;
2166
c8b97818
CM
2167 /*
2168 * compressed and inline extents are written through other
2169 * paths in the FS
2170 */
2171 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 2172 block_start == EXTENT_MAP_INLINE) {
c8b04030 2173 if (compressed)
c8b97818 2174 nr++;
c8b04030 2175 else
38a39ac7 2176 btrfs_writepage_endio_finish_ordered(inode,
25c1252a 2177 page, cur, cur + iosize - 1, true);
cc1d0d93 2178 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
c8b97818 2179 cur += iosize;
d1310b2e
CM
2180 continue;
2181 }
c8b97818 2182
d2a91064 2183 btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
58409edd 2184 if (!PageWriteback(page)) {
d4580fe2 2185 btrfs_err(inode->root->fs_info,
58409edd
DS
2186 "page %lu not writeback, cur %llu end %llu",
2187 page->index, cur, end);
d1310b2e 2188 }
7f3c74fb 2189
c5ef5c6c
QW
2190 /*
2191 * Although the PageDirty bit is cleared before entering this
2192 * function, subpage dirty bit is not cleared.
2193 * So clear subpage dirty bit here so next time we won't submit
2194 * page for range already written to disk.
2195 */
2196 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
2197
bf9486d6 2198 ret = submit_extent_page(op | write_flags, wbc,
209ecde5
QW
2199 &epd->bio_ctrl, disk_bytenr,
2200 page, iosize,
390ed29b 2201 cur - page_offset(page),
722c82ac 2202 0, false);
fe01aa65 2203 if (ret) {
44e5801f
QW
2204 has_error = true;
2205 if (!saved_ret)
2206 saved_ret = ret;
2207
c5ef5c6c 2208 btrfs_page_set_error(fs_info, page, cur, iosize);
fe01aa65 2209 if (PageWriteback(page))
c5ef5c6c
QW
2210 btrfs_page_clear_writeback(fs_info, page, cur,
2211 iosize);
fe01aa65 2212 }
d1310b2e 2213
6bc5636a 2214 cur += iosize;
d1310b2e
CM
2215 nr++;
2216 }
cc1d0d93
QW
2217 /*
2218 * If we finish without problem, we should not only clear page dirty,
2219 * but also empty subpage dirty bits
2220 */
44e5801f 2221 if (!has_error)
cc1d0d93 2222 btrfs_page_assert_not_dirty(fs_info, page);
44e5801f
QW
2223 else
2224 ret = saved_ret;
40f76580 2225 *nr_ret = nr;
40f76580
CM
2226 return ret;
2227}
2228
2229/*
2230 * the writepage semantics are similar to regular writepage. extent
2231 * records are inserted to lock ranges in the tree, and as dirty areas
2232 * are found, they are marked writeback. Then the lock bits are removed
2233 * and the end_io handler clears the writeback ranges
3065976b
QW
2234 *
2235 * Return 0 if everything goes well.
2236 * Return <0 for error.
40f76580
CM
2237 */
2238static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 2239 struct extent_page_data *epd)
40f76580 2240{
8e1dec8e 2241 struct folio *folio = page_folio(page);
40f76580 2242 struct inode *inode = page->mapping->host;
e55a0de1 2243 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
cf3075fb
QW
2244 const u64 page_start = page_offset(page);
2245 const u64 page_end = page_start + PAGE_SIZE - 1;
40f76580
CM
2246 int ret;
2247 int nr = 0;
eb70d222 2248 size_t pg_offset;
40f76580 2249 loff_t i_size = i_size_read(inode);
09cbfeaf 2250 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580 2251
40f76580
CM
2252 trace___extent_writepage(page, inode, wbc);
2253
2254 WARN_ON(!PageLocked(page));
2255
963e4db8
QW
2256 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
2257 page_offset(page), PAGE_SIZE);
40f76580 2258
7073017a 2259 pg_offset = offset_in_page(i_size);
40f76580
CM
2260 if (page->index > end_index ||
2261 (page->index == end_index && !pg_offset)) {
8e1dec8e
MWO
2262 folio_invalidate(folio, 0, folio_size(folio));
2263 folio_unlock(folio);
40f76580
CM
2264 return 0;
2265 }
2266
21a8935e 2267 if (page->index == end_index)
d048b9c2 2268 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
40f76580 2269
32443de3
QW
2270 ret = set_page_extent_mapped(page);
2271 if (ret < 0) {
2272 SetPageError(page);
2273 goto done;
2274 }
40f76580 2275
7789a55a 2276 if (!epd->extent_locked) {
83f1b680 2277 ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
7789a55a 2278 if (ret == 1)
169d2c87 2279 return 0;
7789a55a
NB
2280 if (ret)
2281 goto done;
2282 }
40f76580 2283
d4580fe2 2284 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
83f1b680 2285 &nr);
40f76580 2286 if (ret == 1)
169d2c87 2287 return 0;
40f76580 2288
d1310b2e
CM
2289done:
2290 if (nr == 0) {
2291 /* make sure the mapping tag for page dirty gets cleared */
2292 set_page_writeback(page);
2293 end_page_writeback(page);
2294 }
963e4db8
QW
2295 /*
2296 * Here we used to have a check for PageError() and then set @ret and
2297 * call end_extent_writepage().
2298 *
2299 * But in fact setting @ret here will cause different error paths
2300 * between subpage and regular sectorsize.
2301 *
2302 * For regular page size, we never submit current page, but only add
2303 * current page to current bio.
2304 * The bio submission can only happen in next page.
2305 * Thus if we hit the PageError() branch, @ret is already set to
2306 * non-zero value and will not get updated for regular sectorsize.
2307 *
2308 * But for subpage case, it's possible we submit part of current page,
2309 * thus can get PageError() set by submitted bio of the same page,
2310 * while our @ret is still 0.
2311 *
2312 * So here we unify the behavior and don't set @ret.
2313 * Error can still be properly passed to higher layer as page will
2314 * be set error, here we just don't handle the IO failure.
2315 *
2316 * NOTE: This is just a hotfix for subpage.
2317 * The root fix will be properly ending ordered extent when we hit
2318 * an error during writeback.
2319 *
2320 * But that needs a bigger refactoring, as we not only need to grab the
2321 * submitted OE, but also need to know exactly at which bytenr we hit
2322 * the error.
2323 * Currently the full page based __extent_writepage_io() is not
2324 * capable of that.
2325 */
2326 if (PageError(page))
cf3075fb 2327 end_extent_writepage(page, ret, page_start, page_end);
e55a0de1
QW
2328 if (epd->extent_locked) {
2329 /*
2330 * If epd->extent_locked, it's from extent_write_locked_range(),
2331 * the page can either be locked by lock_page() or
2332 * process_one_page().
2333 * Let btrfs_page_unlock_writer() handle both cases.
2334 */
2335 ASSERT(wbc);
2336 btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
2337 wbc->range_end + 1 - wbc->range_start);
2338 } else {
2339 unlock_page(page);
2340 }
3065976b 2341 ASSERT(ret <= 0);
40f76580 2342 return ret;
d1310b2e
CM
2343}
2344
fd8b2b61 2345void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 2346{
74316201
N
2347 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
2348 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
2349}
2350
18dfa711
FM
2351static void end_extent_buffer_writeback(struct extent_buffer *eb)
2352{
2353 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
2354 smp_mb__after_atomic();
2355 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
2356}
2357
2e3c2513 2358/*
a3efb2f0 2359 * Lock extent buffer status and pages for writeback.
2e3c2513 2360 *
a3efb2f0
QW
2361 * May try to flush write bio if we can't get the lock.
2362 *
2363 * Return 0 if the extent buffer doesn't need to be submitted.
2364 * (E.g. the extent buffer is not dirty)
2365 * Return >0 is the extent buffer is submitted to bio.
2366 * Return <0 if something went wrong, no page is locked.
2e3c2513 2367 */
9df76fb5 2368static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 2369 struct extent_page_data *epd)
0b32f4bb 2370{
9df76fb5 2371 struct btrfs_fs_info *fs_info = eb->fs_info;
c9583ada 2372 int i, num_pages;
0b32f4bb
JB
2373 int flush = 0;
2374 int ret = 0;
2375
2376 if (!btrfs_try_tree_write_lock(eb)) {
9845e5dd 2377 submit_write_bio(epd, 0);
2e3c2513 2378 flush = 1;
0b32f4bb
JB
2379 btrfs_tree_lock(eb);
2380 }
2381
2382 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
2383 btrfs_tree_unlock(eb);
2384 if (!epd->sync_io)
2385 return 0;
2386 if (!flush) {
9845e5dd 2387 submit_write_bio(epd, 0);
0b32f4bb
JB
2388 flush = 1;
2389 }
a098d8e8
CM
2390 while (1) {
2391 wait_on_extent_buffer_writeback(eb);
2392 btrfs_tree_lock(eb);
2393 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
2394 break;
0b32f4bb 2395 btrfs_tree_unlock(eb);
0b32f4bb
JB
2396 }
2397 }
2398
51561ffe
JB
2399 /*
2400 * We need to do this to prevent races in people who check if the eb is
2401 * under IO since we can end up having no IO bits set for a short period
2402 * of time.
2403 */
2404 spin_lock(&eb->refs_lock);
0b32f4bb
JB
2405 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2406 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 2407 spin_unlock(&eb->refs_lock);
0b32f4bb 2408 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
2409 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
2410 -eb->len,
2411 fs_info->dirty_metadata_batch);
0b32f4bb 2412 ret = 1;
51561ffe
JB
2413 } else {
2414 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
2415 }
2416
2417 btrfs_tree_unlock(eb);
2418
f3156df9
QW
2419 /*
2420 * Either we don't need to submit any tree block, or we're submitting
2421 * subpage eb.
2422 * Subpage metadata doesn't use page locking at all, so we can skip
2423 * the page locking.
2424 */
fbca46eb 2425 if (!ret || fs_info->nodesize < PAGE_SIZE)
0b32f4bb
JB
2426 return ret;
2427
65ad0104 2428 num_pages = num_extent_pages(eb);
0b32f4bb 2429 for (i = 0; i < num_pages; i++) {
fb85fc9a 2430 struct page *p = eb->pages[i];
0b32f4bb
JB
2431
2432 if (!trylock_page(p)) {
2433 if (!flush) {
9845e5dd 2434 submit_write_bio(epd, 0);
0b32f4bb
JB
2435 flush = 1;
2436 }
2437 lock_page(p);
2438 }
2439 }
2440
2e3c2513 2441 return ret;
0b32f4bb
JB
2442}
2443
5a2c6075 2444static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
656f30db 2445{
5a2c6075 2446 struct btrfs_fs_info *fs_info = eb->fs_info;
656f30db 2447
5a2c6075 2448 btrfs_page_set_error(fs_info, page, eb->start, eb->len);
656f30db
FM
2449 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
2450 return;
2451
c2e39305
JB
2452 /*
2453 * A read may stumble upon this buffer later, make sure that it gets an
2454 * error and knows there was an error.
2455 */
2456 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2457
68b85589
JB
2458 /*
2459 * We need to set the mapping with the io error as well because a write
2460 * error will flip the file system readonly, and then syncfs() will
2461 * return a 0 because we are readonly if we don't modify the err seq for
2462 * the superblock.
2463 */
2464 mapping_set_error(page->mapping, -EIO);
2465
eb5b64f1
DZ
2466 /*
2467 * If we error out, we should add back the dirty_metadata_bytes
2468 * to make it consistent.
2469 */
eb5b64f1
DZ
2470 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
2471 eb->len, fs_info->dirty_metadata_batch);
2472
656f30db
FM
2473 /*
2474 * If writeback for a btree extent that doesn't belong to a log tree
2475 * failed, increment the counter transaction->eb_write_errors.
2476 * We do this because while the transaction is running and before it's
2477 * committing (when we call filemap_fdata[write|wait]_range against
2478 * the btree inode), we might have
2479 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
2480 * returns an error or an error happens during writeback, when we're
2481 * committing the transaction we wouldn't know about it, since the pages
2482 * can be no longer dirty nor marked anymore for writeback (if a
2483 * subsequent modification to the extent buffer didn't happen before the
2484 * transaction commit), which makes filemap_fdata[write|wait]_range not
2485 * able to find the pages tagged with SetPageError at transaction
2486 * commit time. So if this happens we must abort the transaction,
2487 * otherwise we commit a super block with btree roots that point to
2488 * btree nodes/leafs whose content on disk is invalid - either garbage
2489 * or the content of some node/leaf from a past generation that got
2490 * cowed or deleted and is no longer valid.
2491 *
2492 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
2493 * not be enough - we need to distinguish between log tree extents vs
2494 * non-log tree extents, and the next filemap_fdatawait_range() call
2495 * will catch and clear such errors in the mapping - and that call might
2496 * be from a log sync and not from a transaction commit. Also, checking
2497 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
2498 * not done and would not be reliable - the eb might have been released
2499 * from memory and reading it back again means that flag would not be
2500 * set (since it's a runtime flag, not persisted on disk).
2501 *
2502 * Using the flags below in the btree inode also makes us achieve the
2503 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
2504 * writeback for all dirty pages and before filemap_fdatawait_range()
2505 * is called, the writeback for all dirty pages had already finished
2506 * with errors - because we were not using AS_EIO/AS_ENOSPC,
2507 * filemap_fdatawait_range() would return success, as it could not know
2508 * that writeback errors happened (the pages were no longer tagged for
2509 * writeback).
2510 */
2511 switch (eb->log_index) {
2512 case -1:
5a2c6075 2513 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
656f30db
FM
2514 break;
2515 case 0:
5a2c6075 2516 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
656f30db
FM
2517 break;
2518 case 1:
5a2c6075 2519 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
656f30db
FM
2520 break;
2521 default:
2522 BUG(); /* unexpected, logic error */
2523 }
2524}
2525
2f3186d8
QW
2526/*
2527 * The endio specific version which won't touch any unsafe spinlock in endio
2528 * context.
2529 */
2530static struct extent_buffer *find_extent_buffer_nolock(
2531 struct btrfs_fs_info *fs_info, u64 start)
2532{
2533 struct extent_buffer *eb;
2534
2535 rcu_read_lock();
01cd3909
DS
2536 eb = radix_tree_lookup(&fs_info->buffer_radix,
2537 start >> fs_info->sectorsize_bits);
2f3186d8
QW
2538 if (eb && atomic_inc_not_zero(&eb->refs)) {
2539 rcu_read_unlock();
2540 return eb;
2541 }
2542 rcu_read_unlock();
2543 return NULL;
2544}
2545
2546/*
2547 * The endio function for subpage extent buffer write.
2548 *
2549 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
2550 * after all extent buffers in the page has finished their writeback.
2551 */
917f32a2 2552static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
2f3186d8 2553{
917f32a2 2554 struct bio *bio = &bbio->bio;
fa04c165 2555 struct btrfs_fs_info *fs_info;
2f3186d8
QW
2556 struct bio_vec *bvec;
2557 struct bvec_iter_all iter_all;
2558
fa04c165 2559 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
fbca46eb 2560 ASSERT(fs_info->nodesize < PAGE_SIZE);
fa04c165 2561
2f3186d8
QW
2562 ASSERT(!bio_flagged(bio, BIO_CLONED));
2563 bio_for_each_segment_all(bvec, bio, iter_all) {
2564 struct page *page = bvec->bv_page;
2565 u64 bvec_start = page_offset(page) + bvec->bv_offset;
2566 u64 bvec_end = bvec_start + bvec->bv_len - 1;
2567 u64 cur_bytenr = bvec_start;
2568
2569 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
2570
2571 /* Iterate through all extent buffers in the range */
2572 while (cur_bytenr <= bvec_end) {
2573 struct extent_buffer *eb;
2574 int done;
2575
2576 /*
2577 * Here we can't use find_extent_buffer(), as it may
2578 * try to lock eb->refs_lock, which is not safe in endio
2579 * context.
2580 */
2581 eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
2582 ASSERT(eb);
2583
2584 cur_bytenr = eb->start + eb->len;
2585
2586 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
2587 done = atomic_dec_and_test(&eb->io_pages);
2588 ASSERT(done);
2589
2590 if (bio->bi_status ||
2591 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
2592 ClearPageUptodate(page);
2593 set_btree_ioerr(page, eb);
2594 }
2595
2596 btrfs_subpage_clear_writeback(fs_info, page, eb->start,
2597 eb->len);
2598 end_extent_buffer_writeback(eb);
2599 /*
2600 * free_extent_buffer() will grab spinlock which is not
2601 * safe in endio context. Thus here we manually dec
2602 * the ref.
2603 */
2604 atomic_dec(&eb->refs);
2605 }
2606 }
2607 bio_put(bio);
2608}
2609
917f32a2 2610static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
0b32f4bb 2611{
917f32a2 2612 struct bio *bio = &bbio->bio;
2c30c71b 2613 struct bio_vec *bvec;
0b32f4bb 2614 struct extent_buffer *eb;
2b070cfe 2615 int done;
6dc4f100 2616 struct bvec_iter_all iter_all;
0b32f4bb 2617
c09abff8 2618 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2619 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
2620 struct page *page = bvec->bv_page;
2621
0b32f4bb
JB
2622 eb = (struct extent_buffer *)page->private;
2623 BUG_ON(!eb);
2624 done = atomic_dec_and_test(&eb->io_pages);
2625
4e4cbee9 2626 if (bio->bi_status ||
4246a0b6 2627 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 2628 ClearPageUptodate(page);
5a2c6075 2629 set_btree_ioerr(page, eb);
0b32f4bb
JB
2630 }
2631
2632 end_page_writeback(page);
2633
2634 if (!done)
2635 continue;
2636
2637 end_extent_buffer_writeback(eb);
2c30c71b 2638 }
0b32f4bb
JB
2639
2640 bio_put(bio);
0b32f4bb
JB
2641}
2642
fa04c165
QW
2643static void prepare_eb_write(struct extent_buffer *eb)
2644{
2645 u32 nritems;
2646 unsigned long start;
2647 unsigned long end;
2648
2649 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
2650 atomic_set(&eb->io_pages, num_extent_pages(eb));
2651
2652 /* Set btree blocks beyond nritems with 0 to avoid stale content */
2653 nritems = btrfs_header_nritems(eb);
2654 if (btrfs_header_level(eb) > 0) {
2655 end = btrfs_node_key_ptr_offset(nritems);
2656 memzero_extent_buffer(eb, end, eb->len - end);
2657 } else {
2658 /*
2659 * Leaf:
2660 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
2661 */
2662 start = btrfs_item_nr_offset(nritems);
2663 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
2664 memzero_extent_buffer(eb, start, end - start);
2665 }
2666}
2667
35b6ddfa
QW
2668/*
2669 * Unlike the work in write_one_eb(), we rely completely on extent locking.
2670 * Page locking is only utilized at minimum to keep the VMM code happy.
35b6ddfa
QW
2671 */
2672static int write_one_subpage_eb(struct extent_buffer *eb,
2673 struct writeback_control *wbc,
2674 struct extent_page_data *epd)
2675{
2676 struct btrfs_fs_info *fs_info = eb->fs_info;
2677 struct page *page = eb->pages[0];
353767e4 2678 blk_opf_t write_flags = wbc_to_write_flags(wbc);
35b6ddfa
QW
2679 bool no_dirty_ebs = false;
2680 int ret;
2681
fa04c165
QW
2682 prepare_eb_write(eb);
2683
35b6ddfa
QW
2684 /* clear_page_dirty_for_io() in subpage helper needs page locked */
2685 lock_page(page);
2686 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
2687
2688 /* Check if this is the last dirty bit to update nr_written */
2689 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
2690 eb->start, eb->len);
2691 if (no_dirty_ebs)
2692 clear_page_dirty_for_io(page);
2693
5467abba
QW
2694 epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
2695
390ed29b 2696 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
209ecde5 2697 &epd->bio_ctrl, eb->start, page, eb->len,
5467abba 2698 eb->start - page_offset(page), 0, false);
35b6ddfa
QW
2699 if (ret) {
2700 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
2701 set_btree_ioerr(page, eb);
2702 unlock_page(page);
2703
2704 if (atomic_dec_and_test(&eb->io_pages))
2705 end_extent_buffer_writeback(eb);
2706 return -EIO;
2707 }
2708 unlock_page(page);
2709 /*
2710 * Submission finished without problem, if no range of the page is
2711 * dirty anymore, we have submitted a page. Update nr_written in wbc.
2712 */
2713 if (no_dirty_ebs)
572f3dad 2714 wbc->nr_to_write--;
35b6ddfa
QW
2715 return ret;
2716}
2717
0e378df1 2718static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
2719 struct writeback_control *wbc,
2720 struct extent_page_data *epd)
2721{
0c64c33c 2722 u64 disk_bytenr = eb->start;
cc5e31a4 2723 int i, num_pages;
353767e4 2724 blk_opf_t write_flags = wbc_to_write_flags(wbc);
d7dbe9e7 2725 int ret = 0;
0b32f4bb 2726
fa04c165 2727 prepare_eb_write(eb);
35b6ddfa 2728
5467abba
QW
2729 epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
2730
fa04c165 2731 num_pages = num_extent_pages(eb);
0b32f4bb 2732 for (i = 0; i < num_pages; i++) {
fb85fc9a 2733 struct page *p = eb->pages[i];
0b32f4bb
JB
2734
2735 clear_page_dirty_for_io(p);
2736 set_page_writeback(p);
0ceb34bf 2737 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
209ecde5 2738 &epd->bio_ctrl, disk_bytenr, p,
5467abba 2739 PAGE_SIZE, 0, 0, false);
0b32f4bb 2740 if (ret) {
5a2c6075 2741 set_btree_ioerr(p, eb);
fe01aa65
TK
2742 if (PageWriteback(p))
2743 end_page_writeback(p);
0b32f4bb
JB
2744 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
2745 end_extent_buffer_writeback(eb);
2746 ret = -EIO;
2747 break;
2748 }
0c64c33c 2749 disk_bytenr += PAGE_SIZE;
572f3dad 2750 wbc->nr_to_write--;
0b32f4bb
JB
2751 unlock_page(p);
2752 }
2753
2754 if (unlikely(ret)) {
2755 for (; i < num_pages; i++) {
bbf65cf0 2756 struct page *p = eb->pages[i];
81465028 2757 clear_page_dirty_for_io(p);
0b32f4bb
JB
2758 unlock_page(p);
2759 }
2760 }
2761
2762 return ret;
2763}
2764
c4aec299
QW
2765/*
2766 * Submit one subpage btree page.
2767 *
2768 * The main difference to submit_eb_page() is:
2769 * - Page locking
2770 * For subpage, we don't rely on page locking at all.
2771 *
2772 * - Flush write bio
2773 * We only flush bio if we may be unable to fit current extent buffers into
2774 * current bio.
2775 *
2776 * Return >=0 for the number of submitted extent buffers.
2777 * Return <0 for fatal error.
2778 */
2779static int submit_eb_subpage(struct page *page,
2780 struct writeback_control *wbc,
2781 struct extent_page_data *epd)
2782{
2783 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2784 int submitted = 0;
2785 u64 page_start = page_offset(page);
2786 int bit_start = 0;
c4aec299
QW
2787 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
2788 int ret;
2789
2790 /* Lock and write each dirty extent buffers in the range */
72a69cd0 2791 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
c4aec299
QW
2792 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
2793 struct extent_buffer *eb;
2794 unsigned long flags;
2795 u64 start;
2796
2797 /*
2798 * Take private lock to ensure the subpage won't be detached
2799 * in the meantime.
2800 */
2801 spin_lock(&page->mapping->private_lock);
2802 if (!PagePrivate(page)) {
2803 spin_unlock(&page->mapping->private_lock);
2804 break;
2805 }
2806 spin_lock_irqsave(&subpage->lock, flags);
72a69cd0
QW
2807 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
2808 subpage->bitmaps)) {
c4aec299
QW
2809 spin_unlock_irqrestore(&subpage->lock, flags);
2810 spin_unlock(&page->mapping->private_lock);
2811 bit_start++;
2812 continue;
2813 }
2814
2815 start = page_start + bit_start * fs_info->sectorsize;
2816 bit_start += sectors_per_node;
2817
2818 /*
2819 * Here we just want to grab the eb without touching extra
2820 * spin locks, so call find_extent_buffer_nolock().
2821 */
2822 eb = find_extent_buffer_nolock(fs_info, start);
2823 spin_unlock_irqrestore(&subpage->lock, flags);
2824 spin_unlock(&page->mapping->private_lock);
2825
2826 /*
2827 * The eb has already reached 0 refs thus find_extent_buffer()
2828 * doesn't return it. We don't need to write back such eb
2829 * anyway.
2830 */
2831 if (!eb)
2832 continue;
2833
2834 ret = lock_extent_buffer_for_io(eb, epd);
2835 if (ret == 0) {
2836 free_extent_buffer(eb);
2837 continue;
2838 }
2839 if (ret < 0) {
2840 free_extent_buffer(eb);
2841 goto cleanup;
2842 }
fa04c165 2843 ret = write_one_subpage_eb(eb, wbc, epd);
c4aec299
QW
2844 free_extent_buffer(eb);
2845 if (ret < 0)
2846 goto cleanup;
2847 submitted++;
2848 }
2849 return submitted;
2850
2851cleanup:
2852 /* We hit error, end bio for the submitted extent buffers */
9845e5dd 2853 submit_write_bio(epd, ret);
c4aec299
QW
2854 return ret;
2855}
2856
f91e0d0c
QW
2857/*
2858 * Submit all page(s) of one extent buffer.
2859 *
2860 * @page: the page of one extent buffer
2861 * @eb_context: to determine if we need to submit this page, if current page
2862 * belongs to this eb, we don't need to submit
2863 *
2864 * The caller should pass each page in their bytenr order, and here we use
2865 * @eb_context to determine if we have submitted pages of one extent buffer.
2866 *
2867 * If we have, we just skip until we hit a new page that doesn't belong to
2868 * current @eb_context.
2869 *
2870 * If not, we submit all the page(s) of the extent buffer.
2871 *
2872 * Return >0 if we have submitted the extent buffer successfully.
2873 * Return 0 if we don't need to submit the page, as it's already submitted by
2874 * previous call.
2875 * Return <0 for fatal error.
2876 */
2877static int submit_eb_page(struct page *page, struct writeback_control *wbc,
2878 struct extent_page_data *epd,
2879 struct extent_buffer **eb_context)
2880{
2881 struct address_space *mapping = page->mapping;
0bc09ca1 2882 struct btrfs_block_group *cache = NULL;
f91e0d0c
QW
2883 struct extent_buffer *eb;
2884 int ret;
2885
2886 if (!PagePrivate(page))
2887 return 0;
2888
fbca46eb 2889 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
c4aec299
QW
2890 return submit_eb_subpage(page, wbc, epd);
2891
f91e0d0c
QW
2892 spin_lock(&mapping->private_lock);
2893 if (!PagePrivate(page)) {
2894 spin_unlock(&mapping->private_lock);
2895 return 0;
2896 }
2897
2898 eb = (struct extent_buffer *)page->private;
2899
2900 /*
2901 * Shouldn't happen and normally this would be a BUG_ON but no point
2902 * crashing the machine for something we can survive anyway.
2903 */
2904 if (WARN_ON(!eb)) {
2905 spin_unlock(&mapping->private_lock);
2906 return 0;
2907 }
2908
2909 if (eb == *eb_context) {
2910 spin_unlock(&mapping->private_lock);
2911 return 0;
2912 }
2913 ret = atomic_inc_not_zero(&eb->refs);
2914 spin_unlock(&mapping->private_lock);
2915 if (!ret)
2916 return 0;
2917
0bc09ca1
NA
2918 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
2919 /*
2920 * If for_sync, this hole will be filled with
2921 * trasnsaction commit.
2922 */
2923 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
2924 ret = -EAGAIN;
2925 else
2926 ret = 0;
2927 free_extent_buffer(eb);
2928 return ret;
2929 }
2930
f91e0d0c
QW
2931 *eb_context = eb;
2932
2933 ret = lock_extent_buffer_for_io(eb, epd);
2934 if (ret <= 0) {
0bc09ca1
NA
2935 btrfs_revert_meta_write_pointer(cache, eb);
2936 if (cache)
2937 btrfs_put_block_group(cache);
f91e0d0c
QW
2938 free_extent_buffer(eb);
2939 return ret;
2940 }
be1a1d7a 2941 if (cache) {
d3e29967
NB
2942 /*
2943 * Implies write in zoned mode. Mark the last eb in a block group.
2944 */
56fbb0a4 2945 btrfs_schedule_zone_finish_bg(cache, eb);
d3e29967 2946 btrfs_put_block_group(cache);
be1a1d7a 2947 }
f91e0d0c
QW
2948 ret = write_one_eb(eb, wbc, epd);
2949 free_extent_buffer(eb);
2950 if (ret < 0)
2951 return ret;
2952 return 1;
2953}
2954
0b32f4bb
JB
2955int btree_write_cache_pages(struct address_space *mapping,
2956 struct writeback_control *wbc)
2957{
f91e0d0c 2958 struct extent_buffer *eb_context = NULL;
0b32f4bb 2959 struct extent_page_data epd = {
390ed29b 2960 .bio_ctrl = { 0 },
0b32f4bb
JB
2961 .extent_locked = 0,
2962 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2963 };
b3ff8f1d 2964 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
2965 int ret = 0;
2966 int done = 0;
2967 int nr_to_write_done = 0;
2968 struct pagevec pvec;
2969 int nr_pages;
2970 pgoff_t index;
2971 pgoff_t end; /* Inclusive */
2972 int scanned = 0;
10bbd235 2973 xa_mark_t tag;
0b32f4bb 2974
86679820 2975 pagevec_init(&pvec);
0b32f4bb
JB
2976 if (wbc->range_cyclic) {
2977 index = mapping->writeback_index; /* Start from prev offset */
2978 end = -1;
556755a8
JB
2979 /*
2980 * Start from the beginning does not need to cycle over the
2981 * range, mark it as scanned.
2982 */
2983 scanned = (index == 0);
0b32f4bb 2984 } else {
09cbfeaf
KS
2985 index = wbc->range_start >> PAGE_SHIFT;
2986 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
2987 scanned = 1;
2988 }
2989 if (wbc->sync_mode == WB_SYNC_ALL)
2990 tag = PAGECACHE_TAG_TOWRITE;
2991 else
2992 tag = PAGECACHE_TAG_DIRTY;
0bc09ca1 2993 btrfs_zoned_meta_io_lock(fs_info);
0b32f4bb
JB
2994retry:
2995 if (wbc->sync_mode == WB_SYNC_ALL)
2996 tag_pages_for_writeback(mapping, index, end);
2997 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 2998 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 2999 tag))) {
0b32f4bb
JB
3000 unsigned i;
3001
0b32f4bb
JB
3002 for (i = 0; i < nr_pages; i++) {
3003 struct page *page = pvec.pages[i];
3004
f91e0d0c
QW
3005 ret = submit_eb_page(page, wbc, &epd, &eb_context);
3006 if (ret == 0)
0b32f4bb 3007 continue;
f91e0d0c 3008 if (ret < 0) {
0b32f4bb 3009 done = 1;
0b32f4bb
JB
3010 break;
3011 }
0b32f4bb
JB
3012
3013 /*
3014 * the filesystem may choose to bump up nr_to_write.
3015 * We have to make sure to honor the new nr_to_write
3016 * at any time
3017 */
3018 nr_to_write_done = wbc->nr_to_write <= 0;
3019 }
3020 pagevec_release(&pvec);
3021 cond_resched();
3022 }
3023 if (!scanned && !done) {
3024 /*
3025 * We hit the last page and there is more work to be done: wrap
3026 * back to the start of the file
3027 */
3028 scanned = 1;
3029 index = 0;
3030 goto retry;
3031 }
b3ff8f1d
QW
3032 /*
3033 * If something went wrong, don't allow any metadata write bio to be
3034 * submitted.
3035 *
3036 * This would prevent use-after-free if we had dirty pages not
3037 * cleaned up, which can still happen by fuzzed images.
3038 *
3039 * - Bad extent tree
3040 * Allowing existing tree block to be allocated for other trees.
3041 *
3042 * - Log tree operations
3043 * Exiting tree blocks get allocated to log tree, bumps its
3044 * generation, then get cleaned in tree re-balance.
3045 * Such tree block will not be written back, since it's clean,
3046 * thus no WRITTEN flag set.
3047 * And after log writes back, this tree block is not traced by
3048 * any dirty extent_io_tree.
3049 *
3050 * - Offending tree block gets re-dirtied from its original owner
3051 * Since it has bumped generation, no WRITTEN flag, it can be
3052 * reused without COWing. This tree block will not be traced
3053 * by btrfs_transaction::dirty_pages.
3054 *
3055 * Now such dirty tree block will not be cleaned by any dirty
3056 * extent io tree. Thus we don't want to submit such wild eb
3057 * if the fs already has error.
9845e5dd 3058 *
c9583ada
QW
3059 * We can get ret > 0 from submit_extent_page() indicating how many ebs
3060 * were submitted. Reset it to 0 to avoid false alerts for the caller.
3061 */
3062 if (ret > 0)
3063 ret = 0;
9845e5dd
CH
3064 if (!ret && BTRFS_FS_ERROR(fs_info))
3065 ret = -EROFS;
3066 submit_write_bio(&epd, ret);
3067
3068 btrfs_zoned_meta_io_unlock(fs_info);
0b32f4bb
JB
3069 return ret;
3070}
3071
43dd529a 3072/*
3bed2da1
NB
3073 * Walk the list of dirty pages of the given address space and write all of them.
3074 *
d1310b2e 3075 * @mapping: address space structure to write
3bed2da1
NB
3076 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3077 * @epd: holds context for the write, namely the bio
d1310b2e
CM
3078 *
3079 * If a page is already under I/O, write_cache_pages() skips it, even
3080 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
3081 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
3082 * and msync() need to guarantee that all the data which was dirty at the time
3083 * the call was made get new I/O started against them. If wbc->sync_mode is
3084 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3085 * existing IO to complete.
3086 */
4242b64a 3087static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 3088 struct writeback_control *wbc,
aab6e9ed 3089 struct extent_page_data *epd)
d1310b2e 3090{
7fd1a3f7 3091 struct inode *inode = mapping->host;
d1310b2e
CM
3092 int ret = 0;
3093 int done = 0;
f85d7d6c 3094 int nr_to_write_done = 0;
d1310b2e
CM
3095 struct pagevec pvec;
3096 int nr_pages;
3097 pgoff_t index;
3098 pgoff_t end; /* Inclusive */
a9132667
LB
3099 pgoff_t done_index;
3100 int range_whole = 0;
d1310b2e 3101 int scanned = 0;
10bbd235 3102 xa_mark_t tag;
d1310b2e 3103
7fd1a3f7
JB
3104 /*
3105 * We have to hold onto the inode so that ordered extents can do their
3106 * work when the IO finishes. The alternative to this is failing to add
3107 * an ordered extent if the igrab() fails there and that is a huge pain
3108 * to deal with, so instead just hold onto the inode throughout the
3109 * writepages operation. If it fails here we are freeing up the inode
3110 * anyway and we'd rather not waste our time writing out stuff that is
3111 * going to be truncated anyway.
3112 */
3113 if (!igrab(inode))
3114 return 0;
3115
86679820 3116 pagevec_init(&pvec);
d1310b2e
CM
3117 if (wbc->range_cyclic) {
3118 index = mapping->writeback_index; /* Start from prev offset */
3119 end = -1;
556755a8
JB
3120 /*
3121 * Start from the beginning does not need to cycle over the
3122 * range, mark it as scanned.
3123 */
3124 scanned = (index == 0);
d1310b2e 3125 } else {
09cbfeaf
KS
3126 index = wbc->range_start >> PAGE_SHIFT;
3127 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
3128 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
3129 range_whole = 1;
d1310b2e
CM
3130 scanned = 1;
3131 }
3cd24c69
EL
3132
3133 /*
3134 * We do the tagged writepage as long as the snapshot flush bit is set
3135 * and we are the first one who do the filemap_flush() on this inode.
3136 *
3137 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
3138 * not race in and drop the bit.
3139 */
3140 if (range_whole && wbc->nr_to_write == LONG_MAX &&
3141 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
3142 &BTRFS_I(inode)->runtime_flags))
3143 wbc->tagged_writepages = 1;
3144
3145 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
3146 tag = PAGECACHE_TAG_TOWRITE;
3147 else
3148 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 3149retry:
3cd24c69 3150 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 3151 tag_pages_for_writeback(mapping, index, end);
a9132667 3152 done_index = index;
f85d7d6c 3153 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
3154 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
3155 &index, end, tag))) {
d1310b2e
CM
3156 unsigned i;
3157
d1310b2e
CM
3158 for (i = 0; i < nr_pages; i++) {
3159 struct page *page = pvec.pages[i];
3160
f7bddf1e 3161 done_index = page->index + 1;
d1310b2e 3162 /*
b93b0163
MW
3163 * At this point we hold neither the i_pages lock nor
3164 * the page lock: the page may be truncated or
3165 * invalidated (changing page->mapping to NULL),
3166 * or even swizzled back from swapper_space to
3167 * tmpfs file mapping
d1310b2e 3168 */
c8f2f24b 3169 if (!trylock_page(page)) {
9845e5dd 3170 submit_write_bio(epd, 0);
c8f2f24b 3171 lock_page(page);
01d658f2 3172 }
d1310b2e
CM
3173
3174 if (unlikely(page->mapping != mapping)) {
3175 unlock_page(page);
3176 continue;
3177 }
3178
d2c3f4f6 3179 if (wbc->sync_mode != WB_SYNC_NONE) {
c9583ada 3180 if (PageWriteback(page))
9845e5dd 3181 submit_write_bio(epd, 0);
d1310b2e 3182 wait_on_page_writeback(page);
d2c3f4f6 3183 }
d1310b2e
CM
3184
3185 if (PageWriteback(page) ||
3186 !clear_page_dirty_for_io(page)) {
3187 unlock_page(page);
3188 continue;
3189 }
3190
aab6e9ed 3191 ret = __extent_writepage(page, wbc, epd);
a9132667 3192 if (ret < 0) {
a9132667
LB
3193 done = 1;
3194 break;
3195 }
f85d7d6c
CM
3196
3197 /*
3198 * the filesystem may choose to bump up nr_to_write.
3199 * We have to make sure to honor the new nr_to_write
3200 * at any time
3201 */
3202 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
3203 }
3204 pagevec_release(&pvec);
3205 cond_resched();
3206 }
894b36e3 3207 if (!scanned && !done) {
d1310b2e
CM
3208 /*
3209 * We hit the last page and there is more work to be done: wrap
3210 * back to the start of the file
3211 */
3212 scanned = 1;
3213 index = 0;
42ffb0bf
JB
3214
3215 /*
3216 * If we're looping we could run into a page that is locked by a
3217 * writer and that writer could be waiting on writeback for a
3218 * page in our current bio, and thus deadlock, so flush the
3219 * write bio here.
3220 */
9845e5dd 3221 submit_write_bio(epd, 0);
c9583ada 3222 goto retry;
d1310b2e 3223 }
a9132667
LB
3224
3225 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
3226 mapping->writeback_index = done_index;
3227
7fd1a3f7 3228 btrfs_add_delayed_iput(inode);
894b36e3 3229 return ret;
d1310b2e 3230}
d1310b2e 3231
2bd0fc93
QW
3232/*
3233 * Submit the pages in the range to bio for call sites which delalloc range has
3234 * already been ran (aka, ordered extent inserted) and all pages are still
3235 * locked.
3236 */
3237int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
771ed689 3238{
2bd0fc93
QW
3239 bool found_error = false;
3240 int first_error = 0;
771ed689
CM
3241 int ret = 0;
3242 struct address_space *mapping = inode->i_mapping;
3243 struct page *page;
2bd0fc93 3244 u64 cur = start;
66448b9d
QW
3245 unsigned long nr_pages;
3246 const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
771ed689 3247 struct extent_page_data epd = {
390ed29b 3248 .bio_ctrl = { 0 },
771ed689 3249 .extent_locked = 1,
2bd0fc93 3250 .sync_io = 1,
771ed689
CM
3251 };
3252 struct writeback_control wbc_writepages = {
2bd0fc93 3253 .sync_mode = WB_SYNC_ALL,
771ed689
CM
3254 .range_start = start,
3255 .range_end = end + 1,
ec39f769
CM
3256 /* We're called from an async helper function */
3257 .punt_to_cgroup = 1,
3258 .no_cgroup_owner = 1,
771ed689
CM
3259 };
3260
66448b9d
QW
3261 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
3262 nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
3263 PAGE_SHIFT;
3264 wbc_writepages.nr_to_write = nr_pages * 2;
3265
dbb70bec 3266 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
2bd0fc93 3267 while (cur <= end) {
66448b9d
QW
3268 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
3269
2bd0fc93
QW
3270 page = find_get_page(mapping, cur >> PAGE_SHIFT);
3271 /*
3272 * All pages in the range are locked since
3273 * btrfs_run_delalloc_range(), thus there is no way to clear
3274 * the page dirty flag.
3275 */
66448b9d 3276 ASSERT(PageLocked(page));
2bd0fc93
QW
3277 ASSERT(PageDirty(page));
3278 clear_page_dirty_for_io(page);
3279 ret = __extent_writepage(page, &wbc_writepages, &epd);
3280 ASSERT(ret <= 0);
3281 if (ret < 0) {
3282 found_error = true;
3283 first_error = ret;
771ed689 3284 }
09cbfeaf 3285 put_page(page);
66448b9d 3286 cur = cur_end + 1;
771ed689
CM
3287 }
3288
9845e5dd 3289 submit_write_bio(&epd, found_error ? ret : 0);
dbb70bec
CM
3290
3291 wbc_detach_inode(&wbc_writepages);
2bd0fc93
QW
3292 if (found_error)
3293 return first_error;
771ed689
CM
3294 return ret;
3295}
d1310b2e 3296
8ae225a8 3297int extent_writepages(struct address_space *mapping,
d1310b2e
CM
3298 struct writeback_control *wbc)
3299{
35156d85 3300 struct inode *inode = mapping->host;
d1310b2e
CM
3301 int ret = 0;
3302 struct extent_page_data epd = {
390ed29b 3303 .bio_ctrl = { 0 },
771ed689 3304 .extent_locked = 0,
ffbd517d 3305 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
3306 };
3307
35156d85
JT
3308 /*
3309 * Allow only a single thread to do the reloc work in zoned mode to
3310 * protect the write pointer updates.
3311 */
869f4cdc 3312 btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
935db853 3313 ret = extent_write_cache_pages(mapping, wbc, &epd);
9845e5dd 3314 submit_write_bio(&epd, ret);
19ab78ca 3315 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
d1310b2e
CM
3316 return ret;
3317}
d1310b2e 3318
ba206a02 3319void extent_readahead(struct readahead_control *rac)
d1310b2e 3320{
390ed29b 3321 struct btrfs_bio_ctrl bio_ctrl = { 0 };
67c9684f 3322 struct page *pagepool[16];
125bac01 3323 struct extent_map *em_cached = NULL;
808f80b4 3324 u64 prev_em_start = (u64)-1;
ba206a02 3325 int nr;
d1310b2e 3326
ba206a02 3327 while ((nr = readahead_page_batch(rac, pagepool))) {
32c0a6bc
MWO
3328 u64 contig_start = readahead_pos(rac);
3329 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
e65ef21e 3330
ba206a02 3331 contiguous_readpages(pagepool, nr, contig_start, contig_end,
390ed29b 3332 &em_cached, &bio_ctrl, &prev_em_start);
d1310b2e 3333 }
67c9684f 3334
125bac01
MX
3335 if (em_cached)
3336 free_extent_map(em_cached);
722c82ac 3337 submit_one_bio(&bio_ctrl);
d1310b2e 3338}
d1310b2e
CM
3339
3340/*
895586eb
MWO
3341 * basic invalidate_folio code, this waits on any locked or writeback
3342 * ranges corresponding to the folio, and then deletes any extent state
d1310b2e
CM
3343 * records from the tree
3344 */
895586eb
MWO
3345int extent_invalidate_folio(struct extent_io_tree *tree,
3346 struct folio *folio, size_t offset)
d1310b2e 3347{
2ac55d41 3348 struct extent_state *cached_state = NULL;
895586eb
MWO
3349 u64 start = folio_pos(folio);
3350 u64 end = start + folio_size(folio) - 1;
3351 size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
d1310b2e 3352
829ddec9
QW
3353 /* This function is only called for the btree inode */
3354 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
3355
fda2832f 3356 start += ALIGN(offset, blocksize);
d1310b2e
CM
3357 if (start > end)
3358 return 0;
3359
570eb97b 3360 lock_extent(tree, start, end, &cached_state);
895586eb 3361 folio_wait_writeback(folio);
829ddec9
QW
3362
3363 /*
3364 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
3365 * so here we only need to unlock the extent range to free any
3366 * existing extent state.
3367 */
570eb97b 3368 unlock_extent(tree, start, end, &cached_state);
d1310b2e
CM
3369 return 0;
3370}
d1310b2e 3371
7b13b7b1 3372/*
f913cff3 3373 * a helper for release_folio, this tests for areas of the page that
7b13b7b1
CM
3374 * are locked or under IO and drops the related state bits if it is safe
3375 * to drop the page.
3376 */
29c68b2d 3377static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 3378 struct page *page, gfp_t mask)
7b13b7b1 3379{
4eee4fa4 3380 u64 start = page_offset(page);
09cbfeaf 3381 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
3382 int ret = 1;
3383
8882679e 3384 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 3385 ret = 0;
8882679e 3386 } else {
b71fb16b
JB
3387 u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
3388 EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
3389
11ef160f 3390 /*
2766ff61
FM
3391 * At this point we can safely clear everything except the
3392 * locked bit, the nodatasum bit and the delalloc new bit.
3393 * The delalloc new bit will be cleared by ordered extent
3394 * completion.
11ef160f 3395 */
bd015294 3396 ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
b71fb16b 3397 mask, NULL);
e3f24cc5
CM
3398
3399 /* if clear_extent_bit failed for enomem reasons,
3400 * we can't allow the release to continue.
3401 */
3402 if (ret < 0)
3403 ret = 0;
3404 else
3405 ret = 1;
7b13b7b1
CM
3406 }
3407 return ret;
3408}
7b13b7b1 3409
d1310b2e 3410/*
f913cff3 3411 * a helper for release_folio. As long as there are no locked extents
d1310b2e
CM
3412 * in the range corresponding to the page, both state records and extent
3413 * map records are removed
3414 */
477a30ba 3415int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
3416{
3417 struct extent_map *em;
4eee4fa4 3418 u64 start = page_offset(page);
09cbfeaf 3419 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
3420 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
3421 struct extent_io_tree *tree = &btrfs_inode->io_tree;
3422 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 3423
d0164adc 3424 if (gfpflags_allow_blocking(mask) &&
ee22184b 3425 page->mapping->host->i_size > SZ_16M) {
39b5637f 3426 u64 len;
70dec807 3427 while (start <= end) {
fbc2bd7e
FM
3428 struct btrfs_fs_info *fs_info;
3429 u64 cur_gen;
3430
39b5637f 3431 len = end - start + 1;
890871be 3432 write_lock(&map->lock);
39b5637f 3433 em = lookup_extent_mapping(map, start, len);
285190d9 3434 if (!em) {
890871be 3435 write_unlock(&map->lock);
70dec807
CM
3436 break;
3437 }
7f3c74fb
CM
3438 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3439 em->start != start) {
890871be 3440 write_unlock(&map->lock);
70dec807
CM
3441 free_extent_map(em);
3442 break;
3443 }
3d6448e6
FM
3444 if (test_range_bit(tree, em->start,
3445 extent_map_end(em) - 1,
3446 EXTENT_LOCKED, 0, NULL))
3447 goto next;
3448 /*
3449 * If it's not in the list of modified extents, used
3450 * by a fast fsync, we can remove it. If it's being
3451 * logged we can safely remove it since fsync took an
3452 * extra reference on the em.
3453 */
3454 if (list_empty(&em->list) ||
fbc2bd7e
FM
3455 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
3456 goto remove_em;
3457 /*
3458 * If it's in the list of modified extents, remove it
3459 * only if its generation is older then the current one,
3460 * in which case we don't need it for a fast fsync.
3461 * Otherwise don't remove it, we could be racing with an
3462 * ongoing fast fsync that could miss the new extent.
3463 */
3464 fs_info = btrfs_inode->root->fs_info;
3465 spin_lock(&fs_info->trans_lock);
3466 cur_gen = fs_info->generation;
3467 spin_unlock(&fs_info->trans_lock);
3468 if (em->generation >= cur_gen)
3469 goto next;
3470remove_em:
5e548b32
FM
3471 /*
3472 * We only remove extent maps that are not in the list of
3473 * modified extents or that are in the list but with a
3474 * generation lower then the current generation, so there
3475 * is no need to set the full fsync flag on the inode (it
3476 * hurts the fsync performance for workloads with a data
3477 * size that exceeds or is close to the system's memory).
3478 */
fbc2bd7e
FM
3479 remove_extent_mapping(map, em);
3480 /* once for the rb tree */
3481 free_extent_map(em);
3d6448e6 3482next:
70dec807 3483 start = extent_map_end(em);
890871be 3484 write_unlock(&map->lock);
70dec807
CM
3485
3486 /* once for us */
d1310b2e 3487 free_extent_map(em);
9f47eb54
PM
3488
3489 cond_resched(); /* Allow large-extent preemption. */
d1310b2e 3490 }
d1310b2e 3491 }
29c68b2d 3492 return try_release_extent_state(tree, page, mask);
d1310b2e 3493}
d1310b2e 3494
4751832d
QW
3495/*
3496 * To cache previous fiemap extent
3497 *
3498 * Will be used for merging fiemap extent
3499 */
3500struct fiemap_cache {
3501 u64 offset;
3502 u64 phys;
3503 u64 len;
3504 u32 flags;
3505 bool cached;
3506};
3507
3508/*
3509 * Helper to submit fiemap extent.
3510 *
3511 * Will try to merge current fiemap extent specified by @offset, @phys,
3512 * @len and @flags with cached one.
3513 * And only when we fails to merge, cached one will be submitted as
3514 * fiemap extent.
3515 *
3516 * Return value is the same as fiemap_fill_next_extent().
3517 */
3518static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
3519 struct fiemap_cache *cache,
3520 u64 offset, u64 phys, u64 len, u32 flags)
3521{
3522 int ret = 0;
3523
ac3c0d36
FM
3524 /* Set at the end of extent_fiemap(). */
3525 ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
3526
4751832d
QW
3527 if (!cache->cached)
3528 goto assign;
3529
3530 /*
3531 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 3532 * fiemap extent won't overlap with cached one.
4751832d
QW
3533 * Not recoverable.
3534 *
3535 * NOTE: Physical address can overlap, due to compression
3536 */
3537 if (cache->offset + cache->len > offset) {
3538 WARN_ON(1);
3539 return -EINVAL;
3540 }
3541
3542 /*
3543 * Only merges fiemap extents if
3544 * 1) Their logical addresses are continuous
3545 *
3546 * 2) Their physical addresses are continuous
3547 * So truly compressed (physical size smaller than logical size)
3548 * extents won't get merged with each other
3549 *
ac3c0d36 3550 * 3) Share same flags
4751832d
QW
3551 */
3552 if (cache->offset + cache->len == offset &&
3553 cache->phys + cache->len == phys &&
ac3c0d36 3554 cache->flags == flags) {
4751832d 3555 cache->len += len;
ac3c0d36 3556 return 0;
4751832d
QW
3557 }
3558
3559 /* Not mergeable, need to submit cached one */
3560 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
3561 cache->len, cache->flags);
3562 cache->cached = false;
3563 if (ret)
3564 return ret;
3565assign:
3566 cache->cached = true;
3567 cache->offset = offset;
3568 cache->phys = phys;
3569 cache->len = len;
3570 cache->flags = flags;
ac3c0d36
FM
3571
3572 return 0;
4751832d
QW
3573}
3574
3575/*
848c23b7 3576 * Emit last fiemap cache
4751832d 3577 *
848c23b7
QW
3578 * The last fiemap cache may still be cached in the following case:
3579 * 0 4k 8k
3580 * |<- Fiemap range ->|
3581 * |<------------ First extent ----------->|
3582 *
3583 * In this case, the first extent range will be cached but not emitted.
3584 * So we must emit it before ending extent_fiemap().
4751832d 3585 */
5c5aff98 3586static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 3587 struct fiemap_cache *cache)
4751832d
QW
3588{
3589 int ret;
3590
3591 if (!cache->cached)
3592 return 0;
3593
4751832d
QW
3594 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
3595 cache->len, cache->flags);
3596 cache->cached = false;
3597 if (ret > 0)
3598 ret = 0;
3599 return ret;
3600}
3601
ac3c0d36 3602static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
1506fcc8 3603{
ac3c0d36
FM
3604 struct extent_buffer *clone;
3605 struct btrfs_key key;
3606 int slot;
3607 int ret;
3608
3609 path->slots[0]++;
3610 if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
3611 return 0;
3612
3613 ret = btrfs_next_leaf(inode->root, path);
3614 if (ret != 0)
3615 return ret;
3616
3617 /*
3618 * Don't bother with cloning if there are no more file extent items for
3619 * our inode.
3620 */
3621 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3622 if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
3623 return 1;
3624
3625 /* See the comment at fiemap_search_slot() about why we clone. */
3626 clone = btrfs_clone_extent_buffer(path->nodes[0]);
3627 if (!clone)
3628 return -ENOMEM;
3629
3630 slot = path->slots[0];
3631 btrfs_release_path(path);
3632 path->nodes[0] = clone;
3633 path->slots[0] = slot;
3634
3635 return 0;
3636}
3637
3638/*
3639 * Search for the first file extent item that starts at a given file offset or
3640 * the one that starts immediately before that offset.
3641 * Returns: 0 on success, < 0 on error, 1 if not found.
3642 */
3643static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
3644 u64 file_offset)
3645{
3646 const u64 ino = btrfs_ino(inode);
facee0a0 3647 struct btrfs_root *root = inode->root;
ac3c0d36
FM
3648 struct extent_buffer *clone;
3649 struct btrfs_key key;
3650 int slot;
3651 int ret;
1506fcc8 3652
ac3c0d36
FM
3653 key.objectid = ino;
3654 key.type = BTRFS_EXTENT_DATA_KEY;
3655 key.offset = file_offset;
3656
3657 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3658 if (ret < 0)
3659 return ret;
3660
3661 if (ret > 0 && path->slots[0] > 0) {
3662 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3663 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3664 path->slots[0]--;
3665 }
3666
3667 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
3668 ret = btrfs_next_leaf(root, path);
3669 if (ret != 0)
3670 return ret;
3671
3672 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3673 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3674 return 1;
5911c8fe
DS
3675 }
3676
15c7745c 3677 /*
ac3c0d36
FM
3678 * We clone the leaf and use it during fiemap. This is because while
3679 * using the leaf we do expensive things like checking if an extent is
3680 * shared, which can take a long time. In order to prevent blocking
3681 * other tasks for too long, we use a clone of the leaf. We have locked
3682 * the file range in the inode's io tree, so we know none of our file
3683 * extent items can change. This way we avoid blocking other tasks that
3684 * want to insert items for other inodes in the same leaf or b+tree
3685 * rebalance operations (triggered for example when someone is trying
3686 * to push items into this leaf when trying to insert an item in a
3687 * neighbour leaf).
3688 * We also need the private clone because holding a read lock on an
3689 * extent buffer of the subvolume's b+tree will make lockdep unhappy
3690 * when we call fiemap_fill_next_extent(), because that may cause a page
3691 * fault when filling the user space buffer with fiemap data.
15c7745c 3692 */
ac3c0d36
FM
3693 clone = btrfs_clone_extent_buffer(path->nodes[0]);
3694 if (!clone)
3695 return -ENOMEM;
3696
3697 slot = path->slots[0];
3698 btrfs_release_path(path);
3699 path->nodes[0] = clone;
3700 path->slots[0] = slot;
3701
3702 return 0;
3703}
3704
3705/*
3706 * Process a range which is a hole or a prealloc extent in the inode's subvolume
3707 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
3708 * extent. The end offset (@end) is inclusive.
3709 */
3710static int fiemap_process_hole(struct btrfs_inode *inode,
3711 struct fiemap_extent_info *fieinfo,
3712 struct fiemap_cache *cache,
61dbb952 3713 struct btrfs_backref_share_check_ctx *backref_ctx,
ac3c0d36
FM
3714 u64 disk_bytenr, u64 extent_offset,
3715 u64 extent_gen,
ac3c0d36
FM
3716 u64 start, u64 end)
3717{
3718 const u64 i_size = i_size_read(&inode->vfs_inode);
ac3c0d36
FM
3719 u64 cur_offset = start;
3720 u64 last_delalloc_end = 0;
3721 u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
3722 bool checked_extent_shared = false;
3723 int ret;
4d479cf0 3724
ec29ed5b 3725 /*
ac3c0d36
FM
3726 * There can be no delalloc past i_size, so don't waste time looking for
3727 * it beyond i_size.
ec29ed5b 3728 */
ac3c0d36
FM
3729 while (cur_offset < end && cur_offset < i_size) {
3730 u64 delalloc_start;
3731 u64 delalloc_end;
3732 u64 prealloc_start;
3733 u64 prealloc_len = 0;
3734 bool delalloc;
3735
3736 delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
3737 &delalloc_start,
3738 &delalloc_end);
3739 if (!delalloc)
3740 break;
2d324f59 3741
ec29ed5b 3742 /*
ac3c0d36
FM
3743 * If this is a prealloc extent we have to report every section
3744 * of it that has no delalloc.
ec29ed5b 3745 */
ac3c0d36
FM
3746 if (disk_bytenr != 0) {
3747 if (last_delalloc_end == 0) {
3748 prealloc_start = start;
3749 prealloc_len = delalloc_start - start;
3750 } else {
3751 prealloc_start = last_delalloc_end + 1;
3752 prealloc_len = delalloc_start - prealloc_start;
3753 }
3754 }
3755
3756 if (prealloc_len > 0) {
3757 if (!checked_extent_shared && fieinfo->fi_extents_max) {
ceb707da 3758 ret = btrfs_is_data_extent_shared(inode,
84a7949d
FM
3759 disk_bytenr,
3760 extent_gen,
3761 backref_ctx);
ac3c0d36
FM
3762 if (ret < 0)
3763 return ret;
3764 else if (ret > 0)
3765 prealloc_flags |= FIEMAP_EXTENT_SHARED;
3766
3767 checked_extent_shared = true;
3768 }
3769 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
3770 disk_bytenr + extent_offset,
3771 prealloc_len, prealloc_flags);
3772 if (ret)
3773 return ret;
3774 extent_offset += prealloc_len;
3775 }
3776
3777 ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
3778 delalloc_end + 1 - delalloc_start,
3779 FIEMAP_EXTENT_DELALLOC |
3780 FIEMAP_EXTENT_UNKNOWN);
3781 if (ret)
3782 return ret;
3783
3784 last_delalloc_end = delalloc_end;
3785 cur_offset = delalloc_end + 1;
3786 extent_offset += cur_offset - delalloc_start;
3787 cond_resched();
3788 }
3789
3790 /*
3791 * Either we found no delalloc for the whole prealloc extent or we have
3792 * a prealloc extent that spans i_size or starts at or after i_size.
3793 */
3794 if (disk_bytenr != 0 && last_delalloc_end < end) {
3795 u64 prealloc_start;
3796 u64 prealloc_len;
3797
3798 if (last_delalloc_end == 0) {
3799 prealloc_start = start;
3800 prealloc_len = end + 1 - start;
3801 } else {
3802 prealloc_start = last_delalloc_end + 1;
3803 prealloc_len = end + 1 - prealloc_start;
3804 }
3805
3806 if (!checked_extent_shared && fieinfo->fi_extents_max) {
ceb707da
FM
3807 ret = btrfs_is_data_extent_shared(inode,
3808 disk_bytenr,
84a7949d 3809 extent_gen,
61dbb952 3810 backref_ctx);
ac3c0d36
FM
3811 if (ret < 0)
3812 return ret;
3813 else if (ret > 0)
3814 prealloc_flags |= FIEMAP_EXTENT_SHARED;
3815 }
3816 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
3817 disk_bytenr + extent_offset,
3818 prealloc_len, prealloc_flags);
3819 if (ret)
3820 return ret;
3821 }
3822
3823 return 0;
3824}
3825
3826static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
3827 struct btrfs_path *path,
3828 u64 *last_extent_end_ret)
3829{
3830 const u64 ino = btrfs_ino(inode);
3831 struct btrfs_root *root = inode->root;
3832 struct extent_buffer *leaf;
3833 struct btrfs_file_extent_item *ei;
3834 struct btrfs_key key;
3835 u64 disk_bytenr;
3836 int ret;
3837
3838 /*
3839 * Lookup the last file extent. We're not using i_size here because
3840 * there might be preallocation past i_size.
3841 */
3842 ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
3843 /* There can't be a file extent item at offset (u64)-1 */
3844 ASSERT(ret != 0);
3845 if (ret < 0)
3846 return ret;
3847
3848 /*
3849 * For a non-existing key, btrfs_search_slot() always leaves us at a
3850 * slot > 0, except if the btree is empty, which is impossible because
3851 * at least it has the inode item for this inode and all the items for
3852 * the root inode 256.
3853 */
3854 ASSERT(path->slots[0] > 0);
3855 path->slots[0]--;
3856 leaf = path->nodes[0];
3857 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3858 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
3859 /* No file extent items in the subvolume tree. */
3860 *last_extent_end_ret = 0;
3861 return 0;
975f84fe 3862 }
975f84fe 3863
ec29ed5b 3864 /*
ac3c0d36
FM
3865 * For an inline extent, the disk_bytenr is where inline data starts at,
3866 * so first check if we have an inline extent item before checking if we
3867 * have an implicit hole (disk_bytenr == 0).
ec29ed5b 3868 */
ac3c0d36
FM
3869 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
3870 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
3871 *last_extent_end_ret = btrfs_file_extent_end(path);
3872 return 0;
ec29ed5b
CM
3873 }
3874
ac3c0d36
FM
3875 /*
3876 * Find the last file extent item that is not a hole (when NO_HOLES is
3877 * not enabled). This should take at most 2 iterations in the worst
3878 * case: we have one hole file extent item at slot 0 of a leaf and
3879 * another hole file extent item as the last item in the previous leaf.
3880 * This is because we merge file extent items that represent holes.
3881 */
3882 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
3883 while (disk_bytenr == 0) {
3884 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
3885 if (ret < 0) {
3886 return ret;
3887 } else if (ret > 0) {
3888 /* No file extent items that are not holes. */
3889 *last_extent_end_ret = 0;
3890 return 0;
3891 }
3892 leaf = path->nodes[0];
3893 ei = btrfs_item_ptr(leaf, path->slots[0],
3894 struct btrfs_file_extent_item);
3895 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
3896 }
ec29ed5b 3897
ac3c0d36
FM
3898 *last_extent_end_ret = btrfs_file_extent_end(path);
3899 return 0;
3900}
3901
3902int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
3903 u64 start, u64 len)
3904{
3905 const u64 ino = btrfs_ino(inode);
3906 struct extent_state *cached_state = NULL;
3907 struct btrfs_path *path;
ac3c0d36 3908 struct fiemap_cache cache = { 0 };
61dbb952 3909 struct btrfs_backref_share_check_ctx *backref_ctx;
ac3c0d36
FM
3910 u64 last_extent_end;
3911 u64 prev_extent_end;
3912 u64 lockstart;
3913 u64 lockend;
3914 bool stopped = false;
3915 int ret;
3916
84a7949d 3917 backref_ctx = btrfs_alloc_backref_share_check_ctx();
ac3c0d36 3918 path = btrfs_alloc_path();
84a7949d 3919 if (!backref_ctx || !path) {
ac3c0d36 3920 ret = -ENOMEM;
1506fcc8
YS
3921 goto out;
3922 }
975f84fe 3923
ceb707da
FM
3924 lockstart = round_down(start, inode->root->fs_info->sectorsize);
3925 lockend = round_up(start + len, inode->root->fs_info->sectorsize);
ac3c0d36 3926 prev_extent_end = lockstart;
ea8efc74 3927
570eb97b 3928 lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ea8efc74 3929
ac3c0d36
FM
3930 ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
3931 if (ret < 0)
3932 goto out_unlock;
3933 btrfs_release_path(path);
1506fcc8 3934
ac3c0d36
FM
3935 path->reada = READA_FORWARD;
3936 ret = fiemap_search_slot(inode, path, lockstart);
3937 if (ret < 0) {
3938 goto out_unlock;
3939 } else if (ret > 0) {
ea8efc74 3940 /*
ac3c0d36
FM
3941 * No file extent item found, but we may have delalloc between
3942 * the current offset and i_size. So check for that.
ea8efc74 3943 */
ac3c0d36
FM
3944 ret = 0;
3945 goto check_eof_delalloc;
3946 }
3947
3948 while (prev_extent_end < lockend) {
3949 struct extent_buffer *leaf = path->nodes[0];
3950 struct btrfs_file_extent_item *ei;
3951 struct btrfs_key key;
3952 u64 extent_end;
3953 u64 extent_len;
3954 u64 extent_offset = 0;
3955 u64 extent_gen;
3956 u64 disk_bytenr = 0;
3957 u64 flags = 0;
3958 int extent_type;
3959 u8 compression;
3960
3961 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3962 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3963 break;
3964
3965 extent_end = btrfs_file_extent_end(path);
1506fcc8 3966
ea8efc74 3967 /*
ac3c0d36
FM
3968 * The first iteration can leave us at an extent item that ends
3969 * before our range's start. Move to the next item.
ea8efc74 3970 */
ac3c0d36
FM
3971 if (extent_end <= lockstart)
3972 goto next_item;
fe09e16c 3973
877c1476
FM
3974 backref_ctx->curr_leaf_bytenr = leaf->start;
3975
ac3c0d36
FM
3976 /* We have in implicit hole (NO_HOLES feature enabled). */
3977 if (prev_extent_end < key.offset) {
3978 const u64 range_end = min(key.offset, lockend) - 1;
b8f164e3 3979
ac3c0d36 3980 ret = fiemap_process_hole(inode, fieinfo, &cache,
61dbb952 3981 backref_ctx, 0, 0, 0,
ac3c0d36
FM
3982 prev_extent_end, range_end);
3983 if (ret < 0) {
3984 goto out_unlock;
3985 } else if (ret > 0) {
3986 /* fiemap_fill_next_extent() told us to stop. */
3987 stopped = true;
3988 break;
3989 }
1506fcc8 3990
ac3c0d36
FM
3991 /* We've reached the end of the fiemap range, stop. */
3992 if (key.offset >= lockend) {
3993 stopped = true;
3994 break;
3995 }
1506fcc8
YS
3996 }
3997
ac3c0d36
FM
3998 extent_len = extent_end - key.offset;
3999 ei = btrfs_item_ptr(leaf, path->slots[0],
4000 struct btrfs_file_extent_item);
4001 compression = btrfs_file_extent_compression(leaf, ei);
4002 extent_type = btrfs_file_extent_type(leaf, ei);
4003 extent_gen = btrfs_file_extent_generation(leaf, ei);
4004
4005 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4006 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
4007 if (compression == BTRFS_COMPRESS_NONE)
4008 extent_offset = btrfs_file_extent_offset(leaf, ei);
ec29ed5b 4009 }
ac3c0d36
FM
4010
4011 if (compression != BTRFS_COMPRESS_NONE)
4012 flags |= FIEMAP_EXTENT_ENCODED;
4013
4014 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4015 flags |= FIEMAP_EXTENT_DATA_INLINE;
4016 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
4017 ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
4018 extent_len, flags);
4019 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
4020 ret = fiemap_process_hole(inode, fieinfo, &cache,
61dbb952 4021 backref_ctx,
ac3c0d36 4022 disk_bytenr, extent_offset,
84a7949d
FM
4023 extent_gen, key.offset,
4024 extent_end - 1);
ac3c0d36
FM
4025 } else if (disk_bytenr == 0) {
4026 /* We have an explicit hole. */
4027 ret = fiemap_process_hole(inode, fieinfo, &cache,
61dbb952 4028 backref_ctx, 0, 0, 0,
ac3c0d36
FM
4029 key.offset, extent_end - 1);
4030 } else {
4031 /* We have a regular extent. */
4032 if (fieinfo->fi_extents_max) {
ceb707da 4033 ret = btrfs_is_data_extent_shared(inode,
ac3c0d36
FM
4034 disk_bytenr,
4035 extent_gen,
61dbb952 4036 backref_ctx);
ac3c0d36
FM
4037 if (ret < 0)
4038 goto out_unlock;
4039 else if (ret > 0)
4040 flags |= FIEMAP_EXTENT_SHARED;
4041 }
4042
4043 ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
4044 disk_bytenr + extent_offset,
4045 extent_len, flags);
975f84fe 4046 }
ac3c0d36
FM
4047
4048 if (ret < 0) {
4049 goto out_unlock;
4050 } else if (ret > 0) {
4051 /* fiemap_fill_next_extent() told us to stop. */
4052 stopped = true;
4053 break;
26e726af 4054 }
09fbc1c8 4055
ac3c0d36
FM
4056 prev_extent_end = extent_end;
4057next_item:
09fbc1c8
FM
4058 if (fatal_signal_pending(current)) {
4059 ret = -EINTR;
ac3c0d36 4060 goto out_unlock;
09fbc1c8 4061 }
ac3c0d36
FM
4062
4063 ret = fiemap_next_leaf_item(inode, path);
4064 if (ret < 0) {
4065 goto out_unlock;
4066 } else if (ret > 0) {
4067 /* No more file extent items for this inode. */
4068 break;
4069 }
4070 cond_resched();
1506fcc8 4071 }
5911c8fe 4072
ac3c0d36
FM
4073check_eof_delalloc:
4074 /*
4075 * Release (and free) the path before emitting any final entries to
4076 * fiemap_fill_next_extent() to keep lockdep happy. This is because
4077 * once we find no more file extent items exist, we may have a
4078 * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
4079 * faults when copying data to the user space buffer.
4080 */
4081 btrfs_free_path(path);
4082 path = NULL;
4083
4084 if (!stopped && prev_extent_end < lockend) {
61dbb952 4085 ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx,
84a7949d 4086 0, 0, 0, prev_extent_end, lockend - 1);
ac3c0d36
FM
4087 if (ret < 0)
4088 goto out_unlock;
4089 prev_extent_end = lockend;
4090 }
4091
4092 if (cache.cached && cache.offset + cache.len >= last_extent_end) {
4093 const u64 i_size = i_size_read(&inode->vfs_inode);
4094
4095 if (prev_extent_end < i_size) {
4096 u64 delalloc_start;
4097 u64 delalloc_end;
4098 bool delalloc;
4099
4100 delalloc = btrfs_find_delalloc_in_range(inode,
4101 prev_extent_end,
4102 i_size - 1,
4103 &delalloc_start,
4104 &delalloc_end);
4105 if (!delalloc)
4106 cache.flags |= FIEMAP_EXTENT_LAST;
4107 } else {
4108 cache.flags |= FIEMAP_EXTENT_LAST;
4109 }
4110 }
4111
4112 ret = emit_last_fiemap_cache(fieinfo, &cache);
4113
4114out_unlock:
570eb97b 4115 unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ac3c0d36 4116out:
84a7949d 4117 btrfs_free_backref_share_ctx(backref_ctx);
e02d48ea 4118 btrfs_free_path(path);
1506fcc8
YS
4119 return ret;
4120}
4121
727011e0
CM
4122static void __free_extent_buffer(struct extent_buffer *eb)
4123{
727011e0
CM
4124 kmem_cache_free(extent_buffer_cache, eb);
4125}
4126
2b48966a 4127int extent_buffer_under_io(const struct extent_buffer *eb)
db7f3436
JB
4128{
4129 return (atomic_read(&eb->io_pages) ||
4130 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4131 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4132}
4133
8ff8466d 4134static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
db7f3436 4135{
8ff8466d 4136 struct btrfs_subpage *subpage;
db7f3436 4137
8ff8466d 4138 lockdep_assert_held(&page->mapping->private_lock);
db7f3436 4139
8ff8466d
QW
4140 if (PagePrivate(page)) {
4141 subpage = (struct btrfs_subpage *)page->private;
4142 if (atomic_read(&subpage->eb_refs))
4143 return true;
3d078efa
QW
4144 /*
4145 * Even there is no eb refs here, we may still have
4146 * end_page_read() call relying on page::private.
4147 */
4148 if (atomic_read(&subpage->readers))
4149 return true;
8ff8466d
QW
4150 }
4151 return false;
4152}
db7f3436 4153
8ff8466d
QW
4154static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
4155{
4156 struct btrfs_fs_info *fs_info = eb->fs_info;
4157 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4158
4159 /*
4160 * For mapped eb, we're going to change the page private, which should
4161 * be done under the private_lock.
4162 */
4163 if (mapped)
4164 spin_lock(&page->mapping->private_lock);
4165
4166 if (!PagePrivate(page)) {
5d2361db 4167 if (mapped)
8ff8466d
QW
4168 spin_unlock(&page->mapping->private_lock);
4169 return;
4170 }
4171
fbca46eb 4172 if (fs_info->nodesize >= PAGE_SIZE) {
5d2361db
FL
4173 /*
4174 * We do this since we'll remove the pages after we've
4175 * removed the eb from the radix tree, so we could race
4176 * and have this page now attached to the new eb. So
4177 * only clear page_private if it's still connected to
4178 * this eb.
4179 */
4180 if (PagePrivate(page) &&
4181 page->private == (unsigned long)eb) {
4182 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4183 BUG_ON(PageDirty(page));
4184 BUG_ON(PageWriteback(page));
db7f3436 4185 /*
5d2361db
FL
4186 * We need to make sure we haven't be attached
4187 * to a new eb.
db7f3436 4188 */
d1b89bc0 4189 detach_page_private(page);
db7f3436 4190 }
5d2361db
FL
4191 if (mapped)
4192 spin_unlock(&page->mapping->private_lock);
8ff8466d
QW
4193 return;
4194 }
4195
4196 /*
4197 * For subpage, we can have dummy eb with page private. In this case,
4198 * we can directly detach the private as such page is only attached to
4199 * one dummy eb, no sharing.
4200 */
4201 if (!mapped) {
4202 btrfs_detach_subpage(fs_info, page);
4203 return;
4204 }
4205
4206 btrfs_page_dec_eb_refs(fs_info, page);
4207
4208 /*
4209 * We can only detach the page private if there are no other ebs in the
3d078efa 4210 * page range and no unfinished IO.
8ff8466d
QW
4211 */
4212 if (!page_range_has_eb(fs_info, page))
4213 btrfs_detach_subpage(fs_info, page);
4214
4215 spin_unlock(&page->mapping->private_lock);
4216}
4217
4218/* Release all pages attached to the extent buffer */
4219static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
4220{
4221 int i;
4222 int num_pages;
4223
4224 ASSERT(!extent_buffer_under_io(eb));
4225
4226 num_pages = num_extent_pages(eb);
4227 for (i = 0; i < num_pages; i++) {
4228 struct page *page = eb->pages[i];
4229
4230 if (!page)
4231 continue;
4232
4233 detach_extent_buffer_page(eb, page);
5d2361db 4234
01327610 4235 /* One for when we allocated the page */
09cbfeaf 4236 put_page(page);
d64766fd 4237 }
db7f3436
JB
4238}
4239
4240/*
4241 * Helper for releasing the extent buffer.
4242 */
4243static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4244{
55ac0139 4245 btrfs_release_extent_buffer_pages(eb);
a40246e8 4246 btrfs_leak_debug_del_eb(eb);
db7f3436
JB
4247 __free_extent_buffer(eb);
4248}
4249
f28491e0
JB
4250static struct extent_buffer *
4251__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 4252 unsigned long len)
d1310b2e
CM
4253{
4254 struct extent_buffer *eb = NULL;
4255
d1b5c567 4256 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
4257 eb->start = start;
4258 eb->len = len;
f28491e0 4259 eb->fs_info = fs_info;
196d59ab 4260 init_rwsem(&eb->lock);
b4ce94de 4261
a40246e8 4262 btrfs_leak_debug_add_eb(eb);
d3575156 4263 INIT_LIST_HEAD(&eb->release_list);
6d49ba1b 4264
3083ee2e 4265 spin_lock_init(&eb->refs_lock);
d1310b2e 4266 atomic_set(&eb->refs, 1);
0b32f4bb 4267 atomic_set(&eb->io_pages, 0);
727011e0 4268
deb67895 4269 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
d1310b2e
CM
4270
4271 return eb;
4272}
4273
2b48966a 4274struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
815a51c7 4275{
cc5e31a4 4276 int i;
815a51c7 4277 struct extent_buffer *new;
cc5e31a4 4278 int num_pages = num_extent_pages(src);
dd137dd1 4279 int ret;
815a51c7 4280
3f556f78 4281 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
4282 if (new == NULL)
4283 return NULL;
4284
62c053fb
QW
4285 /*
4286 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
4287 * btrfs_release_extent_buffer() have different behavior for
4288 * UNMAPPED subpage extent buffer.
4289 */
4290 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
4291
dd137dd1
STD
4292 ret = btrfs_alloc_page_array(num_pages, new->pages);
4293 if (ret) {
4294 btrfs_release_extent_buffer(new);
4295 return NULL;
4296 }
4297
815a51c7 4298 for (i = 0; i < num_pages; i++) {
760f991f 4299 int ret;
dd137dd1 4300 struct page *p = new->pages[i];
760f991f 4301
760f991f
QW
4302 ret = attach_extent_buffer_page(new, p, NULL);
4303 if (ret < 0) {
760f991f
QW
4304 btrfs_release_extent_buffer(new);
4305 return NULL;
4306 }
815a51c7 4307 WARN_ON(PageDirty(p));
fba1acf9 4308 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7 4309 }
92d83e94 4310 set_extent_buffer_uptodate(new);
815a51c7
JS
4311
4312 return new;
4313}
4314
0f331229
OS
4315struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4316 u64 start, unsigned long len)
815a51c7
JS
4317{
4318 struct extent_buffer *eb;
cc5e31a4
DS
4319 int num_pages;
4320 int i;
dd137dd1 4321 int ret;
815a51c7 4322
3f556f78 4323 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
4324 if (!eb)
4325 return NULL;
4326
65ad0104 4327 num_pages = num_extent_pages(eb);
dd137dd1
STD
4328 ret = btrfs_alloc_page_array(num_pages, eb->pages);
4329 if (ret)
4330 goto err;
4331
815a51c7 4332 for (i = 0; i < num_pages; i++) {
dd137dd1 4333 struct page *p = eb->pages[i];
09bc1f0f 4334
dd137dd1 4335 ret = attach_extent_buffer_page(eb, p, NULL);
09bc1f0f
QW
4336 if (ret < 0)
4337 goto err;
815a51c7 4338 }
dd137dd1 4339
815a51c7
JS
4340 set_extent_buffer_uptodate(eb);
4341 btrfs_set_header_nritems(eb, 0);
b0132a3b 4342 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
4343
4344 return eb;
4345err:
dd137dd1
STD
4346 for (i = 0; i < num_pages; i++) {
4347 if (eb->pages[i]) {
4348 detach_extent_buffer_page(eb, eb->pages[i]);
4349 __free_page(eb->pages[i]);
4350 }
09bc1f0f 4351 }
815a51c7
JS
4352 __free_extent_buffer(eb);
4353 return NULL;
4354}
4355
0f331229 4356struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 4357 u64 start)
0f331229 4358{
da17066c 4359 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
4360}
4361
0b32f4bb
JB
4362static void check_buffer_tree_ref(struct extent_buffer *eb)
4363{
242e18c7 4364 int refs;
6bf9cd2e
BB
4365 /*
4366 * The TREE_REF bit is first set when the extent_buffer is added
4367 * to the radix tree. It is also reset, if unset, when a new reference
4368 * is created by find_extent_buffer.
0b32f4bb 4369 *
6bf9cd2e
BB
4370 * It is only cleared in two cases: freeing the last non-tree
4371 * reference to the extent_buffer when its STALE bit is set or
f913cff3 4372 * calling release_folio when the tree reference is the only reference.
0b32f4bb 4373 *
6bf9cd2e 4374 * In both cases, care is taken to ensure that the extent_buffer's
f913cff3 4375 * pages are not under io. However, release_folio can be concurrently
6bf9cd2e
BB
4376 * called with creating new references, which is prone to race
4377 * conditions between the calls to check_buffer_tree_ref in those
4378 * codepaths and clearing TREE_REF in try_release_extent_buffer.
0b32f4bb 4379 *
6bf9cd2e
BB
4380 * The actual lifetime of the extent_buffer in the radix tree is
4381 * adequately protected by the refcount, but the TREE_REF bit and
4382 * its corresponding reference are not. To protect against this
4383 * class of races, we call check_buffer_tree_ref from the codepaths
4384 * which trigger io after they set eb->io_pages. Note that once io is
4385 * initiated, TREE_REF can no longer be cleared, so that is the
4386 * moment at which any such race is best fixed.
0b32f4bb 4387 */
242e18c7
CM
4388 refs = atomic_read(&eb->refs);
4389 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4390 return;
4391
594831c4
JB
4392 spin_lock(&eb->refs_lock);
4393 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 4394 atomic_inc(&eb->refs);
594831c4 4395 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
4396}
4397
2457aec6
MG
4398static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4399 struct page *accessed)
5df4235e 4400{
cc5e31a4 4401 int num_pages, i;
5df4235e 4402
0b32f4bb
JB
4403 check_buffer_tree_ref(eb);
4404
65ad0104 4405 num_pages = num_extent_pages(eb);
5df4235e 4406 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
4407 struct page *p = eb->pages[i];
4408
2457aec6
MG
4409 if (p != accessed)
4410 mark_page_accessed(p);
5df4235e
JB
4411 }
4412}
4413
f28491e0
JB
4414struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4415 u64 start)
452c75c3
CS
4416{
4417 struct extent_buffer *eb;
4418
2f3186d8
QW
4419 eb = find_extent_buffer_nolock(fs_info, start);
4420 if (!eb)
4421 return NULL;
4422 /*
4423 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
4424 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
4425 * another task running free_extent_buffer() might have seen that flag
4426 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
4427 * writeback flags not set) and it's still in the tree (flag
4428 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
4429 * decrementing the extent buffer's reference count twice. So here we
4430 * could race and increment the eb's reference count, clear its stale
4431 * flag, mark it as dirty and drop our reference before the other task
4432 * finishes executing free_extent_buffer, which would later result in
4433 * an attempt to free an extent buffer that is dirty.
4434 */
4435 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
4436 spin_lock(&eb->refs_lock);
4437 spin_unlock(&eb->refs_lock);
452c75c3 4438 }
2f3186d8
QW
4439 mark_extent_buffer_accessed(eb, NULL);
4440 return eb;
452c75c3
CS
4441}
4442
faa2dbf0
JB
4443#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4444struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 4445 u64 start)
faa2dbf0
JB
4446{
4447 struct extent_buffer *eb, *exists = NULL;
4448 int ret;
4449
4450 eb = find_extent_buffer(fs_info, start);
4451 if (eb)
4452 return eb;
da17066c 4453 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 4454 if (!eb)
b6293c82 4455 return ERR_PTR(-ENOMEM);
faa2dbf0 4456 eb->fs_info = fs_info;
01cd3909
DS
4457again:
4458 ret = radix_tree_preload(GFP_NOFS);
4459 if (ret) {
4460 exists = ERR_PTR(ret);
4461 goto free_eb;
4462 }
4463 spin_lock(&fs_info->buffer_lock);
4464 ret = radix_tree_insert(&fs_info->buffer_radix,
4465 start >> fs_info->sectorsize_bits, eb);
4466 spin_unlock(&fs_info->buffer_lock);
4467 radix_tree_preload_end();
4468 if (ret == -EEXIST) {
4469 exists = find_extent_buffer(fs_info, start);
4470 if (exists)
faa2dbf0 4471 goto free_eb;
01cd3909
DS
4472 else
4473 goto again;
4474 }
faa2dbf0
JB
4475 check_buffer_tree_ref(eb);
4476 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4477
faa2dbf0
JB
4478 return eb;
4479free_eb:
4480 btrfs_release_extent_buffer(eb);
4481 return exists;
4482}
4483#endif
4484
81982210
QW
4485static struct extent_buffer *grab_extent_buffer(
4486 struct btrfs_fs_info *fs_info, struct page *page)
c0f0a9e7
QW
4487{
4488 struct extent_buffer *exists;
4489
81982210
QW
4490 /*
4491 * For subpage case, we completely rely on radix tree to ensure we
4492 * don't try to insert two ebs for the same bytenr. So here we always
4493 * return NULL and just continue.
4494 */
fbca46eb 4495 if (fs_info->nodesize < PAGE_SIZE)
81982210
QW
4496 return NULL;
4497
c0f0a9e7
QW
4498 /* Page not yet attached to an extent buffer */
4499 if (!PagePrivate(page))
4500 return NULL;
4501
4502 /*
4503 * We could have already allocated an eb for this page and attached one
4504 * so lets see if we can get a ref on the existing eb, and if we can we
4505 * know it's good and we can just return that one, else we know we can
4506 * just overwrite page->private.
4507 */
4508 exists = (struct extent_buffer *)page->private;
4509 if (atomic_inc_not_zero(&exists->refs))
4510 return exists;
4511
4512 WARN_ON(PageDirty(page));
4513 detach_page_private(page);
4514 return NULL;
4515}
4516
fbca46eb
QW
4517static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
4518{
4519 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
4520 btrfs_err(fs_info, "bad tree block start %llu", start);
4521 return -EINVAL;
4522 }
4523
4524 if (fs_info->nodesize < PAGE_SIZE &&
4525 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
4526 btrfs_err(fs_info,
4527 "tree block crosses page boundary, start %llu nodesize %u",
4528 start, fs_info->nodesize);
4529 return -EINVAL;
4530 }
4531 if (fs_info->nodesize >= PAGE_SIZE &&
1280d2d1 4532 !PAGE_ALIGNED(start)) {
fbca46eb
QW
4533 btrfs_err(fs_info,
4534 "tree block is not page aligned, start %llu nodesize %u",
4535 start, fs_info->nodesize);
4536 return -EINVAL;
4537 }
4538 return 0;
4539}
4540
f28491e0 4541struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3fbaf258 4542 u64 start, u64 owner_root, int level)
d1310b2e 4543{
da17066c 4544 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
4545 int num_pages;
4546 int i;
09cbfeaf 4547 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 4548 struct extent_buffer *eb;
6af118ce 4549 struct extent_buffer *exists = NULL;
d1310b2e 4550 struct page *p;
f28491e0 4551 struct address_space *mapping = fs_info->btree_inode->i_mapping;
b40130b2 4552 u64 lockdep_owner = owner_root;
d1310b2e 4553 int uptodate = 1;
19fe0a8b 4554 int ret;
d1310b2e 4555
fbca46eb 4556 if (check_eb_alignment(fs_info, start))
c871b0f2 4557 return ERR_PTR(-EINVAL);
c871b0f2 4558
e9306ad4
QW
4559#if BITS_PER_LONG == 32
4560 if (start >= MAX_LFS_FILESIZE) {
4561 btrfs_err_rl(fs_info,
4562 "extent buffer %llu is beyond 32bit page cache limit", start);
4563 btrfs_err_32bit_limit(fs_info);
4564 return ERR_PTR(-EOVERFLOW);
4565 }
4566 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
4567 btrfs_warn_32bit_limit(fs_info);
4568#endif
4569
f28491e0 4570 eb = find_extent_buffer(fs_info, start);
452c75c3 4571 if (eb)
6af118ce 4572 return eb;
6af118ce 4573
23d79d81 4574 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 4575 if (!eb)
c871b0f2 4576 return ERR_PTR(-ENOMEM);
b40130b2
JB
4577
4578 /*
4579 * The reloc trees are just snapshots, so we need them to appear to be
4580 * just like any other fs tree WRT lockdep.
4581 */
4582 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
4583 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
4584
4585 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
d1310b2e 4586
65ad0104 4587 num_pages = num_extent_pages(eb);
727011e0 4588 for (i = 0; i < num_pages; i++, index++) {
760f991f
QW
4589 struct btrfs_subpage *prealloc = NULL;
4590
d1b5c567 4591 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
4592 if (!p) {
4593 exists = ERR_PTR(-ENOMEM);
6af118ce 4594 goto free_eb;
c871b0f2 4595 }
4f2de97a 4596
760f991f
QW
4597 /*
4598 * Preallocate page->private for subpage case, so that we won't
4599 * allocate memory with private_lock hold. The memory will be
4600 * freed by attach_extent_buffer_page() or freed manually if
4601 * we exit earlier.
4602 *
4603 * Although we have ensured one subpage eb can only have one
4604 * page, but it may change in the future for 16K page size
4605 * support, so we still preallocate the memory in the loop.
4606 */
fbca46eb 4607 if (fs_info->nodesize < PAGE_SIZE) {
651fb419
QW
4608 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
4609 if (IS_ERR(prealloc)) {
4610 ret = PTR_ERR(prealloc);
fdf250db
QW
4611 unlock_page(p);
4612 put_page(p);
4613 exists = ERR_PTR(ret);
4614 goto free_eb;
4615 }
760f991f
QW
4616 }
4617
4f2de97a 4618 spin_lock(&mapping->private_lock);
81982210 4619 exists = grab_extent_buffer(fs_info, p);
c0f0a9e7
QW
4620 if (exists) {
4621 spin_unlock(&mapping->private_lock);
4622 unlock_page(p);
4623 put_page(p);
4624 mark_extent_buffer_accessed(exists, p);
760f991f 4625 btrfs_free_subpage(prealloc);
c0f0a9e7 4626 goto free_eb;
d1310b2e 4627 }
760f991f
QW
4628 /* Should not fail, as we have preallocated the memory */
4629 ret = attach_extent_buffer_page(eb, p, prealloc);
4630 ASSERT(!ret);
8ff8466d
QW
4631 /*
4632 * To inform we have extra eb under allocation, so that
4633 * detach_extent_buffer_page() won't release the page private
4634 * when the eb hasn't yet been inserted into radix tree.
4635 *
4636 * The ref will be decreased when the eb released the page, in
4637 * detach_extent_buffer_page().
4638 * Thus needs no special handling in error path.
4639 */
4640 btrfs_page_inc_eb_refs(fs_info, p);
4f2de97a 4641 spin_unlock(&mapping->private_lock);
760f991f 4642
1e5eb3d6 4643 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
727011e0 4644 eb->pages[i] = p;
d1310b2e
CM
4645 if (!PageUptodate(p))
4646 uptodate = 0;
eb14ab8e
CM
4647
4648 /*
b16d011e
NB
4649 * We can't unlock the pages just yet since the extent buffer
4650 * hasn't been properly inserted in the radix tree, this
f913cff3 4651 * opens a race with btree_release_folio which can free a page
b16d011e
NB
4652 * while we are still filling in all pages for the buffer and
4653 * we could crash.
eb14ab8e 4654 */
d1310b2e
CM
4655 }
4656 if (uptodate)
b4ce94de 4657 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
01cd3909
DS
4658again:
4659 ret = radix_tree_preload(GFP_NOFS);
4660 if (ret) {
4661 exists = ERR_PTR(ret);
4662 goto free_eb;
4663 }
4664
4665 spin_lock(&fs_info->buffer_lock);
4666 ret = radix_tree_insert(&fs_info->buffer_radix,
4667 start >> fs_info->sectorsize_bits, eb);
4668 spin_unlock(&fs_info->buffer_lock);
4669 radix_tree_preload_end();
4670 if (ret == -EEXIST) {
4671 exists = find_extent_buffer(fs_info, start);
4672 if (exists)
452c75c3 4673 goto free_eb;
01cd3909
DS
4674 else
4675 goto again;
4676 }
6af118ce 4677 /* add one reference for the tree */
0b32f4bb 4678 check_buffer_tree_ref(eb);
34b41ace 4679 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
4680
4681 /*
b16d011e 4682 * Now it's safe to unlock the pages because any calls to
f913cff3 4683 * btree_release_folio will correctly detect that a page belongs to a
b16d011e 4684 * live buffer and won't free them prematurely.
eb14ab8e 4685 */
28187ae5
NB
4686 for (i = 0; i < num_pages; i++)
4687 unlock_page(eb->pages[i]);
d1310b2e
CM
4688 return eb;
4689
6af118ce 4690free_eb:
5ca64f45 4691 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
4692 for (i = 0; i < num_pages; i++) {
4693 if (eb->pages[i])
4694 unlock_page(eb->pages[i]);
4695 }
eb14ab8e 4696
897ca6e9 4697 btrfs_release_extent_buffer(eb);
6af118ce 4698 return exists;
d1310b2e 4699}
d1310b2e 4700
3083ee2e
JB
4701static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4702{
4703 struct extent_buffer *eb =
4704 container_of(head, struct extent_buffer, rcu_head);
4705
4706 __free_extent_buffer(eb);
4707}
4708
f7a52a40 4709static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 4710 __releases(&eb->refs_lock)
3083ee2e 4711{
07e21c4d
NB
4712 lockdep_assert_held(&eb->refs_lock);
4713
3083ee2e
JB
4714 WARN_ON(atomic_read(&eb->refs) == 0);
4715 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 4716 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 4717 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 4718
815a51c7 4719 spin_unlock(&eb->refs_lock);
3083ee2e 4720
01cd3909
DS
4721 spin_lock(&fs_info->buffer_lock);
4722 radix_tree_delete(&fs_info->buffer_radix,
4723 eb->start >> fs_info->sectorsize_bits);
4724 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
4725 } else {
4726 spin_unlock(&eb->refs_lock);
815a51c7 4727 }
3083ee2e 4728
a40246e8 4729 btrfs_leak_debug_del_eb(eb);
3083ee2e 4730 /* Should be safe to release our pages at this point */
55ac0139 4731 btrfs_release_extent_buffer_pages(eb);
bcb7e449 4732#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 4733 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
4734 __free_extent_buffer(eb);
4735 return 1;
4736 }
4737#endif
3083ee2e 4738 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 4739 return 1;
3083ee2e
JB
4740 }
4741 spin_unlock(&eb->refs_lock);
e64860aa
JB
4742
4743 return 0;
3083ee2e
JB
4744}
4745
d1310b2e
CM
4746void free_extent_buffer(struct extent_buffer *eb)
4747{
242e18c7 4748 int refs;
d1310b2e
CM
4749 if (!eb)
4750 return;
4751
e5677f05 4752 refs = atomic_read(&eb->refs);
242e18c7 4753 while (1) {
46cc775e
NB
4754 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
4755 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
4756 refs == 1))
242e18c7 4757 break;
e5677f05 4758 if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
242e18c7
CM
4759 return;
4760 }
4761
3083ee2e
JB
4762 spin_lock(&eb->refs_lock);
4763 if (atomic_read(&eb->refs) == 2 &&
4764 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 4765 !extent_buffer_under_io(eb) &&
3083ee2e
JB
4766 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4767 atomic_dec(&eb->refs);
4768
4769 /*
4770 * I know this is terrible, but it's temporary until we stop tracking
4771 * the uptodate bits and such for the extent buffers.
4772 */
f7a52a40 4773 release_extent_buffer(eb);
3083ee2e
JB
4774}
4775
4776void free_extent_buffer_stale(struct extent_buffer *eb)
4777{
4778 if (!eb)
d1310b2e
CM
4779 return;
4780
3083ee2e
JB
4781 spin_lock(&eb->refs_lock);
4782 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4783
0b32f4bb 4784 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
4785 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4786 atomic_dec(&eb->refs);
f7a52a40 4787 release_extent_buffer(eb);
d1310b2e 4788}
d1310b2e 4789
0d27797e
QW
4790static void btree_clear_page_dirty(struct page *page)
4791{
4792 ASSERT(PageDirty(page));
4793 ASSERT(PageLocked(page));
4794 clear_page_dirty_for_io(page);
4795 xa_lock_irq(&page->mapping->i_pages);
4796 if (!PageDirty(page))
4797 __xa_clear_mark(&page->mapping->i_pages,
4798 page_index(page), PAGECACHE_TAG_DIRTY);
4799 xa_unlock_irq(&page->mapping->i_pages);
4800}
4801
4802static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
4803{
4804 struct btrfs_fs_info *fs_info = eb->fs_info;
4805 struct page *page = eb->pages[0];
4806 bool last;
4807
4808 /* btree_clear_page_dirty() needs page locked */
4809 lock_page(page);
4810 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
4811 eb->len);
4812 if (last)
4813 btree_clear_page_dirty(page);
4814 unlock_page(page);
4815 WARN_ON(atomic_read(&eb->refs) == 0);
4816}
4817
2b48966a 4818void clear_extent_buffer_dirty(const struct extent_buffer *eb)
d1310b2e 4819{
cc5e31a4
DS
4820 int i;
4821 int num_pages;
d1310b2e
CM
4822 struct page *page;
4823
fbca46eb 4824 if (eb->fs_info->nodesize < PAGE_SIZE)
0d27797e
QW
4825 return clear_subpage_extent_buffer_dirty(eb);
4826
65ad0104 4827 num_pages = num_extent_pages(eb);
d1310b2e
CM
4828
4829 for (i = 0; i < num_pages; i++) {
fb85fc9a 4830 page = eb->pages[i];
b9473439 4831 if (!PageDirty(page))
d2c3f4f6 4832 continue;
a61e6f29 4833 lock_page(page);
0d27797e 4834 btree_clear_page_dirty(page);
bf0da8c1 4835 ClearPageError(page);
a61e6f29 4836 unlock_page(page);
d1310b2e 4837 }
0b32f4bb 4838 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 4839}
d1310b2e 4840
abb57ef3 4841bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 4842{
cc5e31a4
DS
4843 int i;
4844 int num_pages;
abb57ef3 4845 bool was_dirty;
d1310b2e 4846
0b32f4bb
JB
4847 check_buffer_tree_ref(eb);
4848
b9473439 4849 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 4850
65ad0104 4851 num_pages = num_extent_pages(eb);
3083ee2e 4852 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
4853 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4854
0d27797e 4855 if (!was_dirty) {
fbca46eb 4856 bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
51995c39 4857
0d27797e
QW
4858 /*
4859 * For subpage case, we can have other extent buffers in the
4860 * same page, and in clear_subpage_extent_buffer_dirty() we
4861 * have to clear page dirty without subpage lock held.
4862 * This can cause race where our page gets dirty cleared after
4863 * we just set it.
4864 *
4865 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
4866 * its page for other reasons, we can use page lock to prevent
4867 * the above race.
4868 */
4869 if (subpage)
4870 lock_page(eb->pages[0]);
4871 for (i = 0; i < num_pages; i++)
4872 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
4873 eb->start, eb->len);
4874 if (subpage)
4875 unlock_page(eb->pages[0]);
4876 }
51995c39
LB
4877#ifdef CONFIG_BTRFS_DEBUG
4878 for (i = 0; i < num_pages; i++)
4879 ASSERT(PageDirty(eb->pages[i]));
4880#endif
4881
b9473439 4882 return was_dirty;
d1310b2e 4883}
d1310b2e 4884
69ba3927 4885void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 4886{
251f2acc 4887 struct btrfs_fs_info *fs_info = eb->fs_info;
1259ab75 4888 struct page *page;
cc5e31a4 4889 int num_pages;
251f2acc 4890 int i;
1259ab75 4891
b4ce94de 4892 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 4893 num_pages = num_extent_pages(eb);
1259ab75 4894 for (i = 0; i < num_pages; i++) {
fb85fc9a 4895 page = eb->pages[i];
fbca46eb
QW
4896 if (!page)
4897 continue;
4898
4899 /*
4900 * This is special handling for metadata subpage, as regular
4901 * btrfs_is_subpage() can not handle cloned/dummy metadata.
4902 */
4903 if (fs_info->nodesize >= PAGE_SIZE)
4904 ClearPageUptodate(page);
4905 else
4906 btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
4907 eb->len);
1259ab75 4908 }
1259ab75
CM
4909}
4910
09c25a8c 4911void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 4912{
251f2acc 4913 struct btrfs_fs_info *fs_info = eb->fs_info;
d1310b2e 4914 struct page *page;
cc5e31a4 4915 int num_pages;
251f2acc 4916 int i;
d1310b2e 4917
0b32f4bb 4918 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 4919 num_pages = num_extent_pages(eb);
d1310b2e 4920 for (i = 0; i < num_pages; i++) {
fb85fc9a 4921 page = eb->pages[i];
fbca46eb
QW
4922
4923 /*
4924 * This is special handling for metadata subpage, as regular
4925 * btrfs_is_subpage() can not handle cloned/dummy metadata.
4926 */
4927 if (fs_info->nodesize >= PAGE_SIZE)
4928 SetPageUptodate(page);
4929 else
4930 btrfs_subpage_set_uptodate(fs_info, page, eb->start,
4931 eb->len);
d1310b2e 4932 }
d1310b2e 4933}
d1310b2e 4934
4012daf7
QW
4935static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
4936 int mirror_num)
4937{
4938 struct btrfs_fs_info *fs_info = eb->fs_info;
4939 struct extent_io_tree *io_tree;
4940 struct page *page = eb->pages[0];
e5e886ba 4941 struct extent_state *cached_state = NULL;
722c82ac
CH
4942 struct btrfs_bio_ctrl bio_ctrl = {
4943 .mirror_num = mirror_num,
4944 };
4012daf7
QW
4945 int ret = 0;
4946
4947 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
4948 ASSERT(PagePrivate(page));
4949 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
4950
4951 if (wait == WAIT_NONE) {
83ae4133 4952 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
e5e886ba 4953 &cached_state))
dc56219f 4954 return -EAGAIN;
4012daf7 4955 } else {
e5e886ba
JB
4956 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
4957 &cached_state);
4012daf7
QW
4958 if (ret < 0)
4959 return ret;
4960 }
4961
4962 ret = 0;
4963 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
4964 PageUptodate(page) ||
4965 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
4966 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
e5e886ba
JB
4967 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
4968 &cached_state);
4012daf7
QW
4969 return ret;
4970 }
4971
4972 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
4973 eb->read_mirror = 0;
4974 atomic_set(&eb->io_pages, 1);
4975 check_buffer_tree_ref(eb);
5467abba
QW
4976 bio_ctrl.end_io_func = end_bio_extent_readpage;
4977
4012daf7
QW
4978 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
4979
3d078efa 4980 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
08a6f464 4981 ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
209ecde5 4982 eb->start, page, eb->len,
5467abba 4983 eb->start - page_offset(page), 0, true);
4012daf7
QW
4984 if (ret) {
4985 /*
4986 * In the endio function, if we hit something wrong we will
4987 * increase the io_pages, so here we need to decrease it for
4988 * error path.
4989 */
4990 atomic_dec(&eb->io_pages);
4991 }
722c82ac 4992 submit_one_bio(&bio_ctrl);
e5e886ba
JB
4993 if (ret || wait != WAIT_COMPLETE) {
4994 free_extent_state(cached_state);
4012daf7 4995 return ret;
e5e886ba 4996 }
4012daf7 4997
123a7f00 4998 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
e5e886ba 4999 EXTENT_LOCKED, &cached_state);
4012daf7
QW
5000 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5001 ret = -EIO;
5002 return ret;
5003}
5004
c2ccfbc6 5005int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 5006{
cc5e31a4 5007 int i;
d1310b2e
CM
5008 struct page *page;
5009 int err;
5010 int ret = 0;
ce9adaa5
CM
5011 int locked_pages = 0;
5012 int all_uptodate = 1;
cc5e31a4 5013 int num_pages;
727011e0 5014 unsigned long num_reads = 0;
722c82ac
CH
5015 struct btrfs_bio_ctrl bio_ctrl = {
5016 .mirror_num = mirror_num,
5017 };
a86c12c7 5018
b4ce94de 5019 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
5020 return 0;
5021
651740a5
JB
5022 /*
5023 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
5024 * operation, which could potentially still be in flight. In this case
5025 * we simply want to return an error.
5026 */
5027 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
5028 return -EIO;
5029
fbca46eb 5030 if (eb->fs_info->nodesize < PAGE_SIZE)
4012daf7
QW
5031 return read_extent_buffer_subpage(eb, wait, mirror_num);
5032
65ad0104 5033 num_pages = num_extent_pages(eb);
8436ea91 5034 for (i = 0; i < num_pages; i++) {
fb85fc9a 5035 page = eb->pages[i];
bb82ab88 5036 if (wait == WAIT_NONE) {
2c4d8cb7
QW
5037 /*
5038 * WAIT_NONE is only utilized by readahead. If we can't
5039 * acquire the lock atomically it means either the eb
5040 * is being read out or under modification.
5041 * Either way the eb will be or has been cached,
5042 * readahead can exit safely.
5043 */
2db04966 5044 if (!trylock_page(page))
ce9adaa5 5045 goto unlock_exit;
d1310b2e
CM
5046 } else {
5047 lock_page(page);
5048 }
ce9adaa5 5049 locked_pages++;
2571e739
LB
5050 }
5051 /*
5052 * We need to firstly lock all pages to make sure that
5053 * the uptodate bit of our pages won't be affected by
5054 * clear_extent_buffer_uptodate().
5055 */
8436ea91 5056 for (i = 0; i < num_pages; i++) {
2571e739 5057 page = eb->pages[i];
727011e0
CM
5058 if (!PageUptodate(page)) {
5059 num_reads++;
ce9adaa5 5060 all_uptodate = 0;
727011e0 5061 }
ce9adaa5 5062 }
2571e739 5063
ce9adaa5 5064 if (all_uptodate) {
8436ea91 5065 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
5066 goto unlock_exit;
5067 }
5068
656f30db 5069 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 5070 eb->read_mirror = 0;
0b32f4bb 5071 atomic_set(&eb->io_pages, num_reads);
6bf9cd2e 5072 /*
f913cff3 5073 * It is possible for release_folio to clear the TREE_REF bit before we
6bf9cd2e
BB
5074 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
5075 */
5076 check_buffer_tree_ref(eb);
5467abba 5077 bio_ctrl.end_io_func = end_bio_extent_readpage;
8436ea91 5078 for (i = 0; i < num_pages; i++) {
fb85fc9a 5079 page = eb->pages[i];
baf863b9 5080
ce9adaa5 5081 if (!PageUptodate(page)) {
baf863b9
LB
5082 if (ret) {
5083 atomic_dec(&eb->io_pages);
5084 unlock_page(page);
5085 continue;
5086 }
5087
f188591e 5088 ClearPageError(page);
08a6f464 5089 err = submit_extent_page(REQ_OP_READ, NULL,
209ecde5 5090 &bio_ctrl, page_offset(page), page,
5467abba 5091 PAGE_SIZE, 0, 0, false);
baf863b9 5092 if (err) {
baf863b9 5093 /*
0420177c
NB
5094 * We failed to submit the bio so it's the
5095 * caller's responsibility to perform cleanup
5096 * i.e unlock page/set error bit.
baf863b9 5097 */
0420177c
NB
5098 ret = err;
5099 SetPageError(page);
5100 unlock_page(page);
baf863b9
LB
5101 atomic_dec(&eb->io_pages);
5102 }
d1310b2e
CM
5103 } else {
5104 unlock_page(page);
5105 }
5106 }
5107
722c82ac 5108 submit_one_bio(&bio_ctrl);
a86c12c7 5109
bb82ab88 5110 if (ret || wait != WAIT_COMPLETE)
d1310b2e 5111 return ret;
d397712b 5112
8436ea91 5113 for (i = 0; i < num_pages; i++) {
fb85fc9a 5114 page = eb->pages[i];
d1310b2e 5115 wait_on_page_locked(page);
d397712b 5116 if (!PageUptodate(page))
d1310b2e 5117 ret = -EIO;
d1310b2e 5118 }
d397712b 5119
d1310b2e 5120 return ret;
ce9adaa5
CM
5121
5122unlock_exit:
d397712b 5123 while (locked_pages > 0) {
ce9adaa5 5124 locked_pages--;
8436ea91
JB
5125 page = eb->pages[locked_pages];
5126 unlock_page(page);
ce9adaa5
CM
5127 }
5128 return ret;
d1310b2e 5129}
d1310b2e 5130
f98b6215
QW
5131static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
5132 unsigned long len)
5133{
5134 btrfs_warn(eb->fs_info,
5135 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
5136 eb->start, eb->len, start, len);
5137 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5138
5139 return true;
5140}
5141
5142/*
5143 * Check if the [start, start + len) range is valid before reading/writing
5144 * the eb.
5145 * NOTE: @start and @len are offset inside the eb, not logical address.
5146 *
5147 * Caller should not touch the dst/src memory if this function returns error.
5148 */
5149static inline int check_eb_range(const struct extent_buffer *eb,
5150 unsigned long start, unsigned long len)
5151{
5152 unsigned long offset;
5153
5154 /* start, start + len should not go beyond eb->len nor overflow */
5155 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
5156 return report_eb_range(eb, start, len);
5157
5158 return false;
5159}
5160
1cbb1f45
JM
5161void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5162 unsigned long start, unsigned long len)
d1310b2e
CM
5163{
5164 size_t cur;
5165 size_t offset;
5166 struct page *page;
5167 char *kaddr;
5168 char *dst = (char *)dstv;
884b07d0 5169 unsigned long i = get_eb_page_index(start);
d1310b2e 5170
f98b6215 5171 if (check_eb_range(eb, start, len))
f716abd5 5172 return;
d1310b2e 5173
884b07d0 5174 offset = get_eb_offset_in_page(eb, start);
d1310b2e 5175
d397712b 5176 while (len > 0) {
fb85fc9a 5177 page = eb->pages[i];
d1310b2e 5178
09cbfeaf 5179 cur = min(len, (PAGE_SIZE - offset));
a6591715 5180 kaddr = page_address(page);
d1310b2e 5181 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
5182
5183 dst += cur;
5184 len -= cur;
5185 offset = 0;
5186 i++;
5187 }
5188}
d1310b2e 5189
a48b73ec
JB
5190int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
5191 void __user *dstv,
5192 unsigned long start, unsigned long len)
550ac1d8
GH
5193{
5194 size_t cur;
5195 size_t offset;
5196 struct page *page;
5197 char *kaddr;
5198 char __user *dst = (char __user *)dstv;
884b07d0 5199 unsigned long i = get_eb_page_index(start);
550ac1d8
GH
5200 int ret = 0;
5201
5202 WARN_ON(start > eb->len);
5203 WARN_ON(start + len > eb->start + eb->len);
5204
884b07d0 5205 offset = get_eb_offset_in_page(eb, start);
550ac1d8
GH
5206
5207 while (len > 0) {
fb85fc9a 5208 page = eb->pages[i];
550ac1d8 5209
09cbfeaf 5210 cur = min(len, (PAGE_SIZE - offset));
550ac1d8 5211 kaddr = page_address(page);
a48b73ec 5212 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
550ac1d8
GH
5213 ret = -EFAULT;
5214 break;
5215 }
5216
5217 dst += cur;
5218 len -= cur;
5219 offset = 0;
5220 i++;
5221 }
5222
5223 return ret;
5224}
5225
1cbb1f45
JM
5226int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5227 unsigned long start, unsigned long len)
d1310b2e
CM
5228{
5229 size_t cur;
5230 size_t offset;
5231 struct page *page;
5232 char *kaddr;
5233 char *ptr = (char *)ptrv;
884b07d0 5234 unsigned long i = get_eb_page_index(start);
d1310b2e
CM
5235 int ret = 0;
5236
f98b6215
QW
5237 if (check_eb_range(eb, start, len))
5238 return -EINVAL;
d1310b2e 5239
884b07d0 5240 offset = get_eb_offset_in_page(eb, start);
d1310b2e 5241
d397712b 5242 while (len > 0) {
fb85fc9a 5243 page = eb->pages[i];
d1310b2e 5244
09cbfeaf 5245 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 5246
a6591715 5247 kaddr = page_address(page);
d1310b2e 5248 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
5249 if (ret)
5250 break;
5251
5252 ptr += cur;
5253 len -= cur;
5254 offset = 0;
5255 i++;
5256 }
5257 return ret;
5258}
d1310b2e 5259
b8f95771
QW
5260/*
5261 * Check that the extent buffer is uptodate.
5262 *
5263 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
5264 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
5265 */
5266static void assert_eb_page_uptodate(const struct extent_buffer *eb,
5267 struct page *page)
5268{
5269 struct btrfs_fs_info *fs_info = eb->fs_info;
5270
a50e1fcb
JB
5271 /*
5272 * If we are using the commit root we could potentially clear a page
5273 * Uptodate while we're using the extent buffer that we've previously
5274 * looked up. We don't want to complain in this case, as the page was
5275 * valid before, we just didn't write it out. Instead we want to catch
5276 * the case where we didn't actually read the block properly, which
5277 * would have !PageUptodate && !PageError, as we clear PageError before
5278 * reading.
5279 */
fbca46eb 5280 if (fs_info->nodesize < PAGE_SIZE) {
a50e1fcb 5281 bool uptodate, error;
b8f95771
QW
5282
5283 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
5284 eb->start, eb->len);
a50e1fcb
JB
5285 error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
5286 WARN_ON(!uptodate && !error);
b8f95771 5287 } else {
a50e1fcb 5288 WARN_ON(!PageUptodate(page) && !PageError(page));
b8f95771
QW
5289 }
5290}
5291
2b48966a 5292void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
f157bf76
DS
5293 const void *srcv)
5294{
5295 char *kaddr;
5296
b8f95771 5297 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
5298 kaddr = page_address(eb->pages[0]) +
5299 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
5300 chunk_tree_uuid));
5301 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
5302}
5303
2b48966a 5304void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
f157bf76
DS
5305{
5306 char *kaddr;
5307
b8f95771 5308 assert_eb_page_uptodate(eb, eb->pages[0]);
24880be5
DS
5309 kaddr = page_address(eb->pages[0]) +
5310 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
5311 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
f157bf76
DS
5312}
5313
2b48966a 5314void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
d1310b2e
CM
5315 unsigned long start, unsigned long len)
5316{
5317 size_t cur;
5318 size_t offset;
5319 struct page *page;
5320 char *kaddr;
5321 char *src = (char *)srcv;
884b07d0 5322 unsigned long i = get_eb_page_index(start);
d1310b2e 5323
d3575156
NA
5324 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
5325
f98b6215
QW
5326 if (check_eb_range(eb, start, len))
5327 return;
d1310b2e 5328
884b07d0 5329 offset = get_eb_offset_in_page(eb, start);
d1310b2e 5330
d397712b 5331 while (len > 0) {
fb85fc9a 5332 page = eb->pages[i];
b8f95771 5333 assert_eb_page_uptodate(eb, page);
d1310b2e 5334
09cbfeaf 5335 cur = min(len, PAGE_SIZE - offset);
a6591715 5336 kaddr = page_address(page);
d1310b2e 5337 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
5338
5339 src += cur;
5340 len -= cur;
5341 offset = 0;
5342 i++;
5343 }
5344}
d1310b2e 5345
2b48966a 5346void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
b159fa28 5347 unsigned long len)
d1310b2e
CM
5348{
5349 size_t cur;
5350 size_t offset;
5351 struct page *page;
5352 char *kaddr;
884b07d0 5353 unsigned long i = get_eb_page_index(start);
d1310b2e 5354
f98b6215
QW
5355 if (check_eb_range(eb, start, len))
5356 return;
d1310b2e 5357
884b07d0 5358 offset = get_eb_offset_in_page(eb, start);
d1310b2e 5359
d397712b 5360 while (len > 0) {
fb85fc9a 5361 page = eb->pages[i];
b8f95771 5362 assert_eb_page_uptodate(eb, page);
d1310b2e 5363
09cbfeaf 5364 cur = min(len, PAGE_SIZE - offset);
a6591715 5365 kaddr = page_address(page);
b159fa28 5366 memset(kaddr + offset, 0, cur);
d1310b2e
CM
5367
5368 len -= cur;
5369 offset = 0;
5370 i++;
5371 }
5372}
d1310b2e 5373
2b48966a
DS
5374void copy_extent_buffer_full(const struct extent_buffer *dst,
5375 const struct extent_buffer *src)
58e8012c
DS
5376{
5377 int i;
cc5e31a4 5378 int num_pages;
58e8012c
DS
5379
5380 ASSERT(dst->len == src->len);
5381
fbca46eb 5382 if (dst->fs_info->nodesize >= PAGE_SIZE) {
884b07d0
QW
5383 num_pages = num_extent_pages(dst);
5384 for (i = 0; i < num_pages; i++)
5385 copy_page(page_address(dst->pages[i]),
5386 page_address(src->pages[i]));
5387 } else {
5388 size_t src_offset = get_eb_offset_in_page(src, 0);
5389 size_t dst_offset = get_eb_offset_in_page(dst, 0);
5390
fbca46eb 5391 ASSERT(src->fs_info->nodesize < PAGE_SIZE);
884b07d0
QW
5392 memcpy(page_address(dst->pages[0]) + dst_offset,
5393 page_address(src->pages[0]) + src_offset,
5394 src->len);
5395 }
58e8012c
DS
5396}
5397
2b48966a
DS
5398void copy_extent_buffer(const struct extent_buffer *dst,
5399 const struct extent_buffer *src,
d1310b2e
CM
5400 unsigned long dst_offset, unsigned long src_offset,
5401 unsigned long len)
5402{
5403 u64 dst_len = dst->len;
5404 size_t cur;
5405 size_t offset;
5406 struct page *page;
5407 char *kaddr;
884b07d0 5408 unsigned long i = get_eb_page_index(dst_offset);
d1310b2e 5409
f98b6215
QW
5410 if (check_eb_range(dst, dst_offset, len) ||
5411 check_eb_range(src, src_offset, len))
5412 return;
5413
d1310b2e
CM
5414 WARN_ON(src->len != dst_len);
5415
884b07d0 5416 offset = get_eb_offset_in_page(dst, dst_offset);
d1310b2e 5417
d397712b 5418 while (len > 0) {
fb85fc9a 5419 page = dst->pages[i];
b8f95771 5420 assert_eb_page_uptodate(dst, page);
d1310b2e 5421
09cbfeaf 5422 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 5423
a6591715 5424 kaddr = page_address(page);
d1310b2e 5425 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
5426
5427 src_offset += cur;
5428 len -= cur;
5429 offset = 0;
5430 i++;
5431 }
5432}
d1310b2e 5433
3e1e8bb7
OS
5434/*
5435 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5436 * given bit number
5437 * @eb: the extent buffer
5438 * @start: offset of the bitmap item in the extent buffer
5439 * @nr: bit number
5440 * @page_index: return index of the page in the extent buffer that contains the
5441 * given bit number
5442 * @page_offset: return offset into the page given by page_index
5443 *
5444 * This helper hides the ugliness of finding the byte in an extent buffer which
5445 * contains a given bit.
5446 */
2b48966a 5447static inline void eb_bitmap_offset(const struct extent_buffer *eb,
3e1e8bb7
OS
5448 unsigned long start, unsigned long nr,
5449 unsigned long *page_index,
5450 size_t *page_offset)
5451{
3e1e8bb7
OS
5452 size_t byte_offset = BIT_BYTE(nr);
5453 size_t offset;
5454
5455 /*
5456 * The byte we want is the offset of the extent buffer + the offset of
5457 * the bitmap item in the extent buffer + the offset of the byte in the
5458 * bitmap item.
5459 */
884b07d0 5460 offset = start + offset_in_page(eb->start) + byte_offset;
3e1e8bb7 5461
09cbfeaf 5462 *page_index = offset >> PAGE_SHIFT;
7073017a 5463 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
5464}
5465
43dd529a
DS
5466/*
5467 * Determine whether a bit in a bitmap item is set.
5468 *
5469 * @eb: the extent buffer
5470 * @start: offset of the bitmap item in the extent buffer
5471 * @nr: bit number to test
3e1e8bb7 5472 */
2b48966a 5473int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
5474 unsigned long nr)
5475{
2fe1d551 5476 u8 *kaddr;
3e1e8bb7
OS
5477 struct page *page;
5478 unsigned long i;
5479 size_t offset;
5480
5481 eb_bitmap_offset(eb, start, nr, &i, &offset);
5482 page = eb->pages[i];
b8f95771 5483 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
5484 kaddr = page_address(page);
5485 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5486}
5487
43dd529a
DS
5488/*
5489 * Set an area of a bitmap to 1.
5490 *
5491 * @eb: the extent buffer
5492 * @start: offset of the bitmap item in the extent buffer
5493 * @pos: bit number of the first bit
5494 * @len: number of bits to set
3e1e8bb7 5495 */
2b48966a 5496void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
5497 unsigned long pos, unsigned long len)
5498{
2fe1d551 5499 u8 *kaddr;
3e1e8bb7
OS
5500 struct page *page;
5501 unsigned long i;
5502 size_t offset;
5503 const unsigned int size = pos + len;
5504 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5505 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5506
5507 eb_bitmap_offset(eb, start, pos, &i, &offset);
5508 page = eb->pages[i];
b8f95771 5509 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
5510 kaddr = page_address(page);
5511
5512 while (len >= bits_to_set) {
5513 kaddr[offset] |= mask_to_set;
5514 len -= bits_to_set;
5515 bits_to_set = BITS_PER_BYTE;
9c894696 5516 mask_to_set = ~0;
09cbfeaf 5517 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5518 offset = 0;
5519 page = eb->pages[++i];
b8f95771 5520 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
5521 kaddr = page_address(page);
5522 }
5523 }
5524 if (len) {
5525 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5526 kaddr[offset] |= mask_to_set;
5527 }
5528}
5529
5530
43dd529a
DS
5531/*
5532 * Clear an area of a bitmap.
5533 *
5534 * @eb: the extent buffer
5535 * @start: offset of the bitmap item in the extent buffer
5536 * @pos: bit number of the first bit
5537 * @len: number of bits to clear
3e1e8bb7 5538 */
2b48966a
DS
5539void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
5540 unsigned long start, unsigned long pos,
5541 unsigned long len)
3e1e8bb7 5542{
2fe1d551 5543 u8 *kaddr;
3e1e8bb7
OS
5544 struct page *page;
5545 unsigned long i;
5546 size_t offset;
5547 const unsigned int size = pos + len;
5548 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5549 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5550
5551 eb_bitmap_offset(eb, start, pos, &i, &offset);
5552 page = eb->pages[i];
b8f95771 5553 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
5554 kaddr = page_address(page);
5555
5556 while (len >= bits_to_clear) {
5557 kaddr[offset] &= ~mask_to_clear;
5558 len -= bits_to_clear;
5559 bits_to_clear = BITS_PER_BYTE;
9c894696 5560 mask_to_clear = ~0;
09cbfeaf 5561 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5562 offset = 0;
5563 page = eb->pages[++i];
b8f95771 5564 assert_eb_page_uptodate(eb, page);
3e1e8bb7
OS
5565 kaddr = page_address(page);
5566 }
5567 }
5568 if (len) {
5569 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5570 kaddr[offset] &= ~mask_to_clear;
5571 }
5572}
5573
3387206f
ST
5574static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5575{
5576 unsigned long distance = (src > dst) ? src - dst : dst - src;
5577 return distance < len;
5578}
5579
d1310b2e
CM
5580static void copy_pages(struct page *dst_page, struct page *src_page,
5581 unsigned long dst_off, unsigned long src_off,
5582 unsigned long len)
5583{
a6591715 5584 char *dst_kaddr = page_address(dst_page);
d1310b2e 5585 char *src_kaddr;
727011e0 5586 int must_memmove = 0;
d1310b2e 5587
3387206f 5588 if (dst_page != src_page) {
a6591715 5589 src_kaddr = page_address(src_page);
3387206f 5590 } else {
d1310b2e 5591 src_kaddr = dst_kaddr;
727011e0
CM
5592 if (areas_overlap(src_off, dst_off, len))
5593 must_memmove = 1;
3387206f 5594 }
d1310b2e 5595
727011e0
CM
5596 if (must_memmove)
5597 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5598 else
5599 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
5600}
5601
2b48966a
DS
5602void memcpy_extent_buffer(const struct extent_buffer *dst,
5603 unsigned long dst_offset, unsigned long src_offset,
5604 unsigned long len)
d1310b2e
CM
5605{
5606 size_t cur;
5607 size_t dst_off_in_page;
5608 size_t src_off_in_page;
d1310b2e
CM
5609 unsigned long dst_i;
5610 unsigned long src_i;
5611
f98b6215
QW
5612 if (check_eb_range(dst, dst_offset, len) ||
5613 check_eb_range(dst, src_offset, len))
5614 return;
d1310b2e 5615
d397712b 5616 while (len > 0) {
884b07d0
QW
5617 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
5618 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
d1310b2e 5619
884b07d0
QW
5620 dst_i = get_eb_page_index(dst_offset);
5621 src_i = get_eb_page_index(src_offset);
d1310b2e 5622
09cbfeaf 5623 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
5624 src_off_in_page));
5625 cur = min_t(unsigned long, cur,
09cbfeaf 5626 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 5627
fb85fc9a 5628 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
5629 dst_off_in_page, src_off_in_page, cur);
5630
5631 src_offset += cur;
5632 dst_offset += cur;
5633 len -= cur;
5634 }
5635}
d1310b2e 5636
2b48966a
DS
5637void memmove_extent_buffer(const struct extent_buffer *dst,
5638 unsigned long dst_offset, unsigned long src_offset,
5639 unsigned long len)
d1310b2e
CM
5640{
5641 size_t cur;
5642 size_t dst_off_in_page;
5643 size_t src_off_in_page;
5644 unsigned long dst_end = dst_offset + len - 1;
5645 unsigned long src_end = src_offset + len - 1;
d1310b2e
CM
5646 unsigned long dst_i;
5647 unsigned long src_i;
5648
f98b6215
QW
5649 if (check_eb_range(dst, dst_offset, len) ||
5650 check_eb_range(dst, src_offset, len))
5651 return;
727011e0 5652 if (dst_offset < src_offset) {
d1310b2e
CM
5653 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5654 return;
5655 }
d397712b 5656 while (len > 0) {
884b07d0
QW
5657 dst_i = get_eb_page_index(dst_end);
5658 src_i = get_eb_page_index(src_end);
d1310b2e 5659
884b07d0
QW
5660 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
5661 src_off_in_page = get_eb_offset_in_page(dst, src_end);
d1310b2e
CM
5662
5663 cur = min_t(unsigned long, len, src_off_in_page + 1);
5664 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 5665 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
5666 dst_off_in_page - cur + 1,
5667 src_off_in_page - cur + 1, cur);
5668
5669 dst_end -= cur;
5670 src_end -= cur;
5671 len -= cur;
5672 }
5673}
6af118ce 5674
01cd3909 5675#define GANG_LOOKUP_SIZE 16
d1e86e3f
QW
5676static struct extent_buffer *get_next_extent_buffer(
5677 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
5678{
01cd3909
DS
5679 struct extent_buffer *gang[GANG_LOOKUP_SIZE];
5680 struct extent_buffer *found = NULL;
d1e86e3f 5681 u64 page_start = page_offset(page);
01cd3909 5682 u64 cur = page_start;
d1e86e3f
QW
5683
5684 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
d1e86e3f
QW
5685 lockdep_assert_held(&fs_info->buffer_lock);
5686
01cd3909
DS
5687 while (cur < page_start + PAGE_SIZE) {
5688 int ret;
5689 int i;
5690
5691 ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
5692 (void **)gang, cur >> fs_info->sectorsize_bits,
5693 min_t(unsigned int, GANG_LOOKUP_SIZE,
5694 PAGE_SIZE / fs_info->nodesize));
5695 if (ret == 0)
5696 goto out;
5697 for (i = 0; i < ret; i++) {
5698 /* Already beyond page end */
5699 if (gang[i]->start >= page_start + PAGE_SIZE)
5700 goto out;
5701 /* Found one */
5702 if (gang[i]->start >= bytenr) {
5703 found = gang[i];
5704 goto out;
5705 }
5706 }
5707 cur = gang[ret - 1]->start + gang[ret - 1]->len;
d1e86e3f 5708 }
01cd3909
DS
5709out:
5710 return found;
d1e86e3f
QW
5711}
5712
5713static int try_release_subpage_extent_buffer(struct page *page)
5714{
5715 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
5716 u64 cur = page_offset(page);
5717 const u64 end = page_offset(page) + PAGE_SIZE;
5718 int ret;
5719
5720 while (cur < end) {
5721 struct extent_buffer *eb = NULL;
5722
5723 /*
5724 * Unlike try_release_extent_buffer() which uses page->private
5725 * to grab buffer, for subpage case we rely on radix tree, thus
5726 * we need to ensure radix tree consistency.
5727 *
5728 * We also want an atomic snapshot of the radix tree, thus go
5729 * with spinlock rather than RCU.
5730 */
5731 spin_lock(&fs_info->buffer_lock);
5732 eb = get_next_extent_buffer(fs_info, page, cur);
5733 if (!eb) {
5734 /* No more eb in the page range after or at cur */
5735 spin_unlock(&fs_info->buffer_lock);
5736 break;
5737 }
5738 cur = eb->start + eb->len;
5739
5740 /*
5741 * The same as try_release_extent_buffer(), to ensure the eb
5742 * won't disappear out from under us.
5743 */
5744 spin_lock(&eb->refs_lock);
5745 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5746 spin_unlock(&eb->refs_lock);
5747 spin_unlock(&fs_info->buffer_lock);
5748 break;
5749 }
5750 spin_unlock(&fs_info->buffer_lock);
5751
5752 /*
5753 * If tree ref isn't set then we know the ref on this eb is a
5754 * real ref, so just return, this eb will likely be freed soon
5755 * anyway.
5756 */
5757 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5758 spin_unlock(&eb->refs_lock);
5759 break;
5760 }
5761
5762 /*
5763 * Here we don't care about the return value, we will always
5764 * check the page private at the end. And
5765 * release_extent_buffer() will release the refs_lock.
5766 */
5767 release_extent_buffer(eb);
5768 }
5769 /*
5770 * Finally to check if we have cleared page private, as if we have
5771 * released all ebs in the page, the page private should be cleared now.
5772 */
5773 spin_lock(&page->mapping->private_lock);
5774 if (!PagePrivate(page))
5775 ret = 1;
5776 else
5777 ret = 0;
5778 spin_unlock(&page->mapping->private_lock);
5779 return ret;
5780
5781}
5782
f7a52a40 5783int try_release_extent_buffer(struct page *page)
19fe0a8b 5784{
6af118ce 5785 struct extent_buffer *eb;
6af118ce 5786
fbca46eb 5787 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
d1e86e3f
QW
5788 return try_release_subpage_extent_buffer(page);
5789
3083ee2e 5790 /*
d1e86e3f
QW
5791 * We need to make sure nobody is changing page->private, as we rely on
5792 * page->private as the pointer to extent buffer.
3083ee2e
JB
5793 */
5794 spin_lock(&page->mapping->private_lock);
5795 if (!PagePrivate(page)) {
5796 spin_unlock(&page->mapping->private_lock);
4f2de97a 5797 return 1;
45f49bce 5798 }
6af118ce 5799
3083ee2e
JB
5800 eb = (struct extent_buffer *)page->private;
5801 BUG_ON(!eb);
19fe0a8b
MX
5802
5803 /*
3083ee2e
JB
5804 * This is a little awful but should be ok, we need to make sure that
5805 * the eb doesn't disappear out from under us while we're looking at
5806 * this page.
19fe0a8b 5807 */
3083ee2e 5808 spin_lock(&eb->refs_lock);
0b32f4bb 5809 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
5810 spin_unlock(&eb->refs_lock);
5811 spin_unlock(&page->mapping->private_lock);
5812 return 0;
b9473439 5813 }
3083ee2e 5814 spin_unlock(&page->mapping->private_lock);
897ca6e9 5815
19fe0a8b 5816 /*
3083ee2e
JB
5817 * If tree ref isn't set then we know the ref on this eb is a real ref,
5818 * so just return, this page will likely be freed soon anyway.
19fe0a8b 5819 */
3083ee2e
JB
5820 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5821 spin_unlock(&eb->refs_lock);
5822 return 0;
b9473439 5823 }
19fe0a8b 5824
f7a52a40 5825 return release_extent_buffer(eb);
6af118ce 5826}
bfb484d9
JB
5827
5828/*
5829 * btrfs_readahead_tree_block - attempt to readahead a child block
5830 * @fs_info: the fs_info
5831 * @bytenr: bytenr to read
3fbaf258 5832 * @owner_root: objectid of the root that owns this eb
bfb484d9 5833 * @gen: generation for the uptodate check, can be 0
3fbaf258 5834 * @level: level for the eb
bfb484d9
JB
5835 *
5836 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
5837 * normal uptodate check of the eb, without checking the generation. If we have
5838 * to read the block we will not block on anything.
5839 */
5840void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
3fbaf258 5841 u64 bytenr, u64 owner_root, u64 gen, int level)
bfb484d9
JB
5842{
5843 struct extent_buffer *eb;
5844 int ret;
5845
3fbaf258 5846 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
bfb484d9
JB
5847 if (IS_ERR(eb))
5848 return;
5849
5850 if (btrfs_buffer_uptodate(eb, gen, 1)) {
5851 free_extent_buffer(eb);
5852 return;
5853 }
5854
5855 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
5856 if (ret < 0)
5857 free_extent_buffer_stale(eb);
5858 else
5859 free_extent_buffer(eb);
5860}
5861
5862/*
5863 * btrfs_readahead_node_child - readahead a node's child block
5864 * @node: parent node we're reading from
5865 * @slot: slot in the parent node for the child we want to read
5866 *
5867 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
5868 * the slot in the node provided.
5869 */
5870void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
5871{
5872 btrfs_readahead_tree_block(node->fs_info,
5873 btrfs_node_blockptr(node, slot),
3fbaf258
JB
5874 btrfs_header_owner(node),
5875 btrfs_node_ptr_generation(node, slot),
5876 btrfs_header_level(node) - 1);
bfb484d9 5877}