dma-buf: keep the signaling time of merged fences v3
[linux-block.git] / fs / btrfs / scrub.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
a2de733c 2/*
b6bfebc1 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
a2de733c
AJ
4 */
5
a2de733c 6#include <linux/blkdev.h>
558540c1 7#include <linux/ratelimit.h>
de2491fd 8#include <linux/sched/mm.h>
d5178578 9#include <crypto/hash.h>
a2de733c 10#include "ctree.h"
6e80d4f8 11#include "discard.h"
a2de733c
AJ
12#include "volumes.h"
13#include "disk-io.h"
14#include "ordered-data.h"
0ef8e451 15#include "transaction.h"
558540c1 16#include "backref.h"
5da6fcbc 17#include "extent_io.h"
ff023aac 18#include "dev-replace.h"
21adbd5c 19#include "check-integrity.h"
53b381b3 20#include "raid56.h"
aac0023c 21#include "block-group.h"
12659251 22#include "zoned.h"
c7f13d42 23#include "fs.h"
07e81dc9 24#include "accessors.h"
7c8ede16 25#include "file-item.h"
2fc6822c 26#include "scrub.h"
a2de733c
AJ
27
28/*
29 * This is only the first step towards a full-features scrub. It reads all
30 * extent and super block and verifies the checksums. In case a bad checksum
31 * is found or the extent cannot be read, good data will be written back if
32 * any can be found.
33 *
34 * Future enhancements:
a2de733c
AJ
35 * - In case an unrepairable extent is encountered, track which files are
36 * affected and report them
a2de733c 37 * - track and record media errors, throw out bad devices
a2de733c 38 * - add a mode to also read unallocated space
a2de733c
AJ
39 */
40
d9d181c1 41struct scrub_ctx;
a2de733c 42
ff023aac 43/*
13a62fd9 44 * The following value only influences the performance.
c9d328c0 45 *
13a62fd9 46 * This determines the batch size for stripe submitted in one go.
ff023aac 47 */
54765392 48#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */
7a9e9987
SB
49
50/*
0bb3acdc 51 * The following value times PAGE_SIZE needs to be large enough to match the
7a9e9987 52 * largest node/leaf/sector size that shall be supported.
7a9e9987 53 */
7e737cbc 54#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
a2de733c 55
2af2aaf9
QW
56/* Represent one sector and its needed info to verify the content. */
57struct scrub_sector_verification {
58 bool is_metadata;
59
60 union {
61 /*
62 * Csum pointer for data csum verification. Should point to a
63 * sector csum inside scrub_stripe::csums.
64 *
65 * NULL if this data sector has no csum.
66 */
67 u8 *csum;
68
69 /*
70 * Extra info for metadata verification. All sectors inside a
71 * tree block share the same generation.
72 */
73 u64 generation;
74 };
75};
76
77enum scrub_stripe_flags {
78 /* Set when @mirror_num, @dev, @physical and @logical are set. */
79 SCRUB_STRIPE_FLAG_INITIALIZED,
80
81 /* Set when the read-repair is finished. */
82 SCRUB_STRIPE_FLAG_REPAIR_DONE,
1009254b
QW
83
84 /*
85 * Set for data stripes if it's triggered from P/Q stripe.
86 * During such scrub, we should not report errors in data stripes, nor
87 * update the accounting.
88 */
89 SCRUB_STRIPE_FLAG_NO_REPORT,
2af2aaf9
QW
90};
91
92#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
93
94/*
95 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
96 */
97struct scrub_stripe {
00965807 98 struct scrub_ctx *sctx;
2af2aaf9
QW
99 struct btrfs_block_group *bg;
100
101 struct page *pages[SCRUB_STRIPE_PAGES];
102 struct scrub_sector_verification *sectors;
103
104 struct btrfs_device *dev;
105 u64 logical;
106 u64 physical;
107
108 u16 mirror_num;
109
110 /* Should be BTRFS_STRIPE_LEN / sectorsize. */
111 u16 nr_sectors;
112
00965807
QW
113 /*
114 * How many data/meta extents are in this stripe. Only for scrub status
115 * reporting purposes.
116 */
117 u16 nr_data_extents;
118 u16 nr_meta_extents;
119
2af2aaf9
QW
120 atomic_t pending_io;
121 wait_queue_head_t io_wait;
9ecb5ef5 122 wait_queue_head_t repair_wait;
2af2aaf9
QW
123
124 /*
125 * Indicate the states of the stripe. Bits are defined in
126 * scrub_stripe_flags enum.
127 */
128 unsigned long state;
129
130 /* Indicate which sectors are covered by extent items. */
131 unsigned long extent_sector_bitmap;
132
133 /*
134 * The errors hit during the initial read of the stripe.
135 *
136 * Would be utilized for error reporting and repair.
79b8ee70
QW
137 *
138 * The remaining init_nr_* records the number of errors hit, only used
139 * by error reporting.
2af2aaf9
QW
140 */
141 unsigned long init_error_bitmap;
79b8ee70
QW
142 unsigned int init_nr_io_errors;
143 unsigned int init_nr_csum_errors;
144 unsigned int init_nr_meta_errors;
2af2aaf9
QW
145
146 /*
147 * The following error bitmaps are all for the current status.
148 * Every time we submit a new read, these bitmaps may be updated.
149 *
150 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
151 *
152 * IO and csum errors can happen for both metadata and data.
153 */
154 unsigned long error_bitmap;
155 unsigned long io_error_bitmap;
156 unsigned long csum_error_bitmap;
157 unsigned long meta_error_bitmap;
158
058e09e6
QW
159 /* For writeback (repair or replace) error reporting. */
160 unsigned long write_error_bitmap;
161
162 /* Writeback can be concurrent, thus we need to protect the bitmap. */
163 spinlock_t write_error_lock;
164
2af2aaf9
QW
165 /*
166 * Checksum for the whole stripe if this stripe is inside a data block
167 * group.
168 */
169 u8 *csums;
9ecb5ef5
QW
170
171 struct work_struct work;
2af2aaf9
QW
172};
173
d9d181c1 174struct scrub_ctx {
54765392 175 struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
1009254b 176 struct scrub_stripe *raid56_data_stripes;
fb456252 177 struct btrfs_fs_info *fs_info;
a2de733c 178 int first_free;
54765392 179 int cur_stripe;
a2de733c
AJ
180 struct list_head csum_list;
181 atomic_t cancel_req;
8628764e 182 int readonly;
e360d2f5 183 int sectors_per_bio;
63a212ab 184
eb3b5053
DS
185 /* State of IO submission throttling affecting the associated device */
186 ktime_t throttle_deadline;
187 u64 throttle_sent;
188
63a212ab 189 int is_dev_replace;
de17addc 190 u64 write_pointer;
3fb99303 191
3fb99303 192 struct mutex wr_lock;
3fb99303 193 struct btrfs_device *wr_tgtdev;
63a212ab 194
a2de733c
AJ
195 /*
196 * statistics
197 */
198 struct btrfs_scrub_progress stat;
199 spinlock_t stat_lock;
f55985f4
FM
200
201 /*
202 * Use a ref counter to avoid use-after-free issues. Scrub workers
203 * decrement bios_in_flight and workers_pending and then do a wakeup
204 * on the list_wait wait queue. We must ensure the main scrub task
205 * doesn't free the scrub context before or while the workers are
206 * doing the wakeup() call.
207 */
99f4cdb1 208 refcount_t refs;
a2de733c
AJ
209};
210
558540c1
JS
211struct scrub_warning {
212 struct btrfs_path *path;
213 u64 extent_item_size;
558540c1 214 const char *errstr;
6aa21263 215 u64 physical;
558540c1
JS
216 u64 logical;
217 struct btrfs_device *dev;
558540c1
JS
218};
219
2af2aaf9
QW
220static void release_scrub_stripe(struct scrub_stripe *stripe)
221{
222 if (!stripe)
223 return;
224
225 for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
226 if (stripe->pages[i])
227 __free_page(stripe->pages[i]);
228 stripe->pages[i] = NULL;
229 }
230 kfree(stripe->sectors);
231 kfree(stripe->csums);
232 stripe->sectors = NULL;
233 stripe->csums = NULL;
00965807 234 stripe->sctx = NULL;
2af2aaf9
QW
235 stripe->state = 0;
236}
237
54765392
QW
238static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
239 struct scrub_stripe *stripe)
2af2aaf9
QW
240{
241 int ret;
242
243 memset(stripe, 0, sizeof(*stripe));
244
245 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
246 stripe->state = 0;
247
248 init_waitqueue_head(&stripe->io_wait);
9ecb5ef5 249 init_waitqueue_head(&stripe->repair_wait);
2af2aaf9 250 atomic_set(&stripe->pending_io, 0);
058e09e6 251 spin_lock_init(&stripe->write_error_lock);
2af2aaf9
QW
252
253 ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
254 if (ret < 0)
255 goto error;
256
257 stripe->sectors = kcalloc(stripe->nr_sectors,
258 sizeof(struct scrub_sector_verification),
259 GFP_KERNEL);
260 if (!stripe->sectors)
261 goto error;
262
263 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
264 fs_info->csum_size, GFP_KERNEL);
265 if (!stripe->csums)
266 goto error;
267 return 0;
268error:
269 release_scrub_stripe(stripe);
270 return -ENOMEM;
271}
272
9ecb5ef5 273static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
2af2aaf9
QW
274{
275 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
276}
277
f55985f4 278static void scrub_put_ctx(struct scrub_ctx *sctx);
1623edeb 279
cb7ab021 280static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
3cb0929a
WS
281{
282 while (atomic_read(&fs_info->scrub_pause_req)) {
283 mutex_unlock(&fs_info->scrub_lock);
284 wait_event(fs_info->scrub_pause_wait,
285 atomic_read(&fs_info->scrub_pause_req) == 0);
286 mutex_lock(&fs_info->scrub_lock);
287 }
288}
289
0e22be89 290static void scrub_pause_on(struct btrfs_fs_info *fs_info)
cb7ab021
WS
291{
292 atomic_inc(&fs_info->scrubs_paused);
293 wake_up(&fs_info->scrub_pause_wait);
0e22be89 294}
cb7ab021 295
0e22be89
Z
296static void scrub_pause_off(struct btrfs_fs_info *fs_info)
297{
cb7ab021
WS
298 mutex_lock(&fs_info->scrub_lock);
299 __scrub_blocked_if_needed(fs_info);
300 atomic_dec(&fs_info->scrubs_paused);
301 mutex_unlock(&fs_info->scrub_lock);
302
303 wake_up(&fs_info->scrub_pause_wait);
304}
305
0e22be89
Z
306static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
307{
308 scrub_pause_on(fs_info);
309 scrub_pause_off(fs_info);
310}
311
d9d181c1 312static void scrub_free_csums(struct scrub_ctx *sctx)
a2de733c 313{
d9d181c1 314 while (!list_empty(&sctx->csum_list)) {
a2de733c 315 struct btrfs_ordered_sum *sum;
d9d181c1 316 sum = list_first_entry(&sctx->csum_list,
a2de733c
AJ
317 struct btrfs_ordered_sum, list);
318 list_del(&sum->list);
319 kfree(sum);
320 }
321}
322
d9d181c1 323static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
a2de733c
AJ
324{
325 int i;
a2de733c 326
d9d181c1 327 if (!sctx)
a2de733c
AJ
328 return;
329
54765392
QW
330 for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
331 release_scrub_stripe(&sctx->stripes[i]);
332
d9d181c1
SB
333 scrub_free_csums(sctx);
334 kfree(sctx);
a2de733c
AJ
335}
336
f55985f4
FM
337static void scrub_put_ctx(struct scrub_ctx *sctx)
338{
99f4cdb1 339 if (refcount_dec_and_test(&sctx->refs))
f55985f4
FM
340 scrub_free_ctx(sctx);
341}
342
92f7ba43
DS
343static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
344 struct btrfs_fs_info *fs_info, int is_dev_replace)
a2de733c 345{
d9d181c1 346 struct scrub_ctx *sctx;
a2de733c 347 int i;
a2de733c 348
58c4e173 349 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
d9d181c1 350 if (!sctx)
a2de733c 351 goto nomem;
99f4cdb1 352 refcount_set(&sctx->refs, 1);
63a212ab 353 sctx->is_dev_replace = is_dev_replace;
92f7ba43 354 sctx->fs_info = fs_info;
e49be14b 355 INIT_LIST_HEAD(&sctx->csum_list);
54765392
QW
356 for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
357 int ret;
358
359 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
360 if (ret < 0)
361 goto nomem;
362 sctx->stripes[i].sctx = sctx;
363 }
d9d181c1 364 sctx->first_free = 0;
d9d181c1 365 atomic_set(&sctx->cancel_req, 0);
d9d181c1 366
d9d181c1 367 spin_lock_init(&sctx->stat_lock);
eb3b5053 368 sctx->throttle_deadline = 0;
ff023aac 369
3fb99303 370 mutex_init(&sctx->wr_lock);
8fcdac3f 371 if (is_dev_replace) {
ded56184 372 WARN_ON(!fs_info->dev_replace.tgtdev);
ded56184 373 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
ff023aac 374 }
8fcdac3f 375
d9d181c1 376 return sctx;
a2de733c
AJ
377
378nomem:
d9d181c1 379 scrub_free_ctx(sctx);
a2de733c
AJ
380 return ERR_PTR(-ENOMEM);
381}
382
c7499a64
FM
383static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
384 u64 root, void *warn_ctx)
558540c1 385{
558540c1
JS
386 u32 nlink;
387 int ret;
388 int i;
de2491fd 389 unsigned nofs_flag;
558540c1
JS
390 struct extent_buffer *eb;
391 struct btrfs_inode_item *inode_item;
ff023aac 392 struct scrub_warning *swarn = warn_ctx;
fb456252 393 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
558540c1
JS
394 struct inode_fs_paths *ipath = NULL;
395 struct btrfs_root *local_root;
1d4c08e0 396 struct btrfs_key key;
558540c1 397
56e9357a 398 local_root = btrfs_get_fs_root(fs_info, root, true);
558540c1
JS
399 if (IS_ERR(local_root)) {
400 ret = PTR_ERR(local_root);
401 goto err;
402 }
403
14692cc1
DS
404 /*
405 * this makes the path point to (inum INODE_ITEM ioff)
406 */
1d4c08e0
DS
407 key.objectid = inum;
408 key.type = BTRFS_INODE_ITEM_KEY;
409 key.offset = 0;
410
411 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
558540c1 412 if (ret) {
00246528 413 btrfs_put_root(local_root);
558540c1
JS
414 btrfs_release_path(swarn->path);
415 goto err;
416 }
417
418 eb = swarn->path->nodes[0];
419 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
420 struct btrfs_inode_item);
558540c1
JS
421 nlink = btrfs_inode_nlink(eb, inode_item);
422 btrfs_release_path(swarn->path);
423
de2491fd
DS
424 /*
425 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
426 * uses GFP_NOFS in this context, so we keep it consistent but it does
427 * not seem to be strictly necessary.
428 */
429 nofs_flag = memalloc_nofs_save();
558540c1 430 ipath = init_ipath(4096, local_root, swarn->path);
de2491fd 431 memalloc_nofs_restore(nofs_flag);
26bdef54 432 if (IS_ERR(ipath)) {
00246528 433 btrfs_put_root(local_root);
26bdef54
DC
434 ret = PTR_ERR(ipath);
435 ipath = NULL;
436 goto err;
437 }
558540c1
JS
438 ret = paths_from_inode(inum, ipath);
439
440 if (ret < 0)
441 goto err;
442
443 /*
444 * we deliberately ignore the bit ipath might have been too small to
445 * hold all of the paths here
446 */
447 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
5d163e0e 448 btrfs_warn_in_rcu(fs_info,
8df507cb 449"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
5d163e0e 450 swarn->errstr, swarn->logical,
cb3e217b 451 btrfs_dev_name(swarn->dev),
6aa21263 452 swarn->physical,
5d163e0e 453 root, inum, offset,
8df507cb 454 fs_info->sectorsize, nlink,
5d163e0e 455 (char *)(unsigned long)ipath->fspath->val[i]);
558540c1 456
00246528 457 btrfs_put_root(local_root);
558540c1
JS
458 free_ipath(ipath);
459 return 0;
460
461err:
5d163e0e 462 btrfs_warn_in_rcu(fs_info,
6aa21263 463 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
5d163e0e 464 swarn->errstr, swarn->logical,
cb3e217b 465 btrfs_dev_name(swarn->dev),
6aa21263 466 swarn->physical,
5d163e0e 467 root, inum, offset, ret);
558540c1
JS
468
469 free_ipath(ipath);
470 return 0;
471}
472
00965807
QW
473static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
474 bool is_super, u64 logical, u64 physical)
558540c1 475{
00965807 476 struct btrfs_fs_info *fs_info = dev->fs_info;
558540c1
JS
477 struct btrfs_path *path;
478 struct btrfs_key found_key;
479 struct extent_buffer *eb;
480 struct btrfs_extent_item *ei;
481 struct scrub_warning swarn;
69917e43 482 unsigned long ptr = 0;
69917e43 483 u64 flags = 0;
558540c1 484 u64 ref_root;
69917e43 485 u32 item_size;
07c9a8e0 486 u8 ref_level = 0;
69917e43 487 int ret;
558540c1 488
e69bf81c 489 /* Super block error, no need to search extent tree. */
00965807 490 if (is_super) {
e69bf81c 491 btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
00965807 492 errstr, btrfs_dev_name(dev), physical);
e69bf81c
QW
493 return;
494 }
558540c1 495 path = btrfs_alloc_path();
8b9456da
DS
496 if (!path)
497 return;
558540c1 498
00965807
QW
499 swarn.physical = physical;
500 swarn.logical = logical;
558540c1 501 swarn.errstr = errstr;
a36cf8b8 502 swarn.dev = NULL;
558540c1 503
69917e43
LB
504 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
505 &flags);
558540c1
JS
506 if (ret < 0)
507 goto out;
508
558540c1
JS
509 swarn.extent_item_size = found_key.offset;
510
511 eb = path->nodes[0];
512 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
3212fa14 513 item_size = btrfs_item_size(eb, path->slots[0]);
558540c1 514
69917e43 515 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
558540c1 516 do {
6eda71d0
LB
517 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
518 item_size, &ref_root,
519 &ref_level);
ecaeb14b 520 btrfs_warn_in_rcu(fs_info,
6aa21263 521"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
5d163e0e 522 errstr, swarn.logical,
cb3e217b 523 btrfs_dev_name(dev),
6aa21263 524 swarn.physical,
558540c1
JS
525 ref_level ? "node" : "leaf",
526 ret < 0 ? -1 : ref_level,
527 ret < 0 ? -1 : ref_root);
528 } while (ret != 1);
d8fe29e9 529 btrfs_release_path(path);
558540c1 530 } else {
a2c8d27e
FM
531 struct btrfs_backref_walk_ctx ctx = { 0 };
532
d8fe29e9 533 btrfs_release_path(path);
a2c8d27e
FM
534
535 ctx.bytenr = found_key.objectid;
536 ctx.extent_item_pos = swarn.logical - found_key.objectid;
537 ctx.fs_info = fs_info;
538
558540c1 539 swarn.path = path;
a36cf8b8 540 swarn.dev = dev;
a2c8d27e
FM
541
542 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
558540c1
JS
543 }
544
545out:
546 btrfs_free_path(path);
558540c1
JS
547}
548
4c664611 549static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
af8e2d1d 550{
4c664611 551 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
10f11900 552 return 2;
4c664611 553 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
10f11900
ZL
554 return 3;
555 else
4c664611 556 return (int)bioc->num_stripes;
af8e2d1d
MX
557}
558
10f11900 559static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
18d758a2 560 u64 full_stripe_logical,
af8e2d1d
MX
561 int nstripes, int mirror,
562 int *stripe_index,
563 u64 *stripe_offset)
564{
565 int i;
566
ffe2d203 567 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
18d758a2
QW
568 const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
569 nstripes - 1 : nstripes - 2;
570
af8e2d1d 571 /* RAID5/6 */
18d758a2
QW
572 for (i = 0; i < nr_data_stripes; i++) {
573 const u64 data_stripe_start = full_stripe_logical +
574 (i * BTRFS_STRIPE_LEN);
af8e2d1d 575
18d758a2
QW
576 if (logical >= data_stripe_start &&
577 logical < data_stripe_start + BTRFS_STRIPE_LEN)
af8e2d1d
MX
578 break;
579 }
580
581 *stripe_index = i;
18d758a2
QW
582 *stripe_offset = (logical - full_stripe_logical) &
583 BTRFS_STRIPE_LEN_MASK;
af8e2d1d
MX
584 } else {
585 /* The other RAID type */
586 *stripe_index = mirror;
587 *stripe_offset = 0;
588 }
589}
590
de17addc
NA
591static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
592{
593 int ret = 0;
594 u64 length;
595
596 if (!btrfs_is_zoned(sctx->fs_info))
597 return 0;
598
7db1c5d1
NA
599 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
600 return 0;
601
de17addc
NA
602 if (sctx->write_pointer < physical) {
603 length = physical - sctx->write_pointer;
604
605 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
606 sctx->write_pointer, length);
607 if (!ret)
608 sctx->write_pointer = physical;
609 }
610 return ret;
611}
612
a3ddbaeb
QW
613static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
614{
615 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
616 int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
617
618 return stripe->pages[page_index];
619}
620
621static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
622 int sector_nr)
623{
624 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
625
626 return offset_in_page(sector_nr << fs_info->sectorsize_bits);
627}
628
97cf8f37 629static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
a3ddbaeb
QW
630{
631 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
632 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
633 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
634 const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
635 const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
636 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
637 u8 on_disk_csum[BTRFS_CSUM_SIZE];
638 u8 calculated_csum[BTRFS_CSUM_SIZE];
639 struct btrfs_header *header;
640
641 /*
642 * Here we don't have a good way to attach the pages (and subpages)
643 * to a dummy extent buffer, thus we have to directly grab the members
644 * from pages.
645 */
646 header = (struct btrfs_header *)(page_address(first_page) + first_off);
647 memcpy(on_disk_csum, header->csum, fs_info->csum_size);
648
649 if (logical != btrfs_stack_header_bytenr(header)) {
650 bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
651 bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
652 btrfs_warn_rl(fs_info,
653 "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
654 logical, stripe->mirror_num,
655 btrfs_stack_header_bytenr(header), logical);
656 return;
657 }
658 if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
659 bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
660 bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
661 btrfs_warn_rl(fs_info,
662 "tree block %llu mirror %u has bad fsid, has %pU want %pU",
663 logical, stripe->mirror_num,
664 header->fsid, fs_info->fs_devices->fsid);
665 return;
666 }
667 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
668 BTRFS_UUID_SIZE) != 0) {
669 bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
670 bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
671 btrfs_warn_rl(fs_info,
672 "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
673 logical, stripe->mirror_num,
674 header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
675 return;
676 }
677
678 /* Now check tree block csum. */
679 shash->tfm = fs_info->csum_shash;
680 crypto_shash_init(shash);
681 crypto_shash_update(shash, page_address(first_page) + first_off +
682 BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
683
684 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
685 struct page *page = scrub_stripe_get_page(stripe, i);
686 unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
687
688 crypto_shash_update(shash, page_address(page) + page_off,
689 fs_info->sectorsize);
690 }
691
692 crypto_shash_final(shash, calculated_csum);
693 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
694 bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
695 bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
696 btrfs_warn_rl(fs_info,
697 "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
698 logical, stripe->mirror_num,
699 CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
700 CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
701 return;
702 }
703 if (stripe->sectors[sector_nr].generation !=
704 btrfs_stack_header_generation(header)) {
705 bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
706 bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
707 btrfs_warn_rl(fs_info,
708 "tree block %llu mirror %u has bad generation, has %llu want %llu",
709 logical, stripe->mirror_num,
710 btrfs_stack_header_generation(header),
711 stripe->sectors[sector_nr].generation);
712 return;
713 }
714 bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
715 bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
716 bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
717}
718
97cf8f37
QW
719static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
720{
721 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
722 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
723 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
724 struct page *page = scrub_stripe_get_page(stripe, sector_nr);
725 unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
726 u8 csum_buf[BTRFS_CSUM_SIZE];
727 int ret;
728
729 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
730
731 /* Sector not utilized, skip it. */
732 if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
733 return;
734
735 /* IO error, no need to check. */
736 if (test_bit(sector_nr, &stripe->io_error_bitmap))
737 return;
738
739 /* Metadata, verify the full tree block. */
740 if (sector->is_metadata) {
741 /*
742 * Check if the tree block crosses the stripe boudary. If
743 * crossed the boundary, we cannot verify it but only give a
744 * warning.
745 *
746 * This can only happen on a very old filesystem where chunks
747 * are not ensured to be stripe aligned.
748 */
749 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
750 btrfs_warn_rl(fs_info,
751 "tree block at %llu crosses stripe boundary %llu",
752 stripe->logical +
753 (sector_nr << fs_info->sectorsize_bits),
754 stripe->logical);
755 return;
756 }
757 scrub_verify_one_metadata(stripe, sector_nr);
758 return;
759 }
760
761 /*
762 * Data is easier, we just verify the data csum (if we have it). For
763 * cases without csum, we have no other choice but to trust it.
764 */
765 if (!sector->csum) {
766 clear_bit(sector_nr, &stripe->error_bitmap);
767 return;
768 }
769
770 ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
771 if (ret < 0) {
772 set_bit(sector_nr, &stripe->csum_error_bitmap);
773 set_bit(sector_nr, &stripe->error_bitmap);
774 } else {
775 clear_bit(sector_nr, &stripe->csum_error_bitmap);
776 clear_bit(sector_nr, &stripe->error_bitmap);
777 }
778}
779
780/* Verify specified sectors of a stripe. */
9ecb5ef5 781static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
97cf8f37
QW
782{
783 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
784 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
785 int sector_nr;
786
787 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
788 scrub_verify_one_sector(stripe, sector_nr);
789 if (stripe->sectors[sector_nr].is_metadata)
790 sector_nr += sectors_per_tree - 1;
791 }
792}
793
9ecb5ef5
QW
794static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
795{
796 int i;
797
798 for (i = 0; i < stripe->nr_sectors; i++) {
799 if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
800 scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
801 break;
802 }
803 ASSERT(i < stripe->nr_sectors);
804 return i;
805}
806
807/*
808 * Repair read is different to the regular read:
809 *
810 * - Only reads the failed sectors
811 * - May have extra blocksize limits
812 */
813static void scrub_repair_read_endio(struct btrfs_bio *bbio)
814{
815 struct scrub_stripe *stripe = bbio->private;
816 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
817 struct bio_vec *bvec;
818 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
819 u32 bio_size = 0;
820 int i;
821
822 ASSERT(sector_nr < stripe->nr_sectors);
823
824 bio_for_each_bvec_all(bvec, &bbio->bio, i)
825 bio_size += bvec->bv_len;
826
827 if (bbio->bio.bi_status) {
828 bitmap_set(&stripe->io_error_bitmap, sector_nr,
829 bio_size >> fs_info->sectorsize_bits);
830 bitmap_set(&stripe->error_bitmap, sector_nr,
831 bio_size >> fs_info->sectorsize_bits);
832 } else {
833 bitmap_clear(&stripe->io_error_bitmap, sector_nr,
834 bio_size >> fs_info->sectorsize_bits);
835 }
836 bio_put(&bbio->bio);
837 if (atomic_dec_and_test(&stripe->pending_io))
838 wake_up(&stripe->io_wait);
839}
840
841static int calc_next_mirror(int mirror, int num_copies)
842{
843 ASSERT(mirror <= num_copies);
844 return (mirror + 1 > num_copies) ? 1 : mirror + 1;
845}
846
847static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
848 int mirror, int blocksize, bool wait)
849{
850 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
851 struct btrfs_bio *bbio = NULL;
852 const unsigned long old_error_bitmap = stripe->error_bitmap;
853 int i;
854
855 ASSERT(stripe->mirror_num >= 1);
856 ASSERT(atomic_read(&stripe->pending_io) == 0);
857
858 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
859 struct page *page;
860 int pgoff;
861 int ret;
862
863 page = scrub_stripe_get_page(stripe, i);
864 pgoff = scrub_stripe_get_page_offset(stripe, i);
865
866 /* The current sector cannot be merged, submit the bio. */
867 if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
868 bbio->bio.bi_iter.bi_size >= blocksize)) {
869 ASSERT(bbio->bio.bi_iter.bi_size);
870 atomic_inc(&stripe->pending_io);
871 btrfs_submit_bio(bbio, mirror);
872 if (wait)
873 wait_scrub_stripe_io(stripe);
874 bbio = NULL;
875 }
876
877 if (!bbio) {
878 bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
879 fs_info, scrub_repair_read_endio, stripe);
880 bbio->bio.bi_iter.bi_sector = (stripe->logical +
881 (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
882 }
883
884 ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
885 ASSERT(ret == fs_info->sectorsize);
886 }
887 if (bbio) {
888 ASSERT(bbio->bio.bi_iter.bi_size);
889 atomic_inc(&stripe->pending_io);
890 btrfs_submit_bio(bbio, mirror);
891 if (wait)
892 wait_scrub_stripe_io(stripe);
893 }
894}
895
00965807
QW
896static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
897 struct scrub_stripe *stripe)
898{
899 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
900 DEFAULT_RATELIMIT_BURST);
901 struct btrfs_fs_info *fs_info = sctx->fs_info;
902 struct btrfs_device *dev = NULL;
903 u64 physical = 0;
904 int nr_data_sectors = 0;
905 int nr_meta_sectors = 0;
906 int nr_nodatacsum_sectors = 0;
907 int nr_repaired_sectors = 0;
908 int sector_nr;
909
1009254b
QW
910 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
911 return;
912
00965807
QW
913 /*
914 * Init needed infos for error reporting.
915 *
916 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
917 * thus no need for dev/physical, error reporting still needs dev and physical.
918 */
919 if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
920 u64 mapped_len = fs_info->sectorsize;
921 struct btrfs_io_context *bioc = NULL;
922 int stripe_index = stripe->mirror_num - 1;
923 int ret;
924
925 /* For scrub, our mirror_num should always start at 1. */
926 ASSERT(stripe->mirror_num >= 1);
927 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
928 stripe->logical, &mapped_len, &bioc);
929 /*
930 * If we failed, dev will be NULL, and later detailed reports
931 * will just be skipped.
932 */
933 if (ret < 0)
934 goto skip;
935 physical = bioc->stripes[stripe_index].physical;
936 dev = bioc->stripes[stripe_index].dev;
937 btrfs_put_bioc(bioc);
938 }
939
940skip:
941 for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
942 bool repaired = false;
943
944 if (stripe->sectors[sector_nr].is_metadata) {
945 nr_meta_sectors++;
946 } else {
947 nr_data_sectors++;
948 if (!stripe->sectors[sector_nr].csum)
949 nr_nodatacsum_sectors++;
950 }
951
952 if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
953 !test_bit(sector_nr, &stripe->error_bitmap)) {
954 nr_repaired_sectors++;
955 repaired = true;
956 }
957
958 /* Good sector from the beginning, nothing need to be done. */
959 if (!test_bit(sector_nr, &stripe->init_error_bitmap))
960 continue;
961
962 /*
963 * Report error for the corrupted sectors. If repaired, just
964 * output the message of repaired message.
965 */
966 if (repaired) {
967 if (dev) {
968 btrfs_err_rl_in_rcu(fs_info,
969 "fixed up error at logical %llu on dev %s physical %llu",
970 stripe->logical, btrfs_dev_name(dev),
971 physical);
972 } else {
973 btrfs_err_rl_in_rcu(fs_info,
974 "fixed up error at logical %llu on mirror %u",
975 stripe->logical, stripe->mirror_num);
976 }
977 continue;
978 }
979
980 /* The remaining are all for unrepaired. */
981 if (dev) {
982 btrfs_err_rl_in_rcu(fs_info,
983 "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
984 stripe->logical, btrfs_dev_name(dev),
985 physical);
986 } else {
987 btrfs_err_rl_in_rcu(fs_info,
988 "unable to fixup (regular) error at logical %llu on mirror %u",
989 stripe->logical, stripe->mirror_num);
990 }
991
992 if (test_bit(sector_nr, &stripe->io_error_bitmap))
993 if (__ratelimit(&rs) && dev)
994 scrub_print_common_warning("i/o error", dev, false,
995 stripe->logical, physical);
996 if (test_bit(sector_nr, &stripe->csum_error_bitmap))
997 if (__ratelimit(&rs) && dev)
998 scrub_print_common_warning("checksum error", dev, false,
999 stripe->logical, physical);
1000 if (test_bit(sector_nr, &stripe->meta_error_bitmap))
1001 if (__ratelimit(&rs) && dev)
1002 scrub_print_common_warning("header error", dev, false,
1003 stripe->logical, physical);
1004 }
1005
1006 spin_lock(&sctx->stat_lock);
1007 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
1008 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
1009 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
1010 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
1011 sctx->stat.no_csum += nr_nodatacsum_sectors;
79b8ee70
QW
1012 sctx->stat.read_errors += stripe->init_nr_io_errors;
1013 sctx->stat.csum_errors += stripe->init_nr_csum_errors;
1014 sctx->stat.verify_errors += stripe->init_nr_meta_errors;
00965807
QW
1015 sctx->stat.uncorrectable_errors +=
1016 bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
1017 sctx->stat.corrected_errors += nr_repaired_sectors;
1018 spin_unlock(&sctx->stat_lock);
1019}
1020
9ecb5ef5
QW
1021/*
1022 * The main entrance for all read related scrub work, including:
1023 *
1024 * - Wait for the initial read to finish
1025 * - Verify and locate any bad sectors
1026 * - Go through the remaining mirrors and try to read as large blocksize as
1027 * possible
1028 * - Go through all mirrors (including the failed mirror) sector-by-sector
1029 *
1030 * Writeback does not happen here, it needs extra synchronization.
1031 */
1032static void scrub_stripe_read_repair_worker(struct work_struct *work)
1033{
1034 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
1035 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1036 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1037 stripe->bg->length);
1038 int mirror;
1039 int i;
1040
1041 ASSERT(stripe->mirror_num > 0);
1042
1043 wait_scrub_stripe_io(stripe);
1044 scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
1045 /* Save the initial failed bitmap for later repair and report usage. */
1046 stripe->init_error_bitmap = stripe->error_bitmap;
79b8ee70
QW
1047 stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
1048 stripe->nr_sectors);
1049 stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap,
1050 stripe->nr_sectors);
1051 stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
1052 stripe->nr_sectors);
9ecb5ef5
QW
1053
1054 if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
1055 goto out;
1056
1057 /*
1058 * Try all remaining mirrors.
1059 *
1060 * Here we still try to read as large block as possible, as this is
1061 * faster and we have extra safety nets to rely on.
1062 */
1063 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
1064 mirror != stripe->mirror_num;
1065 mirror = calc_next_mirror(mirror, num_copies)) {
1066 const unsigned long old_error_bitmap = stripe->error_bitmap;
1067
1068 scrub_stripe_submit_repair_read(stripe, mirror,
1069 BTRFS_STRIPE_LEN, false);
1070 wait_scrub_stripe_io(stripe);
1071 scrub_verify_one_stripe(stripe, old_error_bitmap);
1072 if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
1073 goto out;
1074 }
1075
1076 /*
1077 * Last safety net, try re-checking all mirrors, including the failed
1078 * one, sector-by-sector.
1079 *
1080 * As if one sector failed the drive's internal csum, the whole read
1081 * containing the offending sector would be marked as error.
1082 * Thus here we do sector-by-sector read.
1083 *
1084 * This can be slow, thus we only try it as the last resort.
1085 */
1086
1087 for (i = 0, mirror = stripe->mirror_num;
1088 i < num_copies;
1089 i++, mirror = calc_next_mirror(mirror, num_copies)) {
1090 const unsigned long old_error_bitmap = stripe->error_bitmap;
1091
1092 scrub_stripe_submit_repair_read(stripe, mirror,
1093 fs_info->sectorsize, true);
1094 wait_scrub_stripe_io(stripe);
1095 scrub_verify_one_stripe(stripe, old_error_bitmap);
1096 if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
1097 goto out;
1098 }
1099out:
00965807 1100 scrub_stripe_report_errors(stripe->sctx, stripe);
9ecb5ef5
QW
1101 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
1102 wake_up(&stripe->repair_wait);
1103}
1104
54765392 1105static void scrub_read_endio(struct btrfs_bio *bbio)
9ecb5ef5
QW
1106{
1107 struct scrub_stripe *stripe = bbio->private;
1108
1109 if (bbio->bio.bi_status) {
1110 bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
1111 bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
1112 } else {
1113 bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
1114 }
1115 bio_put(&bbio->bio);
1116 if (atomic_dec_and_test(&stripe->pending_io)) {
1117 wake_up(&stripe->io_wait);
1118 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
1119 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
1120 }
1121}
1122
058e09e6
QW
1123static void scrub_write_endio(struct btrfs_bio *bbio)
1124{
1125 struct scrub_stripe *stripe = bbio->private;
1126 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1127 struct bio_vec *bvec;
1128 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
1129 u32 bio_size = 0;
1130 int i;
1131
1132 bio_for_each_bvec_all(bvec, &bbio->bio, i)
1133 bio_size += bvec->bv_len;
1134
1135 if (bbio->bio.bi_status) {
1136 unsigned long flags;
1137
1138 spin_lock_irqsave(&stripe->write_error_lock, flags);
1139 bitmap_set(&stripe->write_error_bitmap, sector_nr,
1140 bio_size >> fs_info->sectorsize_bits);
1141 spin_unlock_irqrestore(&stripe->write_error_lock, flags);
1142 }
1143 bio_put(&bbio->bio);
1144
1145 if (atomic_dec_and_test(&stripe->pending_io))
1146 wake_up(&stripe->io_wait);
1147}
1148
b675df02
QW
1149static void scrub_submit_write_bio(struct scrub_ctx *sctx,
1150 struct scrub_stripe *stripe,
1151 struct btrfs_bio *bbio, bool dev_replace)
1152{
1153 struct btrfs_fs_info *fs_info = sctx->fs_info;
1154 u32 bio_len = bbio->bio.bi_iter.bi_size;
1155 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
1156 stripe->logical;
1157
1158 fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
1159 atomic_inc(&stripe->pending_io);
1160 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
1161 if (!btrfs_is_zoned(fs_info))
1162 return;
1163 /*
1164 * For zoned writeback, queue depth must be 1, thus we must wait for
1165 * the write to finish before the next write.
1166 */
1167 wait_scrub_stripe_io(stripe);
1168
1169 /*
1170 * And also need to update the write pointer if write finished
1171 * successfully.
1172 */
1173 if (!test_bit(bio_off >> fs_info->sectorsize_bits,
1174 &stripe->write_error_bitmap))
1175 sctx->write_pointer += bio_len;
1176}
1177
058e09e6
QW
1178/*
1179 * Submit the write bio(s) for the sectors specified by @write_bitmap.
1180 *
1181 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1182 *
1183 * - Only needs logical bytenr and mirror_num
1184 * Just like the scrub read path
1185 *
1186 * - Would only result in writes to the specified mirror
1187 * Unlike the regular writeback path, which would write back to all stripes
1188 *
1189 * - Handle dev-replace and read-repair writeback differently
1190 */
54765392
QW
1191static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
1192 unsigned long write_bitmap, bool dev_replace)
058e09e6
QW
1193{
1194 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1195 struct btrfs_bio *bbio = NULL;
058e09e6
QW
1196 int sector_nr;
1197
1198 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
1199 struct page *page = scrub_stripe_get_page(stripe, sector_nr);
1200 unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
1201 int ret;
1202
1203 /* We should only writeback sectors covered by an extent. */
1204 ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
1205
1206 /* Cannot merge with previous sector, submit the current one. */
1207 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
b675df02 1208 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
058e09e6
QW
1209 bbio = NULL;
1210 }
1211 if (!bbio) {
1212 bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
1213 fs_info, scrub_write_endio, stripe);
1214 bbio->bio.bi_iter.bi_sector = (stripe->logical +
1215 (sector_nr << fs_info->sectorsize_bits)) >>
1216 SECTOR_SHIFT;
1217 }
1218 ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
1219 ASSERT(ret == fs_info->sectorsize);
1220 }
b675df02
QW
1221 if (bbio)
1222 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
058e09e6
QW
1223}
1224
13a62fd9
QW
1225/*
1226 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1227 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1228 */
e02ee89b
QW
1229static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
1230 unsigned int bio_size)
eb3b5053
DS
1231{
1232 const int time_slice = 1000;
eb3b5053
DS
1233 s64 delta;
1234 ktime_t now;
1235 u32 div;
1236 u64 bwlimit;
1237
eb3b5053
DS
1238 bwlimit = READ_ONCE(device->scrub_speed_max);
1239 if (bwlimit == 0)
1240 return;
1241
1242 /*
1243 * Slice is divided into intervals when the IO is submitted, adjust by
1244 * bwlimit and maximum of 64 intervals.
1245 */
1246 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1247 div = min_t(u32, 64, div);
1248
1249 /* Start new epoch, set deadline */
1250 now = ktime_get();
1251 if (sctx->throttle_deadline == 0) {
1252 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1253 sctx->throttle_sent = 0;
1254 }
1255
1256 /* Still in the time to send? */
1257 if (ktime_before(now, sctx->throttle_deadline)) {
1258 /* If current bio is within the limit, send it */
e02ee89b 1259 sctx->throttle_sent += bio_size;
eb3b5053
DS
1260 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1261 return;
1262
1263 /* We're over the limit, sleep until the rest of the slice */
1264 delta = ktime_ms_delta(sctx->throttle_deadline, now);
1265 } else {
1266 /* New request after deadline, start new epoch */
1267 delta = 0;
1268 }
1269
1270 if (delta) {
1271 long timeout;
1272
1273 timeout = div_u64(delta * HZ, 1000);
1274 schedule_timeout_interruptible(timeout);
1275 }
1276
1277 /* Next call will start the deadline period */
1278 sctx->throttle_deadline = 0;
1279}
1280
3b080b25
WS
1281/*
1282 * Given a physical address, this will calculate it's
1283 * logical offset. if this is a parity stripe, it will return
1284 * the most left data stripe's logical offset.
1285 *
1286 * return 0 if it is a data stripe, 1 means parity stripe.
1287 */
1288static int get_raid56_logic_offset(u64 physical, int num,
5a6ac9ea
MX
1289 struct map_lookup *map, u64 *offset,
1290 u64 *stripe_start)
3b080b25
WS
1291{
1292 int i;
1293 int j = 0;
3b080b25 1294 u64 last_offset;
cff82672 1295 const int data_stripes = nr_data_stripes(map);
3b080b25 1296
cff82672 1297 last_offset = (physical - map->stripes[num].physical) * data_stripes;
5a6ac9ea
MX
1298 if (stripe_start)
1299 *stripe_start = last_offset;
1300
3b080b25 1301 *offset = last_offset;
cff82672 1302 for (i = 0; i < data_stripes; i++) {
6ded22c1
QW
1303 u32 stripe_nr;
1304 u32 stripe_index;
1305 u32 rot;
1306
a97699d1 1307 *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
3b080b25 1308
6ded22c1 1309 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
3b080b25
WS
1310
1311 /* Work out the disk rotation on this stripe-set */
6ded22c1
QW
1312 rot = stripe_nr % map->num_stripes;
1313 stripe_nr /= map->num_stripes;
3b080b25
WS
1314 /* calculate which stripe this data locates */
1315 rot += i;
e4fbaee2 1316 stripe_index = rot % map->num_stripes;
3b080b25
WS
1317 if (stripe_index == num)
1318 return 0;
1319 if (stripe_index < num)
1320 j++;
1321 }
a97699d1 1322 *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
3b080b25
WS
1323 return 1;
1324}
1325
416bd7e7
QW
1326/*
1327 * Return 0 if the extent item range covers any byte of the range.
1328 * Return <0 if the extent item is before @search_start.
1329 * Return >0 if the extent item is after @start_start + @search_len.
1330 */
1331static int compare_extent_item_range(struct btrfs_path *path,
1332 u64 search_start, u64 search_len)
1333{
1334 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
1335 u64 len;
1336 struct btrfs_key key;
1337
1338 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1339 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
1340 key.type == BTRFS_METADATA_ITEM_KEY);
1341 if (key.type == BTRFS_METADATA_ITEM_KEY)
1342 len = fs_info->nodesize;
1343 else
1344 len = key.offset;
1345
1346 if (key.objectid + len <= search_start)
1347 return -1;
1348 if (key.objectid >= search_start + search_len)
1349 return 1;
1350 return 0;
1351}
1352
1353/*
1354 * Locate one extent item which covers any byte in range
1355 * [@search_start, @search_start + @search_length)
1356 *
1357 * If the path is not initialized, we will initialize the search by doing
1358 * a btrfs_search_slot().
1359 * If the path is already initialized, we will use the path as the initial
1360 * slot, to avoid duplicated btrfs_search_slot() calls.
1361 *
1362 * NOTE: If an extent item starts before @search_start, we will still
1363 * return the extent item. This is for data extent crossing stripe boundary.
1364 *
1365 * Return 0 if we found such extent item, and @path will point to the extent item.
1366 * Return >0 if no such extent item can be found, and @path will be released.
1367 * Return <0 if hit fatal error, and @path will be released.
1368 */
1369static int find_first_extent_item(struct btrfs_root *extent_root,
1370 struct btrfs_path *path,
1371 u64 search_start, u64 search_len)
1372{
1373 struct btrfs_fs_info *fs_info = extent_root->fs_info;
1374 struct btrfs_key key;
1375 int ret;
1376
1377 /* Continue using the existing path */
1378 if (path->nodes[0])
1379 goto search_forward;
1380
1381 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1382 key.type = BTRFS_METADATA_ITEM_KEY;
1383 else
1384 key.type = BTRFS_EXTENT_ITEM_KEY;
1385 key.objectid = search_start;
1386 key.offset = (u64)-1;
1387
1388 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1389 if (ret < 0)
1390 return ret;
1391
1392 ASSERT(ret > 0);
1393 /*
1394 * Here we intentionally pass 0 as @min_objectid, as there could be
1395 * an extent item starting before @search_start.
1396 */
1397 ret = btrfs_previous_extent_item(extent_root, path, 0);
1398 if (ret < 0)
1399 return ret;
1400 /*
1401 * No matter whether we have found an extent item, the next loop will
1402 * properly do every check on the key.
1403 */
1404search_forward:
1405 while (true) {
1406 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1407 if (key.objectid >= search_start + search_len)
1408 break;
1409 if (key.type != BTRFS_METADATA_ITEM_KEY &&
1410 key.type != BTRFS_EXTENT_ITEM_KEY)
1411 goto next;
1412
1413 ret = compare_extent_item_range(path, search_start, search_len);
1414 if (ret == 0)
1415 return ret;
1416 if (ret > 0)
1417 break;
1418next:
1419 path->slots[0]++;
1420 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
1421 ret = btrfs_next_leaf(extent_root, path);
1422 if (ret) {
1423 /* Either no more item or fatal error */
1424 btrfs_release_path(path);
1425 return ret;
1426 }
1427 }
1428 }
1429 btrfs_release_path(path);
1430 return 1;
1431}
1432
09022b14
QW
1433static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
1434 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
1435{
1436 struct btrfs_key key;
1437 struct btrfs_extent_item *ei;
1438
1439 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1440 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
1441 key.type == BTRFS_EXTENT_ITEM_KEY);
1442 *extent_start_ret = key.objectid;
1443 if (key.type == BTRFS_METADATA_ITEM_KEY)
1444 *size_ret = path->nodes[0]->fs_info->nodesize;
1445 else
1446 *size_ret = key.offset;
1447 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
1448 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
1449 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
1450}
1451
7db1c5d1
NA
1452static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
1453 u64 physical, u64 physical_end)
1454{
1455 struct btrfs_fs_info *fs_info = sctx->fs_info;
1456 int ret = 0;
1457
1458 if (!btrfs_is_zoned(fs_info))
1459 return 0;
1460
7db1c5d1
NA
1461 mutex_lock(&sctx->wr_lock);
1462 if (sctx->write_pointer < physical_end) {
1463 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
1464 physical,
1465 sctx->write_pointer);
1466 if (ret)
1467 btrfs_err(fs_info,
1468 "zoned: failed to recover write pointer");
1469 }
1470 mutex_unlock(&sctx->wr_lock);
1471 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
1472
1473 return ret;
1474}
1475
b9795475
QW
1476static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
1477 struct scrub_stripe *stripe,
1478 u64 extent_start, u64 extent_len,
1479 u64 extent_flags, u64 extent_gen)
1480{
1481 for (u64 cur_logical = max(stripe->logical, extent_start);
1482 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
1483 extent_start + extent_len);
1484 cur_logical += fs_info->sectorsize) {
1485 const int nr_sector = (cur_logical - stripe->logical) >>
1486 fs_info->sectorsize_bits;
1487 struct scrub_sector_verification *sector =
1488 &stripe->sectors[nr_sector];
1489
1490 set_bit(nr_sector, &stripe->extent_sector_bitmap);
1491 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1492 sector->is_metadata = true;
1493 sector->generation = extent_gen;
1494 }
1495 }
1496}
1497
1498static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
1499{
1500 stripe->extent_sector_bitmap = 0;
1501 stripe->init_error_bitmap = 0;
79b8ee70
QW
1502 stripe->init_nr_io_errors = 0;
1503 stripe->init_nr_csum_errors = 0;
1504 stripe->init_nr_meta_errors = 0;
b9795475
QW
1505 stripe->error_bitmap = 0;
1506 stripe->io_error_bitmap = 0;
1507 stripe->csum_error_bitmap = 0;
1508 stripe->meta_error_bitmap = 0;
1509}
1510
1511/*
1512 * Locate one stripe which has at least one extent in its range.
1513 *
1514 * Return 0 if found such stripe, and store its info into @stripe.
1515 * Return >0 if there is no such stripe in the specified range.
1516 * Return <0 for error.
1517 */
54765392
QW
1518static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
1519 struct btrfs_device *dev, u64 physical,
1520 int mirror_num, u64 logical_start,
1521 u32 logical_len,
1522 struct scrub_stripe *stripe)
b9795475
QW
1523{
1524 struct btrfs_fs_info *fs_info = bg->fs_info;
1525 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
1526 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
1527 const u64 logical_end = logical_start + logical_len;
1528 struct btrfs_path path = { 0 };
1529 u64 cur_logical = logical_start;
1530 u64 stripe_end;
1531 u64 extent_start;
1532 u64 extent_len;
1533 u64 extent_flags;
1534 u64 extent_gen;
1535 int ret;
1536
1537 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
1538 stripe->nr_sectors);
1539 scrub_stripe_reset_bitmaps(stripe);
1540
1541 /* The range must be inside the bg. */
1542 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
1543
1544 path.search_commit_root = 1;
1545 path.skip_locking = 1;
1546
1547 ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
1548 /* Either error or not found. */
1549 if (ret)
1550 goto out;
1551 get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
00965807
QW
1552 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1553 stripe->nr_meta_extents++;
1554 if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1555 stripe->nr_data_extents++;
b9795475
QW
1556 cur_logical = max(extent_start, cur_logical);
1557
1558 /*
1559 * Round down to stripe boundary.
1560 *
1561 * The extra calculation against bg->start is to handle block groups
1562 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1563 */
1564 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
1565 bg->start;
1566 stripe->physical = physical + stripe->logical - logical_start;
1567 stripe->dev = dev;
1568 stripe->bg = bg;
1569 stripe->mirror_num = mirror_num;
1570 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
1571
1572 /* Fill the first extent info into stripe->sectors[] array. */
1573 fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1574 extent_flags, extent_gen);
1575 cur_logical = extent_start + extent_len;
1576
1577 /* Fill the extent info for the remaining sectors. */
1578 while (cur_logical <= stripe_end) {
1579 ret = find_first_extent_item(extent_root, &path, cur_logical,
1580 stripe_end - cur_logical + 1);
1581 if (ret < 0)
1582 goto out;
1583 if (ret > 0) {
1584 ret = 0;
1585 break;
1586 }
1587 get_extent_info(&path, &extent_start, &extent_len,
1588 &extent_flags, &extent_gen);
00965807
QW
1589 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1590 stripe->nr_meta_extents++;
1591 if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1592 stripe->nr_data_extents++;
b9795475
QW
1593 fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1594 extent_flags, extent_gen);
1595 cur_logical = extent_start + extent_len;
1596 }
1597
1598 /* Now fill the data csum. */
1599 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
1600 int sector_nr;
1601 unsigned long csum_bitmap = 0;
1602
1603 /* Csum space should have already been allocated. */
1604 ASSERT(stripe->csums);
1605
1606 /*
1607 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1608 * should contain at most 16 sectors.
1609 */
1610 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
1611
1612 ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
1613 stripe_end, stripe->csums,
1614 &csum_bitmap, true);
1615 if (ret < 0)
1616 goto out;
1617 if (ret > 0)
1618 ret = 0;
1619
1620 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
1621 stripe->sectors[sector_nr].csum = stripe->csums +
1622 sector_nr * fs_info->csum_size;
1623 }
1624 }
1625 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
1626out:
1627 btrfs_release_path(&path);
1628 return ret;
1629}
1630
54765392
QW
1631static void scrub_reset_stripe(struct scrub_stripe *stripe)
1632{
1633 scrub_stripe_reset_bitmaps(stripe);
1634
1635 stripe->nr_meta_extents = 0;
1636 stripe->nr_data_extents = 0;
1637 stripe->state = 0;
1638
1639 for (int i = 0; i < stripe->nr_sectors; i++) {
1640 stripe->sectors[i].is_metadata = false;
1641 stripe->sectors[i].csum = NULL;
1642 stripe->sectors[i].generation = 0;
1643 }
1644}
1645
1646static void scrub_submit_initial_read(struct scrub_ctx *sctx,
1647 struct scrub_stripe *stripe)
1648{
1649 struct btrfs_fs_info *fs_info = sctx->fs_info;
1650 struct btrfs_bio *bbio;
1651 int mirror = stripe->mirror_num;
1652
1653 ASSERT(stripe->bg);
1654 ASSERT(stripe->mirror_num > 0);
1655 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1656
1657 bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
1658 scrub_read_endio, stripe);
1659
1660 /* Read the whole stripe. */
1661 bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
1662 for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
1663 int ret;
1664
1665 ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
1666 /* We should have allocated enough bio vectors. */
1667 ASSERT(ret == PAGE_SIZE);
1668 }
1669 atomic_inc(&stripe->pending_io);
1670
1671 /*
1672 * For dev-replace, either user asks to avoid the source dev, or
1673 * the device is missing, we try the next mirror instead.
1674 */
1675 if (sctx->is_dev_replace &&
1676 (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
1677 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
1678 !stripe->dev->bdev)) {
1679 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1680 stripe->bg->length);
1681
1682 mirror = calc_next_mirror(mirror, num_copies);
1683 }
1684 btrfs_submit_bio(bbio, mirror);
1685}
1686
8eb3dd17
QW
1687static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
1688{
1689 int i;
1690
1691 for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
1692 if (stripe->sectors[i].is_metadata) {
1693 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1694
1695 btrfs_err(fs_info,
1696 "stripe %llu has unrepaired metadata sector at %llu",
1697 stripe->logical,
1698 stripe->logical + (i << fs_info->sectorsize_bits));
1699 return true;
1700 }
1701 }
1702 return false;
1703}
1704
1705static int flush_scrub_stripes(struct scrub_ctx *sctx)
54765392
QW
1706{
1707 struct btrfs_fs_info *fs_info = sctx->fs_info;
1708 struct scrub_stripe *stripe;
1709 const int nr_stripes = sctx->cur_stripe;
8eb3dd17 1710 int ret = 0;
54765392
QW
1711
1712 if (!nr_stripes)
8eb3dd17 1713 return 0;
54765392
QW
1714
1715 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
e02ee89b
QW
1716
1717 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
1718 nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
54765392
QW
1719 for (int i = 0; i < nr_stripes; i++) {
1720 stripe = &sctx->stripes[i];
1721 scrub_submit_initial_read(sctx, stripe);
1722 }
1723
1724 for (int i = 0; i < nr_stripes; i++) {
1725 stripe = &sctx->stripes[i];
1726
1727 wait_event(stripe->repair_wait,
1728 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
1729 }
1730
1731 /*
1732 * Submit the repaired sectors. For zoned case, we cannot do repair
1733 * in-place, but queue the bg to be relocated.
1734 */
1735 if (btrfs_is_zoned(fs_info)) {
1736 for (int i = 0; i < nr_stripes; i++) {
1737 stripe = &sctx->stripes[i];
1738
1739 if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
1740 btrfs_repair_one_zone(fs_info,
1741 sctx->stripes[0].bg->start);
1742 break;
1743 }
1744 }
1f2030ff 1745 } else if (!sctx->readonly) {
54765392
QW
1746 for (int i = 0; i < nr_stripes; i++) {
1747 unsigned long repaired;
1748
1749 stripe = &sctx->stripes[i];
1750
1751 bitmap_andnot(&repaired, &stripe->init_error_bitmap,
1752 &stripe->error_bitmap, stripe->nr_sectors);
1753 scrub_write_sectors(sctx, stripe, repaired, false);
1754 }
1755 }
1756
1757 /* Submit for dev-replace. */
1758 if (sctx->is_dev_replace) {
8eb3dd17
QW
1759 /*
1760 * For dev-replace, if we know there is something wrong with
1761 * metadata, we should immedately abort.
1762 */
1763 for (int i = 0; i < nr_stripes; i++) {
1764 if (stripe_has_metadata_error(&sctx->stripes[i])) {
1765 ret = -EIO;
1766 goto out;
1767 }
1768 }
54765392
QW
1769 for (int i = 0; i < nr_stripes; i++) {
1770 unsigned long good;
1771
1772 stripe = &sctx->stripes[i];
1773
1774 ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
1775
1776 bitmap_andnot(&good, &stripe->extent_sector_bitmap,
1777 &stripe->error_bitmap, stripe->nr_sectors);
1778 scrub_write_sectors(sctx, stripe, good, true);
1779 }
1780 }
1781
1782 /* Wait for the above writebacks to finish. */
1783 for (int i = 0; i < nr_stripes; i++) {
1784 stripe = &sctx->stripes[i];
1785
1786 wait_scrub_stripe_io(stripe);
1787 scrub_reset_stripe(stripe);
1788 }
8eb3dd17 1789out:
54765392 1790 sctx->cur_stripe = 0;
8eb3dd17 1791 return ret;
54765392
QW
1792}
1793
1009254b
QW
1794static void raid56_scrub_wait_endio(struct bio *bio)
1795{
1796 complete(bio->bi_private);
1797}
1798
e02ee89b
QW
1799static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
1800 struct btrfs_device *dev, int mirror_num,
1801 u64 logical, u32 length, u64 physical)
54765392
QW
1802{
1803 struct scrub_stripe *stripe;
1804 int ret;
1805
1806 /* No available slot, submit all stripes and wait for them. */
8eb3dd17
QW
1807 if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
1808 ret = flush_scrub_stripes(sctx);
1809 if (ret < 0)
1810 return ret;
1811 }
54765392
QW
1812
1813 stripe = &sctx->stripes[sctx->cur_stripe];
1814
1815 /* We can queue one stripe using the remaining slot. */
1816 scrub_reset_stripe(stripe);
1817 ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
1818 logical, length, stripe);
1819 /* Either >0 as no more extents or <0 for error. */
1820 if (ret)
1821 return ret;
1822 sctx->cur_stripe++;
1823 return 0;
1824}
1825
1009254b
QW
1826static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
1827 struct btrfs_device *scrub_dev,
1828 struct btrfs_block_group *bg,
1829 struct map_lookup *map,
1830 u64 full_stripe_start)
1831{
1832 DECLARE_COMPLETION_ONSTACK(io_done);
1833 struct btrfs_fs_info *fs_info = sctx->fs_info;
1834 struct btrfs_raid_bio *rbio;
1835 struct btrfs_io_context *bioc = NULL;
1836 struct bio *bio;
1837 struct scrub_stripe *stripe;
1838 bool all_empty = true;
1839 const int data_stripes = nr_data_stripes(map);
1840 unsigned long extent_bitmap = 0;
1841 u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
1842 int ret;
1843
1844 ASSERT(sctx->raid56_data_stripes);
1845
1846 for (int i = 0; i < data_stripes; i++) {
1847 int stripe_index;
1848 int rot;
1849 u64 physical;
1850
1851 stripe = &sctx->raid56_data_stripes[i];
1852 rot = div_u64(full_stripe_start - bg->start,
1853 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
1854 stripe_index = (i + rot) % map->num_stripes;
1855 physical = map->stripes[stripe_index].physical +
1856 (rot << BTRFS_STRIPE_LEN_SHIFT);
1857
1858 scrub_reset_stripe(stripe);
1859 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
1860 ret = scrub_find_fill_first_stripe(bg,
1861 map->stripes[stripe_index].dev, physical, 1,
1862 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
1863 BTRFS_STRIPE_LEN, stripe);
1864 if (ret < 0)
1865 goto out;
1866 /*
1867 * No extent in this data stripe, need to manually mark them
1868 * initialized to make later read submission happy.
1869 */
1870 if (ret > 0) {
1871 stripe->logical = full_stripe_start +
1872 (i << BTRFS_STRIPE_LEN_SHIFT);
1873 stripe->dev = map->stripes[stripe_index].dev;
1874 stripe->mirror_num = 1;
1875 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
1876 }
1877 }
1878
1879 /* Check if all data stripes are empty. */
1880 for (int i = 0; i < data_stripes; i++) {
1881 stripe = &sctx->raid56_data_stripes[i];
1882 if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
1883 all_empty = false;
1884 break;
1885 }
1886 }
1887 if (all_empty) {
1888 ret = 0;
1889 goto out;
1890 }
1891
1892 for (int i = 0; i < data_stripes; i++) {
1893 stripe = &sctx->raid56_data_stripes[i];
1894 scrub_submit_initial_read(sctx, stripe);
1895 }
1896 for (int i = 0; i < data_stripes; i++) {
1897 stripe = &sctx->raid56_data_stripes[i];
1898
1899 wait_event(stripe->repair_wait,
1900 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
1901 }
1902 /* For now, no zoned support for RAID56. */
1903 ASSERT(!btrfs_is_zoned(sctx->fs_info));
1904
1905 /* Writeback for the repaired sectors. */
1906 for (int i = 0; i < data_stripes; i++) {
1907 unsigned long repaired;
1908
1909 stripe = &sctx->raid56_data_stripes[i];
1910
1911 bitmap_andnot(&repaired, &stripe->init_error_bitmap,
1912 &stripe->error_bitmap, stripe->nr_sectors);
1913 scrub_write_sectors(sctx, stripe, repaired, false);
1914 }
1915
1916 /* Wait for the above writebacks to finish. */
1917 for (int i = 0; i < data_stripes; i++) {
1918 stripe = &sctx->raid56_data_stripes[i];
1919
1920 wait_scrub_stripe_io(stripe);
1921 }
1922
1923 /*
1924 * Now all data stripes are properly verified. Check if we have any
1925 * unrepaired, if so abort immediately or we could further corrupt the
1926 * P/Q stripes.
1927 *
1928 * During the loop, also populate extent_bitmap.
1929 */
1930 for (int i = 0; i < data_stripes; i++) {
1931 unsigned long error;
1932
1933 stripe = &sctx->raid56_data_stripes[i];
1934
1935 /*
1936 * We should only check the errors where there is an extent.
1937 * As we may hit an empty data stripe while it's missing.
1938 */
1939 bitmap_and(&error, &stripe->error_bitmap,
1940 &stripe->extent_sector_bitmap, stripe->nr_sectors);
1941 if (!bitmap_empty(&error, stripe->nr_sectors)) {
1942 btrfs_err(fs_info,
1943"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
1944 full_stripe_start, i, stripe->nr_sectors,
1945 &error);
1946 ret = -EIO;
1947 goto out;
1948 }
1949 bitmap_or(&extent_bitmap, &extent_bitmap,
1950 &stripe->extent_sector_bitmap, stripe->nr_sectors);
1951 }
1952
1953 /* Now we can check and regenerate the P/Q stripe. */
1954 bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
1955 bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
1956 bio->bi_private = &io_done;
1957 bio->bi_end_io = raid56_scrub_wait_endio;
1958
1959 btrfs_bio_counter_inc_blocked(fs_info);
1960 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
1961 &length, &bioc);
1962 if (ret < 0) {
1963 btrfs_put_bioc(bioc);
1964 btrfs_bio_counter_dec(fs_info);
1965 goto out;
1966 }
1967 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
1968 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
1969 btrfs_put_bioc(bioc);
1970 if (!rbio) {
1971 ret = -ENOMEM;
1972 btrfs_bio_counter_dec(fs_info);
1973 goto out;
1974 }
1975 raid56_parity_submit_scrub_rbio(rbio);
1976 wait_for_completion_io(&io_done);
1977 ret = blk_status_to_errno(bio->bi_status);
1978 bio_put(bio);
1979 btrfs_bio_counter_dec(fs_info);
1980
1981out:
1982 return ret;
1983}
1984
09022b14
QW
1985/*
1986 * Scrub one range which can only has simple mirror based profile.
1987 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
1988 * RAID0/RAID10).
1989 *
1990 * Since we may need to handle a subset of block group, we need @logical_start
1991 * and @logical_length parameter.
1992 */
1993static int scrub_simple_mirror(struct scrub_ctx *sctx,
09022b14
QW
1994 struct btrfs_block_group *bg,
1995 struct map_lookup *map,
1996 u64 logical_start, u64 logical_length,
1997 struct btrfs_device *device,
1998 u64 physical, int mirror_num)
1999{
2000 struct btrfs_fs_info *fs_info = sctx->fs_info;
2001 const u64 logical_end = logical_start + logical_length;
2002 /* An artificial limit, inherit from old scrub behavior */
09022b14
QW
2003 struct btrfs_path path = { 0 };
2004 u64 cur_logical = logical_start;
2005 int ret;
2006
2007 /* The range must be inside the bg */
2008 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
2009
2010 path.search_commit_root = 1;
2011 path.skip_locking = 1;
2012 /* Go through each extent items inside the logical range */
2013 while (cur_logical < logical_end) {
e02ee89b 2014 u64 cur_physical = physical + cur_logical - logical_start;
09022b14
QW
2015
2016 /* Canceled? */
2017 if (atomic_read(&fs_info->scrub_cancel_req) ||
2018 atomic_read(&sctx->cancel_req)) {
2019 ret = -ECANCELED;
2020 break;
2021 }
2022 /* Paused? */
2023 if (atomic_read(&fs_info->scrub_pause_req)) {
2024 /* Push queued extents */
09022b14
QW
2025 scrub_blocked_if_needed(fs_info);
2026 }
2027 /* Block group removed? */
2028 spin_lock(&bg->lock);
3349b57f 2029 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
09022b14
QW
2030 spin_unlock(&bg->lock);
2031 ret = 0;
2032 break;
2033 }
2034 spin_unlock(&bg->lock);
2035
e02ee89b
QW
2036 ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
2037 cur_logical, logical_end - cur_logical,
2038 cur_physical);
09022b14
QW
2039 if (ret > 0) {
2040 /* No more extent, just update the accounting */
2041 sctx->stat.last_physical = physical + logical_length;
2042 ret = 0;
2043 break;
2044 }
2045 if (ret < 0)
2046 break;
09022b14 2047
e02ee89b
QW
2048 ASSERT(sctx->cur_stripe > 0);
2049 cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
2050 + BTRFS_STRIPE_LEN;
2051
09022b14
QW
2052 /* Don't hold CPU for too long time */
2053 cond_resched();
2054 }
2055 btrfs_release_path(&path);
2056 return ret;
2057}
2058
8557635e
QW
2059/* Calculate the full stripe length for simple stripe based profiles */
2060static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
2061{
2062 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2063 BTRFS_BLOCK_GROUP_RAID10));
2064
a97699d1 2065 return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
8557635e
QW
2066}
2067
2068/* Get the logical bytenr for the stripe */
2069static u64 simple_stripe_get_logical(struct map_lookup *map,
2070 struct btrfs_block_group *bg,
2071 int stripe_index)
2072{
2073 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2074 BTRFS_BLOCK_GROUP_RAID10));
2075 ASSERT(stripe_index < map->num_stripes);
2076
2077 /*
2078 * (stripe_index / sub_stripes) gives how many data stripes we need to
2079 * skip.
2080 */
a97699d1
QW
2081 return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
2082 bg->start;
8557635e
QW
2083}
2084
2085/* Get the mirror number for the stripe */
2086static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
2087{
2088 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2089 BTRFS_BLOCK_GROUP_RAID10));
2090 ASSERT(stripe_index < map->num_stripes);
2091
2092 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2093 return stripe_index % map->sub_stripes + 1;
2094}
2095
2096static int scrub_simple_stripe(struct scrub_ctx *sctx,
8557635e
QW
2097 struct btrfs_block_group *bg,
2098 struct map_lookup *map,
2099 struct btrfs_device *device,
2100 int stripe_index)
2101{
2102 const u64 logical_increment = simple_stripe_full_stripe_len(map);
2103 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
2104 const u64 orig_physical = map->stripes[stripe_index].physical;
2105 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
2106 u64 cur_logical = orig_logical;
2107 u64 cur_physical = orig_physical;
2108 int ret = 0;
2109
2110 while (cur_logical < bg->start + bg->length) {
2111 /*
2112 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2113 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2114 * this stripe.
2115 */
6b4d375a
QW
2116 ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
2117 BTRFS_STRIPE_LEN, device, cur_physical,
2118 mirror_num);
8557635e
QW
2119 if (ret)
2120 return ret;
2121 /* Skip to next stripe which belongs to the target device */
2122 cur_logical += logical_increment;
2123 /* For physical offset, we just go to next stripe */
a97699d1 2124 cur_physical += BTRFS_STRIPE_LEN;
8557635e
QW
2125 }
2126 return ret;
2127}
2128
d9d181c1 2129static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2ae8ae3d 2130 struct btrfs_block_group *bg,
bc88b486 2131 struct extent_map *em,
a36cf8b8 2132 struct btrfs_device *scrub_dev,
bc88b486 2133 int stripe_index)
a2de733c 2134{
fb456252 2135 struct btrfs_fs_info *fs_info = sctx->fs_info;
bc88b486 2136 struct map_lookup *map = em->map_lookup;
09022b14 2137 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
2ae8ae3d 2138 const u64 chunk_logical = bg->start;
a2de733c 2139 int ret;
8eb3dd17 2140 int ret2;
1194a824 2141 u64 physical = map->stripes[stripe_index].physical;
bc88b486
QW
2142 const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
2143 const u64 physical_end = physical + dev_stripe_len;
a2de733c 2144 u64 logical;
625f1c8d 2145 u64 logic_end;
18d30ab9 2146 /* The logical increment after finishing one stripe */
5c07c53f 2147 u64 increment;
18d30ab9 2148 /* Offset inside the chunk */
a2de733c 2149 u64 offset;
5a6ac9ea 2150 u64 stripe_logical;
3b080b25 2151 int stop_loop = 0;
53b381b3 2152
cb7ab021 2153 scrub_blocked_if_needed(fs_info);
7a26285e 2154
de17addc
NA
2155 if (sctx->is_dev_replace &&
2156 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
2157 mutex_lock(&sctx->wr_lock);
2158 sctx->write_pointer = physical;
2159 mutex_unlock(&sctx->wr_lock);
de17addc
NA
2160 }
2161
1009254b
QW
2162 /* Prepare the extra data stripes used by RAID56. */
2163 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
2164 ASSERT(sctx->raid56_data_stripes == NULL);
2165
2166 sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
2167 sizeof(struct scrub_stripe),
2168 GFP_KERNEL);
2169 if (!sctx->raid56_data_stripes) {
2170 ret = -ENOMEM;
2171 goto out;
2172 }
2173 for (int i = 0; i < nr_data_stripes(map); i++) {
2174 ret = init_scrub_stripe(fs_info,
2175 &sctx->raid56_data_stripes[i]);
2176 if (ret < 0)
2177 goto out;
2178 sctx->raid56_data_stripes[i].bg = bg;
2179 sctx->raid56_data_stripes[i].sctx = sctx;
2180 }
2181 }
09022b14
QW
2182 /*
2183 * There used to be a big double loop to handle all profiles using the
2184 * same routine, which grows larger and more gross over time.
2185 *
2186 * So here we handle each profile differently, so simpler profiles
2187 * have simpler scrubbing function.
2188 */
2189 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
2190 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2191 /*
2192 * Above check rules out all complex profile, the remaining
2193 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2194 * mirrored duplication without stripe.
2195 *
2196 * Only @physical and @mirror_num needs to calculated using
2197 * @stripe_index.
2198 */
6b4d375a
QW
2199 ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
2200 scrub_dev, map->stripes[stripe_index].physical,
09022b14 2201 stripe_index + 1);
e430c428 2202 offset = 0;
09022b14
QW
2203 goto out;
2204 }
8557635e 2205 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
6b4d375a 2206 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
a97699d1 2207 offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
8557635e
QW
2208 goto out;
2209 }
2210
2211 /* Only RAID56 goes through the old code */
2212 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
a2de733c 2213 ret = 0;
e430c428
QW
2214
2215 /* Calculate the logical end of the stripe */
2216 get_raid56_logic_offset(physical_end, stripe_index,
2217 map, &logic_end, NULL);
2218 logic_end += chunk_logical;
2219
2220 /* Initialize @offset in case we need to go to out: label */
2221 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
a97699d1 2222 increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
e430c428 2223
18d30ab9
QW
2224 /*
2225 * Due to the rotation, for RAID56 it's better to iterate each stripe
2226 * using their physical offset.
2227 */
3b080b25 2228 while (physical < physical_end) {
18d30ab9
QW
2229 ret = get_raid56_logic_offset(physical, stripe_index, map,
2230 &logical, &stripe_logical);
e430c428
QW
2231 logical += chunk_logical;
2232 if (ret) {
2233 /* it is parity strip */
2234 stripe_logical += chunk_logical;
1009254b
QW
2235 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
2236 map, stripe_logical);
e430c428
QW
2237 if (ret)
2238 goto out;
18d30ab9 2239 goto next;
f2f66a2f
ZL
2240 }
2241
18d30ab9
QW
2242 /*
2243 * Now we're at a data stripe, scrub each extents in the range.
2244 *
2245 * At this stage, if we ignore the repair part, inside each data
2246 * stripe it is no different than SINGLE profile.
2247 * We can reuse scrub_simple_mirror() here, as the repair part
2248 * is still based on @mirror_num.
2249 */
6b4d375a 2250 ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
18d30ab9 2251 scrub_dev, physical, 1);
a2de733c
AJ
2252 if (ret < 0)
2253 goto out;
a2de733c 2254next:
a2de733c 2255 logical += increment;
a97699d1 2256 physical += BTRFS_STRIPE_LEN;
d9d181c1 2257 spin_lock(&sctx->stat_lock);
625f1c8d 2258 if (stop_loop)
bc88b486
QW
2259 sctx->stat.last_physical =
2260 map->stripes[stripe_index].physical + dev_stripe_len;
625f1c8d
LB
2261 else
2262 sctx->stat.last_physical = physical;
d9d181c1 2263 spin_unlock(&sctx->stat_lock);
625f1c8d
LB
2264 if (stop_loop)
2265 break;
a2de733c 2266 }
ff023aac 2267out:
8eb3dd17 2268 ret2 = flush_scrub_stripes(sctx);
b50f2d04 2269 if (!ret)
8eb3dd17 2270 ret = ret2;
1009254b
QW
2271 if (sctx->raid56_data_stripes) {
2272 for (int i = 0; i < nr_data_stripes(map); i++)
2273 release_scrub_stripe(&sctx->raid56_data_stripes[i]);
2274 kfree(sctx->raid56_data_stripes);
2275 sctx->raid56_data_stripes = NULL;
2276 }
7db1c5d1
NA
2277
2278 if (sctx->is_dev_replace && ret >= 0) {
2279 int ret2;
2280
2ae8ae3d
QW
2281 ret2 = sync_write_pointer_for_zoned(sctx,
2282 chunk_logical + offset,
2283 map->stripes[stripe_index].physical,
2284 physical_end);
7db1c5d1
NA
2285 if (ret2)
2286 ret = ret2;
2287 }
2288
a2de733c
AJ
2289 return ret < 0 ? ret : 0;
2290}
2291
d9d181c1 2292static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
d04fbe19 2293 struct btrfs_block_group *bg,
a36cf8b8 2294 struct btrfs_device *scrub_dev,
020d5b73 2295 u64 dev_offset,
d04fbe19 2296 u64 dev_extent_len)
a2de733c 2297{
fb456252 2298 struct btrfs_fs_info *fs_info = sctx->fs_info;
c8bf1b67 2299 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
a2de733c
AJ
2300 struct map_lookup *map;
2301 struct extent_map *em;
2302 int i;
ff023aac 2303 int ret = 0;
a2de733c 2304
c8bf1b67 2305 read_lock(&map_tree->lock);
d04fbe19 2306 em = lookup_extent_mapping(map_tree, bg->start, bg->length);
c8bf1b67 2307 read_unlock(&map_tree->lock);
a2de733c 2308
020d5b73
FM
2309 if (!em) {
2310 /*
2311 * Might have been an unused block group deleted by the cleaner
2312 * kthread or relocation.
2313 */
d04fbe19 2314 spin_lock(&bg->lock);
3349b57f 2315 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
020d5b73 2316 ret = -EINVAL;
d04fbe19 2317 spin_unlock(&bg->lock);
020d5b73
FM
2318
2319 return ret;
2320 }
d04fbe19 2321 if (em->start != bg->start)
a2de733c 2322 goto out;
d04fbe19 2323 if (em->len < dev_extent_len)
a2de733c
AJ
2324 goto out;
2325
d04fbe19 2326 map = em->map_lookup;
a2de733c 2327 for (i = 0; i < map->num_stripes; ++i) {
a36cf8b8 2328 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
859acaf1 2329 map->stripes[i].physical == dev_offset) {
bc88b486 2330 ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
a2de733c
AJ
2331 if (ret)
2332 goto out;
2333 }
2334 }
2335out:
2336 free_extent_map(em);
2337
2338 return ret;
2339}
2340
de17addc
NA
2341static int finish_extent_writes_for_zoned(struct btrfs_root *root,
2342 struct btrfs_block_group *cache)
2343{
2344 struct btrfs_fs_info *fs_info = cache->fs_info;
2345 struct btrfs_trans_handle *trans;
2346
2347 if (!btrfs_is_zoned(fs_info))
2348 return 0;
2349
2350 btrfs_wait_block_group_reservations(cache);
2351 btrfs_wait_nocow_writers(cache);
2352 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
2353
2354 trans = btrfs_join_transaction(root);
2355 if (IS_ERR(trans))
2356 return PTR_ERR(trans);
2357 return btrfs_commit_transaction(trans);
2358}
2359
a2de733c 2360static noinline_for_stack
a36cf8b8 2361int scrub_enumerate_chunks(struct scrub_ctx *sctx,
32934280 2362 struct btrfs_device *scrub_dev, u64 start, u64 end)
a2de733c
AJ
2363{
2364 struct btrfs_dev_extent *dev_extent = NULL;
2365 struct btrfs_path *path;
0b246afa
JM
2366 struct btrfs_fs_info *fs_info = sctx->fs_info;
2367 struct btrfs_root *root = fs_info->dev_root;
a2de733c 2368 u64 chunk_offset;
55e3a601 2369 int ret = 0;
76a8efa1 2370 int ro_set;
a2de733c
AJ
2371 int slot;
2372 struct extent_buffer *l;
2373 struct btrfs_key key;
2374 struct btrfs_key found_key;
32da5386 2375 struct btrfs_block_group *cache;
ff023aac 2376 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
a2de733c
AJ
2377
2378 path = btrfs_alloc_path();
2379 if (!path)
2380 return -ENOMEM;
2381
e4058b54 2382 path->reada = READA_FORWARD;
a2de733c
AJ
2383 path->search_commit_root = 1;
2384 path->skip_locking = 1;
2385
a36cf8b8 2386 key.objectid = scrub_dev->devid;
a2de733c
AJ
2387 key.offset = 0ull;
2388 key.type = BTRFS_DEV_EXTENT_KEY;
2389
a2de733c 2390 while (1) {
d04fbe19
QW
2391 u64 dev_extent_len;
2392
a2de733c
AJ
2393 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2394 if (ret < 0)
8c51032f
AJ
2395 break;
2396 if (ret > 0) {
2397 if (path->slots[0] >=
2398 btrfs_header_nritems(path->nodes[0])) {
2399 ret = btrfs_next_leaf(root, path);
55e3a601
Z
2400 if (ret < 0)
2401 break;
2402 if (ret > 0) {
2403 ret = 0;
8c51032f 2404 break;
55e3a601
Z
2405 }
2406 } else {
2407 ret = 0;
8c51032f
AJ
2408 }
2409 }
a2de733c
AJ
2410
2411 l = path->nodes[0];
2412 slot = path->slots[0];
2413
2414 btrfs_item_key_to_cpu(l, &found_key, slot);
2415
a36cf8b8 2416 if (found_key.objectid != scrub_dev->devid)
a2de733c
AJ
2417 break;
2418
962a298f 2419 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
a2de733c
AJ
2420 break;
2421
2422 if (found_key.offset >= end)
2423 break;
2424
2425 if (found_key.offset < key.offset)
2426 break;
2427
2428 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
d04fbe19 2429 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
a2de733c 2430
d04fbe19 2431 if (found_key.offset + dev_extent_len <= start)
ced96edc 2432 goto skip;
a2de733c 2433
a2de733c
AJ
2434 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2435
2436 /*
2437 * get a reference on the corresponding block group to prevent
2438 * the chunk from going away while we scrub it
2439 */
2440 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
ced96edc
QW
2441
2442 /* some chunks are removed but not committed to disk yet,
2443 * continue scrubbing */
2444 if (!cache)
2445 goto skip;
2446
a692e13d
FM
2447 ASSERT(cache->start <= chunk_offset);
2448 /*
2449 * We are using the commit root to search for device extents, so
2450 * that means we could have found a device extent item from a
2451 * block group that was deleted in the current transaction. The
2452 * logical start offset of the deleted block group, stored at
2453 * @chunk_offset, might be part of the logical address range of
2454 * a new block group (which uses different physical extents).
2455 * In this case btrfs_lookup_block_group() has returned the new
2456 * block group, and its start address is less than @chunk_offset.
2457 *
2458 * We skip such new block groups, because it's pointless to
2459 * process them, as we won't find their extents because we search
2460 * for them using the commit root of the extent tree. For a device
2461 * replace it's also fine to skip it, we won't miss copying them
2462 * to the target device because we have the write duplication
2463 * setup through the regular write path (by btrfs_map_block()),
2464 * and we have committed a transaction when we started the device
2465 * replace, right after setting up the device replace state.
2466 */
2467 if (cache->start < chunk_offset) {
2468 btrfs_put_block_group(cache);
2469 goto skip;
2470 }
2471
78ce9fc2 2472 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3349b57f 2473 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
0dc16ef4
FM
2474 btrfs_put_block_group(cache);
2475 goto skip;
78ce9fc2 2476 }
78ce9fc2
NA
2477 }
2478
2473d24f
FM
2479 /*
2480 * Make sure that while we are scrubbing the corresponding block
2481 * group doesn't get its logical address and its device extents
2482 * reused for another block group, which can possibly be of a
2483 * different type and different profile. We do this to prevent
2484 * false error detections and crashes due to bogus attempts to
2485 * repair extents.
2486 */
2487 spin_lock(&cache->lock);
3349b57f 2488 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
2473d24f
FM
2489 spin_unlock(&cache->lock);
2490 btrfs_put_block_group(cache);
2491 goto skip;
2492 }
6b7304af 2493 btrfs_freeze_block_group(cache);
2473d24f
FM
2494 spin_unlock(&cache->lock);
2495
55e3a601
Z
2496 /*
2497 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2498 * to avoid deadlock caused by:
2499 * btrfs_inc_block_group_ro()
2500 * -> btrfs_wait_for_commit()
2501 * -> btrfs_commit_transaction()
2502 * -> btrfs_scrub_pause()
2503 */
2504 scrub_pause_on(fs_info);
b12de528
QW
2505
2506 /*
2507 * Don't do chunk preallocation for scrub.
2508 *
2509 * This is especially important for SYSTEM bgs, or we can hit
2510 * -EFBIG from btrfs_finish_chunk_alloc() like:
2511 * 1. The only SYSTEM bg is marked RO.
2512 * Since SYSTEM bg is small, that's pretty common.
2513 * 2. New SYSTEM bg will be allocated
2514 * Due to regular version will allocate new chunk.
2515 * 3. New SYSTEM bg is empty and will get cleaned up
2516 * Before cleanup really happens, it's marked RO again.
2517 * 4. Empty SYSTEM bg get scrubbed
2518 * We go back to 2.
2519 *
2520 * This can easily boost the amount of SYSTEM chunks if cleaner
2521 * thread can't be triggered fast enough, and use up all space
2522 * of btrfs_super_block::sys_chunk_array
1bbb97b8
QW
2523 *
2524 * While for dev replace, we need to try our best to mark block
2525 * group RO, to prevent race between:
2526 * - Write duplication
2527 * Contains latest data
2528 * - Scrub copy
2529 * Contains data from commit tree
2530 *
2531 * If target block group is not marked RO, nocow writes can
2532 * be overwritten by scrub copy, causing data corruption.
2533 * So for dev-replace, it's not allowed to continue if a block
2534 * group is not RO.
b12de528 2535 */
1bbb97b8 2536 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
de17addc
NA
2537 if (!ret && sctx->is_dev_replace) {
2538 ret = finish_extent_writes_for_zoned(root, cache);
2539 if (ret) {
2540 btrfs_dec_block_group_ro(cache);
2541 scrub_pause_off(fs_info);
2542 btrfs_put_block_group(cache);
2543 break;
2544 }
2545 }
2546
76a8efa1
Z
2547 if (ret == 0) {
2548 ro_set = 1;
7561551e
QW
2549 } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
2550 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
76a8efa1
Z
2551 /*
2552 * btrfs_inc_block_group_ro return -ENOSPC when it
2553 * failed in creating new chunk for metadata.
1bbb97b8 2554 * It is not a problem for scrub, because
76a8efa1
Z
2555 * metadata are always cowed, and our scrub paused
2556 * commit_transactions.
7561551e
QW
2557 *
2558 * For RAID56 chunks, we have to mark them read-only
2559 * for scrub, as later we would use our own cache
2560 * out of RAID56 realm.
2561 * Thus we want the RAID56 bg to be marked RO to
2562 * prevent RMW from screwing up out cache.
76a8efa1
Z
2563 */
2564 ro_set = 0;
195a49ea
FM
2565 } else if (ret == -ETXTBSY) {
2566 btrfs_warn(fs_info,
2567 "skipping scrub of block group %llu due to active swapfile",
2568 cache->start);
2569 scrub_pause_off(fs_info);
2570 ret = 0;
2571 goto skip_unfreeze;
76a8efa1 2572 } else {
5d163e0e 2573 btrfs_warn(fs_info,
913e1535 2574 "failed setting block group ro: %d", ret);
6b7304af 2575 btrfs_unfreeze_block_group(cache);
55e3a601 2576 btrfs_put_block_group(cache);
1bbb97b8 2577 scrub_pause_off(fs_info);
55e3a601
Z
2578 break;
2579 }
2580
1bbb97b8
QW
2581 /*
2582 * Now the target block is marked RO, wait for nocow writes to
2583 * finish before dev-replace.
2584 * COW is fine, as COW never overwrites extents in commit tree.
2585 */
2586 if (sctx->is_dev_replace) {
2587 btrfs_wait_nocow_writers(cache);
2588 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
2589 cache->length);
2590 }
2591
2592 scrub_pause_off(fs_info);
3ec17a67 2593 down_write(&dev_replace->rwsem);
d04fbe19 2594 dev_replace->cursor_right = found_key.offset + dev_extent_len;
ff023aac
SB
2595 dev_replace->cursor_left = found_key.offset;
2596 dev_replace->item_needs_writeback = 1;
cb5583dd
DS
2597 up_write(&dev_replace->rwsem);
2598
d04fbe19
QW
2599 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
2600 dev_extent_len);
78ce9fc2
NA
2601 if (sctx->is_dev_replace &&
2602 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
2603 cache, found_key.offset))
2604 ro_set = 0;
2605
3ec17a67 2606 down_write(&dev_replace->rwsem);
1a1a8b73
FM
2607 dev_replace->cursor_left = dev_replace->cursor_right;
2608 dev_replace->item_needs_writeback = 1;
3ec17a67 2609 up_write(&dev_replace->rwsem);
1a1a8b73 2610
76a8efa1 2611 if (ro_set)
2ff7e61e 2612 btrfs_dec_block_group_ro(cache);
ff023aac 2613
758f2dfc
FM
2614 /*
2615 * We might have prevented the cleaner kthread from deleting
2616 * this block group if it was already unused because we raced
2617 * and set it to RO mode first. So add it back to the unused
2618 * list, otherwise it might not ever be deleted unless a manual
2619 * balance is triggered or it becomes used and unused again.
2620 */
2621 spin_lock(&cache->lock);
3349b57f
JB
2622 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
2623 !cache->ro && cache->reserved == 0 && cache->used == 0) {
758f2dfc 2624 spin_unlock(&cache->lock);
6e80d4f8
DZ
2625 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
2626 btrfs_discard_queue_work(&fs_info->discard_ctl,
2627 cache);
2628 else
2629 btrfs_mark_bg_unused(cache);
758f2dfc
FM
2630 } else {
2631 spin_unlock(&cache->lock);
2632 }
195a49ea 2633skip_unfreeze:
6b7304af 2634 btrfs_unfreeze_block_group(cache);
a2de733c
AJ
2635 btrfs_put_block_group(cache);
2636 if (ret)
2637 break;
32934280 2638 if (sctx->is_dev_replace &&
af1be4f8 2639 atomic64_read(&dev_replace->num_write_errors) > 0) {
ff023aac
SB
2640 ret = -EIO;
2641 break;
2642 }
2643 if (sctx->stat.malloc_errors > 0) {
2644 ret = -ENOMEM;
2645 break;
2646 }
ced96edc 2647skip:
d04fbe19 2648 key.offset = found_key.offset + dev_extent_len;
71267333 2649 btrfs_release_path(path);
a2de733c
AJ
2650 }
2651
a2de733c 2652 btrfs_free_path(path);
8c51032f 2653
55e3a601 2654 return ret;
a2de733c
AJ
2655}
2656
2a2dc22f
QW
2657static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
2658 struct page *page, u64 physical, u64 generation)
2659{
2660 struct btrfs_fs_info *fs_info = sctx->fs_info;
2661 struct bio_vec bvec;
2662 struct bio bio;
2663 struct btrfs_super_block *sb = page_address(page);
2664 int ret;
2665
2666 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
2667 bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
2668 __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
2669 ret = submit_bio_wait(&bio);
2670 bio_uninit(&bio);
2671
2672 if (ret < 0)
2673 return ret;
2674 ret = btrfs_check_super_csum(fs_info, sb);
2675 if (ret != 0) {
2676 btrfs_err_rl(fs_info,
2677 "super block at physical %llu devid %llu has bad csum",
2678 physical, dev->devid);
2679 return -EIO;
2680 }
2681 if (btrfs_super_generation(sb) != generation) {
2682 btrfs_err_rl(fs_info,
2683"super block at physical %llu devid %llu has bad generation %llu expect %llu",
2684 physical, dev->devid,
2685 btrfs_super_generation(sb), generation);
2686 return -EUCLEAN;
2687 }
2688
2689 return btrfs_validate_super(fs_info, sb, -1);
2690}
2691
a36cf8b8
SB
2692static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2693 struct btrfs_device *scrub_dev)
a2de733c
AJ
2694{
2695 int i;
2696 u64 bytenr;
2697 u64 gen;
2a2dc22f
QW
2698 int ret = 0;
2699 struct page *page;
0b246afa 2700 struct btrfs_fs_info *fs_info = sctx->fs_info;
a2de733c 2701
84961539 2702 if (BTRFS_FS_ERROR(fs_info))
fbabd4a3 2703 return -EROFS;
79787eaa 2704
2a2dc22f
QW
2705 page = alloc_page(GFP_KERNEL);
2706 if (!page) {
2707 spin_lock(&sctx->stat_lock);
2708 sctx->stat.malloc_errors++;
2709 spin_unlock(&sctx->stat_lock);
2710 return -ENOMEM;
2711 }
2712
5f546063 2713 /* Seed devices of a new filesystem has their own generation. */
0b246afa 2714 if (scrub_dev->fs_devices != fs_info->fs_devices)
5f546063
MX
2715 gen = scrub_dev->generation;
2716 else
0b246afa 2717 gen = fs_info->last_trans_committed;
a2de733c
AJ
2718
2719 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2720 bytenr = btrfs_sb_offset(i);
935e5cc9
MX
2721 if (bytenr + BTRFS_SUPER_INFO_SIZE >
2722 scrub_dev->commit_total_bytes)
a2de733c 2723 break;
12659251
NA
2724 if (!btrfs_check_super_location(scrub_dev, bytenr))
2725 continue;
a2de733c 2726
2a2dc22f
QW
2727 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
2728 if (ret) {
2729 spin_lock(&sctx->stat_lock);
2730 sctx->stat.super_errors++;
2731 spin_unlock(&sctx->stat_lock);
2732 }
a2de733c 2733 }
2a2dc22f 2734 __free_page(page);
a2de733c
AJ
2735 return 0;
2736}
2737
e89c4a9c
JB
2738static void scrub_workers_put(struct btrfs_fs_info *fs_info)
2739{
2740 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
2741 &fs_info->scrub_lock)) {
be539518
CH
2742 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
2743 struct workqueue_struct *scrub_wr_comp =
2744 fs_info->scrub_wr_completion_workers;
e89c4a9c
JB
2745
2746 fs_info->scrub_workers = NULL;
2747 fs_info->scrub_wr_completion_workers = NULL;
e89c4a9c
JB
2748 mutex_unlock(&fs_info->scrub_lock);
2749
be539518
CH
2750 if (scrub_workers)
2751 destroy_workqueue(scrub_workers);
2752 if (scrub_wr_comp)
2753 destroy_workqueue(scrub_wr_comp);
e89c4a9c
JB
2754 }
2755}
2756
a2de733c
AJ
2757/*
2758 * get a reference count on fs_info->scrub_workers. start worker if necessary
2759 */
ff023aac
SB
2760static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2761 int is_dev_replace)
a2de733c 2762{
be539518
CH
2763 struct workqueue_struct *scrub_workers = NULL;
2764 struct workqueue_struct *scrub_wr_comp = NULL;
6f011058 2765 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
0339ef2f 2766 int max_active = fs_info->thread_pool_size;
e89c4a9c 2767 int ret = -ENOMEM;
a2de733c 2768
e89c4a9c
JB
2769 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
2770 return 0;
eb4318e5 2771
be539518
CH
2772 scrub_workers = alloc_workqueue("btrfs-scrub", flags,
2773 is_dev_replace ? 1 : max_active);
e89c4a9c
JB
2774 if (!scrub_workers)
2775 goto fail_scrub_workers;
e82afc52 2776
be539518 2777 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
e89c4a9c
JB
2778 if (!scrub_wr_comp)
2779 goto fail_scrub_wr_completion_workers;
ff09c4ca 2780
e89c4a9c
JB
2781 mutex_lock(&fs_info->scrub_lock);
2782 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
2783 ASSERT(fs_info->scrub_workers == NULL &&
5dc96f8d 2784 fs_info->scrub_wr_completion_workers == NULL);
e89c4a9c
JB
2785 fs_info->scrub_workers = scrub_workers;
2786 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
ff09c4ca 2787 refcount_set(&fs_info->scrub_workers_refcnt, 1);
e89c4a9c
JB
2788 mutex_unlock(&fs_info->scrub_lock);
2789 return 0;
632dd772 2790 }
e89c4a9c
JB
2791 /* Other thread raced in and created the workers for us */
2792 refcount_inc(&fs_info->scrub_workers_refcnt);
2793 mutex_unlock(&fs_info->scrub_lock);
e82afc52 2794
e89c4a9c 2795 ret = 0;
5dc96f8d 2796
be539518 2797 destroy_workqueue(scrub_wr_comp);
e82afc52 2798fail_scrub_wr_completion_workers:
be539518 2799 destroy_workqueue(scrub_workers);
e82afc52 2800fail_scrub_workers:
e89c4a9c 2801 return ret;
a2de733c
AJ
2802}
2803
aa1b8cd4
SB
2804int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2805 u64 end, struct btrfs_scrub_progress *progress,
63a212ab 2806 int readonly, int is_dev_replace)
a2de733c 2807{
562d7b15 2808 struct btrfs_dev_lookup_args args = { .devid = devid };
d9d181c1 2809 struct scrub_ctx *sctx;
a2de733c
AJ
2810 int ret;
2811 struct btrfs_device *dev;
a5fb1142 2812 unsigned int nofs_flag;
f9eab5f0 2813 bool need_commit = false;
a2de733c 2814
aa1b8cd4 2815 if (btrfs_fs_closing(fs_info))
6c3abeda 2816 return -EAGAIN;
a2de733c 2817
fc65bb53
QW
2818 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
2819 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
b5d67f64 2820
fc65bb53
QW
2821 /*
2822 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
2823 * value (max nodesize / min sectorsize), thus nodesize should always
2824 * be fine.
2825 */
2826 ASSERT(fs_info->nodesize <=
2827 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
7a9e9987 2828
0e94c4f4
DS
2829 /* Allocate outside of device_list_mutex */
2830 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
2831 if (IS_ERR(sctx))
2832 return PTR_ERR(sctx);
a2de733c 2833
e89c4a9c
JB
2834 ret = scrub_workers_get(fs_info, is_dev_replace);
2835 if (ret)
2836 goto out_free_ctx;
2837
aa1b8cd4 2838 mutex_lock(&fs_info->fs_devices->device_list_mutex);
562d7b15 2839 dev = btrfs_find_device(fs_info->fs_devices, &args);
e6e674bd
AJ
2840 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
2841 !is_dev_replace)) {
aa1b8cd4 2842 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0e94c4f4 2843 ret = -ENODEV;
e89c4a9c 2844 goto out;
a2de733c 2845 }
a2de733c 2846
ebbede42
AJ
2847 if (!is_dev_replace && !readonly &&
2848 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
5d68da3b 2849 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a4852cf2
DS
2850 btrfs_err_in_rcu(fs_info,
2851 "scrub on devid %llu: filesystem on %s is not writable",
cb3e217b 2852 devid, btrfs_dev_name(dev));
0e94c4f4 2853 ret = -EROFS;
e89c4a9c 2854 goto out;
5d68da3b
MX
2855 }
2856
3b7a016f 2857 mutex_lock(&fs_info->scrub_lock);
e12c9621 2858 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
401e29c1 2859 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
a2de733c 2860 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4 2861 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0e94c4f4 2862 ret = -EIO;
e89c4a9c 2863 goto out;
a2de733c
AJ
2864 }
2865
cb5583dd 2866 down_read(&fs_info->dev_replace.rwsem);
cadbc0a0 2867 if (dev->scrub_ctx ||
8dabb742
SB
2868 (!is_dev_replace &&
2869 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
cb5583dd 2870 up_read(&fs_info->dev_replace.rwsem);
a2de733c 2871 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0e94c4f4 2873 ret = -EINPROGRESS;
e89c4a9c 2874 goto out;
a2de733c 2875 }
cb5583dd 2876 up_read(&fs_info->dev_replace.rwsem);
3b7a016f 2877
d9d181c1 2878 sctx->readonly = readonly;
cadbc0a0 2879 dev->scrub_ctx = sctx;
3cb0929a 2880 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c 2881
3cb0929a
WS
2882 /*
2883 * checking @scrub_pause_req here, we can avoid
2884 * race between committing transaction and scrubbing.
2885 */
cb7ab021 2886 __scrub_blocked_if_needed(fs_info);
a2de733c
AJ
2887 atomic_inc(&fs_info->scrubs_running);
2888 mutex_unlock(&fs_info->scrub_lock);
a2de733c 2889
a5fb1142
FM
2890 /*
2891 * In order to avoid deadlock with reclaim when there is a transaction
2892 * trying to pause scrub, make sure we use GFP_NOFS for all the
46343501 2893 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
a5fb1142
FM
2894 * invoked by our callees. The pausing request is done when the
2895 * transaction commit starts, and it blocks the transaction until scrub
2896 * is paused (done at specific points at scrub_stripe() or right above
2897 * before incrementing fs_info->scrubs_running).
2898 */
2899 nofs_flag = memalloc_nofs_save();
ff023aac 2900 if (!is_dev_replace) {
f9eab5f0
QW
2901 u64 old_super_errors;
2902
2903 spin_lock(&sctx->stat_lock);
2904 old_super_errors = sctx->stat.super_errors;
2905 spin_unlock(&sctx->stat_lock);
2906
d1e14420 2907 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
9b011adf
WS
2908 /*
2909 * by holding device list mutex, we can
2910 * kick off writing super in log tree sync.
2911 */
3cb0929a 2912 mutex_lock(&fs_info->fs_devices->device_list_mutex);
ff023aac 2913 ret = scrub_supers(sctx, dev);
3cb0929a 2914 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
f9eab5f0
QW
2915
2916 spin_lock(&sctx->stat_lock);
2917 /*
2918 * Super block errors found, but we can not commit transaction
2919 * at current context, since btrfs_commit_transaction() needs
2920 * to pause the current running scrub (hold by ourselves).
2921 */
2922 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
2923 need_commit = true;
2924 spin_unlock(&sctx->stat_lock);
ff023aac 2925 }
a2de733c
AJ
2926
2927 if (!ret)
32934280 2928 ret = scrub_enumerate_chunks(sctx, dev, start, end);
a5fb1142 2929 memalloc_nofs_restore(nofs_flag);
a2de733c 2930
a2de733c
AJ
2931 atomic_dec(&fs_info->scrubs_running);
2932 wake_up(&fs_info->scrub_pause_wait);
2933
2934 if (progress)
d9d181c1 2935 memcpy(progress, &sctx->stat, sizeof(*progress));
a2de733c 2936
d1e14420
AJ
2937 if (!is_dev_replace)
2938 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
2939 ret ? "not finished" : "finished", devid, ret);
2940
a2de733c 2941 mutex_lock(&fs_info->scrub_lock);
cadbc0a0 2942 dev->scrub_ctx = NULL;
a2de733c
AJ
2943 mutex_unlock(&fs_info->scrub_lock);
2944
e89c4a9c 2945 scrub_workers_put(fs_info);
f55985f4 2946 scrub_put_ctx(sctx);
a2de733c 2947
f9eab5f0
QW
2948 /*
2949 * We found some super block errors before, now try to force a
2950 * transaction commit, as scrub has finished.
2951 */
2952 if (need_commit) {
2953 struct btrfs_trans_handle *trans;
2954
2955 trans = btrfs_start_transaction(fs_info->tree_root, 0);
2956 if (IS_ERR(trans)) {
2957 ret = PTR_ERR(trans);
2958 btrfs_err(fs_info,
2959 "scrub: failed to start transaction to fix super block errors: %d", ret);
2960 return ret;
2961 }
2962 ret = btrfs_commit_transaction(trans);
2963 if (ret < 0)
2964 btrfs_err(fs_info,
2965 "scrub: failed to commit transaction to fix super block errors: %d", ret);
2966 }
0e94c4f4 2967 return ret;
e89c4a9c
JB
2968out:
2969 scrub_workers_put(fs_info);
0e94c4f4
DS
2970out_free_ctx:
2971 scrub_free_ctx(sctx);
2972
a2de733c
AJ
2973 return ret;
2974}
2975
2ff7e61e 2976void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
a2de733c 2977{
a2de733c
AJ
2978 mutex_lock(&fs_info->scrub_lock);
2979 atomic_inc(&fs_info->scrub_pause_req);
2980 while (atomic_read(&fs_info->scrubs_paused) !=
2981 atomic_read(&fs_info->scrubs_running)) {
2982 mutex_unlock(&fs_info->scrub_lock);
2983 wait_event(fs_info->scrub_pause_wait,
2984 atomic_read(&fs_info->scrubs_paused) ==
2985 atomic_read(&fs_info->scrubs_running));
2986 mutex_lock(&fs_info->scrub_lock);
2987 }
2988 mutex_unlock(&fs_info->scrub_lock);
a2de733c
AJ
2989}
2990
2ff7e61e 2991void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
a2de733c 2992{
a2de733c
AJ
2993 atomic_dec(&fs_info->scrub_pause_req);
2994 wake_up(&fs_info->scrub_pause_wait);
a2de733c
AJ
2995}
2996
aa1b8cd4 2997int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
a2de733c 2998{
a2de733c
AJ
2999 mutex_lock(&fs_info->scrub_lock);
3000 if (!atomic_read(&fs_info->scrubs_running)) {
3001 mutex_unlock(&fs_info->scrub_lock);
3002 return -ENOTCONN;
3003 }
3004
3005 atomic_inc(&fs_info->scrub_cancel_req);
3006 while (atomic_read(&fs_info->scrubs_running)) {
3007 mutex_unlock(&fs_info->scrub_lock);
3008 wait_event(fs_info->scrub_pause_wait,
3009 atomic_read(&fs_info->scrubs_running) == 0);
3010 mutex_lock(&fs_info->scrub_lock);
3011 }
3012 atomic_dec(&fs_info->scrub_cancel_req);
3013 mutex_unlock(&fs_info->scrub_lock);
3014
3015 return 0;
3016}
3017
163e97ee 3018int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
49b25e05 3019{
163e97ee 3020 struct btrfs_fs_info *fs_info = dev->fs_info;
d9d181c1 3021 struct scrub_ctx *sctx;
a2de733c
AJ
3022
3023 mutex_lock(&fs_info->scrub_lock);
cadbc0a0 3024 sctx = dev->scrub_ctx;
d9d181c1 3025 if (!sctx) {
a2de733c
AJ
3026 mutex_unlock(&fs_info->scrub_lock);
3027 return -ENOTCONN;
3028 }
d9d181c1 3029 atomic_inc(&sctx->cancel_req);
cadbc0a0 3030 while (dev->scrub_ctx) {
a2de733c
AJ
3031 mutex_unlock(&fs_info->scrub_lock);
3032 wait_event(fs_info->scrub_pause_wait,
cadbc0a0 3033 dev->scrub_ctx == NULL);
a2de733c
AJ
3034 mutex_lock(&fs_info->scrub_lock);
3035 }
3036 mutex_unlock(&fs_info->scrub_lock);
3037
3038 return 0;
3039}
1623edeb 3040
2ff7e61e 3041int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
a2de733c
AJ
3042 struct btrfs_scrub_progress *progress)
3043{
562d7b15 3044 struct btrfs_dev_lookup_args args = { .devid = devid };
a2de733c 3045 struct btrfs_device *dev;
d9d181c1 3046 struct scrub_ctx *sctx = NULL;
a2de733c 3047
0b246afa 3048 mutex_lock(&fs_info->fs_devices->device_list_mutex);
562d7b15 3049 dev = btrfs_find_device(fs_info->fs_devices, &args);
a2de733c 3050 if (dev)
cadbc0a0 3051 sctx = dev->scrub_ctx;
d9d181c1
SB
3052 if (sctx)
3053 memcpy(progress, &sctx->stat, sizeof(*progress));
0b246afa 3054 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c 3055
d9d181c1 3056 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
a2de733c 3057}