7cd5e76a783cb9c11e1dc0fea9ce43e331f4029e
[linux-2.6-block.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "raid56.h"
20 #include "block-group.h"
21 #include "zoned.h"
22 #include "fs.h"
23 #include "accessors.h"
24 #include "file-item.h"
25 #include "scrub.h"
26 #include "raid-stripe-tree.h"
27
28 /*
29  * This is only the first step towards a full-features scrub. It reads all
30  * extent and super block and verifies the checksums. In case a bad checksum
31  * is found or the extent cannot be read, good data will be written back if
32  * any can be found.
33  *
34  * Future enhancements:
35  *  - In case an unrepairable extent is encountered, track which files are
36  *    affected and report them
37  *  - track and record media errors, throw out bad devices
38  *  - add a mode to also read unallocated space
39  */
40
41 struct scrub_ctx;
42
43 /*
44  * The following value only influences the performance.
45  *
46  * This determines how many stripes would be submitted in one go,
47  * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
48  */
49 #define SCRUB_STRIPES_PER_GROUP         8
50
51 /*
52  * How many groups we have for each sctx.
53  *
54  * This would be 8M per device, the same value as the old scrub in-flight bios
55  * size limit.
56  */
57 #define SCRUB_GROUPS_PER_SCTX           16
58
59 #define SCRUB_TOTAL_STRIPES             (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
60
61 /*
62  * The following value times PAGE_SIZE needs to be large enough to match the
63  * largest node/leaf/sector size that shall be supported.
64  */
65 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
66
67 /* Represent one sector and its needed info to verify the content. */
68 struct scrub_sector_verification {
69         union {
70                 /*
71                  * Csum pointer for data csum verification.  Should point to a
72                  * sector csum inside scrub_stripe::csums.
73                  *
74                  * NULL if this data sector has no csum.
75                  */
76                 u8 *csum;
77
78                 /*
79                  * Extra info for metadata verification.  All sectors inside a
80                  * tree block share the same generation.
81                  */
82                 u64 generation;
83         };
84 };
85
86 enum scrub_stripe_flags {
87         /* Set when @mirror_num, @dev, @physical and @logical are set. */
88         SCRUB_STRIPE_FLAG_INITIALIZED,
89
90         /* Set when the read-repair is finished. */
91         SCRUB_STRIPE_FLAG_REPAIR_DONE,
92
93         /*
94          * Set for data stripes if it's triggered from P/Q stripe.
95          * During such scrub, we should not report errors in data stripes, nor
96          * update the accounting.
97          */
98         SCRUB_STRIPE_FLAG_NO_REPORT,
99 };
100
101 /*
102  * We have multiple bitmaps for one scrub_stripe.
103  * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits,
104  * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64).
105  *
106  * So to reduce memory usage for each scrub_stripe, we pack those bitmaps
107  * into a larger one.
108  *
109  * These enum records where the sub-bitmap are inside the larger one.
110  * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit.
111  */
112 enum {
113         /* Which blocks are covered by extent items. */
114         scrub_bitmap_nr_has_extent = 0,
115
116         /* Which blocks are meteadata. */
117         scrub_bitmap_nr_is_metadata,
118
119         /*
120          * Which blocks have errors, including IO, csum, and metadata
121          * errors.
122          * This sub-bitmap is the OR results of the next few error related
123          * sub-bitmaps.
124          */
125         scrub_bitmap_nr_error,
126         scrub_bitmap_nr_io_error,
127         scrub_bitmap_nr_csum_error,
128         scrub_bitmap_nr_meta_error,
129         scrub_bitmap_nr_meta_gen_error,
130         scrub_bitmap_nr_last,
131 };
132
133 #define SCRUB_STRIPE_PAGES              (BTRFS_STRIPE_LEN / PAGE_SIZE)
134
135 /*
136  * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
137  */
138 struct scrub_stripe {
139         struct scrub_ctx *sctx;
140         struct btrfs_block_group *bg;
141
142         struct page *pages[SCRUB_STRIPE_PAGES];
143         struct scrub_sector_verification *sectors;
144
145         struct btrfs_device *dev;
146         u64 logical;
147         u64 physical;
148
149         u16 mirror_num;
150
151         /* Should be BTRFS_STRIPE_LEN / sectorsize. */
152         u16 nr_sectors;
153
154         /*
155          * How many data/meta extents are in this stripe.  Only for scrub status
156          * reporting purposes.
157          */
158         u16 nr_data_extents;
159         u16 nr_meta_extents;
160
161         atomic_t pending_io;
162         wait_queue_head_t io_wait;
163         wait_queue_head_t repair_wait;
164
165         /*
166          * Indicate the states of the stripe.  Bits are defined in
167          * scrub_stripe_flags enum.
168          */
169         unsigned long state;
170
171         /* The large bitmap contains all the sub-bitmaps. */
172         unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last *
173                                             (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))];
174
175         /*
176          * For writeback (repair or replace) error reporting.
177          * This one is protected by a spinlock, thus can not be packed into
178          * the larger bitmap.
179          */
180         unsigned long write_error_bitmap;
181
182         /* Writeback can be concurrent, thus we need to protect the bitmap. */
183         spinlock_t write_error_lock;
184
185         /*
186          * Checksum for the whole stripe if this stripe is inside a data block
187          * group.
188          */
189         u8 *csums;
190
191         struct work_struct work;
192 };
193
194 struct scrub_ctx {
195         struct scrub_stripe     stripes[SCRUB_TOTAL_STRIPES];
196         struct scrub_stripe     *raid56_data_stripes;
197         struct btrfs_fs_info    *fs_info;
198         struct btrfs_path       extent_path;
199         struct btrfs_path       csum_path;
200         int                     first_free;
201         int                     cur_stripe;
202         atomic_t                cancel_req;
203         int                     readonly;
204
205         /* State of IO submission throttling affecting the associated device */
206         ktime_t                 throttle_deadline;
207         u64                     throttle_sent;
208
209         int                     is_dev_replace;
210         u64                     write_pointer;
211
212         struct mutex            wr_lock;
213         struct btrfs_device     *wr_tgtdev;
214
215         /*
216          * statistics
217          */
218         struct btrfs_scrub_progress stat;
219         spinlock_t              stat_lock;
220
221         /*
222          * Use a ref counter to avoid use-after-free issues. Scrub workers
223          * decrement bios_in_flight and workers_pending and then do a wakeup
224          * on the list_wait wait queue. We must ensure the main scrub task
225          * doesn't free the scrub context before or while the workers are
226          * doing the wakeup() call.
227          */
228         refcount_t              refs;
229 };
230
231 #define scrub_calc_start_bit(stripe, name, block_nr)                    \
232 ({                                                                      \
233         unsigned int __start_bit;                                       \
234                                                                         \
235         ASSERT(block_nr < stripe->nr_sectors,                           \
236                 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \
237         __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \
238         __start_bit;                                                    \
239 })
240
241 #define IMPLEMENT_SCRUB_BITMAP_OPS(name)                                \
242 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \
243                                     unsigned int block_nr,              \
244                                     unsigned int nr_blocks)             \
245 {                                                                       \
246         const unsigned int start_bit = scrub_calc_start_bit(stripe,     \
247                                                             name, block_nr); \
248                                                                         \
249         bitmap_set(stripe->bitmaps, start_bit, nr_blocks);              \
250 }                                                                       \
251 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \
252                                       unsigned int block_nr,            \
253                                       unsigned int nr_blocks)           \
254 {                                                                       \
255         const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
256                                                             block_nr);  \
257                                                                         \
258         bitmap_clear(stripe->bitmaps, start_bit, nr_blocks);            \
259 }                                                                       \
260 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \
261                                      unsigned int block_nr)             \
262 {                                                                       \
263         const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
264                                                             block_nr);  \
265                                                                         \
266         return test_bit(start_bit, stripe->bitmaps);                    \
267 }                                                                       \
268 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \
269                                      unsigned int block_nr)             \
270 {                                                                       \
271         const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
272                                                             block_nr);  \
273                                                                         \
274         set_bit(start_bit, stripe->bitmaps);                            \
275 }                                                                       \
276 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \
277                                      unsigned int block_nr)             \
278 {                                                                       \
279         const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
280                                                             block_nr);  \
281                                                                         \
282         clear_bit(start_bit, stripe->bitmaps);                          \
283 }                                                                       \
284 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \
285 {                                                                       \
286         const unsigned int nr_blocks = stripe->nr_sectors;              \
287                                                                         \
288         ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG,             \
289                "nr_blocks=%u BITS_PER_LONG=%u",                         \
290                nr_blocks, BITS_PER_LONG);                               \
291                                                                         \
292         return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \
293                            stripe->nr_sectors);                         \
294 }                                                                       \
295 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \
296 {                                                                       \
297         unsigned long bitmap = scrub_bitmap_read_##name(stripe);        \
298                                                                         \
299         return bitmap_empty(&bitmap, stripe->nr_sectors);               \
300 }                                                                       \
301 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \
302 {                                                                       \
303         unsigned long bitmap = scrub_bitmap_read_##name(stripe);        \
304                                                                         \
305         return bitmap_weight(&bitmap, stripe->nr_sectors);              \
306 }
307 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent);
308 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata);
309 IMPLEMENT_SCRUB_BITMAP_OPS(error);
310 IMPLEMENT_SCRUB_BITMAP_OPS(io_error);
311 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error);
312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error);
313 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error);
314
315 struct scrub_warning {
316         struct btrfs_path       *path;
317         u64                     extent_item_size;
318         const char              *errstr;
319         u64                     physical;
320         u64                     logical;
321         struct btrfs_device     *dev;
322 };
323
324 struct scrub_error_records {
325         /*
326          * Bitmap recording which blocks hit errors (IO/csum/...) during the
327          * initial read.
328          */
329         unsigned long init_error_bitmap;
330
331         unsigned int nr_io_errors;
332         unsigned int nr_csum_errors;
333         unsigned int nr_meta_errors;
334         unsigned int nr_meta_gen_errors;
335 };
336
337 static void release_scrub_stripe(struct scrub_stripe *stripe)
338 {
339         if (!stripe)
340                 return;
341
342         for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
343                 if (stripe->pages[i])
344                         __free_page(stripe->pages[i]);
345                 stripe->pages[i] = NULL;
346         }
347         kfree(stripe->sectors);
348         kfree(stripe->csums);
349         stripe->sectors = NULL;
350         stripe->csums = NULL;
351         stripe->sctx = NULL;
352         stripe->state = 0;
353 }
354
355 static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
356                              struct scrub_stripe *stripe)
357 {
358         int ret;
359
360         memset(stripe, 0, sizeof(*stripe));
361
362         stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
363         stripe->state = 0;
364
365         init_waitqueue_head(&stripe->io_wait);
366         init_waitqueue_head(&stripe->repair_wait);
367         atomic_set(&stripe->pending_io, 0);
368         spin_lock_init(&stripe->write_error_lock);
369
370         ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
371         if (ret < 0)
372                 goto error;
373
374         stripe->sectors = kcalloc(stripe->nr_sectors,
375                                   sizeof(struct scrub_sector_verification),
376                                   GFP_KERNEL);
377         if (!stripe->sectors)
378                 goto error;
379
380         stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
381                                 fs_info->csum_size, GFP_KERNEL);
382         if (!stripe->csums)
383                 goto error;
384         return 0;
385 error:
386         release_scrub_stripe(stripe);
387         return -ENOMEM;
388 }
389
390 static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
391 {
392         wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
393 }
394
395 static void scrub_put_ctx(struct scrub_ctx *sctx);
396
397 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
398 {
399         while (atomic_read(&fs_info->scrub_pause_req)) {
400                 mutex_unlock(&fs_info->scrub_lock);
401                 wait_event(fs_info->scrub_pause_wait,
402                    atomic_read(&fs_info->scrub_pause_req) == 0);
403                 mutex_lock(&fs_info->scrub_lock);
404         }
405 }
406
407 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
408 {
409         atomic_inc(&fs_info->scrubs_paused);
410         wake_up(&fs_info->scrub_pause_wait);
411 }
412
413 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
414 {
415         mutex_lock(&fs_info->scrub_lock);
416         __scrub_blocked_if_needed(fs_info);
417         atomic_dec(&fs_info->scrubs_paused);
418         mutex_unlock(&fs_info->scrub_lock);
419
420         wake_up(&fs_info->scrub_pause_wait);
421 }
422
423 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
424 {
425         scrub_pause_on(fs_info);
426         scrub_pause_off(fs_info);
427 }
428
429 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
430 {
431         int i;
432
433         if (!sctx)
434                 return;
435
436         for (i = 0; i < SCRUB_TOTAL_STRIPES; i++)
437                 release_scrub_stripe(&sctx->stripes[i]);
438
439         kvfree(sctx);
440 }
441
442 static void scrub_put_ctx(struct scrub_ctx *sctx)
443 {
444         if (refcount_dec_and_test(&sctx->refs))
445                 scrub_free_ctx(sctx);
446 }
447
448 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
449                 struct btrfs_fs_info *fs_info, int is_dev_replace)
450 {
451         struct scrub_ctx *sctx;
452         int             i;
453
454         /* Since sctx has inline 128 stripes, it can go beyond 64K easily.  Use
455          * kvzalloc().
456          */
457         sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL);
458         if (!sctx)
459                 goto nomem;
460         refcount_set(&sctx->refs, 1);
461         sctx->is_dev_replace = is_dev_replace;
462         sctx->fs_info = fs_info;
463         sctx->extent_path.search_commit_root = 1;
464         sctx->extent_path.skip_locking = 1;
465         sctx->csum_path.search_commit_root = 1;
466         sctx->csum_path.skip_locking = 1;
467         for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
468                 int ret;
469
470                 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
471                 if (ret < 0)
472                         goto nomem;
473                 sctx->stripes[i].sctx = sctx;
474         }
475         sctx->first_free = 0;
476         atomic_set(&sctx->cancel_req, 0);
477
478         spin_lock_init(&sctx->stat_lock);
479         sctx->throttle_deadline = 0;
480
481         mutex_init(&sctx->wr_lock);
482         if (is_dev_replace) {
483                 WARN_ON(!fs_info->dev_replace.tgtdev);
484                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
485         }
486
487         return sctx;
488
489 nomem:
490         scrub_free_ctx(sctx);
491         return ERR_PTR(-ENOMEM);
492 }
493
494 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
495                                      u64 root, void *warn_ctx)
496 {
497         u32 nlink;
498         int ret;
499         int i;
500         unsigned nofs_flag;
501         struct extent_buffer *eb;
502         struct btrfs_inode_item *inode_item;
503         struct scrub_warning *swarn = warn_ctx;
504         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
505         struct inode_fs_paths *ipath = NULL;
506         struct btrfs_root *local_root;
507         struct btrfs_key key;
508
509         local_root = btrfs_get_fs_root(fs_info, root, true);
510         if (IS_ERR(local_root)) {
511                 ret = PTR_ERR(local_root);
512                 goto err;
513         }
514
515         /*
516          * this makes the path point to (inum INODE_ITEM ioff)
517          */
518         key.objectid = inum;
519         key.type = BTRFS_INODE_ITEM_KEY;
520         key.offset = 0;
521
522         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
523         if (ret) {
524                 btrfs_put_root(local_root);
525                 btrfs_release_path(swarn->path);
526                 goto err;
527         }
528
529         eb = swarn->path->nodes[0];
530         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
531                                         struct btrfs_inode_item);
532         nlink = btrfs_inode_nlink(eb, inode_item);
533         btrfs_release_path(swarn->path);
534
535         /*
536          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
537          * uses GFP_NOFS in this context, so we keep it consistent but it does
538          * not seem to be strictly necessary.
539          */
540         nofs_flag = memalloc_nofs_save();
541         ipath = init_ipath(4096, local_root, swarn->path);
542         memalloc_nofs_restore(nofs_flag);
543         if (IS_ERR(ipath)) {
544                 btrfs_put_root(local_root);
545                 ret = PTR_ERR(ipath);
546                 ipath = NULL;
547                 goto err;
548         }
549         ret = paths_from_inode(inum, ipath);
550
551         if (ret < 0)
552                 goto err;
553
554         /*
555          * we deliberately ignore the bit ipath might have been too small to
556          * hold all of the paths here
557          */
558         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
559                 btrfs_warn_in_rcu(fs_info,
560 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
561                                   swarn->errstr, swarn->logical,
562                                   btrfs_dev_name(swarn->dev),
563                                   swarn->physical,
564                                   root, inum, offset,
565                                   fs_info->sectorsize, nlink,
566                                   (char *)(unsigned long)ipath->fspath->val[i]);
567
568         btrfs_put_root(local_root);
569         free_ipath(ipath);
570         return 0;
571
572 err:
573         btrfs_warn_in_rcu(fs_info,
574                           "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
575                           swarn->errstr, swarn->logical,
576                           btrfs_dev_name(swarn->dev),
577                           swarn->physical,
578                           root, inum, offset, ret);
579
580         free_ipath(ipath);
581         return 0;
582 }
583
584 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
585                                        bool is_super, u64 logical, u64 physical)
586 {
587         struct btrfs_fs_info *fs_info = dev->fs_info;
588         struct btrfs_path *path;
589         struct btrfs_key found_key;
590         struct extent_buffer *eb;
591         struct btrfs_extent_item *ei;
592         struct scrub_warning swarn;
593         u64 flags = 0;
594         u32 item_size;
595         int ret;
596
597         /* Super block error, no need to search extent tree. */
598         if (is_super) {
599                 btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu",
600                                   errstr, btrfs_dev_name(dev), physical);
601                 return;
602         }
603         path = btrfs_alloc_path();
604         if (!path)
605                 return;
606
607         swarn.physical = physical;
608         swarn.logical = logical;
609         swarn.errstr = errstr;
610         swarn.dev = NULL;
611
612         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
613                                   &flags);
614         if (ret < 0)
615                 goto out;
616
617         swarn.extent_item_size = found_key.offset;
618
619         eb = path->nodes[0];
620         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
621         item_size = btrfs_item_size(eb, path->slots[0]);
622
623         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
624                 unsigned long ptr = 0;
625                 u8 ref_level;
626                 u64 ref_root;
627
628                 while (true) {
629                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
630                                                       item_size, &ref_root,
631                                                       &ref_level);
632                         if (ret < 0) {
633                                 btrfs_warn(fs_info,
634                    "scrub: failed to resolve tree backref for logical %llu: %d",
635                                            swarn.logical, ret);
636                                 break;
637                         }
638                         if (ret > 0)
639                                 break;
640                         btrfs_warn_in_rcu(fs_info,
641 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
642                                 errstr, swarn.logical, btrfs_dev_name(dev),
643                                 swarn.physical, (ref_level ? "node" : "leaf"),
644                                 ref_level, ref_root);
645                 }
646                 btrfs_release_path(path);
647         } else {
648                 struct btrfs_backref_walk_ctx ctx = { 0 };
649
650                 btrfs_release_path(path);
651
652                 ctx.bytenr = found_key.objectid;
653                 ctx.extent_item_pos = swarn.logical - found_key.objectid;
654                 ctx.fs_info = fs_info;
655
656                 swarn.path = path;
657                 swarn.dev = dev;
658
659                 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
660         }
661
662 out:
663         btrfs_free_path(path);
664 }
665
666 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
667 {
668         int ret = 0;
669         u64 length;
670
671         if (!btrfs_is_zoned(sctx->fs_info))
672                 return 0;
673
674         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
675                 return 0;
676
677         if (sctx->write_pointer < physical) {
678                 length = physical - sctx->write_pointer;
679
680                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
681                                                 sctx->write_pointer, length);
682                 if (!ret)
683                         sctx->write_pointer = physical;
684         }
685         return ret;
686 }
687
688 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
689 {
690         u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits);
691         const struct page *page = stripe->pages[offset >> PAGE_SHIFT];
692
693         /* stripe->pages[] is allocated by us and no highmem is allowed. */
694         ASSERT(page);
695         ASSERT(!PageHighMem(page));
696         return page_address(page) + offset_in_page(offset);
697 }
698
699 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
700 {
701         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
702         const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
703         const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
704         void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
705         struct btrfs_header *header = first_kaddr;
706         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
707         u8 on_disk_csum[BTRFS_CSUM_SIZE];
708         u8 calculated_csum[BTRFS_CSUM_SIZE];
709
710         /*
711          * Here we don't have a good way to attach the pages (and subpages)
712          * to a dummy extent buffer, thus we have to directly grab the members
713          * from pages.
714          */
715         memcpy(on_disk_csum, header->csum, fs_info->csum_size);
716
717         if (logical != btrfs_stack_header_bytenr(header)) {
718                 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
719                 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
720                 btrfs_warn_rl(fs_info,
721           "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
722                               logical, stripe->mirror_num,
723                               btrfs_stack_header_bytenr(header), logical);
724                 return;
725         }
726         if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
727                    BTRFS_FSID_SIZE) != 0) {
728                 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
729                 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
730                 btrfs_warn_rl(fs_info,
731               "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
732                               logical, stripe->mirror_num,
733                               header->fsid, fs_info->fs_devices->fsid);
734                 return;
735         }
736         if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
737                    BTRFS_UUID_SIZE) != 0) {
738                 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
739                 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
740                 btrfs_warn_rl(fs_info,
741    "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
742                               logical, stripe->mirror_num,
743                               header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
744                 return;
745         }
746
747         /* Now check tree block csum. */
748         shash->tfm = fs_info->csum_shash;
749         crypto_shash_init(shash);
750         crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE,
751                             fs_info->sectorsize - BTRFS_CSUM_SIZE);
752
753         for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
754                 crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i),
755                                     fs_info->sectorsize);
756         }
757
758         crypto_shash_final(shash, calculated_csum);
759         if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
760                 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
761                 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
762                 btrfs_warn_rl(fs_info,
763 "scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
764                               logical, stripe->mirror_num,
765                               CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
766                               CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
767                 return;
768         }
769         if (stripe->sectors[sector_nr].generation !=
770             btrfs_stack_header_generation(header)) {
771                 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
772                 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
773                 btrfs_warn_rl(fs_info,
774       "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
775                               logical, stripe->mirror_num,
776                               btrfs_stack_header_generation(header),
777                               stripe->sectors[sector_nr].generation);
778                 return;
779         }
780         scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree);
781         scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree);
782         scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree);
783         scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree);
784 }
785
786 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
787 {
788         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
789         struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
790         const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
791         void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
792         u8 csum_buf[BTRFS_CSUM_SIZE];
793         int ret;
794
795         ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
796
797         /* Sector not utilized, skip it. */
798         if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr))
799                 return;
800
801         /* IO error, no need to check. */
802         if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
803                 return;
804
805         /* Metadata, verify the full tree block. */
806         if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
807                 /*
808                  * Check if the tree block crosses the stripe boundary.  If
809                  * crossed the boundary, we cannot verify it but only give a
810                  * warning.
811                  *
812                  * This can only happen on a very old filesystem where chunks
813                  * are not ensured to be stripe aligned.
814                  */
815                 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
816                         btrfs_warn_rl(fs_info,
817                         "scrub: tree block at %llu crosses stripe boundary %llu",
818                                       stripe->logical +
819                                       (sector_nr << fs_info->sectorsize_bits),
820                                       stripe->logical);
821                         return;
822                 }
823                 scrub_verify_one_metadata(stripe, sector_nr);
824                 return;
825         }
826
827         /*
828          * Data is easier, we just verify the data csum (if we have it).  For
829          * cases without csum, we have no other choice but to trust it.
830          */
831         if (!sector->csum) {
832                 scrub_bitmap_clear_bit_error(stripe, sector_nr);
833                 return;
834         }
835
836         ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum);
837         if (ret < 0) {
838                 scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
839                 scrub_bitmap_set_bit_error(stripe, sector_nr);
840         } else {
841                 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr);
842                 scrub_bitmap_clear_bit_error(stripe, sector_nr);
843         }
844 }
845
846 /* Verify specified sectors of a stripe. */
847 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
848 {
849         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
850         const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
851         int sector_nr;
852
853         for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
854                 scrub_verify_one_sector(stripe, sector_nr);
855                 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr))
856                         sector_nr += sectors_per_tree - 1;
857         }
858 }
859
860 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
861 {
862         int i;
863
864         for (i = 0; i < stripe->nr_sectors; i++) {
865                 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec))
866                         break;
867         }
868         ASSERT(i < stripe->nr_sectors);
869         return i;
870 }
871
872 /*
873  * Repair read is different to the regular read:
874  *
875  * - Only reads the failed sectors
876  * - May have extra blocksize limits
877  */
878 static void scrub_repair_read_endio(struct btrfs_bio *bbio)
879 {
880         struct scrub_stripe *stripe = bbio->private;
881         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
882         struct bio_vec *bvec;
883         int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
884         u32 bio_size = 0;
885         int i;
886
887         ASSERT(sector_nr < stripe->nr_sectors);
888
889         bio_for_each_bvec_all(bvec, &bbio->bio, i)
890                 bio_size += bvec->bv_len;
891
892         if (bbio->bio.bi_status) {
893                 scrub_bitmap_set_io_error(stripe, sector_nr,
894                                           bio_size >> fs_info->sectorsize_bits);
895                 scrub_bitmap_set_error(stripe, sector_nr,
896                                        bio_size >> fs_info->sectorsize_bits);
897         } else {
898                 scrub_bitmap_clear_io_error(stripe, sector_nr,
899                                           bio_size >> fs_info->sectorsize_bits);
900         }
901         bio_put(&bbio->bio);
902         if (atomic_dec_and_test(&stripe->pending_io))
903                 wake_up(&stripe->io_wait);
904 }
905
906 static int calc_next_mirror(int mirror, int num_copies)
907 {
908         ASSERT(mirror <= num_copies);
909         return (mirror + 1 > num_copies) ? 1 : mirror + 1;
910 }
911
912 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe,
913                                  int sector_nr)
914 {
915         void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
916         int ret;
917
918         ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize,
919                            offset_in_page(kaddr));
920         /*
921          * Caller should ensure the bbio has enough size.
922          * And we cannot use __bio_add_page(), which doesn't do any merge.
923          *
924          * Meanwhile for scrub_submit_initial_read() we fully rely on the merge
925          * to create the minimal amount of bio vectors, for fs block size < page
926          * size cases.
927          */
928         ASSERT(ret == bbio->fs_info->sectorsize);
929 }
930
931 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
932                                             int mirror, int blocksize, bool wait)
933 {
934         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
935         struct btrfs_bio *bbio = NULL;
936         const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
937         int i;
938
939         ASSERT(stripe->mirror_num >= 1);
940         ASSERT(atomic_read(&stripe->pending_io) == 0);
941
942         for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
943                 /* The current sector cannot be merged, submit the bio. */
944                 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) ||
945                              bbio->bio.bi_iter.bi_size >= blocksize)) {
946                         ASSERT(bbio->bio.bi_iter.bi_size);
947                         atomic_inc(&stripe->pending_io);
948                         btrfs_submit_bbio(bbio, mirror);
949                         if (wait)
950                                 wait_scrub_stripe_io(stripe);
951                         bbio = NULL;
952                 }
953
954                 if (!bbio) {
955                         bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
956                                 fs_info, scrub_repair_read_endio, stripe);
957                         bbio->bio.bi_iter.bi_sector = (stripe->logical +
958                                 (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
959                 }
960
961                 scrub_bio_add_sector(bbio, stripe, i);
962         }
963         if (bbio) {
964                 ASSERT(bbio->bio.bi_iter.bi_size);
965                 atomic_inc(&stripe->pending_io);
966                 btrfs_submit_bbio(bbio, mirror);
967                 if (wait)
968                         wait_scrub_stripe_io(stripe);
969         }
970 }
971
972 static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
973                                        struct scrub_stripe *stripe,
974                                        const struct scrub_error_records *errors)
975 {
976         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
977                                       DEFAULT_RATELIMIT_BURST);
978         struct btrfs_fs_info *fs_info = sctx->fs_info;
979         struct btrfs_device *dev = NULL;
980         const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe);
981         const unsigned long error_bitmap = scrub_bitmap_read_error(stripe);
982         u64 physical = 0;
983         int nr_data_sectors = 0;
984         int nr_meta_sectors = 0;
985         int nr_nodatacsum_sectors = 0;
986         int nr_repaired_sectors = 0;
987         int sector_nr;
988
989         if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
990                 return;
991
992         /*
993          * Init needed infos for error reporting.
994          *
995          * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
996          * thus no need for dev/physical, error reporting still needs dev and physical.
997          */
998         if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) {
999                 u64 mapped_len = fs_info->sectorsize;
1000                 struct btrfs_io_context *bioc = NULL;
1001                 int stripe_index = stripe->mirror_num - 1;
1002                 int ret;
1003
1004                 /* For scrub, our mirror_num should always start at 1. */
1005                 ASSERT(stripe->mirror_num >= 1);
1006                 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1007                                       stripe->logical, &mapped_len, &bioc,
1008                                       NULL, NULL);
1009                 /*
1010                  * If we failed, dev will be NULL, and later detailed reports
1011                  * will just be skipped.
1012                  */
1013                 if (ret < 0)
1014                         goto skip;
1015                 physical = bioc->stripes[stripe_index].physical;
1016                 dev = bioc->stripes[stripe_index].dev;
1017                 btrfs_put_bioc(bioc);
1018         }
1019
1020 skip:
1021         for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) {
1022                 bool repaired = false;
1023
1024                 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
1025                         nr_meta_sectors++;
1026                 } else {
1027                         nr_data_sectors++;
1028                         if (!stripe->sectors[sector_nr].csum)
1029                                 nr_nodatacsum_sectors++;
1030                 }
1031
1032                 if (test_bit(sector_nr, &errors->init_error_bitmap) &&
1033                     !test_bit(sector_nr, &error_bitmap)) {
1034                         nr_repaired_sectors++;
1035                         repaired = true;
1036                 }
1037
1038                 /* Good sector from the beginning, nothing need to be done. */
1039                 if (!test_bit(sector_nr, &errors->init_error_bitmap))
1040                         continue;
1041
1042                 /*
1043                  * Report error for the corrupted sectors.  If repaired, just
1044                  * output the message of repaired message.
1045                  */
1046                 if (repaired) {
1047                         if (dev) {
1048                                 btrfs_err_rl_in_rcu(fs_info,
1049                 "scrub: fixed up error at logical %llu on dev %s physical %llu",
1050                                             stripe->logical, btrfs_dev_name(dev),
1051                                             physical);
1052                         } else {
1053                                 btrfs_err_rl_in_rcu(fs_info,
1054                            "scrub: fixed up error at logical %llu on mirror %u",
1055                                             stripe->logical, stripe->mirror_num);
1056                         }
1057                         continue;
1058                 }
1059
1060                 /* The remaining are all for unrepaired. */
1061                 if (dev) {
1062                         btrfs_err_rl_in_rcu(fs_info,
1063 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
1064                                             stripe->logical, btrfs_dev_name(dev),
1065                                             physical);
1066                 } else {
1067                         btrfs_err_rl_in_rcu(fs_info,
1068           "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
1069                                             stripe->logical, stripe->mirror_num);
1070                 }
1071
1072                 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
1073                         if (__ratelimit(&rs) && dev)
1074                                 scrub_print_common_warning("i/o error", dev, false,
1075                                                      stripe->logical, physical);
1076                 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr))
1077                         if (__ratelimit(&rs) && dev)
1078                                 scrub_print_common_warning("checksum error", dev, false,
1079                                                      stripe->logical, physical);
1080                 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr))
1081                         if (__ratelimit(&rs) && dev)
1082                                 scrub_print_common_warning("header error", dev, false,
1083                                                      stripe->logical, physical);
1084                 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr))
1085                         if (__ratelimit(&rs) && dev)
1086                                 scrub_print_common_warning("generation error", dev, false,
1087                                                      stripe->logical, physical);
1088         }
1089
1090         /* Update the device stats. */
1091         for (int i = 0; i < errors->nr_io_errors; i++)
1092                 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
1093         for (int i = 0; i < errors->nr_csum_errors; i++)
1094                 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
1095         /* Generation mismatch error is based on each metadata, not each block. */
1096         for (int i = 0; i < errors->nr_meta_gen_errors;
1097              i += (fs_info->nodesize >> fs_info->sectorsize_bits))
1098                 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
1099
1100         spin_lock(&sctx->stat_lock);
1101         sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
1102         sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
1103         sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
1104         sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
1105         sctx->stat.no_csum += nr_nodatacsum_sectors;
1106         sctx->stat.read_errors += errors->nr_io_errors;
1107         sctx->stat.csum_errors += errors->nr_csum_errors;
1108         sctx->stat.verify_errors += errors->nr_meta_errors +
1109                                     errors->nr_meta_gen_errors;
1110         sctx->stat.uncorrectable_errors +=
1111                 bitmap_weight(&error_bitmap, stripe->nr_sectors);
1112         sctx->stat.corrected_errors += nr_repaired_sectors;
1113         spin_unlock(&sctx->stat_lock);
1114 }
1115
1116 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
1117                                 unsigned long write_bitmap, bool dev_replace);
1118
1119 /*
1120  * The main entrance for all read related scrub work, including:
1121  *
1122  * - Wait for the initial read to finish
1123  * - Verify and locate any bad sectors
1124  * - Go through the remaining mirrors and try to read as large blocksize as
1125  *   possible
1126  * - Go through all mirrors (including the failed mirror) sector-by-sector
1127  * - Submit writeback for repaired sectors
1128  *
1129  * Writeback for dev-replace does not happen here, it needs extra
1130  * synchronization for zoned devices.
1131  */
1132 static void scrub_stripe_read_repair_worker(struct work_struct *work)
1133 {
1134         struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
1135         struct scrub_ctx *sctx = stripe->sctx;
1136         struct btrfs_fs_info *fs_info = sctx->fs_info;
1137         struct scrub_error_records errors = { 0 };
1138         int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1139                                           stripe->bg->length);
1140         unsigned long repaired;
1141         unsigned long error;
1142         int mirror;
1143         int i;
1144
1145         ASSERT(stripe->mirror_num > 0);
1146
1147         wait_scrub_stripe_io(stripe);
1148         scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe));
1149         /* Save the initial failed bitmap for later repair and report usage. */
1150         errors.init_error_bitmap = scrub_bitmap_read_error(stripe);
1151         errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe);
1152         errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe);
1153         errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe);
1154         errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe);
1155
1156         if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors))
1157                 goto out;
1158
1159         /*
1160          * Try all remaining mirrors.
1161          *
1162          * Here we still try to read as large block as possible, as this is
1163          * faster and we have extra safety nets to rely on.
1164          */
1165         for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
1166              mirror != stripe->mirror_num;
1167              mirror = calc_next_mirror(mirror, num_copies)) {
1168                 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
1169
1170                 scrub_stripe_submit_repair_read(stripe, mirror,
1171                                                 BTRFS_STRIPE_LEN, false);
1172                 wait_scrub_stripe_io(stripe);
1173                 scrub_verify_one_stripe(stripe, old_error_bitmap);
1174                 if (scrub_bitmap_empty_error(stripe))
1175                         goto out;
1176         }
1177
1178         /*
1179          * Last safety net, try re-checking all mirrors, including the failed
1180          * one, sector-by-sector.
1181          *
1182          * As if one sector failed the drive's internal csum, the whole read
1183          * containing the offending sector would be marked as error.
1184          * Thus here we do sector-by-sector read.
1185          *
1186          * This can be slow, thus we only try it as the last resort.
1187          */
1188
1189         for (i = 0, mirror = stripe->mirror_num;
1190              i < num_copies;
1191              i++, mirror = calc_next_mirror(mirror, num_copies)) {
1192                 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
1193
1194                 scrub_stripe_submit_repair_read(stripe, mirror,
1195                                                 fs_info->sectorsize, true);
1196                 wait_scrub_stripe_io(stripe);
1197                 scrub_verify_one_stripe(stripe, old_error_bitmap);
1198                 if (scrub_bitmap_empty_error(stripe))
1199                         goto out;
1200         }
1201 out:
1202         error = scrub_bitmap_read_error(stripe);
1203         /*
1204          * Submit the repaired sectors.  For zoned case, we cannot do repair
1205          * in-place, but queue the bg to be relocated.
1206          */
1207         bitmap_andnot(&repaired, &errors.init_error_bitmap, &error,
1208                       stripe->nr_sectors);
1209         if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
1210                 if (btrfs_is_zoned(fs_info)) {
1211                         btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
1212                 } else {
1213                         scrub_write_sectors(sctx, stripe, repaired, false);
1214                         wait_scrub_stripe_io(stripe);
1215                 }
1216         }
1217
1218         scrub_stripe_report_errors(sctx, stripe, &errors);
1219         set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
1220         wake_up(&stripe->repair_wait);
1221 }
1222
1223 static void scrub_read_endio(struct btrfs_bio *bbio)
1224 {
1225         struct scrub_stripe *stripe = bbio->private;
1226         struct bio_vec *bvec;
1227         int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
1228         int num_sectors;
1229         u32 bio_size = 0;
1230         int i;
1231
1232         ASSERT(sector_nr < stripe->nr_sectors);
1233         bio_for_each_bvec_all(bvec, &bbio->bio, i)
1234                 bio_size += bvec->bv_len;
1235         num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
1236
1237         if (bbio->bio.bi_status) {
1238                 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors);
1239                 scrub_bitmap_set_error(stripe, sector_nr, num_sectors);
1240         } else {
1241                 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors);
1242         }
1243         bio_put(&bbio->bio);
1244         if (atomic_dec_and_test(&stripe->pending_io)) {
1245                 wake_up(&stripe->io_wait);
1246                 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
1247                 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
1248         }
1249 }
1250
1251 static void scrub_write_endio(struct btrfs_bio *bbio)
1252 {
1253         struct scrub_stripe *stripe = bbio->private;
1254         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1255         struct bio_vec *bvec;
1256         int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
1257         u32 bio_size = 0;
1258         int i;
1259
1260         bio_for_each_bvec_all(bvec, &bbio->bio, i)
1261                 bio_size += bvec->bv_len;
1262
1263         if (bbio->bio.bi_status) {
1264                 unsigned long flags;
1265
1266                 spin_lock_irqsave(&stripe->write_error_lock, flags);
1267                 bitmap_set(&stripe->write_error_bitmap, sector_nr,
1268                            bio_size >> fs_info->sectorsize_bits);
1269                 spin_unlock_irqrestore(&stripe->write_error_lock, flags);
1270                 for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
1271                         btrfs_dev_stat_inc_and_print(stripe->dev,
1272                                                      BTRFS_DEV_STAT_WRITE_ERRS);
1273         }
1274         bio_put(&bbio->bio);
1275
1276         if (atomic_dec_and_test(&stripe->pending_io))
1277                 wake_up(&stripe->io_wait);
1278 }
1279
1280 static void scrub_submit_write_bio(struct scrub_ctx *sctx,
1281                                    struct scrub_stripe *stripe,
1282                                    struct btrfs_bio *bbio, bool dev_replace)
1283 {
1284         struct btrfs_fs_info *fs_info = sctx->fs_info;
1285         u32 bio_len = bbio->bio.bi_iter.bi_size;
1286         u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
1287                       stripe->logical;
1288
1289         fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
1290         atomic_inc(&stripe->pending_io);
1291         btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
1292         if (!btrfs_is_zoned(fs_info))
1293                 return;
1294         /*
1295          * For zoned writeback, queue depth must be 1, thus we must wait for
1296          * the write to finish before the next write.
1297          */
1298         wait_scrub_stripe_io(stripe);
1299
1300         /*
1301          * And also need to update the write pointer if write finished
1302          * successfully.
1303          */
1304         if (!test_bit(bio_off >> fs_info->sectorsize_bits,
1305                       &stripe->write_error_bitmap))
1306                 sctx->write_pointer += bio_len;
1307 }
1308
1309 /*
1310  * Submit the write bio(s) for the sectors specified by @write_bitmap.
1311  *
1312  * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1313  *
1314  * - Only needs logical bytenr and mirror_num
1315  *   Just like the scrub read path
1316  *
1317  * - Would only result in writes to the specified mirror
1318  *   Unlike the regular writeback path, which would write back to all stripes
1319  *
1320  * - Handle dev-replace and read-repair writeback differently
1321  */
1322 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
1323                                 unsigned long write_bitmap, bool dev_replace)
1324 {
1325         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1326         struct btrfs_bio *bbio = NULL;
1327         int sector_nr;
1328
1329         for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
1330                 /* We should only writeback sectors covered by an extent. */
1331                 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr));
1332
1333                 /* Cannot merge with previous sector, submit the current one. */
1334                 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
1335                         scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
1336                         bbio = NULL;
1337                 }
1338                 if (!bbio) {
1339                         bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
1340                                                fs_info, scrub_write_endio, stripe);
1341                         bbio->bio.bi_iter.bi_sector = (stripe->logical +
1342                                 (sector_nr << fs_info->sectorsize_bits)) >>
1343                                 SECTOR_SHIFT;
1344                 }
1345                 scrub_bio_add_sector(bbio, stripe, sector_nr);
1346         }
1347         if (bbio)
1348                 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
1349 }
1350
1351 /*
1352  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1353  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1354  */
1355 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
1356                                   unsigned int bio_size)
1357 {
1358         const int time_slice = 1000;
1359         s64 delta;
1360         ktime_t now;
1361         u32 div;
1362         u64 bwlimit;
1363
1364         bwlimit = READ_ONCE(device->scrub_speed_max);
1365         if (bwlimit == 0)
1366                 return;
1367
1368         /*
1369          * Slice is divided into intervals when the IO is submitted, adjust by
1370          * bwlimit and maximum of 64 intervals.
1371          */
1372         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1373         div = min_t(u32, 64, div);
1374
1375         /* Start new epoch, set deadline */
1376         now = ktime_get();
1377         if (sctx->throttle_deadline == 0) {
1378                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1379                 sctx->throttle_sent = 0;
1380         }
1381
1382         /* Still in the time to send? */
1383         if (ktime_before(now, sctx->throttle_deadline)) {
1384                 /* If current bio is within the limit, send it */
1385                 sctx->throttle_sent += bio_size;
1386                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1387                         return;
1388
1389                 /* We're over the limit, sleep until the rest of the slice */
1390                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
1391         } else {
1392                 /* New request after deadline, start new epoch */
1393                 delta = 0;
1394         }
1395
1396         if (delta) {
1397                 long timeout;
1398
1399                 timeout = div_u64(delta * HZ, 1000);
1400                 schedule_timeout_interruptible(timeout);
1401         }
1402
1403         /* Next call will start the deadline period */
1404         sctx->throttle_deadline = 0;
1405 }
1406
1407 /*
1408  * Given a physical address, this will calculate it's
1409  * logical offset. if this is a parity stripe, it will return
1410  * the most left data stripe's logical offset.
1411  *
1412  * return 0 if it is a data stripe, 1 means parity stripe.
1413  */
1414 static int get_raid56_logic_offset(u64 physical, int num,
1415                                    struct btrfs_chunk_map *map, u64 *offset,
1416                                    u64 *stripe_start)
1417 {
1418         int i;
1419         int j = 0;
1420         u64 last_offset;
1421         const int data_stripes = nr_data_stripes(map);
1422
1423         last_offset = (physical - map->stripes[num].physical) * data_stripes;
1424         if (stripe_start)
1425                 *stripe_start = last_offset;
1426
1427         *offset = last_offset;
1428         for (i = 0; i < data_stripes; i++) {
1429                 u32 stripe_nr;
1430                 u32 stripe_index;
1431                 u32 rot;
1432
1433                 *offset = last_offset + btrfs_stripe_nr_to_offset(i);
1434
1435                 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
1436
1437                 /* Work out the disk rotation on this stripe-set */
1438                 rot = stripe_nr % map->num_stripes;
1439                 /* calculate which stripe this data locates */
1440                 rot += i;
1441                 stripe_index = rot % map->num_stripes;
1442                 if (stripe_index == num)
1443                         return 0;
1444                 if (stripe_index < num)
1445                         j++;
1446         }
1447         *offset = last_offset + btrfs_stripe_nr_to_offset(j);
1448         return 1;
1449 }
1450
1451 /*
1452  * Return 0 if the extent item range covers any byte of the range.
1453  * Return <0 if the extent item is before @search_start.
1454  * Return >0 if the extent item is after @start_start + @search_len.
1455  */
1456 static int compare_extent_item_range(struct btrfs_path *path,
1457                                      u64 search_start, u64 search_len)
1458 {
1459         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
1460         u64 len;
1461         struct btrfs_key key;
1462
1463         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1464         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
1465                key.type == BTRFS_METADATA_ITEM_KEY);
1466         if (key.type == BTRFS_METADATA_ITEM_KEY)
1467                 len = fs_info->nodesize;
1468         else
1469                 len = key.offset;
1470
1471         if (key.objectid + len <= search_start)
1472                 return -1;
1473         if (key.objectid >= search_start + search_len)
1474                 return 1;
1475         return 0;
1476 }
1477
1478 /*
1479  * Locate one extent item which covers any byte in range
1480  * [@search_start, @search_start + @search_length)
1481  *
1482  * If the path is not initialized, we will initialize the search by doing
1483  * a btrfs_search_slot().
1484  * If the path is already initialized, we will use the path as the initial
1485  * slot, to avoid duplicated btrfs_search_slot() calls.
1486  *
1487  * NOTE: If an extent item starts before @search_start, we will still
1488  * return the extent item. This is for data extent crossing stripe boundary.
1489  *
1490  * Return 0 if we found such extent item, and @path will point to the extent item.
1491  * Return >0 if no such extent item can be found, and @path will be released.
1492  * Return <0 if hit fatal error, and @path will be released.
1493  */
1494 static int find_first_extent_item(struct btrfs_root *extent_root,
1495                                   struct btrfs_path *path,
1496                                   u64 search_start, u64 search_len)
1497 {
1498         struct btrfs_fs_info *fs_info = extent_root->fs_info;
1499         struct btrfs_key key;
1500         int ret;
1501
1502         /* Continue using the existing path */
1503         if (path->nodes[0])
1504                 goto search_forward;
1505
1506         key.objectid = search_start;
1507         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1508                 key.type = BTRFS_METADATA_ITEM_KEY;
1509         else
1510                 key.type = BTRFS_EXTENT_ITEM_KEY;
1511         key.offset = (u64)-1;
1512
1513         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1514         if (ret < 0)
1515                 return ret;
1516         if (ret == 0) {
1517                 /*
1518                  * Key with offset -1 found, there would have to exist an extent
1519                  * item with such offset, but this is out of the valid range.
1520                  */
1521                 btrfs_release_path(path);
1522                 return -EUCLEAN;
1523         }
1524
1525         /*
1526          * Here we intentionally pass 0 as @min_objectid, as there could be
1527          * an extent item starting before @search_start.
1528          */
1529         ret = btrfs_previous_extent_item(extent_root, path, 0);
1530         if (ret < 0)
1531                 return ret;
1532         /*
1533          * No matter whether we have found an extent item, the next loop will
1534          * properly do every check on the key.
1535          */
1536 search_forward:
1537         while (true) {
1538                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1539                 if (key.objectid >= search_start + search_len)
1540                         break;
1541                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
1542                     key.type != BTRFS_EXTENT_ITEM_KEY)
1543                         goto next;
1544
1545                 ret = compare_extent_item_range(path, search_start, search_len);
1546                 if (ret == 0)
1547                         return ret;
1548                 if (ret > 0)
1549                         break;
1550 next:
1551                 ret = btrfs_next_item(extent_root, path);
1552                 if (ret) {
1553                         /* Either no more items or a fatal error. */
1554                         btrfs_release_path(path);
1555                         return ret;
1556                 }
1557         }
1558         btrfs_release_path(path);
1559         return 1;
1560 }
1561
1562 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
1563                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
1564 {
1565         struct btrfs_key key;
1566         struct btrfs_extent_item *ei;
1567
1568         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1569         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
1570                key.type == BTRFS_EXTENT_ITEM_KEY);
1571         *extent_start_ret = key.objectid;
1572         if (key.type == BTRFS_METADATA_ITEM_KEY)
1573                 *size_ret = path->nodes[0]->fs_info->nodesize;
1574         else
1575                 *size_ret = key.offset;
1576         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
1577         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
1578         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
1579 }
1580
1581 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
1582                                         u64 physical, u64 physical_end)
1583 {
1584         struct btrfs_fs_info *fs_info = sctx->fs_info;
1585         int ret = 0;
1586
1587         if (!btrfs_is_zoned(fs_info))
1588                 return 0;
1589
1590         mutex_lock(&sctx->wr_lock);
1591         if (sctx->write_pointer < physical_end) {
1592                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
1593                                                     physical,
1594                                                     sctx->write_pointer);
1595                 if (ret)
1596                         btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
1597         }
1598         mutex_unlock(&sctx->wr_lock);
1599         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
1600
1601         return ret;
1602 }
1603
1604 static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
1605                                  struct scrub_stripe *stripe,
1606                                  u64 extent_start, u64 extent_len,
1607                                  u64 extent_flags, u64 extent_gen)
1608 {
1609         for (u64 cur_logical = max(stripe->logical, extent_start);
1610              cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
1611                                extent_start + extent_len);
1612              cur_logical += fs_info->sectorsize) {
1613                 const int nr_sector = (cur_logical - stripe->logical) >>
1614                                       fs_info->sectorsize_bits;
1615                 struct scrub_sector_verification *sector =
1616                                                 &stripe->sectors[nr_sector];
1617
1618                 scrub_bitmap_set_bit_has_extent(stripe, nr_sector);
1619                 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1620                         scrub_bitmap_set_bit_is_metadata(stripe, nr_sector);
1621                         sector->generation = extent_gen;
1622                 }
1623         }
1624 }
1625
1626 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
1627 {
1628         ASSERT(stripe->nr_sectors);
1629         bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors);
1630 }
1631
1632 /*
1633  * Locate one stripe which has at least one extent in its range.
1634  *
1635  * Return 0 if found such stripe, and store its info into @stripe.
1636  * Return >0 if there is no such stripe in the specified range.
1637  * Return <0 for error.
1638  */
1639 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
1640                                         struct btrfs_path *extent_path,
1641                                         struct btrfs_path *csum_path,
1642                                         struct btrfs_device *dev, u64 physical,
1643                                         int mirror_num, u64 logical_start,
1644                                         u32 logical_len,
1645                                         struct scrub_stripe *stripe)
1646 {
1647         struct btrfs_fs_info *fs_info = bg->fs_info;
1648         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
1649         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
1650         const u64 logical_end = logical_start + logical_len;
1651         u64 cur_logical = logical_start;
1652         u64 stripe_end;
1653         u64 extent_start;
1654         u64 extent_len;
1655         u64 extent_flags;
1656         u64 extent_gen;
1657         int ret;
1658
1659         if (unlikely(!extent_root || !csum_root)) {
1660                 btrfs_err(fs_info, "scrub: no valid extent or csum root found");
1661                 return -EUCLEAN;
1662         }
1663         memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
1664                                    stripe->nr_sectors);
1665         scrub_stripe_reset_bitmaps(stripe);
1666
1667         /* The range must be inside the bg. */
1668         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
1669
1670         ret = find_first_extent_item(extent_root, extent_path, logical_start,
1671                                      logical_len);
1672         /* Either error or not found. */
1673         if (ret)
1674                 goto out;
1675         get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags,
1676                         &extent_gen);
1677         if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1678                 stripe->nr_meta_extents++;
1679         if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1680                 stripe->nr_data_extents++;
1681         cur_logical = max(extent_start, cur_logical);
1682
1683         /*
1684          * Round down to stripe boundary.
1685          *
1686          * The extra calculation against bg->start is to handle block groups
1687          * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1688          */
1689         stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
1690                           bg->start;
1691         stripe->physical = physical + stripe->logical - logical_start;
1692         stripe->dev = dev;
1693         stripe->bg = bg;
1694         stripe->mirror_num = mirror_num;
1695         stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
1696
1697         /* Fill the first extent info into stripe->sectors[] array. */
1698         fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1699                              extent_flags, extent_gen);
1700         cur_logical = extent_start + extent_len;
1701
1702         /* Fill the extent info for the remaining sectors. */
1703         while (cur_logical <= stripe_end) {
1704                 ret = find_first_extent_item(extent_root, extent_path, cur_logical,
1705                                              stripe_end - cur_logical + 1);
1706                 if (ret < 0)
1707                         goto out;
1708                 if (ret > 0) {
1709                         ret = 0;
1710                         break;
1711                 }
1712                 get_extent_info(extent_path, &extent_start, &extent_len,
1713                                 &extent_flags, &extent_gen);
1714                 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1715                         stripe->nr_meta_extents++;
1716                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1717                         stripe->nr_data_extents++;
1718                 fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1719                                      extent_flags, extent_gen);
1720                 cur_logical = extent_start + extent_len;
1721         }
1722
1723         /* Now fill the data csum. */
1724         if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
1725                 int sector_nr;
1726                 unsigned long csum_bitmap = 0;
1727
1728                 /* Csum space should have already been allocated. */
1729                 ASSERT(stripe->csums);
1730
1731                 /*
1732                  * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1733                  * should contain at most 16 sectors.
1734                  */
1735                 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
1736
1737                 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path,
1738                                                 stripe->logical, stripe_end,
1739                                                 stripe->csums, &csum_bitmap);
1740                 if (ret < 0)
1741                         goto out;
1742                 if (ret > 0)
1743                         ret = 0;
1744
1745                 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
1746                         stripe->sectors[sector_nr].csum = stripe->csums +
1747                                 sector_nr * fs_info->csum_size;
1748                 }
1749         }
1750         set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
1751 out:
1752         return ret;
1753 }
1754
1755 static void scrub_reset_stripe(struct scrub_stripe *stripe)
1756 {
1757         scrub_stripe_reset_bitmaps(stripe);
1758
1759         stripe->nr_meta_extents = 0;
1760         stripe->nr_data_extents = 0;
1761         stripe->state = 0;
1762
1763         for (int i = 0; i < stripe->nr_sectors; i++) {
1764                 stripe->sectors[i].csum = NULL;
1765                 stripe->sectors[i].generation = 0;
1766         }
1767 }
1768
1769 static u32 stripe_length(const struct scrub_stripe *stripe)
1770 {
1771         ASSERT(stripe->bg);
1772
1773         return min(BTRFS_STRIPE_LEN,
1774                    stripe->bg->start + stripe->bg->length - stripe->logical);
1775 }
1776
1777 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
1778 {
1779         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1780         struct btrfs_bio *bbio = NULL;
1781         unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
1782         const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe);
1783         u64 stripe_len = BTRFS_STRIPE_LEN;
1784         int mirror = stripe->mirror_num;
1785         int i;
1786
1787         atomic_inc(&stripe->pending_io);
1788
1789         for_each_set_bit(i, &has_extent, stripe->nr_sectors) {
1790                 /* We're beyond the chunk boundary, no need to read anymore. */
1791                 if (i >= nr_sectors)
1792                         break;
1793
1794                 /* The current sector cannot be merged, submit the bio. */
1795                 if (bbio &&
1796                     ((i > 0 && !test_bit(i - 1, &has_extent)) ||
1797                      bbio->bio.bi_iter.bi_size >= stripe_len)) {
1798                         ASSERT(bbio->bio.bi_iter.bi_size);
1799                         atomic_inc(&stripe->pending_io);
1800                         btrfs_submit_bbio(bbio, mirror);
1801                         bbio = NULL;
1802                 }
1803
1804                 if (!bbio) {
1805                         struct btrfs_io_stripe io_stripe = {};
1806                         struct btrfs_io_context *bioc = NULL;
1807                         const u64 logical = stripe->logical +
1808                                             (i << fs_info->sectorsize_bits);
1809                         int err;
1810
1811                         io_stripe.rst_search_commit_root = true;
1812                         stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
1813                         /*
1814                          * For RST cases, we need to manually split the bbio to
1815                          * follow the RST boundary.
1816                          */
1817                         err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
1818                                               &stripe_len, &bioc, &io_stripe, &mirror);
1819                         btrfs_put_bioc(bioc);
1820                         if (err < 0) {
1821                                 if (err != -ENODATA) {
1822                                         /*
1823                                          * Earlier btrfs_get_raid_extent_offset()
1824                                          * returned -ENODATA, which means there's
1825                                          * no entry for the corresponding range
1826                                          * in the stripe tree.  But if it's in
1827                                          * the extent tree, then it's a preallocated
1828                                          * extent and not an error.
1829                                          */
1830                                         scrub_bitmap_set_bit_io_error(stripe, i);
1831                                         scrub_bitmap_set_bit_error(stripe, i);
1832                                 }
1833                                 continue;
1834                         }
1835
1836                         bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
1837                                                fs_info, scrub_read_endio, stripe);
1838                         bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
1839                 }
1840
1841                 scrub_bio_add_sector(bbio, stripe, i);
1842         }
1843
1844         if (bbio) {
1845                 ASSERT(bbio->bio.bi_iter.bi_size);
1846                 atomic_inc(&stripe->pending_io);
1847                 btrfs_submit_bbio(bbio, mirror);
1848         }
1849
1850         if (atomic_dec_and_test(&stripe->pending_io)) {
1851                 wake_up(&stripe->io_wait);
1852                 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
1853                 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
1854         }
1855 }
1856
1857 static void scrub_submit_initial_read(struct scrub_ctx *sctx,
1858                                       struct scrub_stripe *stripe)
1859 {
1860         struct btrfs_fs_info *fs_info = sctx->fs_info;
1861         struct btrfs_bio *bbio;
1862         unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
1863         int mirror = stripe->mirror_num;
1864
1865         ASSERT(stripe->bg);
1866         ASSERT(stripe->mirror_num > 0);
1867         ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1868
1869         if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
1870                 scrub_submit_extent_sector_read(stripe);
1871                 return;
1872         }
1873
1874         bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
1875                                scrub_read_endio, stripe);
1876
1877         bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
1878         /* Read the whole range inside the chunk boundary. */
1879         for (unsigned int cur = 0; cur < nr_sectors; cur++)
1880                 scrub_bio_add_sector(bbio, stripe, cur);
1881         atomic_inc(&stripe->pending_io);
1882
1883         /*
1884          * For dev-replace, either user asks to avoid the source dev, or
1885          * the device is missing, we try the next mirror instead.
1886          */
1887         if (sctx->is_dev_replace &&
1888             (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
1889              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
1890              !stripe->dev->bdev)) {
1891                 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1892                                                   stripe->bg->length);
1893
1894                 mirror = calc_next_mirror(mirror, num_copies);
1895         }
1896         btrfs_submit_bbio(bbio, mirror);
1897 }
1898
1899 static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
1900 {
1901         const unsigned long error = scrub_bitmap_read_error(stripe);
1902         int i;
1903
1904         for_each_set_bit(i, &error, stripe->nr_sectors) {
1905                 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) {
1906                         struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1907
1908                         btrfs_err(fs_info,
1909                     "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
1910                                   stripe->logical,
1911                                   stripe->logical + (i << fs_info->sectorsize_bits));
1912                         return true;
1913                 }
1914         }
1915         return false;
1916 }
1917
1918 static void submit_initial_group_read(struct scrub_ctx *sctx,
1919                                       unsigned int first_slot,
1920                                       unsigned int nr_stripes)
1921 {
1922         struct blk_plug plug;
1923
1924         ASSERT(first_slot < SCRUB_TOTAL_STRIPES);
1925         ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES);
1926
1927         scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
1928                               btrfs_stripe_nr_to_offset(nr_stripes));
1929         blk_start_plug(&plug);
1930         for (int i = 0; i < nr_stripes; i++) {
1931                 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i];
1932
1933                 /* Those stripes should be initialized. */
1934                 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1935                 scrub_submit_initial_read(sctx, stripe);
1936         }
1937         blk_finish_plug(&plug);
1938 }
1939
1940 static int flush_scrub_stripes(struct scrub_ctx *sctx)
1941 {
1942         struct btrfs_fs_info *fs_info = sctx->fs_info;
1943         struct scrub_stripe *stripe;
1944         const int nr_stripes = sctx->cur_stripe;
1945         int ret = 0;
1946
1947         if (!nr_stripes)
1948                 return 0;
1949
1950         ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
1951
1952         /* Submit the stripes which are populated but not submitted. */
1953         if (nr_stripes % SCRUB_STRIPES_PER_GROUP) {
1954                 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP);
1955
1956                 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot);
1957         }
1958
1959         for (int i = 0; i < nr_stripes; i++) {
1960                 stripe = &sctx->stripes[i];
1961
1962                 wait_event(stripe->repair_wait,
1963                            test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
1964         }
1965
1966         /* Submit for dev-replace. */
1967         if (sctx->is_dev_replace) {
1968                 /*
1969                  * For dev-replace, if we know there is something wrong with
1970                  * metadata, we should immediately abort.
1971                  */
1972                 for (int i = 0; i < nr_stripes; i++) {
1973                         if (stripe_has_metadata_error(&sctx->stripes[i])) {
1974                                 ret = -EIO;
1975                                 goto out;
1976                         }
1977                 }
1978                 for (int i = 0; i < nr_stripes; i++) {
1979                         unsigned long good;
1980                         unsigned long has_extent;
1981                         unsigned long error;
1982
1983                         stripe = &sctx->stripes[i];
1984
1985                         ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
1986
1987                         has_extent = scrub_bitmap_read_has_extent(stripe);
1988                         error = scrub_bitmap_read_error(stripe);
1989                         bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors);
1990                         scrub_write_sectors(sctx, stripe, good, true);
1991                 }
1992         }
1993
1994         /* Wait for the above writebacks to finish. */
1995         for (int i = 0; i < nr_stripes; i++) {
1996                 stripe = &sctx->stripes[i];
1997
1998                 wait_scrub_stripe_io(stripe);
1999                 spin_lock(&sctx->stat_lock);
2000                 sctx->stat.last_physical = stripe->physical + stripe_length(stripe);
2001                 spin_unlock(&sctx->stat_lock);
2002                 scrub_reset_stripe(stripe);
2003         }
2004 out:
2005         sctx->cur_stripe = 0;
2006         return ret;
2007 }
2008
2009 static void raid56_scrub_wait_endio(struct bio *bio)
2010 {
2011         complete(bio->bi_private);
2012 }
2013
2014 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
2015                               struct btrfs_device *dev, int mirror_num,
2016                               u64 logical, u32 length, u64 physical,
2017                               u64 *found_logical_ret)
2018 {
2019         struct scrub_stripe *stripe;
2020         int ret;
2021
2022         /*
2023          * There should always be one slot left, as caller filling the last
2024          * slot should flush them all.
2025          */
2026         ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
2027
2028         /* @found_logical_ret must be specified. */
2029         ASSERT(found_logical_ret);
2030
2031         stripe = &sctx->stripes[sctx->cur_stripe];
2032         scrub_reset_stripe(stripe);
2033         ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
2034                                            &sctx->csum_path, dev, physical,
2035                                            mirror_num, logical, length, stripe);
2036         /* Either >0 as no more extents or <0 for error. */
2037         if (ret)
2038                 return ret;
2039         *found_logical_ret = stripe->logical;
2040         sctx->cur_stripe++;
2041
2042         /* We filled one group, submit it. */
2043         if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) {
2044                 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP;
2045
2046                 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP);
2047         }
2048
2049         /* Last slot used, flush them all. */
2050         if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES)
2051                 return flush_scrub_stripes(sctx);
2052         return 0;
2053 }
2054
2055 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
2056                                       struct btrfs_device *scrub_dev,
2057                                       struct btrfs_block_group *bg,
2058                                       struct btrfs_chunk_map *map,
2059                                       u64 full_stripe_start)
2060 {
2061         DECLARE_COMPLETION_ONSTACK(io_done);
2062         struct btrfs_fs_info *fs_info = sctx->fs_info;
2063         struct btrfs_raid_bio *rbio;
2064         struct btrfs_io_context *bioc = NULL;
2065         struct btrfs_path extent_path = { 0 };
2066         struct btrfs_path csum_path = { 0 };
2067         struct bio *bio;
2068         struct scrub_stripe *stripe;
2069         bool all_empty = true;
2070         const int data_stripes = nr_data_stripes(map);
2071         unsigned long extent_bitmap = 0;
2072         u64 length = btrfs_stripe_nr_to_offset(data_stripes);
2073         int ret;
2074
2075         ASSERT(sctx->raid56_data_stripes);
2076
2077         /*
2078          * For data stripe search, we cannot reuse the same extent/csum paths,
2079          * as the data stripe bytenr may be smaller than previous extent.  Thus
2080          * we have to use our own extent/csum paths.
2081          */
2082         extent_path.search_commit_root = 1;
2083         extent_path.skip_locking = 1;
2084         csum_path.search_commit_root = 1;
2085         csum_path.skip_locking = 1;
2086
2087         for (int i = 0; i < data_stripes; i++) {
2088                 int stripe_index;
2089                 int rot;
2090                 u64 physical;
2091
2092                 stripe = &sctx->raid56_data_stripes[i];
2093                 rot = div_u64(full_stripe_start - bg->start,
2094                               data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
2095                 stripe_index = (i + rot) % map->num_stripes;
2096                 physical = map->stripes[stripe_index].physical +
2097                            btrfs_stripe_nr_to_offset(rot);
2098
2099                 scrub_reset_stripe(stripe);
2100                 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
2101                 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path,
2102                                 map->stripes[stripe_index].dev, physical, 1,
2103                                 full_stripe_start + btrfs_stripe_nr_to_offset(i),
2104                                 BTRFS_STRIPE_LEN, stripe);
2105                 if (ret < 0)
2106                         goto out;
2107                 /*
2108                  * No extent in this data stripe, need to manually mark them
2109                  * initialized to make later read submission happy.
2110                  */
2111                 if (ret > 0) {
2112                         stripe->logical = full_stripe_start +
2113                                           btrfs_stripe_nr_to_offset(i);
2114                         stripe->dev = map->stripes[stripe_index].dev;
2115                         stripe->mirror_num = 1;
2116                         set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
2117                 }
2118         }
2119
2120         /* Check if all data stripes are empty. */
2121         for (int i = 0; i < data_stripes; i++) {
2122                 stripe = &sctx->raid56_data_stripes[i];
2123                 if (!scrub_bitmap_empty_has_extent(stripe)) {
2124                         all_empty = false;
2125                         break;
2126                 }
2127         }
2128         if (all_empty) {
2129                 ret = 0;
2130                 goto out;
2131         }
2132
2133         for (int i = 0; i < data_stripes; i++) {
2134                 stripe = &sctx->raid56_data_stripes[i];
2135                 scrub_submit_initial_read(sctx, stripe);
2136         }
2137         for (int i = 0; i < data_stripes; i++) {
2138                 stripe = &sctx->raid56_data_stripes[i];
2139
2140                 wait_event(stripe->repair_wait,
2141                            test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
2142         }
2143         /* For now, no zoned support for RAID56. */
2144         ASSERT(!btrfs_is_zoned(sctx->fs_info));
2145
2146         /*
2147          * Now all data stripes are properly verified. Check if we have any
2148          * unrepaired, if so abort immediately or we could further corrupt the
2149          * P/Q stripes.
2150          *
2151          * During the loop, also populate extent_bitmap.
2152          */
2153         for (int i = 0; i < data_stripes; i++) {
2154                 unsigned long error;
2155                 unsigned long has_extent;
2156
2157                 stripe = &sctx->raid56_data_stripes[i];
2158
2159                 error = scrub_bitmap_read_error(stripe);
2160                 has_extent = scrub_bitmap_read_has_extent(stripe);
2161
2162                 /*
2163                  * We should only check the errors where there is an extent.
2164                  * As we may hit an empty data stripe while it's missing.
2165                  */
2166                 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
2167                 if (!bitmap_empty(&error, stripe->nr_sectors)) {
2168                         btrfs_err(fs_info,
2169 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
2170                                   full_stripe_start, i, stripe->nr_sectors,
2171                                   &error);
2172                         ret = -EIO;
2173                         goto out;
2174                 }
2175                 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent,
2176                           stripe->nr_sectors);
2177         }
2178
2179         /* Now we can check and regenerate the P/Q stripe. */
2180         bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
2181         bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
2182         bio->bi_private = &io_done;
2183         bio->bi_end_io = raid56_scrub_wait_endio;
2184
2185         btrfs_bio_counter_inc_blocked(fs_info);
2186         ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
2187                               &length, &bioc, NULL, NULL);
2188         if (ret < 0) {
2189                 btrfs_put_bioc(bioc);
2190                 btrfs_bio_counter_dec(fs_info);
2191                 goto out;
2192         }
2193         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
2194                                 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
2195         btrfs_put_bioc(bioc);
2196         if (!rbio) {
2197                 ret = -ENOMEM;
2198                 btrfs_bio_counter_dec(fs_info);
2199                 goto out;
2200         }
2201         /* Use the recovered stripes as cache to avoid read them from disk again. */
2202         for (int i = 0; i < data_stripes; i++) {
2203                 stripe = &sctx->raid56_data_stripes[i];
2204
2205                 raid56_parity_cache_data_pages(rbio, stripe->pages,
2206                                 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
2207         }
2208         raid56_parity_submit_scrub_rbio(rbio);
2209         wait_for_completion_io(&io_done);
2210         ret = blk_status_to_errno(bio->bi_status);
2211         bio_put(bio);
2212         btrfs_bio_counter_dec(fs_info);
2213
2214         btrfs_release_path(&extent_path);
2215         btrfs_release_path(&csum_path);
2216 out:
2217         return ret;
2218 }
2219
2220 /*
2221  * Scrub one range which can only has simple mirror based profile.
2222  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
2223  *  RAID0/RAID10).
2224  *
2225  * Since we may need to handle a subset of block group, we need @logical_start
2226  * and @logical_length parameter.
2227  */
2228 static int scrub_simple_mirror(struct scrub_ctx *sctx,
2229                                struct btrfs_block_group *bg,
2230                                u64 logical_start, u64 logical_length,
2231                                struct btrfs_device *device,
2232                                u64 physical, int mirror_num)
2233 {
2234         struct btrfs_fs_info *fs_info = sctx->fs_info;
2235         const u64 logical_end = logical_start + logical_length;
2236         u64 cur_logical = logical_start;
2237         int ret = 0;
2238
2239         /* The range must be inside the bg */
2240         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
2241
2242         /* Go through each extent items inside the logical range */
2243         while (cur_logical < logical_end) {
2244                 u64 found_logical = U64_MAX;
2245                 u64 cur_physical = physical + cur_logical - logical_start;
2246
2247                 /* Canceled? */
2248                 if (atomic_read(&fs_info->scrub_cancel_req) ||
2249                     atomic_read(&sctx->cancel_req)) {
2250                         ret = -ECANCELED;
2251                         break;
2252                 }
2253                 /* Paused? */
2254                 if (atomic_read(&fs_info->scrub_pause_req)) {
2255                         /* Push queued extents */
2256                         scrub_blocked_if_needed(fs_info);
2257                 }
2258                 /* Block group removed? */
2259                 spin_lock(&bg->lock);
2260                 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
2261                         spin_unlock(&bg->lock);
2262                         ret = 0;
2263                         break;
2264                 }
2265                 spin_unlock(&bg->lock);
2266
2267                 ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
2268                                          cur_logical, logical_end - cur_logical,
2269                                          cur_physical, &found_logical);
2270                 if (ret > 0) {
2271                         /* No more extent, just update the accounting */
2272                         spin_lock(&sctx->stat_lock);
2273                         sctx->stat.last_physical = physical + logical_length;
2274                         spin_unlock(&sctx->stat_lock);
2275                         ret = 0;
2276                         break;
2277                 }
2278                 if (ret < 0)
2279                         break;
2280
2281                 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
2282                 ASSERT(found_logical != U64_MAX);
2283                 cur_logical = found_logical + BTRFS_STRIPE_LEN;
2284
2285                 /* Don't hold CPU for too long time */
2286                 cond_resched();
2287         }
2288         return ret;
2289 }
2290
2291 /* Calculate the full stripe length for simple stripe based profiles */
2292 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
2293 {
2294         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2295                             BTRFS_BLOCK_GROUP_RAID10));
2296
2297         return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes);
2298 }
2299
2300 /* Get the logical bytenr for the stripe */
2301 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
2302                                      struct btrfs_block_group *bg,
2303                                      int stripe_index)
2304 {
2305         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2306                             BTRFS_BLOCK_GROUP_RAID10));
2307         ASSERT(stripe_index < map->num_stripes);
2308
2309         /*
2310          * (stripe_index / sub_stripes) gives how many data stripes we need to
2311          * skip.
2312          */
2313         return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) +
2314                bg->start;
2315 }
2316
2317 /* Get the mirror number for the stripe */
2318 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
2319 {
2320         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2321                             BTRFS_BLOCK_GROUP_RAID10));
2322         ASSERT(stripe_index < map->num_stripes);
2323
2324         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2325         return stripe_index % map->sub_stripes + 1;
2326 }
2327
2328 static int scrub_simple_stripe(struct scrub_ctx *sctx,
2329                                struct btrfs_block_group *bg,
2330                                struct btrfs_chunk_map *map,
2331                                struct btrfs_device *device,
2332                                int stripe_index)
2333 {
2334         const u64 logical_increment = simple_stripe_full_stripe_len(map);
2335         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
2336         const u64 orig_physical = map->stripes[stripe_index].physical;
2337         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
2338         u64 cur_logical = orig_logical;
2339         u64 cur_physical = orig_physical;
2340         int ret = 0;
2341
2342         while (cur_logical < bg->start + bg->length) {
2343                 /*
2344                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2345                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2346                  * this stripe.
2347                  */
2348                 ret = scrub_simple_mirror(sctx, bg, cur_logical,
2349                                           BTRFS_STRIPE_LEN, device, cur_physical,
2350                                           mirror_num);
2351                 if (ret)
2352                         return ret;
2353                 /* Skip to next stripe which belongs to the target device */
2354                 cur_logical += logical_increment;
2355                 /* For physical offset, we just go to next stripe */
2356                 cur_physical += BTRFS_STRIPE_LEN;
2357         }
2358         return ret;
2359 }
2360
2361 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2362                                            struct btrfs_block_group *bg,
2363                                            struct btrfs_chunk_map *map,
2364                                            struct btrfs_device *scrub_dev,
2365                                            int stripe_index)
2366 {
2367         struct btrfs_fs_info *fs_info = sctx->fs_info;
2368         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
2369         const u64 chunk_logical = bg->start;
2370         int ret;
2371         int ret2;
2372         u64 physical = map->stripes[stripe_index].physical;
2373         const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
2374         const u64 physical_end = physical + dev_stripe_len;
2375         u64 logical;
2376         u64 logic_end;
2377         /* The logical increment after finishing one stripe */
2378         u64 increment;
2379         /* Offset inside the chunk */
2380         u64 offset;
2381         u64 stripe_logical;
2382
2383         /* Extent_path should be released by now. */
2384         ASSERT(sctx->extent_path.nodes[0] == NULL);
2385
2386         scrub_blocked_if_needed(fs_info);
2387
2388         if (sctx->is_dev_replace &&
2389             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
2390                 mutex_lock(&sctx->wr_lock);
2391                 sctx->write_pointer = physical;
2392                 mutex_unlock(&sctx->wr_lock);
2393         }
2394
2395         /* Prepare the extra data stripes used by RAID56. */
2396         if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
2397                 ASSERT(sctx->raid56_data_stripes == NULL);
2398
2399                 sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
2400                                                     sizeof(struct scrub_stripe),
2401                                                     GFP_KERNEL);
2402                 if (!sctx->raid56_data_stripes) {
2403                         ret = -ENOMEM;
2404                         goto out;
2405                 }
2406                 for (int i = 0; i < nr_data_stripes(map); i++) {
2407                         ret = init_scrub_stripe(fs_info,
2408                                                 &sctx->raid56_data_stripes[i]);
2409                         if (ret < 0)
2410                                 goto out;
2411                         sctx->raid56_data_stripes[i].bg = bg;
2412                         sctx->raid56_data_stripes[i].sctx = sctx;
2413                 }
2414         }
2415         /*
2416          * There used to be a big double loop to handle all profiles using the
2417          * same routine, which grows larger and more gross over time.
2418          *
2419          * So here we handle each profile differently, so simpler profiles
2420          * have simpler scrubbing function.
2421          */
2422         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
2423                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2424                 /*
2425                  * Above check rules out all complex profile, the remaining
2426                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2427                  * mirrored duplication without stripe.
2428                  *
2429                  * Only @physical and @mirror_num needs to calculated using
2430                  * @stripe_index.
2431                  */
2432                 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
2433                                 scrub_dev, map->stripes[stripe_index].physical,
2434                                 stripe_index + 1);
2435                 offset = 0;
2436                 goto out;
2437         }
2438         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
2439                 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
2440                 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes);
2441                 goto out;
2442         }
2443
2444         /* Only RAID56 goes through the old code */
2445         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
2446         ret = 0;
2447
2448         /* Calculate the logical end of the stripe */
2449         get_raid56_logic_offset(physical_end, stripe_index,
2450                                 map, &logic_end, NULL);
2451         logic_end += chunk_logical;
2452
2453         /* Initialize @offset in case we need to go to out: label */
2454         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
2455         increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
2456
2457         /*
2458          * Due to the rotation, for RAID56 it's better to iterate each stripe
2459          * using their physical offset.
2460          */
2461         while (physical < physical_end) {
2462                 ret = get_raid56_logic_offset(physical, stripe_index, map,
2463                                               &logical, &stripe_logical);
2464                 logical += chunk_logical;
2465                 if (ret) {
2466                         /* it is parity strip */
2467                         stripe_logical += chunk_logical;
2468                         ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
2469                                                          map, stripe_logical);
2470                         spin_lock(&sctx->stat_lock);
2471                         sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN,
2472                                                        physical_end);
2473                         spin_unlock(&sctx->stat_lock);
2474                         if (ret)
2475                                 goto out;
2476                         goto next;
2477                 }
2478
2479                 /*
2480                  * Now we're at a data stripe, scrub each extents in the range.
2481                  *
2482                  * At this stage, if we ignore the repair part, inside each data
2483                  * stripe it is no different than SINGLE profile.
2484                  * We can reuse scrub_simple_mirror() here, as the repair part
2485                  * is still based on @mirror_num.
2486                  */
2487                 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
2488                                           scrub_dev, physical, 1);
2489                 if (ret < 0)
2490                         goto out;
2491 next:
2492                 logical += increment;
2493                 physical += BTRFS_STRIPE_LEN;
2494                 spin_lock(&sctx->stat_lock);
2495                 sctx->stat.last_physical = physical;
2496                 spin_unlock(&sctx->stat_lock);
2497         }
2498 out:
2499         ret2 = flush_scrub_stripes(sctx);
2500         if (!ret)
2501                 ret = ret2;
2502         btrfs_release_path(&sctx->extent_path);
2503         btrfs_release_path(&sctx->csum_path);
2504
2505         if (sctx->raid56_data_stripes) {
2506                 for (int i = 0; i < nr_data_stripes(map); i++)
2507                         release_scrub_stripe(&sctx->raid56_data_stripes[i]);
2508                 kfree(sctx->raid56_data_stripes);
2509                 sctx->raid56_data_stripes = NULL;
2510         }
2511
2512         if (sctx->is_dev_replace && ret >= 0) {
2513                 int ret2;
2514
2515                 ret2 = sync_write_pointer_for_zoned(sctx,
2516                                 chunk_logical + offset,
2517                                 map->stripes[stripe_index].physical,
2518                                 physical_end);
2519                 if (ret2)
2520                         ret = ret2;
2521         }
2522
2523         return ret < 0 ? ret : 0;
2524 }
2525
2526 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2527                                           struct btrfs_block_group *bg,
2528                                           struct btrfs_device *scrub_dev,
2529                                           u64 dev_offset,
2530                                           u64 dev_extent_len)
2531 {
2532         struct btrfs_fs_info *fs_info = sctx->fs_info;
2533         struct btrfs_chunk_map *map;
2534         int i;
2535         int ret = 0;
2536
2537         map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
2538         if (!map) {
2539                 /*
2540                  * Might have been an unused block group deleted by the cleaner
2541                  * kthread or relocation.
2542                  */
2543                 spin_lock(&bg->lock);
2544                 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
2545                         ret = -EINVAL;
2546                 spin_unlock(&bg->lock);
2547
2548                 return ret;
2549         }
2550         if (map->start != bg->start)
2551                 goto out;
2552         if (map->chunk_len < dev_extent_len)
2553                 goto out;
2554
2555         for (i = 0; i < map->num_stripes; ++i) {
2556                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2557                     map->stripes[i].physical == dev_offset) {
2558                         ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
2559                         if (ret)
2560                                 goto out;
2561                 }
2562         }
2563 out:
2564         btrfs_free_chunk_map(map);
2565
2566         return ret;
2567 }
2568
2569 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
2570                                           struct btrfs_block_group *cache)
2571 {
2572         struct btrfs_fs_info *fs_info = cache->fs_info;
2573
2574         if (!btrfs_is_zoned(fs_info))
2575                 return 0;
2576
2577         btrfs_wait_block_group_reservations(cache);
2578         btrfs_wait_nocow_writers(cache);
2579         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
2580
2581         return btrfs_commit_current_transaction(root);
2582 }
2583
2584 static noinline_for_stack
2585 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2586                            struct btrfs_device *scrub_dev, u64 start, u64 end)
2587 {
2588         struct btrfs_dev_extent *dev_extent = NULL;
2589         struct btrfs_path *path;
2590         struct btrfs_fs_info *fs_info = sctx->fs_info;
2591         struct btrfs_root *root = fs_info->dev_root;
2592         u64 chunk_offset;
2593         int ret = 0;
2594         int ro_set;
2595         int slot;
2596         struct extent_buffer *l;
2597         struct btrfs_key key;
2598         struct btrfs_key found_key;
2599         struct btrfs_block_group *cache;
2600         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2601
2602         path = btrfs_alloc_path();
2603         if (!path)
2604                 return -ENOMEM;
2605
2606         path->reada = READA_FORWARD;
2607         path->search_commit_root = 1;
2608         path->skip_locking = 1;
2609
2610         key.objectid = scrub_dev->devid;
2611         key.type = BTRFS_DEV_EXTENT_KEY;
2612         key.offset = 0ull;
2613
2614         while (1) {
2615                 u64 dev_extent_len;
2616
2617                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2618                 if (ret < 0)
2619                         break;
2620                 if (ret > 0) {
2621                         if (path->slots[0] >=
2622                             btrfs_header_nritems(path->nodes[0])) {
2623                                 ret = btrfs_next_leaf(root, path);
2624                                 if (ret < 0)
2625                                         break;
2626                                 if (ret > 0) {
2627                                         ret = 0;
2628                                         break;
2629                                 }
2630                         } else {
2631                                 ret = 0;
2632                         }
2633                 }
2634
2635                 l = path->nodes[0];
2636                 slot = path->slots[0];
2637
2638                 btrfs_item_key_to_cpu(l, &found_key, slot);
2639
2640                 if (found_key.objectid != scrub_dev->devid)
2641                         break;
2642
2643                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2644                         break;
2645
2646                 if (found_key.offset >= end)
2647                         break;
2648
2649                 if (found_key.offset < key.offset)
2650                         break;
2651
2652                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2653                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
2654
2655                 if (found_key.offset + dev_extent_len <= start)
2656                         goto skip;
2657
2658                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2659
2660                 /*
2661                  * get a reference on the corresponding block group to prevent
2662                  * the chunk from going away while we scrub it
2663                  */
2664                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2665
2666                 /* some chunks are removed but not committed to disk yet,
2667                  * continue scrubbing */
2668                 if (!cache)
2669                         goto skip;
2670
2671                 ASSERT(cache->start <= chunk_offset);
2672                 /*
2673                  * We are using the commit root to search for device extents, so
2674                  * that means we could have found a device extent item from a
2675                  * block group that was deleted in the current transaction. The
2676                  * logical start offset of the deleted block group, stored at
2677                  * @chunk_offset, might be part of the logical address range of
2678                  * a new block group (which uses different physical extents).
2679                  * In this case btrfs_lookup_block_group() has returned the new
2680                  * block group, and its start address is less than @chunk_offset.
2681                  *
2682                  * We skip such new block groups, because it's pointless to
2683                  * process them, as we won't find their extents because we search
2684                  * for them using the commit root of the extent tree. For a device
2685                  * replace it's also fine to skip it, we won't miss copying them
2686                  * to the target device because we have the write duplication
2687                  * setup through the regular write path (by btrfs_map_block()),
2688                  * and we have committed a transaction when we started the device
2689                  * replace, right after setting up the device replace state.
2690                  */
2691                 if (cache->start < chunk_offset) {
2692                         btrfs_put_block_group(cache);
2693                         goto skip;
2694                 }
2695
2696                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
2697                         if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
2698                                 btrfs_put_block_group(cache);
2699                                 goto skip;
2700                         }
2701                 }
2702
2703                 /*
2704                  * Make sure that while we are scrubbing the corresponding block
2705                  * group doesn't get its logical address and its device extents
2706                  * reused for another block group, which can possibly be of a
2707                  * different type and different profile. We do this to prevent
2708                  * false error detections and crashes due to bogus attempts to
2709                  * repair extents.
2710                  */
2711                 spin_lock(&cache->lock);
2712                 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
2713                         spin_unlock(&cache->lock);
2714                         btrfs_put_block_group(cache);
2715                         goto skip;
2716                 }
2717                 btrfs_freeze_block_group(cache);
2718                 spin_unlock(&cache->lock);
2719
2720                 /*
2721                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2722                  * to avoid deadlock caused by:
2723                  * btrfs_inc_block_group_ro()
2724                  * -> btrfs_wait_for_commit()
2725                  * -> btrfs_commit_transaction()
2726                  * -> btrfs_scrub_pause()
2727                  */
2728                 scrub_pause_on(fs_info);
2729
2730                 /*
2731                  * Don't do chunk preallocation for scrub.
2732                  *
2733                  * This is especially important for SYSTEM bgs, or we can hit
2734                  * -EFBIG from btrfs_finish_chunk_alloc() like:
2735                  * 1. The only SYSTEM bg is marked RO.
2736                  *    Since SYSTEM bg is small, that's pretty common.
2737                  * 2. New SYSTEM bg will be allocated
2738                  *    Due to regular version will allocate new chunk.
2739                  * 3. New SYSTEM bg is empty and will get cleaned up
2740                  *    Before cleanup really happens, it's marked RO again.
2741                  * 4. Empty SYSTEM bg get scrubbed
2742                  *    We go back to 2.
2743                  *
2744                  * This can easily boost the amount of SYSTEM chunks if cleaner
2745                  * thread can't be triggered fast enough, and use up all space
2746                  * of btrfs_super_block::sys_chunk_array
2747                  *
2748                  * While for dev replace, we need to try our best to mark block
2749                  * group RO, to prevent race between:
2750                  * - Write duplication
2751                  *   Contains latest data
2752                  * - Scrub copy
2753                  *   Contains data from commit tree
2754                  *
2755                  * If target block group is not marked RO, nocow writes can
2756                  * be overwritten by scrub copy, causing data corruption.
2757                  * So for dev-replace, it's not allowed to continue if a block
2758                  * group is not RO.
2759                  */
2760                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
2761                 if (!ret && sctx->is_dev_replace) {
2762                         ret = finish_extent_writes_for_zoned(root, cache);
2763                         if (ret) {
2764                                 btrfs_dec_block_group_ro(cache);
2765                                 scrub_pause_off(fs_info);
2766                                 btrfs_put_block_group(cache);
2767                                 break;
2768                         }
2769                 }
2770
2771                 if (ret == 0) {
2772                         ro_set = 1;
2773                 } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
2774                            !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
2775                         /*
2776                          * btrfs_inc_block_group_ro return -ENOSPC when it
2777                          * failed in creating new chunk for metadata.
2778                          * It is not a problem for scrub, because
2779                          * metadata are always cowed, and our scrub paused
2780                          * commit_transactions.
2781                          *
2782                          * For RAID56 chunks, we have to mark them read-only
2783                          * for scrub, as later we would use our own cache
2784                          * out of RAID56 realm.
2785                          * Thus we want the RAID56 bg to be marked RO to
2786                          * prevent RMW from screwing up out cache.
2787                          */
2788                         ro_set = 0;
2789                 } else if (ret == -ETXTBSY) {
2790                         btrfs_warn(fs_info,
2791              "scrub: skipping scrub of block group %llu due to active swapfile",
2792                                    cache->start);
2793                         scrub_pause_off(fs_info);
2794                         ret = 0;
2795                         goto skip_unfreeze;
2796                 } else {
2797                         btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
2798                                    ret);
2799                         btrfs_unfreeze_block_group(cache);
2800                         btrfs_put_block_group(cache);
2801                         scrub_pause_off(fs_info);
2802                         break;
2803                 }
2804
2805                 /*
2806                  * Now the target block is marked RO, wait for nocow writes to
2807                  * finish before dev-replace.
2808                  * COW is fine, as COW never overwrites extents in commit tree.
2809                  */
2810                 if (sctx->is_dev_replace) {
2811                         btrfs_wait_nocow_writers(cache);
2812                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
2813                 }
2814
2815                 scrub_pause_off(fs_info);
2816                 down_write(&dev_replace->rwsem);
2817                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
2818                 dev_replace->cursor_left = found_key.offset;
2819                 dev_replace->item_needs_writeback = 1;
2820                 up_write(&dev_replace->rwsem);
2821
2822                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
2823                                   dev_extent_len);
2824                 if (sctx->is_dev_replace &&
2825                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
2826                                                       cache, found_key.offset))
2827                         ro_set = 0;
2828
2829                 down_write(&dev_replace->rwsem);
2830                 dev_replace->cursor_left = dev_replace->cursor_right;
2831                 dev_replace->item_needs_writeback = 1;
2832                 up_write(&dev_replace->rwsem);
2833
2834                 if (ro_set)
2835                         btrfs_dec_block_group_ro(cache);
2836
2837                 /*
2838                  * We might have prevented the cleaner kthread from deleting
2839                  * this block group if it was already unused because we raced
2840                  * and set it to RO mode first. So add it back to the unused
2841                  * list, otherwise it might not ever be deleted unless a manual
2842                  * balance is triggered or it becomes used and unused again.
2843                  */
2844                 spin_lock(&cache->lock);
2845                 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
2846                     !cache->ro && cache->reserved == 0 && cache->used == 0) {
2847                         spin_unlock(&cache->lock);
2848                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
2849                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
2850                                                          cache);
2851                         else
2852                                 btrfs_mark_bg_unused(cache);
2853                 } else {
2854                         spin_unlock(&cache->lock);
2855                 }
2856 skip_unfreeze:
2857                 btrfs_unfreeze_block_group(cache);
2858                 btrfs_put_block_group(cache);
2859                 if (ret)
2860                         break;
2861                 if (sctx->is_dev_replace &&
2862                     atomic64_read(&dev_replace->num_write_errors) > 0) {
2863                         ret = -EIO;
2864                         break;
2865                 }
2866                 if (sctx->stat.malloc_errors > 0) {
2867                         ret = -ENOMEM;
2868                         break;
2869                 }
2870 skip:
2871                 key.offset = found_key.offset + dev_extent_len;
2872                 btrfs_release_path(path);
2873         }
2874
2875         btrfs_free_path(path);
2876
2877         return ret;
2878 }
2879
2880 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
2881                            struct page *page, u64 physical, u64 generation)
2882 {
2883         struct btrfs_fs_info *fs_info = sctx->fs_info;
2884         struct btrfs_super_block *sb = page_address(page);
2885         int ret;
2886
2887         ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
2888                         BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
2889         if (ret < 0)
2890                 return ret;
2891         ret = btrfs_check_super_csum(fs_info, sb);
2892         if (ret != 0) {
2893                 btrfs_err_rl(fs_info,
2894                   "scrub: super block at physical %llu devid %llu has bad csum",
2895                         physical, dev->devid);
2896                 return -EIO;
2897         }
2898         if (btrfs_super_generation(sb) != generation) {
2899                 btrfs_err_rl(fs_info,
2900 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
2901                              physical, dev->devid,
2902                              btrfs_super_generation(sb), generation);
2903                 return -EUCLEAN;
2904         }
2905
2906         return btrfs_validate_super(fs_info, sb, -1);
2907 }
2908
2909 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2910                                            struct btrfs_device *scrub_dev)
2911 {
2912         int     i;
2913         u64     bytenr;
2914         u64     gen;
2915         int ret = 0;
2916         struct page *page;
2917         struct btrfs_fs_info *fs_info = sctx->fs_info;
2918
2919         if (BTRFS_FS_ERROR(fs_info))
2920                 return -EROFS;
2921
2922         page = alloc_page(GFP_KERNEL);
2923         if (!page) {
2924                 spin_lock(&sctx->stat_lock);
2925                 sctx->stat.malloc_errors++;
2926                 spin_unlock(&sctx->stat_lock);
2927                 return -ENOMEM;
2928         }
2929
2930         /* Seed devices of a new filesystem has their own generation. */
2931         if (scrub_dev->fs_devices != fs_info->fs_devices)
2932                 gen = scrub_dev->generation;
2933         else
2934                 gen = btrfs_get_last_trans_committed(fs_info);
2935
2936         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2937                 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr);
2938                 if (ret == -ENOENT)
2939                         break;
2940
2941                 if (ret) {
2942                         spin_lock(&sctx->stat_lock);
2943                         sctx->stat.super_errors++;
2944                         spin_unlock(&sctx->stat_lock);
2945                         continue;
2946                 }
2947
2948                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
2949                     scrub_dev->commit_total_bytes)
2950                         break;
2951                 if (!btrfs_check_super_location(scrub_dev, bytenr))
2952                         continue;
2953
2954                 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
2955                 if (ret) {
2956                         spin_lock(&sctx->stat_lock);
2957                         sctx->stat.super_errors++;
2958                         spin_unlock(&sctx->stat_lock);
2959                 }
2960         }
2961         __free_page(page);
2962         return 0;
2963 }
2964
2965 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
2966 {
2967         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
2968                                         &fs_info->scrub_lock)) {
2969                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
2970
2971                 fs_info->scrub_workers = NULL;
2972                 mutex_unlock(&fs_info->scrub_lock);
2973
2974                 if (scrub_workers)
2975                         destroy_workqueue(scrub_workers);
2976         }
2977 }
2978
2979 /*
2980  * get a reference count on fs_info->scrub_workers. start worker if necessary
2981  */
2982 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
2983 {
2984         struct workqueue_struct *scrub_workers = NULL;
2985         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
2986         int max_active = fs_info->thread_pool_size;
2987         int ret = -ENOMEM;
2988
2989         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
2990                 return 0;
2991
2992         scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
2993         if (!scrub_workers)
2994                 return -ENOMEM;
2995
2996         mutex_lock(&fs_info->scrub_lock);
2997         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
2998                 ASSERT(fs_info->scrub_workers == NULL);
2999                 fs_info->scrub_workers = scrub_workers;
3000                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
3001                 mutex_unlock(&fs_info->scrub_lock);
3002                 return 0;
3003         }
3004         /* Other thread raced in and created the workers for us */
3005         refcount_inc(&fs_info->scrub_workers_refcnt);
3006         mutex_unlock(&fs_info->scrub_lock);
3007
3008         ret = 0;
3009
3010         destroy_workqueue(scrub_workers);
3011         return ret;
3012 }
3013
3014 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3015                     u64 end, struct btrfs_scrub_progress *progress,
3016                     int readonly, int is_dev_replace)
3017 {
3018         struct btrfs_dev_lookup_args args = { .devid = devid };
3019         struct scrub_ctx *sctx;
3020         int ret;
3021         struct btrfs_device *dev;
3022         unsigned int nofs_flag;
3023         bool need_commit = false;
3024
3025         if (btrfs_fs_closing(fs_info))
3026                 return -EAGAIN;
3027
3028         /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
3029         ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
3030
3031         /*
3032          * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
3033          * value (max nodesize / min sectorsize), thus nodesize should always
3034          * be fine.
3035          */
3036         ASSERT(fs_info->nodesize <=
3037                SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
3038
3039         /* Allocate outside of device_list_mutex */
3040         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3041         if (IS_ERR(sctx))
3042                 return PTR_ERR(sctx);
3043
3044         ret = scrub_workers_get(fs_info);
3045         if (ret)
3046                 goto out_free_ctx;
3047
3048         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3049         dev = btrfs_find_device(fs_info->fs_devices, &args);
3050         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3051                      !is_dev_replace)) {
3052                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3053                 ret = -ENODEV;
3054                 goto out;
3055         }
3056
3057         if (!is_dev_replace && !readonly &&
3058             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3059                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3060                 btrfs_err_in_rcu(fs_info,
3061                         "scrub: devid %llu: filesystem on %s is not writable",
3062                                  devid, btrfs_dev_name(dev));
3063                 ret = -EROFS;
3064                 goto out;
3065         }
3066
3067         mutex_lock(&fs_info->scrub_lock);
3068         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3069             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3070                 mutex_unlock(&fs_info->scrub_lock);
3071                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3072                 ret = -EIO;
3073                 goto out;
3074         }
3075
3076         down_read(&fs_info->dev_replace.rwsem);
3077         if (dev->scrub_ctx ||
3078             (!is_dev_replace &&
3079              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3080                 up_read(&fs_info->dev_replace.rwsem);
3081                 mutex_unlock(&fs_info->scrub_lock);
3082                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3083                 ret = -EINPROGRESS;
3084                 goto out;
3085         }
3086         up_read(&fs_info->dev_replace.rwsem);
3087
3088         sctx->readonly = readonly;
3089         dev->scrub_ctx = sctx;
3090         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3091
3092         /*
3093          * checking @scrub_pause_req here, we can avoid
3094          * race between committing transaction and scrubbing.
3095          */
3096         __scrub_blocked_if_needed(fs_info);
3097         atomic_inc(&fs_info->scrubs_running);
3098         mutex_unlock(&fs_info->scrub_lock);
3099
3100         /*
3101          * In order to avoid deadlock with reclaim when there is a transaction
3102          * trying to pause scrub, make sure we use GFP_NOFS for all the
3103          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
3104          * invoked by our callees. The pausing request is done when the
3105          * transaction commit starts, and it blocks the transaction until scrub
3106          * is paused (done at specific points at scrub_stripe() or right above
3107          * before incrementing fs_info->scrubs_running).
3108          */
3109         nofs_flag = memalloc_nofs_save();
3110         if (!is_dev_replace) {
3111                 u64 old_super_errors;
3112
3113                 spin_lock(&sctx->stat_lock);
3114                 old_super_errors = sctx->stat.super_errors;
3115                 spin_unlock(&sctx->stat_lock);
3116
3117                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3118                 /*
3119                  * by holding device list mutex, we can
3120                  * kick off writing super in log tree sync.
3121                  */
3122                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3123                 ret = scrub_supers(sctx, dev);
3124                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3125
3126                 spin_lock(&sctx->stat_lock);
3127                 /*
3128                  * Super block errors found, but we can not commit transaction
3129                  * at current context, since btrfs_commit_transaction() needs
3130                  * to pause the current running scrub (hold by ourselves).
3131                  */
3132                 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
3133                         need_commit = true;
3134                 spin_unlock(&sctx->stat_lock);
3135         }
3136
3137         if (!ret)
3138                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
3139         memalloc_nofs_restore(nofs_flag);
3140
3141         atomic_dec(&fs_info->scrubs_running);
3142         wake_up(&fs_info->scrub_pause_wait);
3143
3144         if (progress)
3145                 memcpy(progress, &sctx->stat, sizeof(*progress));
3146
3147         if (!is_dev_replace)
3148                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3149                         ret ? "not finished" : "finished", devid, ret);
3150
3151         mutex_lock(&fs_info->scrub_lock);
3152         dev->scrub_ctx = NULL;
3153         mutex_unlock(&fs_info->scrub_lock);
3154
3155         scrub_workers_put(fs_info);
3156         scrub_put_ctx(sctx);
3157
3158         /*
3159          * We found some super block errors before, now try to force a
3160          * transaction commit, as scrub has finished.
3161          */
3162         if (need_commit) {
3163                 struct btrfs_trans_handle *trans;
3164
3165                 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3166                 if (IS_ERR(trans)) {
3167                         ret = PTR_ERR(trans);
3168                         btrfs_err(fs_info,
3169         "scrub: failed to start transaction to fix super block errors: %d", ret);
3170                         return ret;
3171                 }
3172                 ret = btrfs_commit_transaction(trans);
3173                 if (ret < 0)
3174                         btrfs_err(fs_info,
3175         "scrub: failed to commit transaction to fix super block errors: %d", ret);
3176         }
3177         return ret;
3178 out:
3179         scrub_workers_put(fs_info);
3180 out_free_ctx:
3181         scrub_free_ctx(sctx);
3182
3183         return ret;
3184 }
3185
3186 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
3187 {
3188         mutex_lock(&fs_info->scrub_lock);
3189         atomic_inc(&fs_info->scrub_pause_req);
3190         while (atomic_read(&fs_info->scrubs_paused) !=
3191                atomic_read(&fs_info->scrubs_running)) {
3192                 mutex_unlock(&fs_info->scrub_lock);
3193                 wait_event(fs_info->scrub_pause_wait,
3194                            atomic_read(&fs_info->scrubs_paused) ==
3195                            atomic_read(&fs_info->scrubs_running));
3196                 mutex_lock(&fs_info->scrub_lock);
3197         }
3198         mutex_unlock(&fs_info->scrub_lock);
3199 }
3200
3201 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
3202 {
3203         atomic_dec(&fs_info->scrub_pause_req);
3204         wake_up(&fs_info->scrub_pause_wait);
3205 }
3206
3207 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3208 {
3209         mutex_lock(&fs_info->scrub_lock);
3210         if (!atomic_read(&fs_info->scrubs_running)) {
3211                 mutex_unlock(&fs_info->scrub_lock);
3212                 return -ENOTCONN;
3213         }
3214
3215         atomic_inc(&fs_info->scrub_cancel_req);
3216         while (atomic_read(&fs_info->scrubs_running)) {
3217                 mutex_unlock(&fs_info->scrub_lock);
3218                 wait_event(fs_info->scrub_pause_wait,
3219                            atomic_read(&fs_info->scrubs_running) == 0);
3220                 mutex_lock(&fs_info->scrub_lock);
3221         }
3222         atomic_dec(&fs_info->scrub_cancel_req);
3223         mutex_unlock(&fs_info->scrub_lock);
3224
3225         return 0;
3226 }
3227
3228 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
3229 {
3230         struct btrfs_fs_info *fs_info = dev->fs_info;
3231         struct scrub_ctx *sctx;
3232
3233         mutex_lock(&fs_info->scrub_lock);
3234         sctx = dev->scrub_ctx;
3235         if (!sctx) {
3236                 mutex_unlock(&fs_info->scrub_lock);
3237                 return -ENOTCONN;
3238         }
3239         atomic_inc(&sctx->cancel_req);
3240         while (dev->scrub_ctx) {
3241                 mutex_unlock(&fs_info->scrub_lock);
3242                 wait_event(fs_info->scrub_pause_wait,
3243                            dev->scrub_ctx == NULL);
3244                 mutex_lock(&fs_info->scrub_lock);
3245         }
3246         mutex_unlock(&fs_info->scrub_lock);
3247
3248         return 0;
3249 }
3250
3251 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
3252                          struct btrfs_scrub_progress *progress)
3253 {
3254         struct btrfs_dev_lookup_args args = { .devid = devid };
3255         struct btrfs_device *dev;
3256         struct scrub_ctx *sctx = NULL;
3257
3258         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3259         dev = btrfs_find_device(fs_info->fs_devices, &args);
3260         if (dev)
3261                 sctx = dev->scrub_ctx;
3262         if (sctx)
3263                 memcpy(progress, &sctx->stat, sizeof(*progress));
3264         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3265
3266         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3267 }