Merge tag 'scsi-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb...
[linux-2.6-block.git] / fs / btrfs / scrub.c
1 /*
2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27 #include "extent_io.h"
28 #include "dev-replace.h"
29 #include "check-integrity.h"
30 #include "rcu-string.h"
31 #include "raid56.h"
32
33 /*
34  * This is only the first step towards a full-features scrub. It reads all
35  * extent and super block and verifies the checksums. In case a bad checksum
36  * is found or the extent cannot be read, good data will be written back if
37  * any can be found.
38  *
39  * Future enhancements:
40  *  - In case an unrepairable extent is encountered, track which files are
41  *    affected and report them
42  *  - track and record media errors, throw out bad devices
43  *  - add a mode to also read unallocated space
44  */
45
46 struct scrub_block;
47 struct scrub_ctx;
48
49 /*
50  * the following three values only influence the performance.
51  * The last one configures the number of parallel and outstanding I/O
52  * operations. The first two values configure an upper limit for the number
53  * of (dynamically allocated) pages that are added to a bio.
54  */
55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
58
59 /*
60  * the following value times PAGE_SIZE needs to be large enough to match the
61  * largest node/leaf/sector size that shall be supported.
62  * Values larger than BTRFS_STRIPE_LEN are not supported.
63  */
64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
65
66 struct scrub_recover {
67         atomic_t                refs;
68         struct btrfs_bio        *bbio;
69         u64                     map_length;
70 };
71
72 struct scrub_page {
73         struct scrub_block      *sblock;
74         struct page             *page;
75         struct btrfs_device     *dev;
76         struct list_head        list;
77         u64                     flags;  /* extent flags */
78         u64                     generation;
79         u64                     logical;
80         u64                     physical;
81         u64                     physical_for_dev_replace;
82         atomic_t                refs;
83         struct {
84                 unsigned int    mirror_num:8;
85                 unsigned int    have_csum:1;
86                 unsigned int    io_error:1;
87         };
88         u8                      csum[BTRFS_CSUM_SIZE];
89
90         struct scrub_recover    *recover;
91 };
92
93 struct scrub_bio {
94         int                     index;
95         struct scrub_ctx        *sctx;
96         struct btrfs_device     *dev;
97         struct bio              *bio;
98         int                     err;
99         u64                     logical;
100         u64                     physical;
101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
103 #else
104         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
105 #endif
106         int                     page_count;
107         int                     next_free;
108         struct btrfs_work       work;
109 };
110
111 struct scrub_block {
112         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
113         int                     page_count;
114         atomic_t                outstanding_pages;
115         atomic_t                refs; /* free mem on transition to zero */
116         struct scrub_ctx        *sctx;
117         struct scrub_parity     *sparity;
118         struct {
119                 unsigned int    header_error:1;
120                 unsigned int    checksum_error:1;
121                 unsigned int    no_io_error_seen:1;
122                 unsigned int    generation_error:1; /* also sets header_error */
123
124                 /* The following is for the data used to check parity */
125                 /* It is for the data with checksum */
126                 unsigned int    data_corrected:1;
127         };
128         struct btrfs_work       work;
129 };
130
131 /* Used for the chunks with parity stripe such RAID5/6 */
132 struct scrub_parity {
133         struct scrub_ctx        *sctx;
134
135         struct btrfs_device     *scrub_dev;
136
137         u64                     logic_start;
138
139         u64                     logic_end;
140
141         int                     nsectors;
142
143         int                     stripe_len;
144
145         atomic_t                refs;
146
147         struct list_head        spages;
148
149         /* Work of parity check and repair */
150         struct btrfs_work       work;
151
152         /* Mark the parity blocks which have data */
153         unsigned long           *dbitmap;
154
155         /*
156          * Mark the parity blocks which have data, but errors happen when
157          * read data or check data
158          */
159         unsigned long           *ebitmap;
160
161         unsigned long           bitmap[0];
162 };
163
164 struct scrub_wr_ctx {
165         struct scrub_bio *wr_curr_bio;
166         struct btrfs_device *tgtdev;
167         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
168         atomic_t flush_all_writes;
169         struct mutex wr_lock;
170 };
171
172 struct scrub_ctx {
173         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
174         struct btrfs_fs_info    *fs_info;
175         int                     first_free;
176         int                     curr;
177         atomic_t                bios_in_flight;
178         atomic_t                workers_pending;
179         spinlock_t              list_lock;
180         wait_queue_head_t       list_wait;
181         u16                     csum_size;
182         struct list_head        csum_list;
183         atomic_t                cancel_req;
184         int                     readonly;
185         int                     pages_per_rd_bio;
186         u32                     sectorsize;
187         u32                     nodesize;
188
189         int                     is_dev_replace;
190         struct scrub_wr_ctx     wr_ctx;
191
192         /*
193          * statistics
194          */
195         struct btrfs_scrub_progress stat;
196         spinlock_t              stat_lock;
197
198         /*
199          * Use a ref counter to avoid use-after-free issues. Scrub workers
200          * decrement bios_in_flight and workers_pending and then do a wakeup
201          * on the list_wait wait queue. We must ensure the main scrub task
202          * doesn't free the scrub context before or while the workers are
203          * doing the wakeup() call.
204          */
205         atomic_t                refs;
206 };
207
208 struct scrub_fixup_nodatasum {
209         struct scrub_ctx        *sctx;
210         struct btrfs_device     *dev;
211         u64                     logical;
212         struct btrfs_root       *root;
213         struct btrfs_work       work;
214         int                     mirror_num;
215 };
216
217 struct scrub_nocow_inode {
218         u64                     inum;
219         u64                     offset;
220         u64                     root;
221         struct list_head        list;
222 };
223
224 struct scrub_copy_nocow_ctx {
225         struct scrub_ctx        *sctx;
226         u64                     logical;
227         u64                     len;
228         int                     mirror_num;
229         u64                     physical_for_dev_replace;
230         struct list_head        inodes;
231         struct btrfs_work       work;
232 };
233
234 struct scrub_warning {
235         struct btrfs_path       *path;
236         u64                     extent_item_size;
237         const char              *errstr;
238         sector_t                sector;
239         u64                     logical;
240         struct btrfs_device     *dev;
241 };
242
243 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
244 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
245 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
246 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
247 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
248 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
249                                      struct scrub_block *sblocks_for_recheck);
250 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
251                                 struct scrub_block *sblock,
252                                 int retry_failed_mirror);
253 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
254 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
255                                              struct scrub_block *sblock_good);
256 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
257                                             struct scrub_block *sblock_good,
258                                             int page_num, int force_write);
259 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
260 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
261                                            int page_num);
262 static int scrub_checksum_data(struct scrub_block *sblock);
263 static int scrub_checksum_tree_block(struct scrub_block *sblock);
264 static int scrub_checksum_super(struct scrub_block *sblock);
265 static void scrub_block_get(struct scrub_block *sblock);
266 static void scrub_block_put(struct scrub_block *sblock);
267 static void scrub_page_get(struct scrub_page *spage);
268 static void scrub_page_put(struct scrub_page *spage);
269 static void scrub_parity_get(struct scrub_parity *sparity);
270 static void scrub_parity_put(struct scrub_parity *sparity);
271 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
272                                     struct scrub_page *spage);
273 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
274                        u64 physical, struct btrfs_device *dev, u64 flags,
275                        u64 gen, int mirror_num, u8 *csum, int force,
276                        u64 physical_for_dev_replace);
277 static void scrub_bio_end_io(struct bio *bio);
278 static void scrub_bio_end_io_worker(struct btrfs_work *work);
279 static void scrub_block_complete(struct scrub_block *sblock);
280 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
281                                u64 extent_logical, u64 extent_len,
282                                u64 *extent_physical,
283                                struct btrfs_device **extent_dev,
284                                int *extent_mirror_num);
285 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
286                               struct scrub_wr_ctx *wr_ctx,
287                               struct btrfs_fs_info *fs_info,
288                               struct btrfs_device *dev,
289                               int is_dev_replace);
290 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
291 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
292                                     struct scrub_page *spage);
293 static void scrub_wr_submit(struct scrub_ctx *sctx);
294 static void scrub_wr_bio_end_io(struct bio *bio);
295 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
296 static int write_page_nocow(struct scrub_ctx *sctx,
297                             u64 physical_for_dev_replace, struct page *page);
298 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
299                                       struct scrub_copy_nocow_ctx *ctx);
300 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
301                             int mirror_num, u64 physical_for_dev_replace);
302 static void copy_nocow_pages_worker(struct btrfs_work *work);
303 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
304 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
305 static void scrub_put_ctx(struct scrub_ctx *sctx);
306
307
308 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
309 {
310         atomic_inc(&sctx->refs);
311         atomic_inc(&sctx->bios_in_flight);
312 }
313
314 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
315 {
316         atomic_dec(&sctx->bios_in_flight);
317         wake_up(&sctx->list_wait);
318         scrub_put_ctx(sctx);
319 }
320
321 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
322 {
323         while (atomic_read(&fs_info->scrub_pause_req)) {
324                 mutex_unlock(&fs_info->scrub_lock);
325                 wait_event(fs_info->scrub_pause_wait,
326                    atomic_read(&fs_info->scrub_pause_req) == 0);
327                 mutex_lock(&fs_info->scrub_lock);
328         }
329 }
330
331 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
332 {
333         atomic_inc(&fs_info->scrubs_paused);
334         wake_up(&fs_info->scrub_pause_wait);
335 }
336
337 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
338 {
339         mutex_lock(&fs_info->scrub_lock);
340         __scrub_blocked_if_needed(fs_info);
341         atomic_dec(&fs_info->scrubs_paused);
342         mutex_unlock(&fs_info->scrub_lock);
343
344         wake_up(&fs_info->scrub_pause_wait);
345 }
346
347 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
348 {
349         scrub_pause_on(fs_info);
350         scrub_pause_off(fs_info);
351 }
352
353 /*
354  * used for workers that require transaction commits (i.e., for the
355  * NOCOW case)
356  */
357 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
358 {
359         struct btrfs_fs_info *fs_info = sctx->fs_info;
360
361         atomic_inc(&sctx->refs);
362         /*
363          * increment scrubs_running to prevent cancel requests from
364          * completing as long as a worker is running. we must also
365          * increment scrubs_paused to prevent deadlocking on pause
366          * requests used for transactions commits (as the worker uses a
367          * transaction context). it is safe to regard the worker
368          * as paused for all matters practical. effectively, we only
369          * avoid cancellation requests from completing.
370          */
371         mutex_lock(&fs_info->scrub_lock);
372         atomic_inc(&fs_info->scrubs_running);
373         atomic_inc(&fs_info->scrubs_paused);
374         mutex_unlock(&fs_info->scrub_lock);
375
376         /*
377          * check if @scrubs_running=@scrubs_paused condition
378          * inside wait_event() is not an atomic operation.
379          * which means we may inc/dec @scrub_running/paused
380          * at any time. Let's wake up @scrub_pause_wait as
381          * much as we can to let commit transaction blocked less.
382          */
383         wake_up(&fs_info->scrub_pause_wait);
384
385         atomic_inc(&sctx->workers_pending);
386 }
387
388 /* used for workers that require transaction commits */
389 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
390 {
391         struct btrfs_fs_info *fs_info = sctx->fs_info;
392
393         /*
394          * see scrub_pending_trans_workers_inc() why we're pretending
395          * to be paused in the scrub counters
396          */
397         mutex_lock(&fs_info->scrub_lock);
398         atomic_dec(&fs_info->scrubs_running);
399         atomic_dec(&fs_info->scrubs_paused);
400         mutex_unlock(&fs_info->scrub_lock);
401         atomic_dec(&sctx->workers_pending);
402         wake_up(&fs_info->scrub_pause_wait);
403         wake_up(&sctx->list_wait);
404         scrub_put_ctx(sctx);
405 }
406
407 static void scrub_free_csums(struct scrub_ctx *sctx)
408 {
409         while (!list_empty(&sctx->csum_list)) {
410                 struct btrfs_ordered_sum *sum;
411                 sum = list_first_entry(&sctx->csum_list,
412                                        struct btrfs_ordered_sum, list);
413                 list_del(&sum->list);
414                 kfree(sum);
415         }
416 }
417
418 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
419 {
420         int i;
421
422         if (!sctx)
423                 return;
424
425         scrub_free_wr_ctx(&sctx->wr_ctx);
426
427         /* this can happen when scrub is cancelled */
428         if (sctx->curr != -1) {
429                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
430
431                 for (i = 0; i < sbio->page_count; i++) {
432                         WARN_ON(!sbio->pagev[i]->page);
433                         scrub_block_put(sbio->pagev[i]->sblock);
434                 }
435                 bio_put(sbio->bio);
436         }
437
438         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
439                 struct scrub_bio *sbio = sctx->bios[i];
440
441                 if (!sbio)
442                         break;
443                 kfree(sbio);
444         }
445
446         scrub_free_csums(sctx);
447         kfree(sctx);
448 }
449
450 static void scrub_put_ctx(struct scrub_ctx *sctx)
451 {
452         if (atomic_dec_and_test(&sctx->refs))
453                 scrub_free_ctx(sctx);
454 }
455
456 static noinline_for_stack
457 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
458 {
459         struct scrub_ctx *sctx;
460         int             i;
461         struct btrfs_fs_info *fs_info = dev->fs_info;
462         int ret;
463
464         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
465         if (!sctx)
466                 goto nomem;
467         atomic_set(&sctx->refs, 1);
468         sctx->is_dev_replace = is_dev_replace;
469         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
470         sctx->curr = -1;
471         sctx->fs_info = dev->fs_info;
472         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
473                 struct scrub_bio *sbio;
474
475                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
476                 if (!sbio)
477                         goto nomem;
478                 sctx->bios[i] = sbio;
479
480                 sbio->index = i;
481                 sbio->sctx = sctx;
482                 sbio->page_count = 0;
483                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
484                                 scrub_bio_end_io_worker, NULL, NULL);
485
486                 if (i != SCRUB_BIOS_PER_SCTX - 1)
487                         sctx->bios[i]->next_free = i + 1;
488                 else
489                         sctx->bios[i]->next_free = -1;
490         }
491         sctx->first_free = 0;
492         sctx->nodesize = fs_info->nodesize;
493         sctx->sectorsize = fs_info->sectorsize;
494         atomic_set(&sctx->bios_in_flight, 0);
495         atomic_set(&sctx->workers_pending, 0);
496         atomic_set(&sctx->cancel_req, 0);
497         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
498         INIT_LIST_HEAD(&sctx->csum_list);
499
500         spin_lock_init(&sctx->list_lock);
501         spin_lock_init(&sctx->stat_lock);
502         init_waitqueue_head(&sctx->list_wait);
503
504         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
505                                  fs_info->dev_replace.tgtdev, is_dev_replace);
506         if (ret) {
507                 scrub_free_ctx(sctx);
508                 return ERR_PTR(ret);
509         }
510         return sctx;
511
512 nomem:
513         scrub_free_ctx(sctx);
514         return ERR_PTR(-ENOMEM);
515 }
516
517 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
518                                      void *warn_ctx)
519 {
520         u64 isize;
521         u32 nlink;
522         int ret;
523         int i;
524         struct extent_buffer *eb;
525         struct btrfs_inode_item *inode_item;
526         struct scrub_warning *swarn = warn_ctx;
527         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
528         struct inode_fs_paths *ipath = NULL;
529         struct btrfs_root *local_root;
530         struct btrfs_key root_key;
531         struct btrfs_key key;
532
533         root_key.objectid = root;
534         root_key.type = BTRFS_ROOT_ITEM_KEY;
535         root_key.offset = (u64)-1;
536         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
537         if (IS_ERR(local_root)) {
538                 ret = PTR_ERR(local_root);
539                 goto err;
540         }
541
542         /*
543          * this makes the path point to (inum INODE_ITEM ioff)
544          */
545         key.objectid = inum;
546         key.type = BTRFS_INODE_ITEM_KEY;
547         key.offset = 0;
548
549         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
550         if (ret) {
551                 btrfs_release_path(swarn->path);
552                 goto err;
553         }
554
555         eb = swarn->path->nodes[0];
556         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
557                                         struct btrfs_inode_item);
558         isize = btrfs_inode_size(eb, inode_item);
559         nlink = btrfs_inode_nlink(eb, inode_item);
560         btrfs_release_path(swarn->path);
561
562         ipath = init_ipath(4096, local_root, swarn->path);
563         if (IS_ERR(ipath)) {
564                 ret = PTR_ERR(ipath);
565                 ipath = NULL;
566                 goto err;
567         }
568         ret = paths_from_inode(inum, ipath);
569
570         if (ret < 0)
571                 goto err;
572
573         /*
574          * we deliberately ignore the bit ipath might have been too small to
575          * hold all of the paths here
576          */
577         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
578                 btrfs_warn_in_rcu(fs_info,
579                                   "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
580                                   swarn->errstr, swarn->logical,
581                                   rcu_str_deref(swarn->dev->name),
582                                   (unsigned long long)swarn->sector,
583                                   root, inum, offset,
584                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
585                                   (char *)(unsigned long)ipath->fspath->val[i]);
586
587         free_ipath(ipath);
588         return 0;
589
590 err:
591         btrfs_warn_in_rcu(fs_info,
592                           "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
593                           swarn->errstr, swarn->logical,
594                           rcu_str_deref(swarn->dev->name),
595                           (unsigned long long)swarn->sector,
596                           root, inum, offset, ret);
597
598         free_ipath(ipath);
599         return 0;
600 }
601
602 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
603 {
604         struct btrfs_device *dev;
605         struct btrfs_fs_info *fs_info;
606         struct btrfs_path *path;
607         struct btrfs_key found_key;
608         struct extent_buffer *eb;
609         struct btrfs_extent_item *ei;
610         struct scrub_warning swarn;
611         unsigned long ptr = 0;
612         u64 extent_item_pos;
613         u64 flags = 0;
614         u64 ref_root;
615         u32 item_size;
616         u8 ref_level = 0;
617         int ret;
618
619         WARN_ON(sblock->page_count < 1);
620         dev = sblock->pagev[0]->dev;
621         fs_info = sblock->sctx->fs_info;
622
623         path = btrfs_alloc_path();
624         if (!path)
625                 return;
626
627         swarn.sector = (sblock->pagev[0]->physical) >> 9;
628         swarn.logical = sblock->pagev[0]->logical;
629         swarn.errstr = errstr;
630         swarn.dev = NULL;
631
632         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
633                                   &flags);
634         if (ret < 0)
635                 goto out;
636
637         extent_item_pos = swarn.logical - found_key.objectid;
638         swarn.extent_item_size = found_key.offset;
639
640         eb = path->nodes[0];
641         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
642         item_size = btrfs_item_size_nr(eb, path->slots[0]);
643
644         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
645                 do {
646                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
647                                                       item_size, &ref_root,
648                                                       &ref_level);
649                         btrfs_warn_in_rcu(fs_info,
650                                 "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu",
651                                 errstr, swarn.logical,
652                                 rcu_str_deref(dev->name),
653                                 (unsigned long long)swarn.sector,
654                                 ref_level ? "node" : "leaf",
655                                 ret < 0 ? -1 : ref_level,
656                                 ret < 0 ? -1 : ref_root);
657                 } while (ret != 1);
658                 btrfs_release_path(path);
659         } else {
660                 btrfs_release_path(path);
661                 swarn.path = path;
662                 swarn.dev = dev;
663                 iterate_extent_inodes(fs_info, found_key.objectid,
664                                         extent_item_pos, 1,
665                                         scrub_print_warning_inode, &swarn);
666         }
667
668 out:
669         btrfs_free_path(path);
670 }
671
672 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
673 {
674         struct page *page = NULL;
675         unsigned long index;
676         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
677         int ret;
678         int corrected = 0;
679         struct btrfs_key key;
680         struct inode *inode = NULL;
681         struct btrfs_fs_info *fs_info;
682         u64 end = offset + PAGE_SIZE - 1;
683         struct btrfs_root *local_root;
684         int srcu_index;
685
686         key.objectid = root;
687         key.type = BTRFS_ROOT_ITEM_KEY;
688         key.offset = (u64)-1;
689
690         fs_info = fixup->root->fs_info;
691         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
692
693         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
694         if (IS_ERR(local_root)) {
695                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
696                 return PTR_ERR(local_root);
697         }
698
699         key.type = BTRFS_INODE_ITEM_KEY;
700         key.objectid = inum;
701         key.offset = 0;
702         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
703         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
704         if (IS_ERR(inode))
705                 return PTR_ERR(inode);
706
707         index = offset >> PAGE_SHIFT;
708
709         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
710         if (!page) {
711                 ret = -ENOMEM;
712                 goto out;
713         }
714
715         if (PageUptodate(page)) {
716                 if (PageDirty(page)) {
717                         /*
718                          * we need to write the data to the defect sector. the
719                          * data that was in that sector is not in memory,
720                          * because the page was modified. we must not write the
721                          * modified page to that sector.
722                          *
723                          * TODO: what could be done here: wait for the delalloc
724                          *       runner to write out that page (might involve
725                          *       COW) and see whether the sector is still
726                          *       referenced afterwards.
727                          *
728                          * For the meantime, we'll treat this error
729                          * incorrectable, although there is a chance that a
730                          * later scrub will find the bad sector again and that
731                          * there's no dirty page in memory, then.
732                          */
733                         ret = -EIO;
734                         goto out;
735                 }
736                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
737                                         fixup->logical, page,
738                                         offset - page_offset(page),
739                                         fixup->mirror_num);
740                 unlock_page(page);
741                 corrected = !ret;
742         } else {
743                 /*
744                  * we need to get good data first. the general readpage path
745                  * will call repair_io_failure for us, we just have to make
746                  * sure we read the bad mirror.
747                  */
748                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
749                                         EXTENT_DAMAGED);
750                 if (ret) {
751                         /* set_extent_bits should give proper error */
752                         WARN_ON(ret > 0);
753                         if (ret > 0)
754                                 ret = -EFAULT;
755                         goto out;
756                 }
757
758                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
759                                                 btrfs_get_extent,
760                                                 fixup->mirror_num);
761                 wait_on_page_locked(page);
762
763                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
764                                                 end, EXTENT_DAMAGED, 0, NULL);
765                 if (!corrected)
766                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
767                                                 EXTENT_DAMAGED);
768         }
769
770 out:
771         if (page)
772                 put_page(page);
773
774         iput(inode);
775
776         if (ret < 0)
777                 return ret;
778
779         if (ret == 0 && corrected) {
780                 /*
781                  * we only need to call readpage for one of the inodes belonging
782                  * to this extent. so make iterate_extent_inodes stop
783                  */
784                 return 1;
785         }
786
787         return -EIO;
788 }
789
790 static void scrub_fixup_nodatasum(struct btrfs_work *work)
791 {
792         struct btrfs_fs_info *fs_info;
793         int ret;
794         struct scrub_fixup_nodatasum *fixup;
795         struct scrub_ctx *sctx;
796         struct btrfs_trans_handle *trans = NULL;
797         struct btrfs_path *path;
798         int uncorrectable = 0;
799
800         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
801         sctx = fixup->sctx;
802         fs_info = fixup->root->fs_info;
803
804         path = btrfs_alloc_path();
805         if (!path) {
806                 spin_lock(&sctx->stat_lock);
807                 ++sctx->stat.malloc_errors;
808                 spin_unlock(&sctx->stat_lock);
809                 uncorrectable = 1;
810                 goto out;
811         }
812
813         trans = btrfs_join_transaction(fixup->root);
814         if (IS_ERR(trans)) {
815                 uncorrectable = 1;
816                 goto out;
817         }
818
819         /*
820          * the idea is to trigger a regular read through the standard path. we
821          * read a page from the (failed) logical address by specifying the
822          * corresponding copynum of the failed sector. thus, that readpage is
823          * expected to fail.
824          * that is the point where on-the-fly error correction will kick in
825          * (once it's finished) and rewrite the failed sector if a good copy
826          * can be found.
827          */
828         ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
829                                           scrub_fixup_readpage, fixup);
830         if (ret < 0) {
831                 uncorrectable = 1;
832                 goto out;
833         }
834         WARN_ON(ret != 1);
835
836         spin_lock(&sctx->stat_lock);
837         ++sctx->stat.corrected_errors;
838         spin_unlock(&sctx->stat_lock);
839
840 out:
841         if (trans && !IS_ERR(trans))
842                 btrfs_end_transaction(trans);
843         if (uncorrectable) {
844                 spin_lock(&sctx->stat_lock);
845                 ++sctx->stat.uncorrectable_errors;
846                 spin_unlock(&sctx->stat_lock);
847                 btrfs_dev_replace_stats_inc(
848                         &fs_info->dev_replace.num_uncorrectable_read_errors);
849                 btrfs_err_rl_in_rcu(fs_info,
850                     "unable to fixup (nodatasum) error at logical %llu on dev %s",
851                         fixup->logical, rcu_str_deref(fixup->dev->name));
852         }
853
854         btrfs_free_path(path);
855         kfree(fixup);
856
857         scrub_pending_trans_workers_dec(sctx);
858 }
859
860 static inline void scrub_get_recover(struct scrub_recover *recover)
861 {
862         atomic_inc(&recover->refs);
863 }
864
865 static inline void scrub_put_recover(struct scrub_recover *recover)
866 {
867         if (atomic_dec_and_test(&recover->refs)) {
868                 btrfs_put_bbio(recover->bbio);
869                 kfree(recover);
870         }
871 }
872
873 /*
874  * scrub_handle_errored_block gets called when either verification of the
875  * pages failed or the bio failed to read, e.g. with EIO. In the latter
876  * case, this function handles all pages in the bio, even though only one
877  * may be bad.
878  * The goal of this function is to repair the errored block by using the
879  * contents of one of the mirrors.
880  */
881 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
882 {
883         struct scrub_ctx *sctx = sblock_to_check->sctx;
884         struct btrfs_device *dev;
885         struct btrfs_fs_info *fs_info;
886         u64 length;
887         u64 logical;
888         unsigned int failed_mirror_index;
889         unsigned int is_metadata;
890         unsigned int have_csum;
891         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
892         struct scrub_block *sblock_bad;
893         int ret;
894         int mirror_index;
895         int page_num;
896         int success;
897         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
898                                       DEFAULT_RATELIMIT_BURST);
899
900         BUG_ON(sblock_to_check->page_count < 1);
901         fs_info = sctx->fs_info;
902         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
903                 /*
904                  * if we find an error in a super block, we just report it.
905                  * They will get written with the next transaction commit
906                  * anyway
907                  */
908                 spin_lock(&sctx->stat_lock);
909                 ++sctx->stat.super_errors;
910                 spin_unlock(&sctx->stat_lock);
911                 return 0;
912         }
913         length = sblock_to_check->page_count * PAGE_SIZE;
914         logical = sblock_to_check->pagev[0]->logical;
915         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
916         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
917         is_metadata = !(sblock_to_check->pagev[0]->flags &
918                         BTRFS_EXTENT_FLAG_DATA);
919         have_csum = sblock_to_check->pagev[0]->have_csum;
920         dev = sblock_to_check->pagev[0]->dev;
921
922         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
923                 sblocks_for_recheck = NULL;
924                 goto nodatasum_case;
925         }
926
927         /*
928          * read all mirrors one after the other. This includes to
929          * re-read the extent or metadata block that failed (that was
930          * the cause that this fixup code is called) another time,
931          * page by page this time in order to know which pages
932          * caused I/O errors and which ones are good (for all mirrors).
933          * It is the goal to handle the situation when more than one
934          * mirror contains I/O errors, but the errors do not
935          * overlap, i.e. the data can be repaired by selecting the
936          * pages from those mirrors without I/O error on the
937          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
938          * would be that mirror #1 has an I/O error on the first page,
939          * the second page is good, and mirror #2 has an I/O error on
940          * the second page, but the first page is good.
941          * Then the first page of the first mirror can be repaired by
942          * taking the first page of the second mirror, and the
943          * second page of the second mirror can be repaired by
944          * copying the contents of the 2nd page of the 1st mirror.
945          * One more note: if the pages of one mirror contain I/O
946          * errors, the checksum cannot be verified. In order to get
947          * the best data for repairing, the first attempt is to find
948          * a mirror without I/O errors and with a validated checksum.
949          * Only if this is not possible, the pages are picked from
950          * mirrors with I/O errors without considering the checksum.
951          * If the latter is the case, at the end, the checksum of the
952          * repaired area is verified in order to correctly maintain
953          * the statistics.
954          */
955
956         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
957                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
958         if (!sblocks_for_recheck) {
959                 spin_lock(&sctx->stat_lock);
960                 sctx->stat.malloc_errors++;
961                 sctx->stat.read_errors++;
962                 sctx->stat.uncorrectable_errors++;
963                 spin_unlock(&sctx->stat_lock);
964                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
965                 goto out;
966         }
967
968         /* setup the context, map the logical blocks and alloc the pages */
969         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
970         if (ret) {
971                 spin_lock(&sctx->stat_lock);
972                 sctx->stat.read_errors++;
973                 sctx->stat.uncorrectable_errors++;
974                 spin_unlock(&sctx->stat_lock);
975                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
976                 goto out;
977         }
978         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
979         sblock_bad = sblocks_for_recheck + failed_mirror_index;
980
981         /* build and submit the bios for the failed mirror, check checksums */
982         scrub_recheck_block(fs_info, sblock_bad, 1);
983
984         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
985             sblock_bad->no_io_error_seen) {
986                 /*
987                  * the error disappeared after reading page by page, or
988                  * the area was part of a huge bio and other parts of the
989                  * bio caused I/O errors, or the block layer merged several
990                  * read requests into one and the error is caused by a
991                  * different bio (usually one of the two latter cases is
992                  * the cause)
993                  */
994                 spin_lock(&sctx->stat_lock);
995                 sctx->stat.unverified_errors++;
996                 sblock_to_check->data_corrected = 1;
997                 spin_unlock(&sctx->stat_lock);
998
999                 if (sctx->is_dev_replace)
1000                         scrub_write_block_to_dev_replace(sblock_bad);
1001                 goto out;
1002         }
1003
1004         if (!sblock_bad->no_io_error_seen) {
1005                 spin_lock(&sctx->stat_lock);
1006                 sctx->stat.read_errors++;
1007                 spin_unlock(&sctx->stat_lock);
1008                 if (__ratelimit(&_rs))
1009                         scrub_print_warning("i/o error", sblock_to_check);
1010                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1011         } else if (sblock_bad->checksum_error) {
1012                 spin_lock(&sctx->stat_lock);
1013                 sctx->stat.csum_errors++;
1014                 spin_unlock(&sctx->stat_lock);
1015                 if (__ratelimit(&_rs))
1016                         scrub_print_warning("checksum error", sblock_to_check);
1017                 btrfs_dev_stat_inc_and_print(dev,
1018                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1019         } else if (sblock_bad->header_error) {
1020                 spin_lock(&sctx->stat_lock);
1021                 sctx->stat.verify_errors++;
1022                 spin_unlock(&sctx->stat_lock);
1023                 if (__ratelimit(&_rs))
1024                         scrub_print_warning("checksum/header error",
1025                                             sblock_to_check);
1026                 if (sblock_bad->generation_error)
1027                         btrfs_dev_stat_inc_and_print(dev,
1028                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1029                 else
1030                         btrfs_dev_stat_inc_and_print(dev,
1031                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1032         }
1033
1034         if (sctx->readonly) {
1035                 ASSERT(!sctx->is_dev_replace);
1036                 goto out;
1037         }
1038
1039         if (!is_metadata && !have_csum) {
1040                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1041
1042                 WARN_ON(sctx->is_dev_replace);
1043
1044 nodatasum_case:
1045
1046                 /*
1047                  * !is_metadata and !have_csum, this means that the data
1048                  * might not be COWed, that it might be modified
1049                  * concurrently. The general strategy to work on the
1050                  * commit root does not help in the case when COW is not
1051                  * used.
1052                  */
1053                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1054                 if (!fixup_nodatasum)
1055                         goto did_not_correct_error;
1056                 fixup_nodatasum->sctx = sctx;
1057                 fixup_nodatasum->dev = dev;
1058                 fixup_nodatasum->logical = logical;
1059                 fixup_nodatasum->root = fs_info->extent_root;
1060                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1061                 scrub_pending_trans_workers_inc(sctx);
1062                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1063                                 scrub_fixup_nodatasum, NULL, NULL);
1064                 btrfs_queue_work(fs_info->scrub_workers,
1065                                  &fixup_nodatasum->work);
1066                 goto out;
1067         }
1068
1069         /*
1070          * now build and submit the bios for the other mirrors, check
1071          * checksums.
1072          * First try to pick the mirror which is completely without I/O
1073          * errors and also does not have a checksum error.
1074          * If one is found, and if a checksum is present, the full block
1075          * that is known to contain an error is rewritten. Afterwards
1076          * the block is known to be corrected.
1077          * If a mirror is found which is completely correct, and no
1078          * checksum is present, only those pages are rewritten that had
1079          * an I/O error in the block to be repaired, since it cannot be
1080          * determined, which copy of the other pages is better (and it
1081          * could happen otherwise that a correct page would be
1082          * overwritten by a bad one).
1083          */
1084         for (mirror_index = 0;
1085              mirror_index < BTRFS_MAX_MIRRORS &&
1086              sblocks_for_recheck[mirror_index].page_count > 0;
1087              mirror_index++) {
1088                 struct scrub_block *sblock_other;
1089
1090                 if (mirror_index == failed_mirror_index)
1091                         continue;
1092                 sblock_other = sblocks_for_recheck + mirror_index;
1093
1094                 /* build and submit the bios, check checksums */
1095                 scrub_recheck_block(fs_info, sblock_other, 0);
1096
1097                 if (!sblock_other->header_error &&
1098                     !sblock_other->checksum_error &&
1099                     sblock_other->no_io_error_seen) {
1100                         if (sctx->is_dev_replace) {
1101                                 scrub_write_block_to_dev_replace(sblock_other);
1102                                 goto corrected_error;
1103                         } else {
1104                                 ret = scrub_repair_block_from_good_copy(
1105                                                 sblock_bad, sblock_other);
1106                                 if (!ret)
1107                                         goto corrected_error;
1108                         }
1109                 }
1110         }
1111
1112         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1113                 goto did_not_correct_error;
1114
1115         /*
1116          * In case of I/O errors in the area that is supposed to be
1117          * repaired, continue by picking good copies of those pages.
1118          * Select the good pages from mirrors to rewrite bad pages from
1119          * the area to fix. Afterwards verify the checksum of the block
1120          * that is supposed to be repaired. This verification step is
1121          * only done for the purpose of statistic counting and for the
1122          * final scrub report, whether errors remain.
1123          * A perfect algorithm could make use of the checksum and try
1124          * all possible combinations of pages from the different mirrors
1125          * until the checksum verification succeeds. For example, when
1126          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1127          * of mirror #2 is readable but the final checksum test fails,
1128          * then the 2nd page of mirror #3 could be tried, whether now
1129          * the final checksum succeeds. But this would be a rare
1130          * exception and is therefore not implemented. At least it is
1131          * avoided that the good copy is overwritten.
1132          * A more useful improvement would be to pick the sectors
1133          * without I/O error based on sector sizes (512 bytes on legacy
1134          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1135          * mirror could be repaired by taking 512 byte of a different
1136          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1137          * area are unreadable.
1138          */
1139         success = 1;
1140         for (page_num = 0; page_num < sblock_bad->page_count;
1141              page_num++) {
1142                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1143                 struct scrub_block *sblock_other = NULL;
1144
1145                 /* skip no-io-error page in scrub */
1146                 if (!page_bad->io_error && !sctx->is_dev_replace)
1147                         continue;
1148
1149                 /* try to find no-io-error page in mirrors */
1150                 if (page_bad->io_error) {
1151                         for (mirror_index = 0;
1152                              mirror_index < BTRFS_MAX_MIRRORS &&
1153                              sblocks_for_recheck[mirror_index].page_count > 0;
1154                              mirror_index++) {
1155                                 if (!sblocks_for_recheck[mirror_index].
1156                                     pagev[page_num]->io_error) {
1157                                         sblock_other = sblocks_for_recheck +
1158                                                        mirror_index;
1159                                         break;
1160                                 }
1161                         }
1162                         if (!sblock_other)
1163                                 success = 0;
1164                 }
1165
1166                 if (sctx->is_dev_replace) {
1167                         /*
1168                          * did not find a mirror to fetch the page
1169                          * from. scrub_write_page_to_dev_replace()
1170                          * handles this case (page->io_error), by
1171                          * filling the block with zeros before
1172                          * submitting the write request
1173                          */
1174                         if (!sblock_other)
1175                                 sblock_other = sblock_bad;
1176
1177                         if (scrub_write_page_to_dev_replace(sblock_other,
1178                                                             page_num) != 0) {
1179                                 btrfs_dev_replace_stats_inc(
1180                                         &fs_info->dev_replace.num_write_errors);
1181                                 success = 0;
1182                         }
1183                 } else if (sblock_other) {
1184                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1185                                                                sblock_other,
1186                                                                page_num, 0);
1187                         if (0 == ret)
1188                                 page_bad->io_error = 0;
1189                         else
1190                                 success = 0;
1191                 }
1192         }
1193
1194         if (success && !sctx->is_dev_replace) {
1195                 if (is_metadata || have_csum) {
1196                         /*
1197                          * need to verify the checksum now that all
1198                          * sectors on disk are repaired (the write
1199                          * request for data to be repaired is on its way).
1200                          * Just be lazy and use scrub_recheck_block()
1201                          * which re-reads the data before the checksum
1202                          * is verified, but most likely the data comes out
1203                          * of the page cache.
1204                          */
1205                         scrub_recheck_block(fs_info, sblock_bad, 1);
1206                         if (!sblock_bad->header_error &&
1207                             !sblock_bad->checksum_error &&
1208                             sblock_bad->no_io_error_seen)
1209                                 goto corrected_error;
1210                         else
1211                                 goto did_not_correct_error;
1212                 } else {
1213 corrected_error:
1214                         spin_lock(&sctx->stat_lock);
1215                         sctx->stat.corrected_errors++;
1216                         sblock_to_check->data_corrected = 1;
1217                         spin_unlock(&sctx->stat_lock);
1218                         btrfs_err_rl_in_rcu(fs_info,
1219                                 "fixed up error at logical %llu on dev %s",
1220                                 logical, rcu_str_deref(dev->name));
1221                 }
1222         } else {
1223 did_not_correct_error:
1224                 spin_lock(&sctx->stat_lock);
1225                 sctx->stat.uncorrectable_errors++;
1226                 spin_unlock(&sctx->stat_lock);
1227                 btrfs_err_rl_in_rcu(fs_info,
1228                         "unable to fixup (regular) error at logical %llu on dev %s",
1229                         logical, rcu_str_deref(dev->name));
1230         }
1231
1232 out:
1233         if (sblocks_for_recheck) {
1234                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1235                      mirror_index++) {
1236                         struct scrub_block *sblock = sblocks_for_recheck +
1237                                                      mirror_index;
1238                         struct scrub_recover *recover;
1239                         int page_index;
1240
1241                         for (page_index = 0; page_index < sblock->page_count;
1242                              page_index++) {
1243                                 sblock->pagev[page_index]->sblock = NULL;
1244                                 recover = sblock->pagev[page_index]->recover;
1245                                 if (recover) {
1246                                         scrub_put_recover(recover);
1247                                         sblock->pagev[page_index]->recover =
1248                                                                         NULL;
1249                                 }
1250                                 scrub_page_put(sblock->pagev[page_index]);
1251                         }
1252                 }
1253                 kfree(sblocks_for_recheck);
1254         }
1255
1256         return 0;
1257 }
1258
1259 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1260 {
1261         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1262                 return 2;
1263         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1264                 return 3;
1265         else
1266                 return (int)bbio->num_stripes;
1267 }
1268
1269 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1270                                                  u64 *raid_map,
1271                                                  u64 mapped_length,
1272                                                  int nstripes, int mirror,
1273                                                  int *stripe_index,
1274                                                  u64 *stripe_offset)
1275 {
1276         int i;
1277
1278         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1279                 /* RAID5/6 */
1280                 for (i = 0; i < nstripes; i++) {
1281                         if (raid_map[i] == RAID6_Q_STRIPE ||
1282                             raid_map[i] == RAID5_P_STRIPE)
1283                                 continue;
1284
1285                         if (logical >= raid_map[i] &&
1286                             logical < raid_map[i] + mapped_length)
1287                                 break;
1288                 }
1289
1290                 *stripe_index = i;
1291                 *stripe_offset = logical - raid_map[i];
1292         } else {
1293                 /* The other RAID type */
1294                 *stripe_index = mirror;
1295                 *stripe_offset = 0;
1296         }
1297 }
1298
1299 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1300                                      struct scrub_block *sblocks_for_recheck)
1301 {
1302         struct scrub_ctx *sctx = original_sblock->sctx;
1303         struct btrfs_fs_info *fs_info = sctx->fs_info;
1304         u64 length = original_sblock->page_count * PAGE_SIZE;
1305         u64 logical = original_sblock->pagev[0]->logical;
1306         u64 generation = original_sblock->pagev[0]->generation;
1307         u64 flags = original_sblock->pagev[0]->flags;
1308         u64 have_csum = original_sblock->pagev[0]->have_csum;
1309         struct scrub_recover *recover;
1310         struct btrfs_bio *bbio;
1311         u64 sublen;
1312         u64 mapped_length;
1313         u64 stripe_offset;
1314         int stripe_index;
1315         int page_index = 0;
1316         int mirror_index;
1317         int nmirrors;
1318         int ret;
1319
1320         /*
1321          * note: the two members refs and outstanding_pages
1322          * are not used (and not set) in the blocks that are used for
1323          * the recheck procedure
1324          */
1325
1326         while (length > 0) {
1327                 sublen = min_t(u64, length, PAGE_SIZE);
1328                 mapped_length = sublen;
1329                 bbio = NULL;
1330
1331                 /*
1332                  * with a length of PAGE_SIZE, each returned stripe
1333                  * represents one mirror
1334                  */
1335                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1336                                 logical, &mapped_length, &bbio, 0, 1);
1337                 if (ret || !bbio || mapped_length < sublen) {
1338                         btrfs_put_bbio(bbio);
1339                         return -EIO;
1340                 }
1341
1342                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1343                 if (!recover) {
1344                         btrfs_put_bbio(bbio);
1345                         return -ENOMEM;
1346                 }
1347
1348                 atomic_set(&recover->refs, 1);
1349                 recover->bbio = bbio;
1350                 recover->map_length = mapped_length;
1351
1352                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1353
1354                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1355
1356                 for (mirror_index = 0; mirror_index < nmirrors;
1357                      mirror_index++) {
1358                         struct scrub_block *sblock;
1359                         struct scrub_page *page;
1360
1361                         sblock = sblocks_for_recheck + mirror_index;
1362                         sblock->sctx = sctx;
1363
1364                         page = kzalloc(sizeof(*page), GFP_NOFS);
1365                         if (!page) {
1366 leave_nomem:
1367                                 spin_lock(&sctx->stat_lock);
1368                                 sctx->stat.malloc_errors++;
1369                                 spin_unlock(&sctx->stat_lock);
1370                                 scrub_put_recover(recover);
1371                                 return -ENOMEM;
1372                         }
1373                         scrub_page_get(page);
1374                         sblock->pagev[page_index] = page;
1375                         page->sblock = sblock;
1376                         page->flags = flags;
1377                         page->generation = generation;
1378                         page->logical = logical;
1379                         page->have_csum = have_csum;
1380                         if (have_csum)
1381                                 memcpy(page->csum,
1382                                        original_sblock->pagev[0]->csum,
1383                                        sctx->csum_size);
1384
1385                         scrub_stripe_index_and_offset(logical,
1386                                                       bbio->map_type,
1387                                                       bbio->raid_map,
1388                                                       mapped_length,
1389                                                       bbio->num_stripes -
1390                                                       bbio->num_tgtdevs,
1391                                                       mirror_index,
1392                                                       &stripe_index,
1393                                                       &stripe_offset);
1394                         page->physical = bbio->stripes[stripe_index].physical +
1395                                          stripe_offset;
1396                         page->dev = bbio->stripes[stripe_index].dev;
1397
1398                         BUG_ON(page_index >= original_sblock->page_count);
1399                         page->physical_for_dev_replace =
1400                                 original_sblock->pagev[page_index]->
1401                                 physical_for_dev_replace;
1402                         /* for missing devices, dev->bdev is NULL */
1403                         page->mirror_num = mirror_index + 1;
1404                         sblock->page_count++;
1405                         page->page = alloc_page(GFP_NOFS);
1406                         if (!page->page)
1407                                 goto leave_nomem;
1408
1409                         scrub_get_recover(recover);
1410                         page->recover = recover;
1411                 }
1412                 scrub_put_recover(recover);
1413                 length -= sublen;
1414                 logical += sublen;
1415                 page_index++;
1416         }
1417
1418         return 0;
1419 }
1420
1421 struct scrub_bio_ret {
1422         struct completion event;
1423         int error;
1424 };
1425
1426 static void scrub_bio_wait_endio(struct bio *bio)
1427 {
1428         struct scrub_bio_ret *ret = bio->bi_private;
1429
1430         ret->error = bio->bi_error;
1431         complete(&ret->event);
1432 }
1433
1434 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1435 {
1436         return page->recover &&
1437                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1438 }
1439
1440 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1441                                         struct bio *bio,
1442                                         struct scrub_page *page)
1443 {
1444         struct scrub_bio_ret done;
1445         int ret;
1446
1447         init_completion(&done.event);
1448         done.error = 0;
1449         bio->bi_iter.bi_sector = page->logical >> 9;
1450         bio->bi_private = &done;
1451         bio->bi_end_io = scrub_bio_wait_endio;
1452
1453         ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1454                                     page->recover->map_length,
1455                                     page->mirror_num, 0);
1456         if (ret)
1457                 return ret;
1458
1459         wait_for_completion(&done.event);
1460         if (done.error)
1461                 return -EIO;
1462
1463         return 0;
1464 }
1465
1466 /*
1467  * this function will check the on disk data for checksum errors, header
1468  * errors and read I/O errors. If any I/O errors happen, the exact pages
1469  * which are errored are marked as being bad. The goal is to enable scrub
1470  * to take those pages that are not errored from all the mirrors so that
1471  * the pages that are errored in the just handled mirror can be repaired.
1472  */
1473 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1474                                 struct scrub_block *sblock,
1475                                 int retry_failed_mirror)
1476 {
1477         int page_num;
1478
1479         sblock->no_io_error_seen = 1;
1480
1481         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1482                 struct bio *bio;
1483                 struct scrub_page *page = sblock->pagev[page_num];
1484
1485                 if (page->dev->bdev == NULL) {
1486                         page->io_error = 1;
1487                         sblock->no_io_error_seen = 0;
1488                         continue;
1489                 }
1490
1491                 WARN_ON(!page->page);
1492                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1493                 if (!bio) {
1494                         page->io_error = 1;
1495                         sblock->no_io_error_seen = 0;
1496                         continue;
1497                 }
1498                 bio->bi_bdev = page->dev->bdev;
1499
1500                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1501                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1502                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1503                                 sblock->no_io_error_seen = 0;
1504                 } else {
1505                         bio->bi_iter.bi_sector = page->physical >> 9;
1506                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
1507
1508                         if (btrfsic_submit_bio_wait(bio))
1509                                 sblock->no_io_error_seen = 0;
1510                 }
1511
1512                 bio_put(bio);
1513         }
1514
1515         if (sblock->no_io_error_seen)
1516                 scrub_recheck_block_checksum(sblock);
1517 }
1518
1519 static inline int scrub_check_fsid(u8 fsid[],
1520                                    struct scrub_page *spage)
1521 {
1522         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1523         int ret;
1524
1525         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1526         return !ret;
1527 }
1528
1529 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1530 {
1531         sblock->header_error = 0;
1532         sblock->checksum_error = 0;
1533         sblock->generation_error = 0;
1534
1535         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1536                 scrub_checksum_data(sblock);
1537         else
1538                 scrub_checksum_tree_block(sblock);
1539 }
1540
1541 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1542                                              struct scrub_block *sblock_good)
1543 {
1544         int page_num;
1545         int ret = 0;
1546
1547         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1548                 int ret_sub;
1549
1550                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1551                                                            sblock_good,
1552                                                            page_num, 1);
1553                 if (ret_sub)
1554                         ret = ret_sub;
1555         }
1556
1557         return ret;
1558 }
1559
1560 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1561                                             struct scrub_block *sblock_good,
1562                                             int page_num, int force_write)
1563 {
1564         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1565         struct scrub_page *page_good = sblock_good->pagev[page_num];
1566         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1567
1568         BUG_ON(page_bad->page == NULL);
1569         BUG_ON(page_good->page == NULL);
1570         if (force_write || sblock_bad->header_error ||
1571             sblock_bad->checksum_error || page_bad->io_error) {
1572                 struct bio *bio;
1573                 int ret;
1574
1575                 if (!page_bad->dev->bdev) {
1576                         btrfs_warn_rl(fs_info,
1577                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1578                         return -EIO;
1579                 }
1580
1581                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1582                 if (!bio)
1583                         return -EIO;
1584                 bio->bi_bdev = page_bad->dev->bdev;
1585                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1586                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1587
1588                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1589                 if (PAGE_SIZE != ret) {
1590                         bio_put(bio);
1591                         return -EIO;
1592                 }
1593
1594                 if (btrfsic_submit_bio_wait(bio)) {
1595                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1596                                 BTRFS_DEV_STAT_WRITE_ERRS);
1597                         btrfs_dev_replace_stats_inc(
1598                                 &fs_info->dev_replace.num_write_errors);
1599                         bio_put(bio);
1600                         return -EIO;
1601                 }
1602                 bio_put(bio);
1603         }
1604
1605         return 0;
1606 }
1607
1608 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1609 {
1610         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1611         int page_num;
1612
1613         /*
1614          * This block is used for the check of the parity on the source device,
1615          * so the data needn't be written into the destination device.
1616          */
1617         if (sblock->sparity)
1618                 return;
1619
1620         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1621                 int ret;
1622
1623                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1624                 if (ret)
1625                         btrfs_dev_replace_stats_inc(
1626                                 &fs_info->dev_replace.num_write_errors);
1627         }
1628 }
1629
1630 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1631                                            int page_num)
1632 {
1633         struct scrub_page *spage = sblock->pagev[page_num];
1634
1635         BUG_ON(spage->page == NULL);
1636         if (spage->io_error) {
1637                 void *mapped_buffer = kmap_atomic(spage->page);
1638
1639                 memset(mapped_buffer, 0, PAGE_SIZE);
1640                 flush_dcache_page(spage->page);
1641                 kunmap_atomic(mapped_buffer);
1642         }
1643         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1644 }
1645
1646 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1647                                     struct scrub_page *spage)
1648 {
1649         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1650         struct scrub_bio *sbio;
1651         int ret;
1652
1653         mutex_lock(&wr_ctx->wr_lock);
1654 again:
1655         if (!wr_ctx->wr_curr_bio) {
1656                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1657                                               GFP_KERNEL);
1658                 if (!wr_ctx->wr_curr_bio) {
1659                         mutex_unlock(&wr_ctx->wr_lock);
1660                         return -ENOMEM;
1661                 }
1662                 wr_ctx->wr_curr_bio->sctx = sctx;
1663                 wr_ctx->wr_curr_bio->page_count = 0;
1664         }
1665         sbio = wr_ctx->wr_curr_bio;
1666         if (sbio->page_count == 0) {
1667                 struct bio *bio;
1668
1669                 sbio->physical = spage->physical_for_dev_replace;
1670                 sbio->logical = spage->logical;
1671                 sbio->dev = wr_ctx->tgtdev;
1672                 bio = sbio->bio;
1673                 if (!bio) {
1674                         bio = btrfs_io_bio_alloc(GFP_KERNEL,
1675                                         wr_ctx->pages_per_wr_bio);
1676                         if (!bio) {
1677                                 mutex_unlock(&wr_ctx->wr_lock);
1678                                 return -ENOMEM;
1679                         }
1680                         sbio->bio = bio;
1681                 }
1682
1683                 bio->bi_private = sbio;
1684                 bio->bi_end_io = scrub_wr_bio_end_io;
1685                 bio->bi_bdev = sbio->dev->bdev;
1686                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1687                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1688                 sbio->err = 0;
1689         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1690                    spage->physical_for_dev_replace ||
1691                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1692                    spage->logical) {
1693                 scrub_wr_submit(sctx);
1694                 goto again;
1695         }
1696
1697         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1698         if (ret != PAGE_SIZE) {
1699                 if (sbio->page_count < 1) {
1700                         bio_put(sbio->bio);
1701                         sbio->bio = NULL;
1702                         mutex_unlock(&wr_ctx->wr_lock);
1703                         return -EIO;
1704                 }
1705                 scrub_wr_submit(sctx);
1706                 goto again;
1707         }
1708
1709         sbio->pagev[sbio->page_count] = spage;
1710         scrub_page_get(spage);
1711         sbio->page_count++;
1712         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1713                 scrub_wr_submit(sctx);
1714         mutex_unlock(&wr_ctx->wr_lock);
1715
1716         return 0;
1717 }
1718
1719 static void scrub_wr_submit(struct scrub_ctx *sctx)
1720 {
1721         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1722         struct scrub_bio *sbio;
1723
1724         if (!wr_ctx->wr_curr_bio)
1725                 return;
1726
1727         sbio = wr_ctx->wr_curr_bio;
1728         wr_ctx->wr_curr_bio = NULL;
1729         WARN_ON(!sbio->bio->bi_bdev);
1730         scrub_pending_bio_inc(sctx);
1731         /* process all writes in a single worker thread. Then the block layer
1732          * orders the requests before sending them to the driver which
1733          * doubled the write performance on spinning disks when measured
1734          * with Linux 3.5 */
1735         btrfsic_submit_bio(sbio->bio);
1736 }
1737
1738 static void scrub_wr_bio_end_io(struct bio *bio)
1739 {
1740         struct scrub_bio *sbio = bio->bi_private;
1741         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1742
1743         sbio->err = bio->bi_error;
1744         sbio->bio = bio;
1745
1746         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1747                          scrub_wr_bio_end_io_worker, NULL, NULL);
1748         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1749 }
1750
1751 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1752 {
1753         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1754         struct scrub_ctx *sctx = sbio->sctx;
1755         int i;
1756
1757         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1758         if (sbio->err) {
1759                 struct btrfs_dev_replace *dev_replace =
1760                         &sbio->sctx->fs_info->dev_replace;
1761
1762                 for (i = 0; i < sbio->page_count; i++) {
1763                         struct scrub_page *spage = sbio->pagev[i];
1764
1765                         spage->io_error = 1;
1766                         btrfs_dev_replace_stats_inc(&dev_replace->
1767                                                     num_write_errors);
1768                 }
1769         }
1770
1771         for (i = 0; i < sbio->page_count; i++)
1772                 scrub_page_put(sbio->pagev[i]);
1773
1774         bio_put(sbio->bio);
1775         kfree(sbio);
1776         scrub_pending_bio_dec(sctx);
1777 }
1778
1779 static int scrub_checksum(struct scrub_block *sblock)
1780 {
1781         u64 flags;
1782         int ret;
1783
1784         /*
1785          * No need to initialize these stats currently,
1786          * because this function only use return value
1787          * instead of these stats value.
1788          *
1789          * Todo:
1790          * always use stats
1791          */
1792         sblock->header_error = 0;
1793         sblock->generation_error = 0;
1794         sblock->checksum_error = 0;
1795
1796         WARN_ON(sblock->page_count < 1);
1797         flags = sblock->pagev[0]->flags;
1798         ret = 0;
1799         if (flags & BTRFS_EXTENT_FLAG_DATA)
1800                 ret = scrub_checksum_data(sblock);
1801         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1802                 ret = scrub_checksum_tree_block(sblock);
1803         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1804                 (void)scrub_checksum_super(sblock);
1805         else
1806                 WARN_ON(1);
1807         if (ret)
1808                 scrub_handle_errored_block(sblock);
1809
1810         return ret;
1811 }
1812
1813 static int scrub_checksum_data(struct scrub_block *sblock)
1814 {
1815         struct scrub_ctx *sctx = sblock->sctx;
1816         u8 csum[BTRFS_CSUM_SIZE];
1817         u8 *on_disk_csum;
1818         struct page *page;
1819         void *buffer;
1820         u32 crc = ~(u32)0;
1821         u64 len;
1822         int index;
1823
1824         BUG_ON(sblock->page_count < 1);
1825         if (!sblock->pagev[0]->have_csum)
1826                 return 0;
1827
1828         on_disk_csum = sblock->pagev[0]->csum;
1829         page = sblock->pagev[0]->page;
1830         buffer = kmap_atomic(page);
1831
1832         len = sctx->sectorsize;
1833         index = 0;
1834         for (;;) {
1835                 u64 l = min_t(u64, len, PAGE_SIZE);
1836
1837                 crc = btrfs_csum_data(buffer, crc, l);
1838                 kunmap_atomic(buffer);
1839                 len -= l;
1840                 if (len == 0)
1841                         break;
1842                 index++;
1843                 BUG_ON(index >= sblock->page_count);
1844                 BUG_ON(!sblock->pagev[index]->page);
1845                 page = sblock->pagev[index]->page;
1846                 buffer = kmap_atomic(page);
1847         }
1848
1849         btrfs_csum_final(crc, csum);
1850         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1851                 sblock->checksum_error = 1;
1852
1853         return sblock->checksum_error;
1854 }
1855
1856 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1857 {
1858         struct scrub_ctx *sctx = sblock->sctx;
1859         struct btrfs_header *h;
1860         struct btrfs_fs_info *fs_info = sctx->fs_info;
1861         u8 calculated_csum[BTRFS_CSUM_SIZE];
1862         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1863         struct page *page;
1864         void *mapped_buffer;
1865         u64 mapped_size;
1866         void *p;
1867         u32 crc = ~(u32)0;
1868         u64 len;
1869         int index;
1870
1871         BUG_ON(sblock->page_count < 1);
1872         page = sblock->pagev[0]->page;
1873         mapped_buffer = kmap_atomic(page);
1874         h = (struct btrfs_header *)mapped_buffer;
1875         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1876
1877         /*
1878          * we don't use the getter functions here, as we
1879          * a) don't have an extent buffer and
1880          * b) the page is already kmapped
1881          */
1882         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1883                 sblock->header_error = 1;
1884
1885         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
1886                 sblock->header_error = 1;
1887                 sblock->generation_error = 1;
1888         }
1889
1890         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1891                 sblock->header_error = 1;
1892
1893         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1894                    BTRFS_UUID_SIZE))
1895                 sblock->header_error = 1;
1896
1897         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1898         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1899         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1900         index = 0;
1901         for (;;) {
1902                 u64 l = min_t(u64, len, mapped_size);
1903
1904                 crc = btrfs_csum_data(p, crc, l);
1905                 kunmap_atomic(mapped_buffer);
1906                 len -= l;
1907                 if (len == 0)
1908                         break;
1909                 index++;
1910                 BUG_ON(index >= sblock->page_count);
1911                 BUG_ON(!sblock->pagev[index]->page);
1912                 page = sblock->pagev[index]->page;
1913                 mapped_buffer = kmap_atomic(page);
1914                 mapped_size = PAGE_SIZE;
1915                 p = mapped_buffer;
1916         }
1917
1918         btrfs_csum_final(crc, calculated_csum);
1919         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1920                 sblock->checksum_error = 1;
1921
1922         return sblock->header_error || sblock->checksum_error;
1923 }
1924
1925 static int scrub_checksum_super(struct scrub_block *sblock)
1926 {
1927         struct btrfs_super_block *s;
1928         struct scrub_ctx *sctx = sblock->sctx;
1929         u8 calculated_csum[BTRFS_CSUM_SIZE];
1930         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1931         struct page *page;
1932         void *mapped_buffer;
1933         u64 mapped_size;
1934         void *p;
1935         u32 crc = ~(u32)0;
1936         int fail_gen = 0;
1937         int fail_cor = 0;
1938         u64 len;
1939         int index;
1940
1941         BUG_ON(sblock->page_count < 1);
1942         page = sblock->pagev[0]->page;
1943         mapped_buffer = kmap_atomic(page);
1944         s = (struct btrfs_super_block *)mapped_buffer;
1945         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1946
1947         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1948                 ++fail_cor;
1949
1950         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1951                 ++fail_gen;
1952
1953         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1954                 ++fail_cor;
1955
1956         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1957         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1958         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1959         index = 0;
1960         for (;;) {
1961                 u64 l = min_t(u64, len, mapped_size);
1962
1963                 crc = btrfs_csum_data(p, crc, l);
1964                 kunmap_atomic(mapped_buffer);
1965                 len -= l;
1966                 if (len == 0)
1967                         break;
1968                 index++;
1969                 BUG_ON(index >= sblock->page_count);
1970                 BUG_ON(!sblock->pagev[index]->page);
1971                 page = sblock->pagev[index]->page;
1972                 mapped_buffer = kmap_atomic(page);
1973                 mapped_size = PAGE_SIZE;
1974                 p = mapped_buffer;
1975         }
1976
1977         btrfs_csum_final(crc, calculated_csum);
1978         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1979                 ++fail_cor;
1980
1981         if (fail_cor + fail_gen) {
1982                 /*
1983                  * if we find an error in a super block, we just report it.
1984                  * They will get written with the next transaction commit
1985                  * anyway
1986                  */
1987                 spin_lock(&sctx->stat_lock);
1988                 ++sctx->stat.super_errors;
1989                 spin_unlock(&sctx->stat_lock);
1990                 if (fail_cor)
1991                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1992                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1993                 else
1994                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1995                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1996         }
1997
1998         return fail_cor + fail_gen;
1999 }
2000
2001 static void scrub_block_get(struct scrub_block *sblock)
2002 {
2003         atomic_inc(&sblock->refs);
2004 }
2005
2006 static void scrub_block_put(struct scrub_block *sblock)
2007 {
2008         if (atomic_dec_and_test(&sblock->refs)) {
2009                 int i;
2010
2011                 if (sblock->sparity)
2012                         scrub_parity_put(sblock->sparity);
2013
2014                 for (i = 0; i < sblock->page_count; i++)
2015                         scrub_page_put(sblock->pagev[i]);
2016                 kfree(sblock);
2017         }
2018 }
2019
2020 static void scrub_page_get(struct scrub_page *spage)
2021 {
2022         atomic_inc(&spage->refs);
2023 }
2024
2025 static void scrub_page_put(struct scrub_page *spage)
2026 {
2027         if (atomic_dec_and_test(&spage->refs)) {
2028                 if (spage->page)
2029                         __free_page(spage->page);
2030                 kfree(spage);
2031         }
2032 }
2033
2034 static void scrub_submit(struct scrub_ctx *sctx)
2035 {
2036         struct scrub_bio *sbio;
2037
2038         if (sctx->curr == -1)
2039                 return;
2040
2041         sbio = sctx->bios[sctx->curr];
2042         sctx->curr = -1;
2043         scrub_pending_bio_inc(sctx);
2044         btrfsic_submit_bio(sbio->bio);
2045 }
2046
2047 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2048                                     struct scrub_page *spage)
2049 {
2050         struct scrub_block *sblock = spage->sblock;
2051         struct scrub_bio *sbio;
2052         int ret;
2053
2054 again:
2055         /*
2056          * grab a fresh bio or wait for one to become available
2057          */
2058         while (sctx->curr == -1) {
2059                 spin_lock(&sctx->list_lock);
2060                 sctx->curr = sctx->first_free;
2061                 if (sctx->curr != -1) {
2062                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2063                         sctx->bios[sctx->curr]->next_free = -1;
2064                         sctx->bios[sctx->curr]->page_count = 0;
2065                         spin_unlock(&sctx->list_lock);
2066                 } else {
2067                         spin_unlock(&sctx->list_lock);
2068                         wait_event(sctx->list_wait, sctx->first_free != -1);
2069                 }
2070         }
2071         sbio = sctx->bios[sctx->curr];
2072         if (sbio->page_count == 0) {
2073                 struct bio *bio;
2074
2075                 sbio->physical = spage->physical;
2076                 sbio->logical = spage->logical;
2077                 sbio->dev = spage->dev;
2078                 bio = sbio->bio;
2079                 if (!bio) {
2080                         bio = btrfs_io_bio_alloc(GFP_KERNEL,
2081                                         sctx->pages_per_rd_bio);
2082                         if (!bio)
2083                                 return -ENOMEM;
2084                         sbio->bio = bio;
2085                 }
2086
2087                 bio->bi_private = sbio;
2088                 bio->bi_end_io = scrub_bio_end_io;
2089                 bio->bi_bdev = sbio->dev->bdev;
2090                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2091                 bio_set_op_attrs(bio, REQ_OP_READ, 0);
2092                 sbio->err = 0;
2093         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2094                    spage->physical ||
2095                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2096                    spage->logical ||
2097                    sbio->dev != spage->dev) {
2098                 scrub_submit(sctx);
2099                 goto again;
2100         }
2101
2102         sbio->pagev[sbio->page_count] = spage;
2103         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2104         if (ret != PAGE_SIZE) {
2105                 if (sbio->page_count < 1) {
2106                         bio_put(sbio->bio);
2107                         sbio->bio = NULL;
2108                         return -EIO;
2109                 }
2110                 scrub_submit(sctx);
2111                 goto again;
2112         }
2113
2114         scrub_block_get(sblock); /* one for the page added to the bio */
2115         atomic_inc(&sblock->outstanding_pages);
2116         sbio->page_count++;
2117         if (sbio->page_count == sctx->pages_per_rd_bio)
2118                 scrub_submit(sctx);
2119
2120         return 0;
2121 }
2122
2123 static void scrub_missing_raid56_end_io(struct bio *bio)
2124 {
2125         struct scrub_block *sblock = bio->bi_private;
2126         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2127
2128         if (bio->bi_error)
2129                 sblock->no_io_error_seen = 0;
2130
2131         bio_put(bio);
2132
2133         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2134 }
2135
2136 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2137 {
2138         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2139         struct scrub_ctx *sctx = sblock->sctx;
2140         struct btrfs_fs_info *fs_info = sctx->fs_info;
2141         u64 logical;
2142         struct btrfs_device *dev;
2143
2144         logical = sblock->pagev[0]->logical;
2145         dev = sblock->pagev[0]->dev;
2146
2147         if (sblock->no_io_error_seen)
2148                 scrub_recheck_block_checksum(sblock);
2149
2150         if (!sblock->no_io_error_seen) {
2151                 spin_lock(&sctx->stat_lock);
2152                 sctx->stat.read_errors++;
2153                 spin_unlock(&sctx->stat_lock);
2154                 btrfs_err_rl_in_rcu(fs_info,
2155                         "IO error rebuilding logical %llu for dev %s",
2156                         logical, rcu_str_deref(dev->name));
2157         } else if (sblock->header_error || sblock->checksum_error) {
2158                 spin_lock(&sctx->stat_lock);
2159                 sctx->stat.uncorrectable_errors++;
2160                 spin_unlock(&sctx->stat_lock);
2161                 btrfs_err_rl_in_rcu(fs_info,
2162                         "failed to rebuild valid logical %llu for dev %s",
2163                         logical, rcu_str_deref(dev->name));
2164         } else {
2165                 scrub_write_block_to_dev_replace(sblock);
2166         }
2167
2168         scrub_block_put(sblock);
2169
2170         if (sctx->is_dev_replace &&
2171             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2172                 mutex_lock(&sctx->wr_ctx.wr_lock);
2173                 scrub_wr_submit(sctx);
2174                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2175         }
2176
2177         scrub_pending_bio_dec(sctx);
2178 }
2179
2180 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2181 {
2182         struct scrub_ctx *sctx = sblock->sctx;
2183         struct btrfs_fs_info *fs_info = sctx->fs_info;
2184         u64 length = sblock->page_count * PAGE_SIZE;
2185         u64 logical = sblock->pagev[0]->logical;
2186         struct btrfs_bio *bbio = NULL;
2187         struct bio *bio;
2188         struct btrfs_raid_bio *rbio;
2189         int ret;
2190         int i;
2191
2192         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2193                         &length, &bbio, 0, 1);
2194         if (ret || !bbio || !bbio->raid_map)
2195                 goto bbio_out;
2196
2197         if (WARN_ON(!sctx->is_dev_replace ||
2198                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2199                 /*
2200                  * We shouldn't be scrubbing a missing device. Even for dev
2201                  * replace, we should only get here for RAID 5/6. We either
2202                  * managed to mount something with no mirrors remaining or
2203                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2204                  */
2205                 goto bbio_out;
2206         }
2207
2208         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2209         if (!bio)
2210                 goto bbio_out;
2211
2212         bio->bi_iter.bi_sector = logical >> 9;
2213         bio->bi_private = sblock;
2214         bio->bi_end_io = scrub_missing_raid56_end_io;
2215
2216         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2217         if (!rbio)
2218                 goto rbio_out;
2219
2220         for (i = 0; i < sblock->page_count; i++) {
2221                 struct scrub_page *spage = sblock->pagev[i];
2222
2223                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2224         }
2225
2226         btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2227                         scrub_missing_raid56_worker, NULL, NULL);
2228         scrub_block_get(sblock);
2229         scrub_pending_bio_inc(sctx);
2230         raid56_submit_missing_rbio(rbio);
2231         return;
2232
2233 rbio_out:
2234         bio_put(bio);
2235 bbio_out:
2236         btrfs_put_bbio(bbio);
2237         spin_lock(&sctx->stat_lock);
2238         sctx->stat.malloc_errors++;
2239         spin_unlock(&sctx->stat_lock);
2240 }
2241
2242 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2243                        u64 physical, struct btrfs_device *dev, u64 flags,
2244                        u64 gen, int mirror_num, u8 *csum, int force,
2245                        u64 physical_for_dev_replace)
2246 {
2247         struct scrub_block *sblock;
2248         int index;
2249
2250         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2251         if (!sblock) {
2252                 spin_lock(&sctx->stat_lock);
2253                 sctx->stat.malloc_errors++;
2254                 spin_unlock(&sctx->stat_lock);
2255                 return -ENOMEM;
2256         }
2257
2258         /* one ref inside this function, plus one for each page added to
2259          * a bio later on */
2260         atomic_set(&sblock->refs, 1);
2261         sblock->sctx = sctx;
2262         sblock->no_io_error_seen = 1;
2263
2264         for (index = 0; len > 0; index++) {
2265                 struct scrub_page *spage;
2266                 u64 l = min_t(u64, len, PAGE_SIZE);
2267
2268                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2269                 if (!spage) {
2270 leave_nomem:
2271                         spin_lock(&sctx->stat_lock);
2272                         sctx->stat.malloc_errors++;
2273                         spin_unlock(&sctx->stat_lock);
2274                         scrub_block_put(sblock);
2275                         return -ENOMEM;
2276                 }
2277                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2278                 scrub_page_get(spage);
2279                 sblock->pagev[index] = spage;
2280                 spage->sblock = sblock;
2281                 spage->dev = dev;
2282                 spage->flags = flags;
2283                 spage->generation = gen;
2284                 spage->logical = logical;
2285                 spage->physical = physical;
2286                 spage->physical_for_dev_replace = physical_for_dev_replace;
2287                 spage->mirror_num = mirror_num;
2288                 if (csum) {
2289                         spage->have_csum = 1;
2290                         memcpy(spage->csum, csum, sctx->csum_size);
2291                 } else {
2292                         spage->have_csum = 0;
2293                 }
2294                 sblock->page_count++;
2295                 spage->page = alloc_page(GFP_KERNEL);
2296                 if (!spage->page)
2297                         goto leave_nomem;
2298                 len -= l;
2299                 logical += l;
2300                 physical += l;
2301                 physical_for_dev_replace += l;
2302         }
2303
2304         WARN_ON(sblock->page_count == 0);
2305         if (dev->missing) {
2306                 /*
2307                  * This case should only be hit for RAID 5/6 device replace. See
2308                  * the comment in scrub_missing_raid56_pages() for details.
2309                  */
2310                 scrub_missing_raid56_pages(sblock);
2311         } else {
2312                 for (index = 0; index < sblock->page_count; index++) {
2313                         struct scrub_page *spage = sblock->pagev[index];
2314                         int ret;
2315
2316                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2317                         if (ret) {
2318                                 scrub_block_put(sblock);
2319                                 return ret;
2320                         }
2321                 }
2322
2323                 if (force)
2324                         scrub_submit(sctx);
2325         }
2326
2327         /* last one frees, either here or in bio completion for last page */
2328         scrub_block_put(sblock);
2329         return 0;
2330 }
2331
2332 static void scrub_bio_end_io(struct bio *bio)
2333 {
2334         struct scrub_bio *sbio = bio->bi_private;
2335         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2336
2337         sbio->err = bio->bi_error;
2338         sbio->bio = bio;
2339
2340         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2341 }
2342
2343 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2344 {
2345         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2346         struct scrub_ctx *sctx = sbio->sctx;
2347         int i;
2348
2349         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2350         if (sbio->err) {
2351                 for (i = 0; i < sbio->page_count; i++) {
2352                         struct scrub_page *spage = sbio->pagev[i];
2353
2354                         spage->io_error = 1;
2355                         spage->sblock->no_io_error_seen = 0;
2356                 }
2357         }
2358
2359         /* now complete the scrub_block items that have all pages completed */
2360         for (i = 0; i < sbio->page_count; i++) {
2361                 struct scrub_page *spage = sbio->pagev[i];
2362                 struct scrub_block *sblock = spage->sblock;
2363
2364                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2365                         scrub_block_complete(sblock);
2366                 scrub_block_put(sblock);
2367         }
2368
2369         bio_put(sbio->bio);
2370         sbio->bio = NULL;
2371         spin_lock(&sctx->list_lock);
2372         sbio->next_free = sctx->first_free;
2373         sctx->first_free = sbio->index;
2374         spin_unlock(&sctx->list_lock);
2375
2376         if (sctx->is_dev_replace &&
2377             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2378                 mutex_lock(&sctx->wr_ctx.wr_lock);
2379                 scrub_wr_submit(sctx);
2380                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2381         }
2382
2383         scrub_pending_bio_dec(sctx);
2384 }
2385
2386 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2387                                        unsigned long *bitmap,
2388                                        u64 start, u64 len)
2389 {
2390         u32 offset;
2391         int nsectors;
2392         int sectorsize = sparity->sctx->fs_info->sectorsize;
2393
2394         if (len >= sparity->stripe_len) {
2395                 bitmap_set(bitmap, 0, sparity->nsectors);
2396                 return;
2397         }
2398
2399         start -= sparity->logic_start;
2400         start = div_u64_rem(start, sparity->stripe_len, &offset);
2401         offset /= sectorsize;
2402         nsectors = (int)len / sectorsize;
2403
2404         if (offset + nsectors <= sparity->nsectors) {
2405                 bitmap_set(bitmap, offset, nsectors);
2406                 return;
2407         }
2408
2409         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2410         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2411 }
2412
2413 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2414                                                    u64 start, u64 len)
2415 {
2416         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2417 }
2418
2419 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2420                                                   u64 start, u64 len)
2421 {
2422         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2423 }
2424
2425 static void scrub_block_complete(struct scrub_block *sblock)
2426 {
2427         int corrupted = 0;
2428
2429         if (!sblock->no_io_error_seen) {
2430                 corrupted = 1;
2431                 scrub_handle_errored_block(sblock);
2432         } else {
2433                 /*
2434                  * if has checksum error, write via repair mechanism in
2435                  * dev replace case, otherwise write here in dev replace
2436                  * case.
2437                  */
2438                 corrupted = scrub_checksum(sblock);
2439                 if (!corrupted && sblock->sctx->is_dev_replace)
2440                         scrub_write_block_to_dev_replace(sblock);
2441         }
2442
2443         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2444                 u64 start = sblock->pagev[0]->logical;
2445                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2446                           PAGE_SIZE;
2447
2448                 scrub_parity_mark_sectors_error(sblock->sparity,
2449                                                 start, end - start);
2450         }
2451 }
2452
2453 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2454 {
2455         struct btrfs_ordered_sum *sum = NULL;
2456         unsigned long index;
2457         unsigned long num_sectors;
2458
2459         while (!list_empty(&sctx->csum_list)) {
2460                 sum = list_first_entry(&sctx->csum_list,
2461                                        struct btrfs_ordered_sum, list);
2462                 if (sum->bytenr > logical)
2463                         return 0;
2464                 if (sum->bytenr + sum->len > logical)
2465                         break;
2466
2467                 ++sctx->stat.csum_discards;
2468                 list_del(&sum->list);
2469                 kfree(sum);
2470                 sum = NULL;
2471         }
2472         if (!sum)
2473                 return 0;
2474
2475         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2476         num_sectors = sum->len / sctx->sectorsize;
2477         memcpy(csum, sum->sums + index, sctx->csum_size);
2478         if (index == num_sectors - 1) {
2479                 list_del(&sum->list);
2480                 kfree(sum);
2481         }
2482         return 1;
2483 }
2484
2485 /* scrub extent tries to collect up to 64 kB for each bio */
2486 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2487                         u64 physical, struct btrfs_device *dev, u64 flags,
2488                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2489 {
2490         int ret;
2491         u8 csum[BTRFS_CSUM_SIZE];
2492         u32 blocksize;
2493
2494         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2495                 blocksize = sctx->sectorsize;
2496                 spin_lock(&sctx->stat_lock);
2497                 sctx->stat.data_extents_scrubbed++;
2498                 sctx->stat.data_bytes_scrubbed += len;
2499                 spin_unlock(&sctx->stat_lock);
2500         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2501                 blocksize = sctx->nodesize;
2502                 spin_lock(&sctx->stat_lock);
2503                 sctx->stat.tree_extents_scrubbed++;
2504                 sctx->stat.tree_bytes_scrubbed += len;
2505                 spin_unlock(&sctx->stat_lock);
2506         } else {
2507                 blocksize = sctx->sectorsize;
2508                 WARN_ON(1);
2509         }
2510
2511         while (len) {
2512                 u64 l = min_t(u64, len, blocksize);
2513                 int have_csum = 0;
2514
2515                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2516                         /* push csums to sbio */
2517                         have_csum = scrub_find_csum(sctx, logical, csum);
2518                         if (have_csum == 0)
2519                                 ++sctx->stat.no_csum;
2520                         if (sctx->is_dev_replace && !have_csum) {
2521                                 ret = copy_nocow_pages(sctx, logical, l,
2522                                                        mirror_num,
2523                                                       physical_for_dev_replace);
2524                                 goto behind_scrub_pages;
2525                         }
2526                 }
2527                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2528                                   mirror_num, have_csum ? csum : NULL, 0,
2529                                   physical_for_dev_replace);
2530 behind_scrub_pages:
2531                 if (ret)
2532                         return ret;
2533                 len -= l;
2534                 logical += l;
2535                 physical += l;
2536                 physical_for_dev_replace += l;
2537         }
2538         return 0;
2539 }
2540
2541 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2542                                   u64 logical, u64 len,
2543                                   u64 physical, struct btrfs_device *dev,
2544                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2545 {
2546         struct scrub_ctx *sctx = sparity->sctx;
2547         struct scrub_block *sblock;
2548         int index;
2549
2550         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2551         if (!sblock) {
2552                 spin_lock(&sctx->stat_lock);
2553                 sctx->stat.malloc_errors++;
2554                 spin_unlock(&sctx->stat_lock);
2555                 return -ENOMEM;
2556         }
2557
2558         /* one ref inside this function, plus one for each page added to
2559          * a bio later on */
2560         atomic_set(&sblock->refs, 1);
2561         sblock->sctx = sctx;
2562         sblock->no_io_error_seen = 1;
2563         sblock->sparity = sparity;
2564         scrub_parity_get(sparity);
2565
2566         for (index = 0; len > 0; index++) {
2567                 struct scrub_page *spage;
2568                 u64 l = min_t(u64, len, PAGE_SIZE);
2569
2570                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2571                 if (!spage) {
2572 leave_nomem:
2573                         spin_lock(&sctx->stat_lock);
2574                         sctx->stat.malloc_errors++;
2575                         spin_unlock(&sctx->stat_lock);
2576                         scrub_block_put(sblock);
2577                         return -ENOMEM;
2578                 }
2579                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2580                 /* For scrub block */
2581                 scrub_page_get(spage);
2582                 sblock->pagev[index] = spage;
2583                 /* For scrub parity */
2584                 scrub_page_get(spage);
2585                 list_add_tail(&spage->list, &sparity->spages);
2586                 spage->sblock = sblock;
2587                 spage->dev = dev;
2588                 spage->flags = flags;
2589                 spage->generation = gen;
2590                 spage->logical = logical;
2591                 spage->physical = physical;
2592                 spage->mirror_num = mirror_num;
2593                 if (csum) {
2594                         spage->have_csum = 1;
2595                         memcpy(spage->csum, csum, sctx->csum_size);
2596                 } else {
2597                         spage->have_csum = 0;
2598                 }
2599                 sblock->page_count++;
2600                 spage->page = alloc_page(GFP_KERNEL);
2601                 if (!spage->page)
2602                         goto leave_nomem;
2603                 len -= l;
2604                 logical += l;
2605                 physical += l;
2606         }
2607
2608         WARN_ON(sblock->page_count == 0);
2609         for (index = 0; index < sblock->page_count; index++) {
2610                 struct scrub_page *spage = sblock->pagev[index];
2611                 int ret;
2612
2613                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2614                 if (ret) {
2615                         scrub_block_put(sblock);
2616                         return ret;
2617                 }
2618         }
2619
2620         /* last one frees, either here or in bio completion for last page */
2621         scrub_block_put(sblock);
2622         return 0;
2623 }
2624
2625 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2626                                    u64 logical, u64 len,
2627                                    u64 physical, struct btrfs_device *dev,
2628                                    u64 flags, u64 gen, int mirror_num)
2629 {
2630         struct scrub_ctx *sctx = sparity->sctx;
2631         int ret;
2632         u8 csum[BTRFS_CSUM_SIZE];
2633         u32 blocksize;
2634
2635         if (dev->missing) {
2636                 scrub_parity_mark_sectors_error(sparity, logical, len);
2637                 return 0;
2638         }
2639
2640         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2641                 blocksize = sctx->sectorsize;
2642         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2643                 blocksize = sctx->nodesize;
2644         } else {
2645                 blocksize = sctx->sectorsize;
2646                 WARN_ON(1);
2647         }
2648
2649         while (len) {
2650                 u64 l = min_t(u64, len, blocksize);
2651                 int have_csum = 0;
2652
2653                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2654                         /* push csums to sbio */
2655                         have_csum = scrub_find_csum(sctx, logical, csum);
2656                         if (have_csum == 0)
2657                                 goto skip;
2658                 }
2659                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2660                                              flags, gen, mirror_num,
2661                                              have_csum ? csum : NULL);
2662                 if (ret)
2663                         return ret;
2664 skip:
2665                 len -= l;
2666                 logical += l;
2667                 physical += l;
2668         }
2669         return 0;
2670 }
2671
2672 /*
2673  * Given a physical address, this will calculate it's
2674  * logical offset. if this is a parity stripe, it will return
2675  * the most left data stripe's logical offset.
2676  *
2677  * return 0 if it is a data stripe, 1 means parity stripe.
2678  */
2679 static int get_raid56_logic_offset(u64 physical, int num,
2680                                    struct map_lookup *map, u64 *offset,
2681                                    u64 *stripe_start)
2682 {
2683         int i;
2684         int j = 0;
2685         u64 stripe_nr;
2686         u64 last_offset;
2687         u32 stripe_index;
2688         u32 rot;
2689
2690         last_offset = (physical - map->stripes[num].physical) *
2691                       nr_data_stripes(map);
2692         if (stripe_start)
2693                 *stripe_start = last_offset;
2694
2695         *offset = last_offset;
2696         for (i = 0; i < nr_data_stripes(map); i++) {
2697                 *offset = last_offset + i * map->stripe_len;
2698
2699                 stripe_nr = div_u64(*offset, map->stripe_len);
2700                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2701
2702                 /* Work out the disk rotation on this stripe-set */
2703                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2704                 /* calculate which stripe this data locates */
2705                 rot += i;
2706                 stripe_index = rot % map->num_stripes;
2707                 if (stripe_index == num)
2708                         return 0;
2709                 if (stripe_index < num)
2710                         j++;
2711         }
2712         *offset = last_offset + j * map->stripe_len;
2713         return 1;
2714 }
2715
2716 static void scrub_free_parity(struct scrub_parity *sparity)
2717 {
2718         struct scrub_ctx *sctx = sparity->sctx;
2719         struct scrub_page *curr, *next;
2720         int nbits;
2721
2722         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2723         if (nbits) {
2724                 spin_lock(&sctx->stat_lock);
2725                 sctx->stat.read_errors += nbits;
2726                 sctx->stat.uncorrectable_errors += nbits;
2727                 spin_unlock(&sctx->stat_lock);
2728         }
2729
2730         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2731                 list_del_init(&curr->list);
2732                 scrub_page_put(curr);
2733         }
2734
2735         kfree(sparity);
2736 }
2737
2738 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2739 {
2740         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2741                                                     work);
2742         struct scrub_ctx *sctx = sparity->sctx;
2743
2744         scrub_free_parity(sparity);
2745         scrub_pending_bio_dec(sctx);
2746 }
2747
2748 static void scrub_parity_bio_endio(struct bio *bio)
2749 {
2750         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2751         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2752
2753         if (bio->bi_error)
2754                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2755                           sparity->nsectors);
2756
2757         bio_put(bio);
2758
2759         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
2760                         scrub_parity_bio_endio_worker, NULL, NULL);
2761         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2762 }
2763
2764 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2765 {
2766         struct scrub_ctx *sctx = sparity->sctx;
2767         struct btrfs_fs_info *fs_info = sctx->fs_info;
2768         struct bio *bio;
2769         struct btrfs_raid_bio *rbio;
2770         struct scrub_page *spage;
2771         struct btrfs_bio *bbio = NULL;
2772         u64 length;
2773         int ret;
2774
2775         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2776                            sparity->nsectors))
2777                 goto out;
2778
2779         length = sparity->logic_end - sparity->logic_start;
2780         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2781                                &length, &bbio, 0, 1);
2782         if (ret || !bbio || !bbio->raid_map)
2783                 goto bbio_out;
2784
2785         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2786         if (!bio)
2787                 goto bbio_out;
2788
2789         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2790         bio->bi_private = sparity;
2791         bio->bi_end_io = scrub_parity_bio_endio;
2792
2793         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2794                                               length, sparity->scrub_dev,
2795                                               sparity->dbitmap,
2796                                               sparity->nsectors);
2797         if (!rbio)
2798                 goto rbio_out;
2799
2800         list_for_each_entry(spage, &sparity->spages, list)
2801                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2802
2803         scrub_pending_bio_inc(sctx);
2804         raid56_parity_submit_scrub_rbio(rbio);
2805         return;
2806
2807 rbio_out:
2808         bio_put(bio);
2809 bbio_out:
2810         btrfs_put_bbio(bbio);
2811         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2812                   sparity->nsectors);
2813         spin_lock(&sctx->stat_lock);
2814         sctx->stat.malloc_errors++;
2815         spin_unlock(&sctx->stat_lock);
2816 out:
2817         scrub_free_parity(sparity);
2818 }
2819
2820 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2821 {
2822         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2823 }
2824
2825 static void scrub_parity_get(struct scrub_parity *sparity)
2826 {
2827         atomic_inc(&sparity->refs);
2828 }
2829
2830 static void scrub_parity_put(struct scrub_parity *sparity)
2831 {
2832         if (!atomic_dec_and_test(&sparity->refs))
2833                 return;
2834
2835         scrub_parity_check_and_repair(sparity);
2836 }
2837
2838 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2839                                                   struct map_lookup *map,
2840                                                   struct btrfs_device *sdev,
2841                                                   struct btrfs_path *path,
2842                                                   u64 logic_start,
2843                                                   u64 logic_end)
2844 {
2845         struct btrfs_fs_info *fs_info = sctx->fs_info;
2846         struct btrfs_root *root = fs_info->extent_root;
2847         struct btrfs_root *csum_root = fs_info->csum_root;
2848         struct btrfs_extent_item *extent;
2849         struct btrfs_bio *bbio = NULL;
2850         u64 flags;
2851         int ret;
2852         int slot;
2853         struct extent_buffer *l;
2854         struct btrfs_key key;
2855         u64 generation;
2856         u64 extent_logical;
2857         u64 extent_physical;
2858         u64 extent_len;
2859         u64 mapped_length;
2860         struct btrfs_device *extent_dev;
2861         struct scrub_parity *sparity;
2862         int nsectors;
2863         int bitmap_len;
2864         int extent_mirror_num;
2865         int stop_loop = 0;
2866
2867         nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
2868         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2869         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2870                           GFP_NOFS);
2871         if (!sparity) {
2872                 spin_lock(&sctx->stat_lock);
2873                 sctx->stat.malloc_errors++;
2874                 spin_unlock(&sctx->stat_lock);
2875                 return -ENOMEM;
2876         }
2877
2878         sparity->stripe_len = map->stripe_len;
2879         sparity->nsectors = nsectors;
2880         sparity->sctx = sctx;
2881         sparity->scrub_dev = sdev;
2882         sparity->logic_start = logic_start;
2883         sparity->logic_end = logic_end;
2884         atomic_set(&sparity->refs, 1);
2885         INIT_LIST_HEAD(&sparity->spages);
2886         sparity->dbitmap = sparity->bitmap;
2887         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2888
2889         ret = 0;
2890         while (logic_start < logic_end) {
2891                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2892                         key.type = BTRFS_METADATA_ITEM_KEY;
2893                 else
2894                         key.type = BTRFS_EXTENT_ITEM_KEY;
2895                 key.objectid = logic_start;
2896                 key.offset = (u64)-1;
2897
2898                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2899                 if (ret < 0)
2900                         goto out;
2901
2902                 if (ret > 0) {
2903                         ret = btrfs_previous_extent_item(root, path, 0);
2904                         if (ret < 0)
2905                                 goto out;
2906                         if (ret > 0) {
2907                                 btrfs_release_path(path);
2908                                 ret = btrfs_search_slot(NULL, root, &key,
2909                                                         path, 0, 0);
2910                                 if (ret < 0)
2911                                         goto out;
2912                         }
2913                 }
2914
2915                 stop_loop = 0;
2916                 while (1) {
2917                         u64 bytes;
2918
2919                         l = path->nodes[0];
2920                         slot = path->slots[0];
2921                         if (slot >= btrfs_header_nritems(l)) {
2922                                 ret = btrfs_next_leaf(root, path);
2923                                 if (ret == 0)
2924                                         continue;
2925                                 if (ret < 0)
2926                                         goto out;
2927
2928                                 stop_loop = 1;
2929                                 break;
2930                         }
2931                         btrfs_item_key_to_cpu(l, &key, slot);
2932
2933                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2934                             key.type != BTRFS_METADATA_ITEM_KEY)
2935                                 goto next;
2936
2937                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2938                                 bytes = fs_info->nodesize;
2939                         else
2940                                 bytes = key.offset;
2941
2942                         if (key.objectid + bytes <= logic_start)
2943                                 goto next;
2944
2945                         if (key.objectid >= logic_end) {
2946                                 stop_loop = 1;
2947                                 break;
2948                         }
2949
2950                         while (key.objectid >= logic_start + map->stripe_len)
2951                                 logic_start += map->stripe_len;
2952
2953                         extent = btrfs_item_ptr(l, slot,
2954                                                 struct btrfs_extent_item);
2955                         flags = btrfs_extent_flags(l, extent);
2956                         generation = btrfs_extent_generation(l, extent);
2957
2958                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2959                             (key.objectid < logic_start ||
2960                              key.objectid + bytes >
2961                              logic_start + map->stripe_len)) {
2962                                 btrfs_err(fs_info,
2963                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2964                                           key.objectid, logic_start);
2965                                 spin_lock(&sctx->stat_lock);
2966                                 sctx->stat.uncorrectable_errors++;
2967                                 spin_unlock(&sctx->stat_lock);
2968                                 goto next;
2969                         }
2970 again:
2971                         extent_logical = key.objectid;
2972                         extent_len = bytes;
2973
2974                         if (extent_logical < logic_start) {
2975                                 extent_len -= logic_start - extent_logical;
2976                                 extent_logical = logic_start;
2977                         }
2978
2979                         if (extent_logical + extent_len >
2980                             logic_start + map->stripe_len)
2981                                 extent_len = logic_start + map->stripe_len -
2982                                              extent_logical;
2983
2984                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2985                                                        extent_len);
2986
2987                         mapped_length = extent_len;
2988                         bbio = NULL;
2989                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2990                                         extent_logical, &mapped_length, &bbio,
2991                                         0);
2992                         if (!ret) {
2993                                 if (!bbio || mapped_length < extent_len)
2994                                         ret = -EIO;
2995                         }
2996                         if (ret) {
2997                                 btrfs_put_bbio(bbio);
2998                                 goto out;
2999                         }
3000                         extent_physical = bbio->stripes[0].physical;
3001                         extent_mirror_num = bbio->mirror_num;
3002                         extent_dev = bbio->stripes[0].dev;
3003                         btrfs_put_bbio(bbio);
3004
3005                         ret = btrfs_lookup_csums_range(csum_root,
3006                                                 extent_logical,
3007                                                 extent_logical + extent_len - 1,
3008                                                 &sctx->csum_list, 1);
3009                         if (ret)
3010                                 goto out;
3011
3012                         ret = scrub_extent_for_parity(sparity, extent_logical,
3013                                                       extent_len,
3014                                                       extent_physical,
3015                                                       extent_dev, flags,
3016                                                       generation,
3017                                                       extent_mirror_num);
3018
3019                         scrub_free_csums(sctx);
3020
3021                         if (ret)
3022                                 goto out;
3023
3024                         if (extent_logical + extent_len <
3025                             key.objectid + bytes) {
3026                                 logic_start += map->stripe_len;
3027
3028                                 if (logic_start >= logic_end) {
3029                                         stop_loop = 1;
3030                                         break;
3031                                 }
3032
3033                                 if (logic_start < key.objectid + bytes) {
3034                                         cond_resched();
3035                                         goto again;
3036                                 }
3037                         }
3038 next:
3039                         path->slots[0]++;
3040                 }
3041
3042                 btrfs_release_path(path);
3043
3044                 if (stop_loop)
3045                         break;
3046
3047                 logic_start += map->stripe_len;
3048         }
3049 out:
3050         if (ret < 0)
3051                 scrub_parity_mark_sectors_error(sparity, logic_start,
3052                                                 logic_end - logic_start);
3053         scrub_parity_put(sparity);
3054         scrub_submit(sctx);
3055         mutex_lock(&sctx->wr_ctx.wr_lock);
3056         scrub_wr_submit(sctx);
3057         mutex_unlock(&sctx->wr_ctx.wr_lock);
3058
3059         btrfs_release_path(path);
3060         return ret < 0 ? ret : 0;
3061 }
3062
3063 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3064                                            struct map_lookup *map,
3065                                            struct btrfs_device *scrub_dev,
3066                                            int num, u64 base, u64 length,
3067                                            int is_dev_replace)
3068 {
3069         struct btrfs_path *path, *ppath;
3070         struct btrfs_fs_info *fs_info = sctx->fs_info;
3071         struct btrfs_root *root = fs_info->extent_root;
3072         struct btrfs_root *csum_root = fs_info->csum_root;
3073         struct btrfs_extent_item *extent;
3074         struct blk_plug plug;
3075         u64 flags;
3076         int ret;
3077         int slot;
3078         u64 nstripes;
3079         struct extent_buffer *l;
3080         u64 physical;
3081         u64 logical;
3082         u64 logic_end;
3083         u64 physical_end;
3084         u64 generation;
3085         int mirror_num;
3086         struct reada_control *reada1;
3087         struct reada_control *reada2;
3088         struct btrfs_key key;
3089         struct btrfs_key key_end;
3090         u64 increment = map->stripe_len;
3091         u64 offset;
3092         u64 extent_logical;
3093         u64 extent_physical;
3094         u64 extent_len;
3095         u64 stripe_logical;
3096         u64 stripe_end;
3097         struct btrfs_device *extent_dev;
3098         int extent_mirror_num;
3099         int stop_loop = 0;
3100
3101         physical = map->stripes[num].physical;
3102         offset = 0;
3103         nstripes = div_u64(length, map->stripe_len);
3104         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3105                 offset = map->stripe_len * num;
3106                 increment = map->stripe_len * map->num_stripes;
3107                 mirror_num = 1;
3108         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3109                 int factor = map->num_stripes / map->sub_stripes;
3110                 offset = map->stripe_len * (num / map->sub_stripes);
3111                 increment = map->stripe_len * factor;
3112                 mirror_num = num % map->sub_stripes + 1;
3113         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3114                 increment = map->stripe_len;
3115                 mirror_num = num % map->num_stripes + 1;
3116         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3117                 increment = map->stripe_len;
3118                 mirror_num = num % map->num_stripes + 1;
3119         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3120                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3121                 increment = map->stripe_len * nr_data_stripes(map);
3122                 mirror_num = 1;
3123         } else {
3124                 increment = map->stripe_len;
3125                 mirror_num = 1;
3126         }
3127
3128         path = btrfs_alloc_path();
3129         if (!path)
3130                 return -ENOMEM;
3131
3132         ppath = btrfs_alloc_path();
3133         if (!ppath) {
3134                 btrfs_free_path(path);
3135                 return -ENOMEM;
3136         }
3137
3138         /*
3139          * work on commit root. The related disk blocks are static as
3140          * long as COW is applied. This means, it is save to rewrite
3141          * them to repair disk errors without any race conditions
3142          */
3143         path->search_commit_root = 1;
3144         path->skip_locking = 1;
3145
3146         ppath->search_commit_root = 1;
3147         ppath->skip_locking = 1;
3148         /*
3149          * trigger the readahead for extent tree csum tree and wait for
3150          * completion. During readahead, the scrub is officially paused
3151          * to not hold off transaction commits
3152          */
3153         logical = base + offset;
3154         physical_end = physical + nstripes * map->stripe_len;
3155         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3156                 get_raid56_logic_offset(physical_end, num,
3157                                         map, &logic_end, NULL);
3158                 logic_end += base;
3159         } else {
3160                 logic_end = logical + increment * nstripes;
3161         }
3162         wait_event(sctx->list_wait,
3163                    atomic_read(&sctx->bios_in_flight) == 0);
3164         scrub_blocked_if_needed(fs_info);
3165
3166         /* FIXME it might be better to start readahead at commit root */
3167         key.objectid = logical;
3168         key.type = BTRFS_EXTENT_ITEM_KEY;
3169         key.offset = (u64)0;
3170         key_end.objectid = logic_end;
3171         key_end.type = BTRFS_METADATA_ITEM_KEY;
3172         key_end.offset = (u64)-1;
3173         reada1 = btrfs_reada_add(root, &key, &key_end);
3174
3175         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3176         key.type = BTRFS_EXTENT_CSUM_KEY;
3177         key.offset = logical;
3178         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3179         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3180         key_end.offset = logic_end;
3181         reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3182
3183         if (!IS_ERR(reada1))
3184                 btrfs_reada_wait(reada1);
3185         if (!IS_ERR(reada2))
3186                 btrfs_reada_wait(reada2);
3187
3188
3189         /*
3190          * collect all data csums for the stripe to avoid seeking during
3191          * the scrub. This might currently (crc32) end up to be about 1MB
3192          */
3193         blk_start_plug(&plug);
3194
3195         /*
3196          * now find all extents for each stripe and scrub them
3197          */
3198         ret = 0;
3199         while (physical < physical_end) {
3200                 /*
3201                  * canceled?
3202                  */
3203                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3204                     atomic_read(&sctx->cancel_req)) {
3205                         ret = -ECANCELED;
3206                         goto out;
3207                 }
3208                 /*
3209                  * check to see if we have to pause
3210                  */
3211                 if (atomic_read(&fs_info->scrub_pause_req)) {
3212                         /* push queued extents */
3213                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3214                         scrub_submit(sctx);
3215                         mutex_lock(&sctx->wr_ctx.wr_lock);
3216                         scrub_wr_submit(sctx);
3217                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3218                         wait_event(sctx->list_wait,
3219                                    atomic_read(&sctx->bios_in_flight) == 0);
3220                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3221                         scrub_blocked_if_needed(fs_info);
3222                 }
3223
3224                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3225                         ret = get_raid56_logic_offset(physical, num, map,
3226                                                       &logical,
3227                                                       &stripe_logical);
3228                         logical += base;
3229                         if (ret) {
3230                                 /* it is parity strip */
3231                                 stripe_logical += base;
3232                                 stripe_end = stripe_logical + increment;
3233                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3234                                                           ppath, stripe_logical,
3235                                                           stripe_end);
3236                                 if (ret)
3237                                         goto out;
3238                                 goto skip;
3239                         }
3240                 }
3241
3242                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3243                         key.type = BTRFS_METADATA_ITEM_KEY;
3244                 else
3245                         key.type = BTRFS_EXTENT_ITEM_KEY;
3246                 key.objectid = logical;
3247                 key.offset = (u64)-1;
3248
3249                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3250                 if (ret < 0)
3251                         goto out;
3252
3253                 if (ret > 0) {
3254                         ret = btrfs_previous_extent_item(root, path, 0);
3255                         if (ret < 0)
3256                                 goto out;
3257                         if (ret > 0) {
3258                                 /* there's no smaller item, so stick with the
3259                                  * larger one */
3260                                 btrfs_release_path(path);
3261                                 ret = btrfs_search_slot(NULL, root, &key,
3262                                                         path, 0, 0);
3263                                 if (ret < 0)
3264                                         goto out;
3265                         }
3266                 }
3267
3268                 stop_loop = 0;
3269                 while (1) {
3270                         u64 bytes;
3271
3272                         l = path->nodes[0];
3273                         slot = path->slots[0];
3274                         if (slot >= btrfs_header_nritems(l)) {
3275                                 ret = btrfs_next_leaf(root, path);
3276                                 if (ret == 0)
3277                                         continue;
3278                                 if (ret < 0)
3279                                         goto out;
3280
3281                                 stop_loop = 1;
3282                                 break;
3283                         }
3284                         btrfs_item_key_to_cpu(l, &key, slot);
3285
3286                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3287                             key.type != BTRFS_METADATA_ITEM_KEY)
3288                                 goto next;
3289
3290                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3291                                 bytes = fs_info->nodesize;
3292                         else
3293                                 bytes = key.offset;
3294
3295                         if (key.objectid + bytes <= logical)
3296                                 goto next;
3297
3298                         if (key.objectid >= logical + map->stripe_len) {
3299                                 /* out of this device extent */
3300                                 if (key.objectid >= logic_end)
3301                                         stop_loop = 1;
3302                                 break;
3303                         }
3304
3305                         extent = btrfs_item_ptr(l, slot,
3306                                                 struct btrfs_extent_item);
3307                         flags = btrfs_extent_flags(l, extent);
3308                         generation = btrfs_extent_generation(l, extent);
3309
3310                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311                             (key.objectid < logical ||
3312                              key.objectid + bytes >
3313                              logical + map->stripe_len)) {
3314                                 btrfs_err(fs_info,
3315                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3316                                        key.objectid, logical);
3317                                 spin_lock(&sctx->stat_lock);
3318                                 sctx->stat.uncorrectable_errors++;
3319                                 spin_unlock(&sctx->stat_lock);
3320                                 goto next;
3321                         }
3322
3323 again:
3324                         extent_logical = key.objectid;
3325                         extent_len = bytes;
3326
3327                         /*
3328                          * trim extent to this stripe
3329                          */
3330                         if (extent_logical < logical) {
3331                                 extent_len -= logical - extent_logical;
3332                                 extent_logical = logical;
3333                         }
3334                         if (extent_logical + extent_len >
3335                             logical + map->stripe_len) {
3336                                 extent_len = logical + map->stripe_len -
3337                                              extent_logical;
3338                         }
3339
3340                         extent_physical = extent_logical - logical + physical;
3341                         extent_dev = scrub_dev;
3342                         extent_mirror_num = mirror_num;
3343                         if (is_dev_replace)
3344                                 scrub_remap_extent(fs_info, extent_logical,
3345                                                    extent_len, &extent_physical,
3346                                                    &extent_dev,
3347                                                    &extent_mirror_num);
3348
3349                         ret = btrfs_lookup_csums_range(csum_root,
3350                                                        extent_logical,
3351                                                        extent_logical +
3352                                                        extent_len - 1,
3353                                                        &sctx->csum_list, 1);
3354                         if (ret)
3355                                 goto out;
3356
3357                         ret = scrub_extent(sctx, extent_logical, extent_len,
3358                                            extent_physical, extent_dev, flags,
3359                                            generation, extent_mirror_num,
3360                                            extent_logical - logical + physical);
3361
3362                         scrub_free_csums(sctx);
3363
3364                         if (ret)
3365                                 goto out;
3366
3367                         if (extent_logical + extent_len <
3368                             key.objectid + bytes) {
3369                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3370                                         /*
3371                                          * loop until we find next data stripe
3372                                          * or we have finished all stripes.
3373                                          */
3374 loop:
3375                                         physical += map->stripe_len;
3376                                         ret = get_raid56_logic_offset(physical,
3377                                                         num, map, &logical,
3378                                                         &stripe_logical);
3379                                         logical += base;
3380
3381                                         if (ret && physical < physical_end) {
3382                                                 stripe_logical += base;
3383                                                 stripe_end = stripe_logical +
3384                                                                 increment;
3385                                                 ret = scrub_raid56_parity(sctx,
3386                                                         map, scrub_dev, ppath,
3387                                                         stripe_logical,
3388                                                         stripe_end);
3389                                                 if (ret)
3390                                                         goto out;
3391                                                 goto loop;
3392                                         }
3393                                 } else {
3394                                         physical += map->stripe_len;
3395                                         logical += increment;
3396                                 }
3397                                 if (logical < key.objectid + bytes) {
3398                                         cond_resched();
3399                                         goto again;
3400                                 }
3401
3402                                 if (physical >= physical_end) {
3403                                         stop_loop = 1;
3404                                         break;
3405                                 }
3406                         }
3407 next:
3408                         path->slots[0]++;
3409                 }
3410                 btrfs_release_path(path);
3411 skip:
3412                 logical += increment;
3413                 physical += map->stripe_len;
3414                 spin_lock(&sctx->stat_lock);
3415                 if (stop_loop)
3416                         sctx->stat.last_physical = map->stripes[num].physical +
3417                                                    length;
3418                 else
3419                         sctx->stat.last_physical = physical;
3420                 spin_unlock(&sctx->stat_lock);
3421                 if (stop_loop)
3422                         break;
3423         }
3424 out:
3425         /* push queued extents */
3426         scrub_submit(sctx);
3427         mutex_lock(&sctx->wr_ctx.wr_lock);
3428         scrub_wr_submit(sctx);
3429         mutex_unlock(&sctx->wr_ctx.wr_lock);
3430
3431         blk_finish_plug(&plug);
3432         btrfs_free_path(path);
3433         btrfs_free_path(ppath);
3434         return ret < 0 ? ret : 0;
3435 }
3436
3437 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3438                                           struct btrfs_device *scrub_dev,
3439                                           u64 chunk_offset, u64 length,
3440                                           u64 dev_offset,
3441                                           struct btrfs_block_group_cache *cache,
3442                                           int is_dev_replace)
3443 {
3444         struct btrfs_fs_info *fs_info = sctx->fs_info;
3445         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3446         struct map_lookup *map;
3447         struct extent_map *em;
3448         int i;
3449         int ret = 0;
3450
3451         read_lock(&map_tree->map_tree.lock);
3452         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3453         read_unlock(&map_tree->map_tree.lock);
3454
3455         if (!em) {
3456                 /*
3457                  * Might have been an unused block group deleted by the cleaner
3458                  * kthread or relocation.
3459                  */
3460                 spin_lock(&cache->lock);
3461                 if (!cache->removed)
3462                         ret = -EINVAL;
3463                 spin_unlock(&cache->lock);
3464
3465                 return ret;
3466         }
3467
3468         map = em->map_lookup;
3469         if (em->start != chunk_offset)
3470                 goto out;
3471
3472         if (em->len < length)
3473                 goto out;
3474
3475         for (i = 0; i < map->num_stripes; ++i) {
3476                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3477                     map->stripes[i].physical == dev_offset) {
3478                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3479                                            chunk_offset, length,
3480                                            is_dev_replace);
3481                         if (ret)
3482                                 goto out;
3483                 }
3484         }
3485 out:
3486         free_extent_map(em);
3487
3488         return ret;
3489 }
3490
3491 static noinline_for_stack
3492 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3493                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3494                            int is_dev_replace)
3495 {
3496         struct btrfs_dev_extent *dev_extent = NULL;
3497         struct btrfs_path *path;
3498         struct btrfs_fs_info *fs_info = sctx->fs_info;
3499         struct btrfs_root *root = fs_info->dev_root;
3500         u64 length;
3501         u64 chunk_offset;
3502         int ret = 0;
3503         int ro_set;
3504         int slot;
3505         struct extent_buffer *l;
3506         struct btrfs_key key;
3507         struct btrfs_key found_key;
3508         struct btrfs_block_group_cache *cache;
3509         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3510
3511         path = btrfs_alloc_path();
3512         if (!path)
3513                 return -ENOMEM;
3514
3515         path->reada = READA_FORWARD;
3516         path->search_commit_root = 1;
3517         path->skip_locking = 1;
3518
3519         key.objectid = scrub_dev->devid;
3520         key.offset = 0ull;
3521         key.type = BTRFS_DEV_EXTENT_KEY;
3522
3523         while (1) {
3524                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3525                 if (ret < 0)
3526                         break;
3527                 if (ret > 0) {
3528                         if (path->slots[0] >=
3529                             btrfs_header_nritems(path->nodes[0])) {
3530                                 ret = btrfs_next_leaf(root, path);
3531                                 if (ret < 0)
3532                                         break;
3533                                 if (ret > 0) {
3534                                         ret = 0;
3535                                         break;
3536                                 }
3537                         } else {
3538                                 ret = 0;
3539                         }
3540                 }
3541
3542                 l = path->nodes[0];
3543                 slot = path->slots[0];
3544
3545                 btrfs_item_key_to_cpu(l, &found_key, slot);
3546
3547                 if (found_key.objectid != scrub_dev->devid)
3548                         break;
3549
3550                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3551                         break;
3552
3553                 if (found_key.offset >= end)
3554                         break;
3555
3556                 if (found_key.offset < key.offset)
3557                         break;
3558
3559                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3560                 length = btrfs_dev_extent_length(l, dev_extent);
3561
3562                 if (found_key.offset + length <= start)
3563                         goto skip;
3564
3565                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3566
3567                 /*
3568                  * get a reference on the corresponding block group to prevent
3569                  * the chunk from going away while we scrub it
3570                  */
3571                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3572
3573                 /* some chunks are removed but not committed to disk yet,
3574                  * continue scrubbing */
3575                 if (!cache)
3576                         goto skip;
3577
3578                 /*
3579                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3580                  * to avoid deadlock caused by:
3581                  * btrfs_inc_block_group_ro()
3582                  * -> btrfs_wait_for_commit()
3583                  * -> btrfs_commit_transaction()
3584                  * -> btrfs_scrub_pause()
3585                  */
3586                 scrub_pause_on(fs_info);
3587                 ret = btrfs_inc_block_group_ro(root, cache);
3588                 if (!ret && is_dev_replace) {
3589                         /*
3590                          * If we are doing a device replace wait for any tasks
3591                          * that started dellaloc right before we set the block
3592                          * group to RO mode, as they might have just allocated
3593                          * an extent from it or decided they could do a nocow
3594                          * write. And if any such tasks did that, wait for their
3595                          * ordered extents to complete and then commit the
3596                          * current transaction, so that we can later see the new
3597                          * extent items in the extent tree - the ordered extents
3598                          * create delayed data references (for cow writes) when
3599                          * they complete, which will be run and insert the
3600                          * corresponding extent items into the extent tree when
3601                          * we commit the transaction they used when running
3602                          * inode.c:btrfs_finish_ordered_io(). We later use
3603                          * the commit root of the extent tree to find extents
3604                          * to copy from the srcdev into the tgtdev, and we don't
3605                          * want to miss any new extents.
3606                          */
3607                         btrfs_wait_block_group_reservations(cache);
3608                         btrfs_wait_nocow_writers(cache);
3609                         ret = btrfs_wait_ordered_roots(fs_info, -1,
3610                                                        cache->key.objectid,
3611                                                        cache->key.offset);
3612                         if (ret > 0) {
3613                                 struct btrfs_trans_handle *trans;
3614
3615                                 trans = btrfs_join_transaction(root);
3616                                 if (IS_ERR(trans))
3617                                         ret = PTR_ERR(trans);
3618                                 else
3619                                         ret = btrfs_commit_transaction(trans);
3620                                 if (ret) {
3621                                         scrub_pause_off(fs_info);
3622                                         btrfs_put_block_group(cache);
3623                                         break;
3624                                 }
3625                         }
3626                 }
3627                 scrub_pause_off(fs_info);
3628
3629                 if (ret == 0) {
3630                         ro_set = 1;
3631                 } else if (ret == -ENOSPC) {
3632                         /*
3633                          * btrfs_inc_block_group_ro return -ENOSPC when it
3634                          * failed in creating new chunk for metadata.
3635                          * It is not a problem for scrub/replace, because
3636                          * metadata are always cowed, and our scrub paused
3637                          * commit_transactions.
3638                          */
3639                         ro_set = 0;
3640                 } else {
3641                         btrfs_warn(fs_info,
3642                                    "failed setting block group ro, ret=%d\n",
3643                                    ret);
3644                         btrfs_put_block_group(cache);
3645                         break;
3646                 }
3647
3648                 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3649                 dev_replace->cursor_right = found_key.offset + length;
3650                 dev_replace->cursor_left = found_key.offset;
3651                 dev_replace->item_needs_writeback = 1;
3652                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3653                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3654                                   found_key.offset, cache, is_dev_replace);
3655
3656                 /*
3657                  * flush, submit all pending read and write bios, afterwards
3658                  * wait for them.
3659                  * Note that in the dev replace case, a read request causes
3660                  * write requests that are submitted in the read completion
3661                  * worker. Therefore in the current situation, it is required
3662                  * that all write requests are flushed, so that all read and
3663                  * write requests are really completed when bios_in_flight
3664                  * changes to 0.
3665                  */
3666                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3667                 scrub_submit(sctx);
3668                 mutex_lock(&sctx->wr_ctx.wr_lock);
3669                 scrub_wr_submit(sctx);
3670                 mutex_unlock(&sctx->wr_ctx.wr_lock);
3671
3672                 wait_event(sctx->list_wait,
3673                            atomic_read(&sctx->bios_in_flight) == 0);
3674
3675                 scrub_pause_on(fs_info);
3676
3677                 /*
3678                  * must be called before we decrease @scrub_paused.
3679                  * make sure we don't block transaction commit while
3680                  * we are waiting pending workers finished.
3681                  */
3682                 wait_event(sctx->list_wait,
3683                            atomic_read(&sctx->workers_pending) == 0);
3684                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3685
3686                 scrub_pause_off(fs_info);
3687
3688                 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3689                 dev_replace->cursor_left = dev_replace->cursor_right;
3690                 dev_replace->item_needs_writeback = 1;
3691                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3692
3693                 if (ro_set)
3694                         btrfs_dec_block_group_ro(cache);
3695
3696                 /*
3697                  * We might have prevented the cleaner kthread from deleting
3698                  * this block group if it was already unused because we raced
3699                  * and set it to RO mode first. So add it back to the unused
3700                  * list, otherwise it might not ever be deleted unless a manual
3701                  * balance is triggered or it becomes used and unused again.
3702                  */
3703                 spin_lock(&cache->lock);
3704                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3705                     btrfs_block_group_used(&cache->item) == 0) {
3706                         spin_unlock(&cache->lock);
3707                         spin_lock(&fs_info->unused_bgs_lock);
3708                         if (list_empty(&cache->bg_list)) {
3709                                 btrfs_get_block_group(cache);
3710                                 list_add_tail(&cache->bg_list,
3711                                               &fs_info->unused_bgs);
3712                         }
3713                         spin_unlock(&fs_info->unused_bgs_lock);
3714                 } else {
3715                         spin_unlock(&cache->lock);
3716                 }
3717
3718                 btrfs_put_block_group(cache);
3719                 if (ret)
3720                         break;
3721                 if (is_dev_replace &&
3722                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3723                         ret = -EIO;
3724                         break;
3725                 }
3726                 if (sctx->stat.malloc_errors > 0) {
3727                         ret = -ENOMEM;
3728                         break;
3729                 }
3730 skip:
3731                 key.offset = found_key.offset + length;
3732                 btrfs_release_path(path);
3733         }
3734
3735         btrfs_free_path(path);
3736
3737         return ret;
3738 }
3739
3740 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3741                                            struct btrfs_device *scrub_dev)
3742 {
3743         int     i;
3744         u64     bytenr;
3745         u64     gen;
3746         int     ret;
3747         struct btrfs_fs_info *fs_info = sctx->fs_info;
3748
3749         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3750                 return -EIO;
3751
3752         /* Seed devices of a new filesystem has their own generation. */
3753         if (scrub_dev->fs_devices != fs_info->fs_devices)
3754                 gen = scrub_dev->generation;
3755         else
3756                 gen = fs_info->last_trans_committed;
3757
3758         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3759                 bytenr = btrfs_sb_offset(i);
3760                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3761                     scrub_dev->commit_total_bytes)
3762                         break;
3763
3764                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3765                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3766                                   NULL, 1, bytenr);
3767                 if (ret)
3768                         return ret;
3769         }
3770         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3771
3772         return 0;
3773 }
3774
3775 /*
3776  * get a reference count on fs_info->scrub_workers. start worker if necessary
3777  */
3778 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3779                                                 int is_dev_replace)
3780 {
3781         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3782         int max_active = fs_info->thread_pool_size;
3783
3784         if (fs_info->scrub_workers_refcnt == 0) {
3785                 if (is_dev_replace)
3786                         fs_info->scrub_workers =
3787                                 btrfs_alloc_workqueue(fs_info, "scrub", flags,
3788                                                       1, 4);
3789                 else
3790                         fs_info->scrub_workers =
3791                                 btrfs_alloc_workqueue(fs_info, "scrub", flags,
3792                                                       max_active, 4);
3793                 if (!fs_info->scrub_workers)
3794                         goto fail_scrub_workers;
3795
3796                 fs_info->scrub_wr_completion_workers =
3797                         btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3798                                               max_active, 2);
3799                 if (!fs_info->scrub_wr_completion_workers)
3800                         goto fail_scrub_wr_completion_workers;
3801
3802                 fs_info->scrub_nocow_workers =
3803                         btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
3804                 if (!fs_info->scrub_nocow_workers)
3805                         goto fail_scrub_nocow_workers;
3806                 fs_info->scrub_parity_workers =
3807                         btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3808                                               max_active, 2);
3809                 if (!fs_info->scrub_parity_workers)
3810                         goto fail_scrub_parity_workers;
3811         }
3812         ++fs_info->scrub_workers_refcnt;
3813         return 0;
3814
3815 fail_scrub_parity_workers:
3816         btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3817 fail_scrub_nocow_workers:
3818         btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3819 fail_scrub_wr_completion_workers:
3820         btrfs_destroy_workqueue(fs_info->scrub_workers);
3821 fail_scrub_workers:
3822         return -ENOMEM;
3823 }
3824
3825 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3826 {
3827         if (--fs_info->scrub_workers_refcnt == 0) {
3828                 btrfs_destroy_workqueue(fs_info->scrub_workers);
3829                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3830                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3831                 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
3832         }
3833         WARN_ON(fs_info->scrub_workers_refcnt < 0);
3834 }
3835
3836 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3837                     u64 end, struct btrfs_scrub_progress *progress,
3838                     int readonly, int is_dev_replace)
3839 {
3840         struct scrub_ctx *sctx;
3841         int ret;
3842         struct btrfs_device *dev;
3843         struct rcu_string *name;
3844
3845         if (btrfs_fs_closing(fs_info))
3846                 return -EINVAL;
3847
3848         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3849                 /*
3850                  * in this case scrub is unable to calculate the checksum
3851                  * the way scrub is implemented. Do not handle this
3852                  * situation at all because it won't ever happen.
3853                  */
3854                 btrfs_err(fs_info,
3855                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3856                        fs_info->nodesize,
3857                        BTRFS_STRIPE_LEN);
3858                 return -EINVAL;
3859         }
3860
3861         if (fs_info->sectorsize != PAGE_SIZE) {
3862                 /* not supported for data w/o checksums */
3863                 btrfs_err_rl(fs_info,
3864                            "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
3865                        fs_info->sectorsize, PAGE_SIZE);
3866                 return -EINVAL;
3867         }
3868
3869         if (fs_info->nodesize >
3870             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3871             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3872                 /*
3873                  * would exhaust the array bounds of pagev member in
3874                  * struct scrub_block
3875                  */
3876                 btrfs_err(fs_info,
3877                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3878                        fs_info->nodesize,
3879                        SCRUB_MAX_PAGES_PER_BLOCK,
3880                        fs_info->sectorsize,
3881                        SCRUB_MAX_PAGES_PER_BLOCK);
3882                 return -EINVAL;
3883         }
3884
3885
3886         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3887         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3888         if (!dev || (dev->missing && !is_dev_replace)) {
3889                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3890                 return -ENODEV;
3891         }
3892
3893         if (!is_dev_replace && !readonly && !dev->writeable) {
3894                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3895                 rcu_read_lock();
3896                 name = rcu_dereference(dev->name);
3897                 btrfs_err(fs_info, "scrub: device %s is not writable",
3898                           name->str);
3899                 rcu_read_unlock();
3900                 return -EROFS;
3901         }
3902
3903         mutex_lock(&fs_info->scrub_lock);
3904         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3905                 mutex_unlock(&fs_info->scrub_lock);
3906                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3907                 return -EIO;
3908         }
3909
3910         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3911         if (dev->scrub_device ||
3912             (!is_dev_replace &&
3913              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3914                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3915                 mutex_unlock(&fs_info->scrub_lock);
3916                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3917                 return -EINPROGRESS;
3918         }
3919         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3920
3921         ret = scrub_workers_get(fs_info, is_dev_replace);
3922         if (ret) {
3923                 mutex_unlock(&fs_info->scrub_lock);
3924                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3925                 return ret;
3926         }
3927
3928         sctx = scrub_setup_ctx(dev, is_dev_replace);
3929         if (IS_ERR(sctx)) {
3930                 mutex_unlock(&fs_info->scrub_lock);
3931                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3932                 scrub_workers_put(fs_info);
3933                 return PTR_ERR(sctx);
3934         }
3935         sctx->readonly = readonly;
3936         dev->scrub_device = sctx;
3937         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3938
3939         /*
3940          * checking @scrub_pause_req here, we can avoid
3941          * race between committing transaction and scrubbing.
3942          */
3943         __scrub_blocked_if_needed(fs_info);
3944         atomic_inc(&fs_info->scrubs_running);
3945         mutex_unlock(&fs_info->scrub_lock);
3946
3947         if (!is_dev_replace) {
3948                 /*
3949                  * by holding device list mutex, we can
3950                  * kick off writing super in log tree sync.
3951                  */
3952                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3953                 ret = scrub_supers(sctx, dev);
3954                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3955         }
3956
3957         if (!ret)
3958                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3959                                              is_dev_replace);
3960
3961         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3962         atomic_dec(&fs_info->scrubs_running);
3963         wake_up(&fs_info->scrub_pause_wait);
3964
3965         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3966
3967         if (progress)
3968                 memcpy(progress, &sctx->stat, sizeof(*progress));
3969
3970         mutex_lock(&fs_info->scrub_lock);
3971         dev->scrub_device = NULL;
3972         scrub_workers_put(fs_info);
3973         mutex_unlock(&fs_info->scrub_lock);
3974
3975         scrub_put_ctx(sctx);
3976
3977         return ret;
3978 }
3979
3980 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
3981 {
3982         mutex_lock(&fs_info->scrub_lock);
3983         atomic_inc(&fs_info->scrub_pause_req);
3984         while (atomic_read(&fs_info->scrubs_paused) !=
3985                atomic_read(&fs_info->scrubs_running)) {
3986                 mutex_unlock(&fs_info->scrub_lock);
3987                 wait_event(fs_info->scrub_pause_wait,
3988                            atomic_read(&fs_info->scrubs_paused) ==
3989                            atomic_read(&fs_info->scrubs_running));
3990                 mutex_lock(&fs_info->scrub_lock);
3991         }
3992         mutex_unlock(&fs_info->scrub_lock);
3993 }
3994
3995 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
3996 {
3997         atomic_dec(&fs_info->scrub_pause_req);
3998         wake_up(&fs_info->scrub_pause_wait);
3999 }
4000
4001 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4002 {
4003         mutex_lock(&fs_info->scrub_lock);
4004         if (!atomic_read(&fs_info->scrubs_running)) {
4005                 mutex_unlock(&fs_info->scrub_lock);
4006                 return -ENOTCONN;
4007         }
4008
4009         atomic_inc(&fs_info->scrub_cancel_req);
4010         while (atomic_read(&fs_info->scrubs_running)) {
4011                 mutex_unlock(&fs_info->scrub_lock);
4012                 wait_event(fs_info->scrub_pause_wait,
4013                            atomic_read(&fs_info->scrubs_running) == 0);
4014                 mutex_lock(&fs_info->scrub_lock);
4015         }
4016         atomic_dec(&fs_info->scrub_cancel_req);
4017         mutex_unlock(&fs_info->scrub_lock);
4018
4019         return 0;
4020 }
4021
4022 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4023                            struct btrfs_device *dev)
4024 {
4025         struct scrub_ctx *sctx;
4026
4027         mutex_lock(&fs_info->scrub_lock);
4028         sctx = dev->scrub_device;
4029         if (!sctx) {
4030                 mutex_unlock(&fs_info->scrub_lock);
4031                 return -ENOTCONN;
4032         }
4033         atomic_inc(&sctx->cancel_req);
4034         while (dev->scrub_device) {
4035                 mutex_unlock(&fs_info->scrub_lock);
4036                 wait_event(fs_info->scrub_pause_wait,
4037                            dev->scrub_device == NULL);
4038                 mutex_lock(&fs_info->scrub_lock);
4039         }
4040         mutex_unlock(&fs_info->scrub_lock);
4041
4042         return 0;
4043 }
4044
4045 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4046                          struct btrfs_scrub_progress *progress)
4047 {
4048         struct btrfs_device *dev;
4049         struct scrub_ctx *sctx = NULL;
4050
4051         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4052         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4053         if (dev)
4054                 sctx = dev->scrub_device;
4055         if (sctx)
4056                 memcpy(progress, &sctx->stat, sizeof(*progress));
4057         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4058
4059         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4060 }
4061
4062 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4063                                u64 extent_logical, u64 extent_len,
4064                                u64 *extent_physical,
4065                                struct btrfs_device **extent_dev,
4066                                int *extent_mirror_num)
4067 {
4068         u64 mapped_length;
4069         struct btrfs_bio *bbio = NULL;
4070         int ret;
4071
4072         mapped_length = extent_len;
4073         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4074                               &mapped_length, &bbio, 0);
4075         if (ret || !bbio || mapped_length < extent_len ||
4076             !bbio->stripes[0].dev->bdev) {
4077                 btrfs_put_bbio(bbio);
4078                 return;
4079         }
4080
4081         *extent_physical = bbio->stripes[0].physical;
4082         *extent_mirror_num = bbio->mirror_num;
4083         *extent_dev = bbio->stripes[0].dev;
4084         btrfs_put_bbio(bbio);
4085 }
4086
4087 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
4088                               struct scrub_wr_ctx *wr_ctx,
4089                               struct btrfs_fs_info *fs_info,
4090                               struct btrfs_device *dev,
4091                               int is_dev_replace)
4092 {
4093         WARN_ON(wr_ctx->wr_curr_bio != NULL);
4094
4095         mutex_init(&wr_ctx->wr_lock);
4096         wr_ctx->wr_curr_bio = NULL;
4097         if (!is_dev_replace)
4098                 return 0;
4099
4100         WARN_ON(!dev->bdev);
4101         wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
4102         wr_ctx->tgtdev = dev;
4103         atomic_set(&wr_ctx->flush_all_writes, 0);
4104         return 0;
4105 }
4106
4107 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
4108 {
4109         mutex_lock(&wr_ctx->wr_lock);
4110         kfree(wr_ctx->wr_curr_bio);
4111         wr_ctx->wr_curr_bio = NULL;
4112         mutex_unlock(&wr_ctx->wr_lock);
4113 }
4114
4115 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4116                             int mirror_num, u64 physical_for_dev_replace)
4117 {
4118         struct scrub_copy_nocow_ctx *nocow_ctx;
4119         struct btrfs_fs_info *fs_info = sctx->fs_info;
4120
4121         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4122         if (!nocow_ctx) {
4123                 spin_lock(&sctx->stat_lock);
4124                 sctx->stat.malloc_errors++;
4125                 spin_unlock(&sctx->stat_lock);
4126                 return -ENOMEM;
4127         }
4128
4129         scrub_pending_trans_workers_inc(sctx);
4130
4131         nocow_ctx->sctx = sctx;
4132         nocow_ctx->logical = logical;
4133         nocow_ctx->len = len;
4134         nocow_ctx->mirror_num = mirror_num;
4135         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
4136         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4137                         copy_nocow_pages_worker, NULL, NULL);
4138         INIT_LIST_HEAD(&nocow_ctx->inodes);
4139         btrfs_queue_work(fs_info->scrub_nocow_workers,
4140                          &nocow_ctx->work);
4141
4142         return 0;
4143 }
4144
4145 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4146 {
4147         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4148         struct scrub_nocow_inode *nocow_inode;
4149
4150         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4151         if (!nocow_inode)
4152                 return -ENOMEM;
4153         nocow_inode->inum = inum;
4154         nocow_inode->offset = offset;
4155         nocow_inode->root = root;
4156         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4157         return 0;
4158 }
4159
4160 #define COPY_COMPLETE 1
4161
4162 static void copy_nocow_pages_worker(struct btrfs_work *work)
4163 {
4164         struct scrub_copy_nocow_ctx *nocow_ctx =
4165                 container_of(work, struct scrub_copy_nocow_ctx, work);
4166         struct scrub_ctx *sctx = nocow_ctx->sctx;
4167         struct btrfs_fs_info *fs_info = sctx->fs_info;
4168         struct btrfs_root *root = fs_info->extent_root;
4169         u64 logical = nocow_ctx->logical;
4170         u64 len = nocow_ctx->len;
4171         int mirror_num = nocow_ctx->mirror_num;
4172         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4173         int ret;
4174         struct btrfs_trans_handle *trans = NULL;
4175         struct btrfs_path *path;
4176         int not_written = 0;
4177
4178         path = btrfs_alloc_path();
4179         if (!path) {
4180                 spin_lock(&sctx->stat_lock);
4181                 sctx->stat.malloc_errors++;
4182                 spin_unlock(&sctx->stat_lock);
4183                 not_written = 1;
4184                 goto out;
4185         }
4186
4187         trans = btrfs_join_transaction(root);
4188         if (IS_ERR(trans)) {
4189                 not_written = 1;
4190                 goto out;
4191         }
4192
4193         ret = iterate_inodes_from_logical(logical, fs_info, path,
4194                                           record_inode_for_nocow, nocow_ctx);
4195         if (ret != 0 && ret != -ENOENT) {
4196                 btrfs_warn(fs_info,
4197                            "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4198                            logical, physical_for_dev_replace, len, mirror_num,
4199                            ret);
4200                 not_written = 1;
4201                 goto out;
4202         }
4203
4204         btrfs_end_transaction(trans);
4205         trans = NULL;
4206         while (!list_empty(&nocow_ctx->inodes)) {
4207                 struct scrub_nocow_inode *entry;
4208                 entry = list_first_entry(&nocow_ctx->inodes,
4209                                          struct scrub_nocow_inode,
4210                                          list);
4211                 list_del_init(&entry->list);
4212                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4213                                                  entry->root, nocow_ctx);
4214                 kfree(entry);
4215                 if (ret == COPY_COMPLETE) {
4216                         ret = 0;
4217                         break;
4218                 } else if (ret) {
4219                         break;
4220                 }
4221         }
4222 out:
4223         while (!list_empty(&nocow_ctx->inodes)) {
4224                 struct scrub_nocow_inode *entry;
4225                 entry = list_first_entry(&nocow_ctx->inodes,
4226                                          struct scrub_nocow_inode,
4227                                          list);
4228                 list_del_init(&entry->list);
4229                 kfree(entry);
4230         }
4231         if (trans && !IS_ERR(trans))
4232                 btrfs_end_transaction(trans);
4233         if (not_written)
4234                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4235                                             num_uncorrectable_read_errors);
4236
4237         btrfs_free_path(path);
4238         kfree(nocow_ctx);
4239
4240         scrub_pending_trans_workers_dec(sctx);
4241 }
4242
4243 static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4244                                  u64 logical)
4245 {
4246         struct extent_state *cached_state = NULL;
4247         struct btrfs_ordered_extent *ordered;
4248         struct extent_io_tree *io_tree;
4249         struct extent_map *em;
4250         u64 lockstart = start, lockend = start + len - 1;
4251         int ret = 0;
4252
4253         io_tree = &BTRFS_I(inode)->io_tree;
4254
4255         lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
4256         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4257         if (ordered) {
4258                 btrfs_put_ordered_extent(ordered);
4259                 ret = 1;
4260                 goto out_unlock;
4261         }
4262
4263         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4264         if (IS_ERR(em)) {
4265                 ret = PTR_ERR(em);
4266                 goto out_unlock;
4267         }
4268
4269         /*
4270          * This extent does not actually cover the logical extent anymore,
4271          * move on to the next inode.
4272          */
4273         if (em->block_start > logical ||
4274             em->block_start + em->block_len < logical + len) {
4275                 free_extent_map(em);
4276                 ret = 1;
4277                 goto out_unlock;
4278         }
4279         free_extent_map(em);
4280
4281 out_unlock:
4282         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4283                              GFP_NOFS);
4284         return ret;
4285 }
4286
4287 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4288                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4289 {
4290         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
4291         struct btrfs_key key;
4292         struct inode *inode;
4293         struct page *page;
4294         struct btrfs_root *local_root;
4295         struct extent_io_tree *io_tree;
4296         u64 physical_for_dev_replace;
4297         u64 nocow_ctx_logical;
4298         u64 len = nocow_ctx->len;
4299         unsigned long index;
4300         int srcu_index;
4301         int ret = 0;
4302         int err = 0;
4303
4304         key.objectid = root;
4305         key.type = BTRFS_ROOT_ITEM_KEY;
4306         key.offset = (u64)-1;
4307
4308         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4309
4310         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4311         if (IS_ERR(local_root)) {
4312                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4313                 return PTR_ERR(local_root);
4314         }
4315
4316         key.type = BTRFS_INODE_ITEM_KEY;
4317         key.objectid = inum;
4318         key.offset = 0;
4319         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4320         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4321         if (IS_ERR(inode))
4322                 return PTR_ERR(inode);
4323
4324         /* Avoid truncate/dio/punch hole.. */
4325         inode_lock(inode);
4326         inode_dio_wait(inode);
4327
4328         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4329         io_tree = &BTRFS_I(inode)->io_tree;
4330         nocow_ctx_logical = nocow_ctx->logical;
4331
4332         ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4333         if (ret) {
4334                 ret = ret > 0 ? 0 : ret;
4335                 goto out;
4336         }
4337
4338         while (len >= PAGE_SIZE) {
4339                 index = offset >> PAGE_SHIFT;
4340 again:
4341                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4342                 if (!page) {
4343                         btrfs_err(fs_info, "find_or_create_page() failed");
4344                         ret = -ENOMEM;
4345                         goto out;
4346                 }
4347
4348                 if (PageUptodate(page)) {
4349                         if (PageDirty(page))
4350                                 goto next_page;
4351                 } else {
4352                         ClearPageError(page);
4353                         err = extent_read_full_page(io_tree, page,
4354                                                            btrfs_get_extent,
4355                                                            nocow_ctx->mirror_num);
4356                         if (err) {
4357                                 ret = err;
4358                                 goto next_page;
4359                         }
4360
4361                         lock_page(page);
4362                         /*
4363                          * If the page has been remove from the page cache,
4364                          * the data on it is meaningless, because it may be
4365                          * old one, the new data may be written into the new
4366                          * page in the page cache.
4367                          */
4368                         if (page->mapping != inode->i_mapping) {
4369                                 unlock_page(page);
4370                                 put_page(page);
4371                                 goto again;
4372                         }
4373                         if (!PageUptodate(page)) {
4374                                 ret = -EIO;
4375                                 goto next_page;
4376                         }
4377                 }
4378
4379                 ret = check_extent_to_block(inode, offset, len,
4380                                             nocow_ctx_logical);
4381                 if (ret) {
4382                         ret = ret > 0 ? 0 : ret;
4383                         goto next_page;
4384                 }
4385
4386                 err = write_page_nocow(nocow_ctx->sctx,
4387                                        physical_for_dev_replace, page);
4388                 if (err)
4389                         ret = err;
4390 next_page:
4391                 unlock_page(page);
4392                 put_page(page);
4393
4394                 if (ret)
4395                         break;
4396
4397                 offset += PAGE_SIZE;
4398                 physical_for_dev_replace += PAGE_SIZE;
4399                 nocow_ctx_logical += PAGE_SIZE;
4400                 len -= PAGE_SIZE;
4401         }
4402         ret = COPY_COMPLETE;
4403 out:
4404         inode_unlock(inode);
4405         iput(inode);
4406         return ret;
4407 }
4408
4409 static int write_page_nocow(struct scrub_ctx *sctx,
4410                             u64 physical_for_dev_replace, struct page *page)
4411 {
4412         struct bio *bio;
4413         struct btrfs_device *dev;
4414         int ret;
4415
4416         dev = sctx->wr_ctx.tgtdev;
4417         if (!dev)
4418                 return -EIO;
4419         if (!dev->bdev) {
4420                 btrfs_warn_rl(dev->fs_info,
4421                         "scrub write_page_nocow(bdev == NULL) is unexpected");
4422                 return -EIO;
4423         }
4424         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4425         if (!bio) {
4426                 spin_lock(&sctx->stat_lock);
4427                 sctx->stat.malloc_errors++;
4428                 spin_unlock(&sctx->stat_lock);
4429                 return -ENOMEM;
4430         }
4431         bio->bi_iter.bi_size = 0;
4432         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4433         bio->bi_bdev = dev->bdev;
4434         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
4435         ret = bio_add_page(bio, page, PAGE_SIZE, 0);
4436         if (ret != PAGE_SIZE) {
4437 leave_with_eio:
4438                 bio_put(bio);
4439                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4440                 return -EIO;
4441         }
4442
4443         if (btrfsic_submit_bio_wait(bio))
4444                 goto leave_with_eio;
4445
4446         bio_put(bio);
4447         return 0;
4448 }