btrfs: remove redundant calls to flush_dcache_page
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
cea62800 16#include "misc.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
53b381b3
DW
22
23/* set when additional merges to this rbio are not allowed */
24#define RBIO_RMW_LOCKED_BIT 1
25
4ae10b3a
CM
26/*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30#define RBIO_CACHE_BIT 2
31
32/*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35#define RBIO_CACHE_READY_BIT 3
36
4ae10b3a
CM
37#define RBIO_CACHE_SIZE 1024
38
8a953348
DS
39#define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41/* Used by the raid56 code to lock stripes for read/modify/write */
42struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45};
46
47/* Used by the raid56 code to lock stripes for read/modify/write */
48struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53};
54
eb357060
QW
55/*
56 * A bvec like structure to present a sector inside a page.
57 *
58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
59 */
60struct sector_ptr {
61 struct page *page;
00425dd9
QW
62 unsigned int pgoff:24;
63 unsigned int uptodate:8;
eb357060
QW
64};
65
1b94b556 66enum btrfs_rbio_ops {
b4ee1782
OS
67 BTRFS_RBIO_WRITE,
68 BTRFS_RBIO_READ_REBUILD,
69 BTRFS_RBIO_PARITY_SCRUB,
70 BTRFS_RBIO_REBUILD_MISSING,
1b94b556
MX
71};
72
53b381b3 73struct btrfs_raid_bio {
4c664611 74 struct btrfs_io_context *bioc;
53b381b3 75
53b381b3
DW
76 /* while we're doing rmw on a stripe
77 * we put it into a hash table so we can
78 * lock the stripe and merge more rbios
79 * into it.
80 */
81 struct list_head hash_list;
82
4ae10b3a
CM
83 /*
84 * LRU list for the stripe cache
85 */
86 struct list_head stripe_cache;
87
53b381b3
DW
88 /*
89 * for scheduling work in the helper threads
90 */
385de0ef 91 struct work_struct work;
53b381b3
DW
92
93 /*
94 * bio list and bio_list_lock are used
95 * to add more bios into the stripe
96 * in hopes of avoiding the full rmw
97 */
98 struct bio_list bio_list;
99 spinlock_t bio_list_lock;
100
6ac0f488
CM
101 /* also protected by the bio_list_lock, the
102 * plug list is used by the plugging code
103 * to collect partial bios while plugged. The
104 * stripe locking code also uses it to hand off
53b381b3
DW
105 * the stripe lock to the next pending IO
106 */
107 struct list_head plug_list;
108
109 /*
110 * flags that tell us if it is safe to
111 * merge with this bio
112 */
113 unsigned long flags;
114
53b381b3
DW
115 /*
116 * set if we're doing a parity rebuild
117 * for a read from higher up, which is handled
118 * differently from a parity rebuild as part of
119 * rmw
120 */
1b94b556 121 enum btrfs_rbio_ops operation;
53b381b3 122
29b06838
QW
123 /* Size of each individual stripe on disk */
124 u32 stripe_len;
53b381b3 125
29b06838
QW
126 /* How many pages there are for the full stripe including P/Q */
127 u16 nr_pages;
53b381b3 128
94efbe19
QW
129 /* How many sectors there are for the full stripe including P/Q */
130 u16 nr_sectors;
131
29b06838
QW
132 /* Number of data stripes (no p/q) */
133 u8 nr_data;
134
143823cf 135 /* Number of all stripes (including P/Q) */
29b06838
QW
136 u8 real_stripes;
137
138 /* How many pages there are for each stripe */
139 u8 stripe_npages;
140
94efbe19
QW
141 /* How many sectors there are for each stripe */
142 u8 stripe_nsectors;
143
29b06838
QW
144 /* First bad stripe, -1 means no corruption */
145 s8 faila;
146
147 /* Second bad stripe (for RAID6 use) */
148 s8 failb;
149
150 /* Stripe number that we're scrubbing */
151 u8 scrubp;
53b381b3
DW
152
153 /*
154 * size of all the bios in the bio_list. This
155 * helps us decide if the rbio maps to a full
156 * stripe or not
157 */
158 int bio_list_bytes;
159
4245215d
MX
160 int generic_bio_cnt;
161
dec95574 162 refcount_t refs;
53b381b3 163
b89e1b01
MX
164 atomic_t stripes_pending;
165
166 atomic_t error;
c67c68eb
QW
167
168 /* Bitmap to record which horizontal stripe has data */
169 unsigned long dbitmap;
170
171 /* Allocated with stripe_nsectors-many bits for finish_*() calls */
172 unsigned long finish_pbitmap;
173
53b381b3
DW
174 /*
175 * these are two arrays of pointers. We allocate the
176 * rbio big enough to hold them both and setup their
177 * locations when the rbio is allocated
178 */
179
180 /* pointers to pages that we allocated for
181 * reading/writing stripes directly from the disk (including P/Q)
182 */
183 struct page **stripe_pages;
184
00425dd9
QW
185 /* Pointers to the sectors in the bio_list, for faster lookup */
186 struct sector_ptr *bio_sectors;
187
5a6ac9ea 188 /*
eb357060
QW
189 * For subpage support, we need to map each sector to above
190 * stripe_pages.
5a6ac9ea 191 */
eb357060
QW
192 struct sector_ptr *stripe_sectors;
193
1389053e
KC
194 /* allocated with real_stripes-many pointers for finish_*() calls */
195 void **finish_pointers;
53b381b3
DW
196};
197
198static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
199static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
385de0ef
CH
200static void rmw_work(struct work_struct *work);
201static void read_rebuild_work(struct work_struct *work);
53b381b3
DW
202static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
203static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
204static void __free_raid_bio(struct btrfs_raid_bio *rbio);
205static void index_rbio_pages(struct btrfs_raid_bio *rbio);
206static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
207
5a6ac9ea
MX
208static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
209 int need_check);
385de0ef 210static void scrub_parity_work(struct work_struct *work);
5a6ac9ea 211
385de0ef 212static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
ac638859 213{
385de0ef
CH
214 INIT_WORK(&rbio->work, work_func);
215 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
216}
217
53b381b3
DW
218/*
219 * the stripe hash table is used for locking, and to collect
220 * bios in hopes of making a full stripe
221 */
222int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
223{
224 struct btrfs_stripe_hash_table *table;
225 struct btrfs_stripe_hash_table *x;
226 struct btrfs_stripe_hash *cur;
227 struct btrfs_stripe_hash *h;
228 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
229 int i;
230
231 if (info->stripe_hash_table)
232 return 0;
233
83c8266a
DS
234 /*
235 * The table is large, starting with order 4 and can go as high as
236 * order 7 in case lock debugging is turned on.
237 *
238 * Try harder to allocate and fallback to vmalloc to lower the chance
239 * of a failing mount.
240 */
ee787f95 241 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
242 if (!table)
243 return -ENOMEM;
53b381b3 244
4ae10b3a
CM
245 spin_lock_init(&table->cache_lock);
246 INIT_LIST_HEAD(&table->stripe_cache);
247
53b381b3
DW
248 h = table->table;
249
250 for (i = 0; i < num_entries; i++) {
251 cur = h + i;
252 INIT_LIST_HEAD(&cur->hash_list);
253 spin_lock_init(&cur->lock);
53b381b3
DW
254 }
255
256 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 257 kvfree(x);
53b381b3
DW
258 return 0;
259}
260
4ae10b3a
CM
261/*
262 * caching an rbio means to copy anything from the
ac26df8b 263 * bio_sectors array into the stripe_pages array. We
4ae10b3a
CM
264 * use the page uptodate bit in the stripe cache array
265 * to indicate if it has valid data
266 *
267 * once the caching is done, we set the cache ready
268 * bit.
269 */
270static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
271{
272 int i;
4ae10b3a
CM
273 int ret;
274
275 ret = alloc_rbio_pages(rbio);
276 if (ret)
277 return;
278
00425dd9
QW
279 for (i = 0; i < rbio->nr_sectors; i++) {
280 /* Some range not covered by bio (partial write), skip it */
281 if (!rbio->bio_sectors[i].page)
282 continue;
283
284 ASSERT(rbio->stripe_sectors[i].page);
285 memcpy_page(rbio->stripe_sectors[i].page,
286 rbio->stripe_sectors[i].pgoff,
287 rbio->bio_sectors[i].page,
288 rbio->bio_sectors[i].pgoff,
289 rbio->bioc->fs_info->sectorsize);
290 rbio->stripe_sectors[i].uptodate = 1;
291 }
4ae10b3a
CM
292 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
293}
294
53b381b3
DW
295/*
296 * we hash on the first logical address of the stripe
297 */
298static int rbio_bucket(struct btrfs_raid_bio *rbio)
299{
4c664611 300 u64 num = rbio->bioc->raid_map[0];
53b381b3
DW
301
302 /*
303 * we shift down quite a bit. We're using byte
304 * addressing, and most of the lower bits are zeros.
305 * This tends to upset hash_64, and it consistently
306 * returns just one or two different values.
307 *
308 * shifting off the lower bits fixes things.
309 */
310 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
311}
312
d4e28d9b
QW
313static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
314 unsigned int page_nr)
315{
316 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
317 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
318 int i;
319
320 ASSERT(page_nr < rbio->nr_pages);
321
322 for (i = sectors_per_page * page_nr;
323 i < sectors_per_page * page_nr + sectors_per_page;
324 i++) {
325 if (!rbio->stripe_sectors[i].uptodate)
326 return false;
327 }
328 return true;
329}
330
eb357060
QW
331/*
332 * Update the stripe_sectors[] array to use correct page and pgoff
333 *
334 * Should be called every time any page pointer in stripes_pages[] got modified.
335 */
336static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
337{
338 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
339 u32 offset;
340 int i;
341
342 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
343 int page_index = offset >> PAGE_SHIFT;
344
345 ASSERT(page_index < rbio->nr_pages);
346 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
347 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
348 }
349}
350
4ae10b3a 351/*
d4e28d9b
QW
352 * Stealing an rbio means taking all the uptodate pages from the stripe array
353 * in the source rbio and putting them into the destination rbio.
354 *
355 * This will also update the involved stripe_sectors[] which are referring to
356 * the old pages.
4ae10b3a
CM
357 */
358static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
359{
360 int i;
361 struct page *s;
362 struct page *d;
363
364 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
365 return;
366
367 for (i = 0; i < dest->nr_pages; i++) {
368 s = src->stripe_pages[i];
d4e28d9b 369 if (!s || !full_page_sectors_uptodate(src, i))
4ae10b3a 370 continue;
4ae10b3a
CM
371
372 d = dest->stripe_pages[i];
373 if (d)
374 __free_page(d);
375
376 dest->stripe_pages[i] = s;
377 src->stripe_pages[i] = NULL;
378 }
eb357060
QW
379 index_stripe_sectors(dest);
380 index_stripe_sectors(src);
4ae10b3a
CM
381}
382
53b381b3
DW
383/*
384 * merging means we take the bio_list from the victim and
385 * splice it into the destination. The victim should
386 * be discarded afterwards.
387 *
388 * must be called with dest->rbio_list_lock held
389 */
390static void merge_rbio(struct btrfs_raid_bio *dest,
391 struct btrfs_raid_bio *victim)
392{
393 bio_list_merge(&dest->bio_list, &victim->bio_list);
394 dest->bio_list_bytes += victim->bio_list_bytes;
bd8f7e62
QW
395 /* Also inherit the bitmaps from @victim. */
396 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
397 dest->stripe_nsectors);
4245215d 398 dest->generic_bio_cnt += victim->generic_bio_cnt;
53b381b3
DW
399 bio_list_init(&victim->bio_list);
400}
401
402/*
4ae10b3a
CM
403 * used to prune items that are in the cache. The caller
404 * must hold the hash table lock.
405 */
406static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
407{
408 int bucket = rbio_bucket(rbio);
409 struct btrfs_stripe_hash_table *table;
410 struct btrfs_stripe_hash *h;
411 int freeit = 0;
412
413 /*
414 * check the bit again under the hash table lock.
415 */
416 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
417 return;
418
6a258d72 419 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
420 h = table->table + bucket;
421
422 /* hold the lock for the bucket because we may be
423 * removing it from the hash table
424 */
425 spin_lock(&h->lock);
426
427 /*
428 * hold the lock for the bio list because we need
429 * to make sure the bio list is empty
430 */
431 spin_lock(&rbio->bio_list_lock);
432
433 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
434 list_del_init(&rbio->stripe_cache);
435 table->cache_size -= 1;
436 freeit = 1;
437
438 /* if the bio list isn't empty, this rbio is
439 * still involved in an IO. We take it out
440 * of the cache list, and drop the ref that
441 * was held for the list.
442 *
443 * If the bio_list was empty, we also remove
444 * the rbio from the hash_table, and drop
445 * the corresponding ref
446 */
447 if (bio_list_empty(&rbio->bio_list)) {
448 if (!list_empty(&rbio->hash_list)) {
449 list_del_init(&rbio->hash_list);
dec95574 450 refcount_dec(&rbio->refs);
4ae10b3a
CM
451 BUG_ON(!list_empty(&rbio->plug_list));
452 }
453 }
454 }
455
456 spin_unlock(&rbio->bio_list_lock);
457 spin_unlock(&h->lock);
458
459 if (freeit)
460 __free_raid_bio(rbio);
461}
462
463/*
464 * prune a given rbio from the cache
465 */
466static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
467{
468 struct btrfs_stripe_hash_table *table;
469 unsigned long flags;
470
471 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
472 return;
473
6a258d72 474 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
475
476 spin_lock_irqsave(&table->cache_lock, flags);
477 __remove_rbio_from_cache(rbio);
478 spin_unlock_irqrestore(&table->cache_lock, flags);
479}
480
481/*
482 * remove everything in the cache
483 */
48a3b636 484static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
485{
486 struct btrfs_stripe_hash_table *table;
487 unsigned long flags;
488 struct btrfs_raid_bio *rbio;
489
490 table = info->stripe_hash_table;
491
492 spin_lock_irqsave(&table->cache_lock, flags);
493 while (!list_empty(&table->stripe_cache)) {
494 rbio = list_entry(table->stripe_cache.next,
495 struct btrfs_raid_bio,
496 stripe_cache);
497 __remove_rbio_from_cache(rbio);
498 }
499 spin_unlock_irqrestore(&table->cache_lock, flags);
500}
501
502/*
503 * remove all cached entries and free the hash table
504 * used by unmount
53b381b3
DW
505 */
506void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
507{
508 if (!info->stripe_hash_table)
509 return;
4ae10b3a 510 btrfs_clear_rbio_cache(info);
f749303b 511 kvfree(info->stripe_hash_table);
53b381b3
DW
512 info->stripe_hash_table = NULL;
513}
514
4ae10b3a
CM
515/*
516 * insert an rbio into the stripe cache. It
517 * must have already been prepared by calling
518 * cache_rbio_pages
519 *
520 * If this rbio was already cached, it gets
521 * moved to the front of the lru.
522 *
523 * If the size of the rbio cache is too big, we
524 * prune an item.
525 */
526static void cache_rbio(struct btrfs_raid_bio *rbio)
527{
528 struct btrfs_stripe_hash_table *table;
529 unsigned long flags;
530
531 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
532 return;
533
6a258d72 534 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
535
536 spin_lock_irqsave(&table->cache_lock, flags);
537 spin_lock(&rbio->bio_list_lock);
538
539 /* bump our ref if we were not in the list before */
540 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 541 refcount_inc(&rbio->refs);
4ae10b3a
CM
542
543 if (!list_empty(&rbio->stripe_cache)){
544 list_move(&rbio->stripe_cache, &table->stripe_cache);
545 } else {
546 list_add(&rbio->stripe_cache, &table->stripe_cache);
547 table->cache_size += 1;
548 }
549
550 spin_unlock(&rbio->bio_list_lock);
551
552 if (table->cache_size > RBIO_CACHE_SIZE) {
553 struct btrfs_raid_bio *found;
554
555 found = list_entry(table->stripe_cache.prev,
556 struct btrfs_raid_bio,
557 stripe_cache);
558
559 if (found != rbio)
560 __remove_rbio_from_cache(found);
561 }
562
563 spin_unlock_irqrestore(&table->cache_lock, flags);
4ae10b3a
CM
564}
565
53b381b3
DW
566/*
567 * helper function to run the xor_blocks api. It is only
568 * able to do MAX_XOR_BLOCKS at a time, so we need to
569 * loop through.
570 */
571static void run_xor(void **pages, int src_cnt, ssize_t len)
572{
573 int src_off = 0;
574 int xor_src_cnt = 0;
575 void *dest = pages[src_cnt];
576
577 while(src_cnt > 0) {
578 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
579 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
580
581 src_cnt -= xor_src_cnt;
582 src_off += xor_src_cnt;
583 }
584}
585
586/*
176571a1
DS
587 * Returns true if the bio list inside this rbio covers an entire stripe (no
588 * rmw required).
53b381b3 589 */
176571a1 590static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3 591{
176571a1 592 unsigned long flags;
53b381b3
DW
593 unsigned long size = rbio->bio_list_bytes;
594 int ret = 1;
595
176571a1 596 spin_lock_irqsave(&rbio->bio_list_lock, flags);
53b381b3
DW
597 if (size != rbio->nr_data * rbio->stripe_len)
598 ret = 0;
53b381b3 599 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
53b381b3 600 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
176571a1 601
53b381b3
DW
602 return ret;
603}
604
605/*
606 * returns 1 if it is safe to merge two rbios together.
607 * The merging is safe if the two rbios correspond to
608 * the same stripe and if they are both going in the same
609 * direction (read vs write), and if neither one is
610 * locked for final IO
611 *
612 * The caller is responsible for locking such that
613 * rmw_locked is safe to test
614 */
615static int rbio_can_merge(struct btrfs_raid_bio *last,
616 struct btrfs_raid_bio *cur)
617{
618 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
619 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
620 return 0;
621
4ae10b3a
CM
622 /*
623 * we can't merge with cached rbios, since the
624 * idea is that when we merge the destination
625 * rbio is going to run our IO for us. We can
01327610 626 * steal from cached rbios though, other functions
4ae10b3a
CM
627 * handle that.
628 */
629 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
630 test_bit(RBIO_CACHE_BIT, &cur->flags))
631 return 0;
632
4c664611 633 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
53b381b3
DW
634 return 0;
635
5a6ac9ea
MX
636 /* we can't merge with different operations */
637 if (last->operation != cur->operation)
638 return 0;
639 /*
640 * We've need read the full stripe from the drive.
641 * check and repair the parity and write the new results.
642 *
643 * We're not allowed to add any new bios to the
644 * bio list here, anyone else that wants to
645 * change this stripe needs to do their own rmw.
646 */
db34be19 647 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 648 return 0;
53b381b3 649
db34be19 650 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
b4ee1782
OS
651 return 0;
652
cc54ff62
LB
653 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
654 int fa = last->faila;
655 int fb = last->failb;
656 int cur_fa = cur->faila;
657 int cur_fb = cur->failb;
658
659 if (last->faila >= last->failb) {
660 fa = last->failb;
661 fb = last->faila;
662 }
663
664 if (cur->faila >= cur->failb) {
665 cur_fa = cur->failb;
666 cur_fb = cur->faila;
667 }
668
669 if (fa != cur_fa || fb != cur_fb)
670 return 0;
671 }
53b381b3
DW
672 return 1;
673}
674
3e77605d
QW
675static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
676 unsigned int stripe_nr,
677 unsigned int sector_nr)
678{
679 ASSERT(stripe_nr < rbio->real_stripes);
680 ASSERT(sector_nr < rbio->stripe_nsectors);
681
682 return stripe_nr * rbio->stripe_nsectors + sector_nr;
683}
684
685/* Return a sector from rbio->stripe_sectors, not from the bio list */
686static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
687 unsigned int stripe_nr,
688 unsigned int sector_nr)
689{
690 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
691 sector_nr)];
692}
693
1145059a
QW
694/* Grab a sector inside P stripe */
695static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
696 unsigned int sector_nr)
b7178a5f 697{
1145059a 698 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
b7178a5f
ZL
699}
700
1145059a
QW
701/* Grab a sector inside Q stripe, return NULL if not RAID6 */
702static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
703 unsigned int sector_nr)
53b381b3 704{
1145059a
QW
705 if (rbio->nr_data + 1 == rbio->real_stripes)
706 return NULL;
707 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
53b381b3
DW
708}
709
53b381b3
DW
710/*
711 * The first stripe in the table for a logical address
712 * has the lock. rbios are added in one of three ways:
713 *
714 * 1) Nobody has the stripe locked yet. The rbio is given
715 * the lock and 0 is returned. The caller must start the IO
716 * themselves.
717 *
718 * 2) Someone has the stripe locked, but we're able to merge
719 * with the lock owner. The rbio is freed and the IO will
720 * start automatically along with the existing rbio. 1 is returned.
721 *
722 * 3) Someone has the stripe locked, but we're not able to merge.
723 * The rbio is added to the lock owner's plug list, or merged into
724 * an rbio already on the plug list. When the lock owner unlocks,
725 * the next rbio on the list is run and the IO is started automatically.
726 * 1 is returned
727 *
728 * If we return 0, the caller still owns the rbio and must continue with
729 * IO submission. If we return 1, the caller must assume the rbio has
730 * already been freed.
731 */
732static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
733{
721860d5 734 struct btrfs_stripe_hash *h;
53b381b3
DW
735 struct btrfs_raid_bio *cur;
736 struct btrfs_raid_bio *pending;
737 unsigned long flags;
53b381b3 738 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 739 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 740 int ret = 0;
53b381b3 741
6a258d72 742 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 743
53b381b3
DW
744 spin_lock_irqsave(&h->lock, flags);
745 list_for_each_entry(cur, &h->hash_list, hash_list) {
4c664611 746 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
9d6cb1b0 747 continue;
4ae10b3a 748
9d6cb1b0 749 spin_lock(&cur->bio_list_lock);
4ae10b3a 750
9d6cb1b0
JT
751 /* Can we steal this cached rbio's pages? */
752 if (bio_list_empty(&cur->bio_list) &&
753 list_empty(&cur->plug_list) &&
754 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
755 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
756 list_del_init(&cur->hash_list);
757 refcount_dec(&cur->refs);
53b381b3 758
9d6cb1b0
JT
759 steal_rbio(cur, rbio);
760 cache_drop = cur;
761 spin_unlock(&cur->bio_list_lock);
4ae10b3a 762
9d6cb1b0
JT
763 goto lockit;
764 }
53b381b3 765
9d6cb1b0
JT
766 /* Can we merge into the lock owner? */
767 if (rbio_can_merge(cur, rbio)) {
768 merge_rbio(cur, rbio);
53b381b3 769 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 770 freeit = rbio;
53b381b3
DW
771 ret = 1;
772 goto out;
773 }
9d6cb1b0
JT
774
775
776 /*
777 * We couldn't merge with the running rbio, see if we can merge
778 * with the pending ones. We don't have to check for rmw_locked
779 * because there is no way they are inside finish_rmw right now
780 */
781 list_for_each_entry(pending, &cur->plug_list, plug_list) {
782 if (rbio_can_merge(pending, rbio)) {
783 merge_rbio(pending, rbio);
784 spin_unlock(&cur->bio_list_lock);
785 freeit = rbio;
786 ret = 1;
787 goto out;
788 }
789 }
790
791 /*
792 * No merging, put us on the tail of the plug list, our rbio
793 * will be started with the currently running rbio unlocks
794 */
795 list_add_tail(&rbio->plug_list, &cur->plug_list);
796 spin_unlock(&cur->bio_list_lock);
797 ret = 1;
798 goto out;
53b381b3 799 }
4ae10b3a 800lockit:
dec95574 801 refcount_inc(&rbio->refs);
53b381b3
DW
802 list_add(&rbio->hash_list, &h->hash_list);
803out:
804 spin_unlock_irqrestore(&h->lock, flags);
4ae10b3a
CM
805 if (cache_drop)
806 remove_rbio_from_cache(cache_drop);
53b381b3
DW
807 if (freeit)
808 __free_raid_bio(freeit);
809 return ret;
810}
811
812/*
813 * called as rmw or parity rebuild is completed. If the plug list has more
814 * rbios waiting for this stripe, the next one on the list will be started
815 */
816static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
817{
818 int bucket;
819 struct btrfs_stripe_hash *h;
820 unsigned long flags;
4ae10b3a 821 int keep_cache = 0;
53b381b3
DW
822
823 bucket = rbio_bucket(rbio);
6a258d72 824 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 825
4ae10b3a
CM
826 if (list_empty(&rbio->plug_list))
827 cache_rbio(rbio);
828
53b381b3
DW
829 spin_lock_irqsave(&h->lock, flags);
830 spin_lock(&rbio->bio_list_lock);
831
832 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
833 /*
834 * if we're still cached and there is no other IO
835 * to perform, just leave this rbio here for others
836 * to steal from later
837 */
838 if (list_empty(&rbio->plug_list) &&
839 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
840 keep_cache = 1;
841 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
842 BUG_ON(!bio_list_empty(&rbio->bio_list));
843 goto done;
844 }
53b381b3
DW
845
846 list_del_init(&rbio->hash_list);
dec95574 847 refcount_dec(&rbio->refs);
53b381b3
DW
848
849 /*
850 * we use the plug list to hold all the rbios
851 * waiting for the chance to lock this stripe.
852 * hand the lock over to one of them.
853 */
854 if (!list_empty(&rbio->plug_list)) {
855 struct btrfs_raid_bio *next;
856 struct list_head *head = rbio->plug_list.next;
857
858 next = list_entry(head, struct btrfs_raid_bio,
859 plug_list);
860
861 list_del_init(&rbio->plug_list);
862
863 list_add(&next->hash_list, &h->hash_list);
dec95574 864 refcount_inc(&next->refs);
53b381b3
DW
865 spin_unlock(&rbio->bio_list_lock);
866 spin_unlock_irqrestore(&h->lock, flags);
867
1b94b556 868 if (next->operation == BTRFS_RBIO_READ_REBUILD)
e66d8d5a 869 start_async_work(next, read_rebuild_work);
b4ee1782
OS
870 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
871 steal_rbio(rbio, next);
e66d8d5a 872 start_async_work(next, read_rebuild_work);
b4ee1782 873 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 874 steal_rbio(rbio, next);
cf6a4a75 875 start_async_work(next, rmw_work);
5a6ac9ea
MX
876 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
877 steal_rbio(rbio, next);
a81b747d 878 start_async_work(next, scrub_parity_work);
4ae10b3a 879 }
53b381b3
DW
880
881 goto done_nolock;
53b381b3
DW
882 }
883 }
4ae10b3a 884done:
53b381b3
DW
885 spin_unlock(&rbio->bio_list_lock);
886 spin_unlock_irqrestore(&h->lock, flags);
887
888done_nolock:
4ae10b3a
CM
889 if (!keep_cache)
890 remove_rbio_from_cache(rbio);
53b381b3
DW
891}
892
893static void __free_raid_bio(struct btrfs_raid_bio *rbio)
894{
895 int i;
896
dec95574 897 if (!refcount_dec_and_test(&rbio->refs))
53b381b3
DW
898 return;
899
4ae10b3a 900 WARN_ON(!list_empty(&rbio->stripe_cache));
53b381b3
DW
901 WARN_ON(!list_empty(&rbio->hash_list));
902 WARN_ON(!bio_list_empty(&rbio->bio_list));
903
904 for (i = 0; i < rbio->nr_pages; i++) {
905 if (rbio->stripe_pages[i]) {
906 __free_page(rbio->stripe_pages[i]);
907 rbio->stripe_pages[i] = NULL;
908 }
909 }
af8e2d1d 910
4c664611 911 btrfs_put_bioc(rbio->bioc);
53b381b3
DW
912 kfree(rbio);
913}
914
7583d8d0 915static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
53b381b3 916{
7583d8d0
LB
917 struct bio *next;
918
919 while (cur) {
920 next = cur->bi_next;
921 cur->bi_next = NULL;
922 cur->bi_status = err;
923 bio_endio(cur);
924 cur = next;
925 }
53b381b3
DW
926}
927
928/*
929 * this frees the rbio and runs through all the bios in the
930 * bio_list and calls end_io on them
931 */
4e4cbee9 932static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
53b381b3
DW
933{
934 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 935 struct bio *extra;
4245215d
MX
936
937 if (rbio->generic_bio_cnt)
6a258d72 938 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
bd8f7e62
QW
939 /*
940 * Clear the data bitmap, as the rbio may be cached for later usage.
941 * do this before before unlock_stripe() so there will be no new bio
942 * for this bio.
943 */
944 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
4245215d 945
7583d8d0
LB
946 /*
947 * At this moment, rbio->bio_list is empty, however since rbio does not
948 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
949 * hash list, rbio may be merged with others so that rbio->bio_list
950 * becomes non-empty.
951 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
952 * more and we can call bio_endio() on all queued bios.
953 */
954 unlock_stripe(rbio);
955 extra = bio_list_get(&rbio->bio_list);
956 __free_raid_bio(rbio);
53b381b3 957
7583d8d0
LB
958 rbio_endio_bio_list(cur, err);
959 if (extra)
960 rbio_endio_bio_list(extra, err);
53b381b3
DW
961}
962
963/*
964 * end io function used by finish_rmw. When we finally
965 * get here, we've written a full stripe
966 */
4246a0b6 967static void raid_write_end_io(struct bio *bio)
53b381b3
DW
968{
969 struct btrfs_raid_bio *rbio = bio->bi_private;
4e4cbee9 970 blk_status_t err = bio->bi_status;
a6111d11 971 int max_errors;
53b381b3
DW
972
973 if (err)
974 fail_bio_stripe(rbio, bio);
975
976 bio_put(bio);
977
b89e1b01 978 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
979 return;
980
58efbc9f 981 err = BLK_STS_OK;
53b381b3
DW
982
983 /* OK, we have read all the stripes we need to. */
a6111d11 984 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
4c664611 985 0 : rbio->bioc->max_errors;
a6111d11 986 if (atomic_read(&rbio->error) > max_errors)
4e4cbee9 987 err = BLK_STS_IOERR;
53b381b3 988
4246a0b6 989 rbio_orig_end_io(rbio, err);
53b381b3
DW
990}
991
3e77605d
QW
992/**
993 * Get a sector pointer specified by its @stripe_nr and @sector_nr
994 *
995 * @rbio: The raid bio
996 * @stripe_nr: Stripe number, valid range [0, real_stripe)
997 * @sector_nr: Sector number inside the stripe,
998 * valid range [0, stripe_nsectors)
999 * @bio_list_only: Whether to use sectors inside the bio list only.
1000 *
1001 * The read/modify/write code wants to reuse the original bio page as much
1002 * as possible, and only use stripe_sectors as fallback.
1003 */
1004static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
1005 int stripe_nr, int sector_nr,
1006 bool bio_list_only)
1007{
1008 struct sector_ptr *sector;
1009 int index;
1010
1011 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
1012 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1013
1014 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
1015 ASSERT(index >= 0 && index < rbio->nr_sectors);
1016
1017 spin_lock_irq(&rbio->bio_list_lock);
1018 sector = &rbio->bio_sectors[index];
1019 if (sector->page || bio_list_only) {
1020 /* Don't return sector without a valid page pointer */
1021 if (!sector->page)
1022 sector = NULL;
1023 spin_unlock_irq(&rbio->bio_list_lock);
1024 return sector;
1025 }
1026 spin_unlock_irq(&rbio->bio_list_lock);
1027
1028 return &rbio->stripe_sectors[index];
1029}
1030
53b381b3
DW
1031/*
1032 * allocation and initial setup for the btrfs_raid_bio. Not
1033 * this does not allocate any pages for rbio->pages.
1034 */
2ff7e61e 1035static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
4c664611 1036 struct btrfs_io_context *bioc,
cc353a8b 1037 u32 stripe_len)
53b381b3 1038{
843de58b
QW
1039 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
1040 const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
1041 const unsigned int num_pages = stripe_npages * real_stripes;
94efbe19
QW
1042 const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
1043 const unsigned int num_sectors = stripe_nsectors * real_stripes;
53b381b3
DW
1044 struct btrfs_raid_bio *rbio;
1045 int nr_data = 0;
53b381b3
DW
1046 void *p;
1047
843de58b 1048 ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
94efbe19
QW
1049 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1050 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
c67c68eb
QW
1051 /*
1052 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1053 * (at most 16) should be no larger than BITS_PER_LONG.
1054 */
1055 ASSERT(stripe_nsectors <= BITS_PER_LONG);
843de58b 1056
1389053e
KC
1057 rbio = kzalloc(sizeof(*rbio) +
1058 sizeof(*rbio->stripe_pages) * num_pages +
00425dd9 1059 sizeof(*rbio->bio_sectors) * num_sectors +
eb357060 1060 sizeof(*rbio->stripe_sectors) * num_sectors +
c67c68eb 1061 sizeof(*rbio->finish_pointers) * real_stripes,
1389053e 1062 GFP_NOFS);
af8e2d1d 1063 if (!rbio)
53b381b3 1064 return ERR_PTR(-ENOMEM);
53b381b3
DW
1065
1066 bio_list_init(&rbio->bio_list);
1067 INIT_LIST_HEAD(&rbio->plug_list);
1068 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 1069 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 1070 INIT_LIST_HEAD(&rbio->hash_list);
4c664611 1071 rbio->bioc = bioc;
53b381b3
DW
1072 rbio->stripe_len = stripe_len;
1073 rbio->nr_pages = num_pages;
94efbe19 1074 rbio->nr_sectors = num_sectors;
2c8cdd6e 1075 rbio->real_stripes = real_stripes;
5a6ac9ea 1076 rbio->stripe_npages = stripe_npages;
94efbe19 1077 rbio->stripe_nsectors = stripe_nsectors;
53b381b3
DW
1078 rbio->faila = -1;
1079 rbio->failb = -1;
dec95574 1080 refcount_set(&rbio->refs, 1);
b89e1b01
MX
1081 atomic_set(&rbio->error, 0);
1082 atomic_set(&rbio->stripes_pending, 0);
53b381b3
DW
1083
1084 /*
ac26df8b
QW
1085 * The stripe_pages, bio_sectors, etc arrays point to the extra memory
1086 * we allocated past the end of the rbio.
53b381b3
DW
1087 */
1088 p = rbio + 1;
1389053e
KC
1089#define CONSUME_ALLOC(ptr, count) do { \
1090 ptr = p; \
1091 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1092 } while (0)
1093 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
00425dd9 1094 CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
eb357060 1095 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
1389053e 1096 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1389053e 1097#undef CONSUME_ALLOC
53b381b3 1098
4c664611 1099 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
10f11900 1100 nr_data = real_stripes - 1;
4c664611 1101 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
2c8cdd6e 1102 nr_data = real_stripes - 2;
53b381b3 1103 else
10f11900 1104 BUG();
53b381b3
DW
1105
1106 rbio->nr_data = nr_data;
1107 return rbio;
1108}
1109
1110/* allocate pages for all the stripes in the bio, including parity */
1111static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1112{
eb357060
QW
1113 int ret;
1114
1115 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
1116 if (ret < 0)
1117 return ret;
1118 /* Mapping all sectors */
1119 index_stripe_sectors(rbio);
1120 return 0;
53b381b3
DW
1121}
1122
b7178a5f 1123/* only allocate pages for p/q stripes */
53b381b3
DW
1124static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1125{
f77183dc 1126 const int data_pages = rbio->nr_data * rbio->stripe_npages;
eb357060 1127 int ret;
53b381b3 1128
eb357060
QW
1129 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1130 rbio->stripe_pages + data_pages);
1131 if (ret < 0)
1132 return ret;
1133
1134 index_stripe_sectors(rbio);
1135 return 0;
53b381b3
DW
1136}
1137
1138/*
3e77605d
QW
1139 * Add a single sector @sector into our list of bios for IO.
1140 *
1141 * Return 0 if everything went well.
1142 * Return <0 for error.
53b381b3 1143 */
3e77605d
QW
1144static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1145 struct bio_list *bio_list,
1146 struct sector_ptr *sector,
1147 unsigned int stripe_nr,
1148 unsigned int sector_nr,
1149 unsigned long bio_max_len,
1150 unsigned int opf)
53b381b3 1151{
3e77605d 1152 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
53b381b3 1153 struct bio *last = bio_list->tail;
53b381b3
DW
1154 int ret;
1155 struct bio *bio;
4c664611 1156 struct btrfs_io_stripe *stripe;
53b381b3
DW
1157 u64 disk_start;
1158
3e77605d
QW
1159 /*
1160 * Note: here stripe_nr has taken device replace into consideration,
1161 * thus it can be larger than rbio->real_stripe.
1162 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1163 */
1164 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1165 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1166 ASSERT(sector->page);
1167
4c664611 1168 stripe = &rbio->bioc->stripes[stripe_nr];
3e77605d 1169 disk_start = stripe->physical + sector_nr * sectorsize;
53b381b3
DW
1170
1171 /* if the device is missing, just fail this stripe */
1172 if (!stripe->dev->bdev)
1173 return fail_rbio_index(rbio, stripe_nr);
1174
1175 /* see if we can add this page onto our existing bio */
1176 if (last) {
1201b58b 1177 u64 last_end = last->bi_iter.bi_sector << 9;
4f024f37 1178 last_end += last->bi_iter.bi_size;
53b381b3
DW
1179
1180 /*
1181 * we can't merge these if they are from different
1182 * devices or if they are not contiguous
1183 */
f90ae76a 1184 if (last_end == disk_start && !last->bi_status &&
309dca30 1185 last->bi_bdev == stripe->dev->bdev) {
3e77605d
QW
1186 ret = bio_add_page(last, sector->page, sectorsize,
1187 sector->pgoff);
1188 if (ret == sectorsize)
53b381b3
DW
1189 return 0;
1190 }
1191 }
1192
1193 /* put a new bio on the list */
e1b4b44e
CH
1194 bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
1195 opf, GFP_NOFS);
4f024f37 1196 bio->bi_iter.bi_sector = disk_start >> 9;
e01bf588 1197 bio->bi_private = rbio;
53b381b3 1198
3e77605d 1199 bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
53b381b3
DW
1200 bio_list_add(bio_list, bio);
1201 return 0;
1202}
1203
1204/*
1205 * while we're doing the read/modify/write cycle, we could
1206 * have errors in reading pages off the disk. This checks
1207 * for errors and if we're not able to read the page it'll
1208 * trigger parity reconstruction. The rmw will be finished
1209 * after we've reconstructed the failed stripes
1210 */
1211static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1212{
1213 if (rbio->faila >= 0 || rbio->failb >= 0) {
2c8cdd6e 1214 BUG_ON(rbio->faila == rbio->real_stripes - 1);
53b381b3
DW
1215 __raid56_parity_recover(rbio);
1216 } else {
1217 finish_rmw(rbio);
1218 }
1219}
1220
00425dd9
QW
1221static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1222{
1223 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1224 struct bio_vec bvec;
1225 struct bvec_iter iter;
1226 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1227 rbio->bioc->raid_map[0];
1228
1229 if (bio_flagged(bio, BIO_CLONED))
1230 bio->bi_iter = btrfs_bio(bio)->iter;
1231
1232 bio_for_each_segment(bvec, bio, iter) {
1233 u32 bvec_offset;
1234
1235 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1236 bvec_offset += sectorsize, offset += sectorsize) {
1237 int index = offset / sectorsize;
1238 struct sector_ptr *sector = &rbio->bio_sectors[index];
1239
1240 sector->page = bvec.bv_page;
1241 sector->pgoff = bvec.bv_offset + bvec_offset;
1242 ASSERT(sector->pgoff < PAGE_SIZE);
1243 }
1244 }
1245}
1246
53b381b3
DW
1247/*
1248 * helper function to walk our bio list and populate the bio_pages array with
1249 * the result. This seems expensive, but it is faster than constantly
1250 * searching through the bio list as we setup the IO in finish_rmw or stripe
1251 * reconstruction.
1252 *
1253 * This must be called before you trust the answers from page_in_rbio
1254 */
1255static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1256{
1257 struct bio *bio;
53b381b3
DW
1258
1259 spin_lock_irq(&rbio->bio_list_lock);
00425dd9
QW
1260 bio_list_for_each(bio, &rbio->bio_list)
1261 index_one_bio(rbio, bio);
1262
53b381b3
DW
1263 spin_unlock_irq(&rbio->bio_list_lock);
1264}
1265
1266/*
1267 * this is called from one of two situations. We either
1268 * have a full stripe from the higher layers, or we've read all
1269 * the missing bits off disk.
1270 *
1271 * This will calculate the parity and then send down any
1272 * changed blocks.
1273 */
1274static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1275{
4c664611 1276 struct btrfs_io_context *bioc = rbio->bioc;
1145059a 1277 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 1278 void **pointers = rbio->finish_pointers;
53b381b3
DW
1279 int nr_data = rbio->nr_data;
1280 int stripe;
3e77605d 1281 int sectornr;
c17af965 1282 bool has_qstripe;
53b381b3
DW
1283 struct bio_list bio_list;
1284 struct bio *bio;
53b381b3
DW
1285 int ret;
1286
1287 bio_list_init(&bio_list);
1288
c17af965
DS
1289 if (rbio->real_stripes - rbio->nr_data == 1)
1290 has_qstripe = false;
1291 else if (rbio->real_stripes - rbio->nr_data == 2)
1292 has_qstripe = true;
1293 else
53b381b3 1294 BUG();
53b381b3 1295
bd8f7e62
QW
1296 /* We should have at least one data sector. */
1297 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1298
53b381b3
DW
1299 /* at this point we either have a full stripe,
1300 * or we've read the full stripe from the drive.
1301 * recalculate the parity and write the new results.
1302 *
1303 * We're not allowed to add any new bios to the
1304 * bio list here, anyone else that wants to
1305 * change this stripe needs to do their own rmw.
1306 */
1307 spin_lock_irq(&rbio->bio_list_lock);
1308 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1309 spin_unlock_irq(&rbio->bio_list_lock);
1310
b89e1b01 1311 atomic_set(&rbio->error, 0);
53b381b3
DW
1312
1313 /*
1314 * now that we've set rmw_locked, run through the
1315 * bio list one last time and map the page pointers
4ae10b3a
CM
1316 *
1317 * We don't cache full rbios because we're assuming
1318 * the higher layers are unlikely to use this area of
1319 * the disk again soon. If they do use it again,
1320 * hopefully they will send another full bio.
53b381b3
DW
1321 */
1322 index_rbio_pages(rbio);
4ae10b3a
CM
1323 if (!rbio_is_full(rbio))
1324 cache_rbio_pages(rbio);
1325 else
1326 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
53b381b3 1327
3e77605d 1328 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1145059a
QW
1329 struct sector_ptr *sector;
1330
1331 /* First collect one sector from each data stripe */
53b381b3 1332 for (stripe = 0; stripe < nr_data; stripe++) {
1145059a
QW
1333 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1334 pointers[stripe] = kmap_local_page(sector->page) +
1335 sector->pgoff;
53b381b3
DW
1336 }
1337
1145059a
QW
1338 /* Then add the parity stripe */
1339 sector = rbio_pstripe_sector(rbio, sectornr);
1340 sector->uptodate = 1;
1341 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
53b381b3 1342
c17af965 1343 if (has_qstripe) {
53b381b3 1344 /*
1145059a
QW
1345 * RAID6, add the qstripe and call the library function
1346 * to fill in our p/q
53b381b3 1347 */
1145059a
QW
1348 sector = rbio_qstripe_sector(rbio, sectornr);
1349 sector->uptodate = 1;
1350 pointers[stripe++] = kmap_local_page(sector->page) +
1351 sector->pgoff;
53b381b3 1352
1145059a 1353 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
53b381b3
DW
1354 pointers);
1355 } else {
1356 /* raid5 */
1145059a
QW
1357 memcpy(pointers[nr_data], pointers[0], sectorsize);
1358 run_xor(pointers + 1, nr_data - 1, sectorsize);
53b381b3 1359 }
94a0b58d
IW
1360 for (stripe = stripe - 1; stripe >= 0; stripe--)
1361 kunmap_local(pointers[stripe]);
53b381b3
DW
1362 }
1363
1364 /*
1365 * time to start writing. Make bios for everything from the
1366 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1367 * everything else.
1368 */
2c8cdd6e 1369 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
3e77605d
QW
1370 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1371 struct sector_ptr *sector;
1372
bd8f7e62
QW
1373 /* This vertical stripe has no data, skip it. */
1374 if (!test_bit(sectornr, &rbio->dbitmap))
1375 continue;
1376
53b381b3 1377 if (stripe < rbio->nr_data) {
3e77605d
QW
1378 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1379 if (!sector)
53b381b3
DW
1380 continue;
1381 } else {
3e77605d 1382 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3
DW
1383 }
1384
3e77605d
QW
1385 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1386 sectornr, rbio->stripe_len,
1387 REQ_OP_WRITE);
53b381b3
DW
1388 if (ret)
1389 goto cleanup;
1390 }
1391 }
1392
4c664611 1393 if (likely(!bioc->num_tgtdevs))
2c8cdd6e
MX
1394 goto write_data;
1395
1396 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
4c664611 1397 if (!bioc->tgtdev_map[stripe])
2c8cdd6e
MX
1398 continue;
1399
3e77605d
QW
1400 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1401 struct sector_ptr *sector;
1402
bd8f7e62
QW
1403 /* This vertical stripe has no data, skip it. */
1404 if (!test_bit(sectornr, &rbio->dbitmap))
1405 continue;
1406
2c8cdd6e 1407 if (stripe < rbio->nr_data) {
3e77605d
QW
1408 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1409 if (!sector)
2c8cdd6e
MX
1410 continue;
1411 } else {
3e77605d 1412 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2c8cdd6e
MX
1413 }
1414
3e77605d 1415 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 1416 rbio->bioc->tgtdev_map[stripe],
3e77605d 1417 sectornr, rbio->stripe_len,
e01bf588 1418 REQ_OP_WRITE);
2c8cdd6e
MX
1419 if (ret)
1420 goto cleanup;
1421 }
1422 }
1423
1424write_data:
b89e1b01
MX
1425 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1426 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
53b381b3 1427
bf28a605 1428 while ((bio = bio_list_pop(&bio_list))) {
53b381b3 1429 bio->bi_end_io = raid_write_end_io;
4e49ea4a
MC
1430
1431 submit_bio(bio);
53b381b3
DW
1432 }
1433 return;
1434
1435cleanup:
58efbc9f 1436 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1437
1438 while ((bio = bio_list_pop(&bio_list)))
1439 bio_put(bio);
53b381b3
DW
1440}
1441
1442/*
1443 * helper to find the stripe number for a given bio. Used to figure out which
1444 * stripe has failed. This expects the bio to correspond to a physical disk,
1445 * so it looks up based on physical sector numbers.
1446 */
1447static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1448 struct bio *bio)
1449{
4f024f37 1450 u64 physical = bio->bi_iter.bi_sector;
53b381b3 1451 int i;
4c664611 1452 struct btrfs_io_stripe *stripe;
53b381b3
DW
1453
1454 physical <<= 9;
1455
4c664611
QW
1456 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1457 stripe = &rbio->bioc->stripes[i];
83025863 1458 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
309dca30 1459 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
53b381b3
DW
1460 return i;
1461 }
1462 }
1463 return -1;
1464}
1465
1466/*
1467 * helper to find the stripe number for a given
1468 * bio (before mapping). Used to figure out which stripe has
1469 * failed. This looks up based on logical block numbers.
1470 */
1471static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1472 struct bio *bio)
1473{
1201b58b 1474 u64 logical = bio->bi_iter.bi_sector << 9;
53b381b3
DW
1475 int i;
1476
53b381b3 1477 for (i = 0; i < rbio->nr_data; i++) {
4c664611 1478 u64 stripe_start = rbio->bioc->raid_map[i];
83025863
NB
1479
1480 if (in_range(logical, stripe_start, rbio->stripe_len))
53b381b3 1481 return i;
53b381b3
DW
1482 }
1483 return -1;
1484}
1485
1486/*
1487 * returns -EIO if we had too many failures
1488 */
1489static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1490{
1491 unsigned long flags;
1492 int ret = 0;
1493
1494 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1495
1496 /* we already know this stripe is bad, move on */
1497 if (rbio->faila == failed || rbio->failb == failed)
1498 goto out;
1499
1500 if (rbio->faila == -1) {
1501 /* first failure on this rbio */
1502 rbio->faila = failed;
b89e1b01 1503 atomic_inc(&rbio->error);
53b381b3
DW
1504 } else if (rbio->failb == -1) {
1505 /* second failure on this rbio */
1506 rbio->failb = failed;
b89e1b01 1507 atomic_inc(&rbio->error);
53b381b3
DW
1508 } else {
1509 ret = -EIO;
1510 }
1511out:
1512 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1513
1514 return ret;
1515}
1516
1517/*
1518 * helper to fail a stripe based on a physical disk
1519 * bio.
1520 */
1521static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1522 struct bio *bio)
1523{
1524 int failed = find_bio_stripe(rbio, bio);
1525
1526 if (failed < 0)
1527 return -EIO;
1528
1529 return fail_rbio_index(rbio, failed);
1530}
1531
5fdb7afc
QW
1532/*
1533 * For subpage case, we can no longer set page Uptodate directly for
1534 * stripe_pages[], thus we need to locate the sector.
1535 */
1536static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1537 struct page *page,
1538 unsigned int pgoff)
1539{
1540 int i;
1541
1542 for (i = 0; i < rbio->nr_sectors; i++) {
1543 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1544
1545 if (sector->page == page && sector->pgoff == pgoff)
1546 return sector;
1547 }
1548 return NULL;
1549}
1550
53b381b3
DW
1551/*
1552 * this sets each page in the bio uptodate. It should only be used on private
1553 * rbio pages, nothing that comes in from the higher layers
1554 */
5fdb7afc 1555static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
53b381b3 1556{
5fdb7afc 1557 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
0198e5b7 1558 struct bio_vec *bvec;
6dc4f100 1559 struct bvec_iter_all iter_all;
6592e58c 1560
0198e5b7 1561 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1562
5fdb7afc
QW
1563 bio_for_each_segment_all(bvec, bio, iter_all) {
1564 struct sector_ptr *sector;
1565 int pgoff;
1566
1567 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1568 pgoff += sectorsize) {
1569 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1570 ASSERT(sector);
1571 if (sector)
1572 sector->uptodate = 1;
1573 }
1574 }
53b381b3
DW
1575}
1576
1577/*
1578 * end io for the read phase of the rmw cycle. All the bios here are physical
1579 * stripe bios we've read from the disk so we can recalculate the parity of the
1580 * stripe.
1581 *
1582 * This will usually kick off finish_rmw once all the bios are read in, but it
1583 * may trigger parity reconstruction if we had any errors along the way
1584 */
4246a0b6 1585static void raid_rmw_end_io(struct bio *bio)
53b381b3
DW
1586{
1587 struct btrfs_raid_bio *rbio = bio->bi_private;
1588
4e4cbee9 1589 if (bio->bi_status)
53b381b3
DW
1590 fail_bio_stripe(rbio, bio);
1591 else
5fdb7afc 1592 set_bio_pages_uptodate(rbio, bio);
53b381b3
DW
1593
1594 bio_put(bio);
1595
b89e1b01 1596 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
1597 return;
1598
4c664611 1599 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
53b381b3
DW
1600 goto cleanup;
1601
1602 /*
1603 * this will normally call finish_rmw to start our write
1604 * but if there are any failed stripes we'll reconstruct
1605 * from parity first
1606 */
1607 validate_rbio_for_rmw(rbio);
1608 return;
1609
1610cleanup:
1611
58efbc9f 1612 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
1613}
1614
53b381b3
DW
1615/*
1616 * the stripe must be locked by the caller. It will
1617 * unlock after all the writes are done
1618 */
1619static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1620{
1621 int bios_to_read = 0;
53b381b3
DW
1622 struct bio_list bio_list;
1623 int ret;
3e77605d 1624 int sectornr;
53b381b3
DW
1625 int stripe;
1626 struct bio *bio;
1627
1628 bio_list_init(&bio_list);
1629
1630 ret = alloc_rbio_pages(rbio);
1631 if (ret)
1632 goto cleanup;
1633
1634 index_rbio_pages(rbio);
1635
b89e1b01 1636 atomic_set(&rbio->error, 0);
53b381b3
DW
1637 /*
1638 * build a list of bios to read all the missing parts of this
1639 * stripe
1640 */
1641 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
3e77605d
QW
1642 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1643 struct sector_ptr *sector;
1644
53b381b3 1645 /*
3e77605d
QW
1646 * We want to find all the sectors missing from the
1647 * rbio and read them from the disk. If * sector_in_rbio()
1648 * finds a page in the bio list we don't need to read
1649 * it off the stripe.
53b381b3 1650 */
3e77605d
QW
1651 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1652 if (sector)
53b381b3
DW
1653 continue;
1654
3e77605d 1655 sector = rbio_stripe_sector(rbio, stripe, sectornr);
4ae10b3a 1656 /*
3e77605d
QW
1657 * The bio cache may have handed us an uptodate page.
1658 * If so, be happy and use it.
4ae10b3a 1659 */
3e77605d 1660 if (sector->uptodate)
4ae10b3a
CM
1661 continue;
1662
3e77605d
QW
1663 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1664 stripe, sectornr, rbio->stripe_len,
e01bf588 1665 REQ_OP_READ);
53b381b3
DW
1666 if (ret)
1667 goto cleanup;
1668 }
1669 }
1670
1671 bios_to_read = bio_list_size(&bio_list);
1672 if (!bios_to_read) {
1673 /*
1674 * this can happen if others have merged with
1675 * us, it means there is nothing left to read.
1676 * But if there are missing devices it may not be
1677 * safe to do the full stripe write yet.
1678 */
1679 goto finish;
1680 }
1681
1682 /*
4c664611
QW
1683 * The bioc may be freed once we submit the last bio. Make sure not to
1684 * touch it after that.
53b381b3 1685 */
b89e1b01 1686 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 1687 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
1688 bio->bi_end_io = raid_rmw_end_io;
1689
6a258d72 1690 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 1691
4e49ea4a 1692 submit_bio(bio);
53b381b3
DW
1693 }
1694 /* the actual write will happen once the reads are done */
1695 return 0;
1696
1697cleanup:
58efbc9f 1698 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1699
1700 while ((bio = bio_list_pop(&bio_list)))
1701 bio_put(bio);
1702
53b381b3
DW
1703 return -EIO;
1704
1705finish:
1706 validate_rbio_for_rmw(rbio);
1707 return 0;
1708}
1709
1710/*
1711 * if the upper layers pass in a full stripe, we thank them by only allocating
1712 * enough pages to hold the parity, and sending it all down quickly.
1713 */
1714static int full_stripe_write(struct btrfs_raid_bio *rbio)
1715{
1716 int ret;
1717
1718 ret = alloc_rbio_parity_pages(rbio);
3cd846d1
MX
1719 if (ret) {
1720 __free_raid_bio(rbio);
53b381b3 1721 return ret;
3cd846d1 1722 }
53b381b3
DW
1723
1724 ret = lock_stripe_add(rbio);
1725 if (ret == 0)
1726 finish_rmw(rbio);
1727 return 0;
1728}
1729
1730/*
1731 * partial stripe writes get handed over to async helpers.
1732 * We're really hoping to merge a few more writes into this
1733 * rbio before calculating new parity
1734 */
1735static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1736{
1737 int ret;
1738
1739 ret = lock_stripe_add(rbio);
1740 if (ret == 0)
cf6a4a75 1741 start_async_work(rbio, rmw_work);
53b381b3
DW
1742 return 0;
1743}
1744
1745/*
1746 * sometimes while we were reading from the drive to
1747 * recalculate parity, enough new bios come into create
1748 * a full stripe. So we do a check here to see if we can
1749 * go directly to finish_rmw
1750 */
1751static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1752{
1753 /* head off into rmw land if we don't have a full stripe */
1754 if (!rbio_is_full(rbio))
1755 return partial_stripe_write(rbio);
1756 return full_stripe_write(rbio);
1757}
1758
6ac0f488
CM
1759/*
1760 * We use plugging call backs to collect full stripes.
1761 * Any time we get a partial stripe write while plugged
1762 * we collect it into a list. When the unplug comes down,
1763 * we sort the list by logical block number and merge
1764 * everything we can into the same rbios
1765 */
1766struct btrfs_plug_cb {
1767 struct blk_plug_cb cb;
1768 struct btrfs_fs_info *info;
1769 struct list_head rbio_list;
385de0ef 1770 struct work_struct work;
6ac0f488
CM
1771};
1772
1773/*
1774 * rbios on the plug list are sorted for easier merging.
1775 */
4f0f586b
ST
1776static int plug_cmp(void *priv, const struct list_head *a,
1777 const struct list_head *b)
6ac0f488 1778{
214cc184
DS
1779 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1780 plug_list);
1781 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1782 plug_list);
4f024f37
KO
1783 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1784 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1785
1786 if (a_sector < b_sector)
1787 return -1;
1788 if (a_sector > b_sector)
1789 return 1;
1790 return 0;
1791}
1792
1793static void run_plug(struct btrfs_plug_cb *plug)
1794{
1795 struct btrfs_raid_bio *cur;
1796 struct btrfs_raid_bio *last = NULL;
1797
1798 /*
1799 * sort our plug list then try to merge
1800 * everything we can in hopes of creating full
1801 * stripes.
1802 */
1803 list_sort(NULL, &plug->rbio_list, plug_cmp);
1804 while (!list_empty(&plug->rbio_list)) {
1805 cur = list_entry(plug->rbio_list.next,
1806 struct btrfs_raid_bio, plug_list);
1807 list_del_init(&cur->plug_list);
1808
1809 if (rbio_is_full(cur)) {
c7b562c5
DS
1810 int ret;
1811
6ac0f488 1812 /* we have a full stripe, send it down */
c7b562c5
DS
1813 ret = full_stripe_write(cur);
1814 BUG_ON(ret);
6ac0f488
CM
1815 continue;
1816 }
1817 if (last) {
1818 if (rbio_can_merge(last, cur)) {
1819 merge_rbio(last, cur);
1820 __free_raid_bio(cur);
1821 continue;
1822
1823 }
1824 __raid56_parity_write(last);
1825 }
1826 last = cur;
1827 }
1828 if (last) {
1829 __raid56_parity_write(last);
1830 }
1831 kfree(plug);
1832}
1833
1834/*
1835 * if the unplug comes from schedule, we have to push the
1836 * work off to a helper thread
1837 */
385de0ef 1838static void unplug_work(struct work_struct *work)
6ac0f488
CM
1839{
1840 struct btrfs_plug_cb *plug;
1841 plug = container_of(work, struct btrfs_plug_cb, work);
1842 run_plug(plug);
1843}
1844
1845static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1846{
1847 struct btrfs_plug_cb *plug;
1848 plug = container_of(cb, struct btrfs_plug_cb, cb);
1849
1850 if (from_schedule) {
385de0ef
CH
1851 INIT_WORK(&plug->work, unplug_work);
1852 queue_work(plug->info->rmw_workers, &plug->work);
6ac0f488
CM
1853 return;
1854 }
1855 run_plug(plug);
1856}
1857
bd8f7e62
QW
1858/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1859static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1860{
1861 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1862 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1863 const u64 full_stripe_start = rbio->bioc->raid_map[0];
1864 const u32 orig_len = orig_bio->bi_iter.bi_size;
1865 const u32 sectorsize = fs_info->sectorsize;
1866 u64 cur_logical;
1867
1868 ASSERT(orig_logical >= full_stripe_start &&
1869 orig_logical + orig_len <= full_stripe_start +
1870 rbio->nr_data * rbio->stripe_len);
1871
1872 bio_list_add(&rbio->bio_list, orig_bio);
1873 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1874
1875 /* Update the dbitmap. */
1876 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1877 cur_logical += sectorsize) {
1878 int bit = ((u32)(cur_logical - full_stripe_start) >>
1879 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1880
1881 set_bit(bit, &rbio->dbitmap);
1882 }
1883}
1884
53b381b3
DW
1885/*
1886 * our main entry point for writes from the rest of the FS.
1887 */
cc353a8b 1888int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
53b381b3 1889{
6a258d72 1890 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1891 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1892 struct btrfs_plug_cb *plug = NULL;
1893 struct blk_plug_cb *cb;
4245215d 1894 int ret;
53b381b3 1895
4c664611 1896 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 1897 if (IS_ERR(rbio)) {
4c664611 1898 btrfs_put_bioc(bioc);
53b381b3 1899 return PTR_ERR(rbio);
af8e2d1d 1900 }
1b94b556 1901 rbio->operation = BTRFS_RBIO_WRITE;
bd8f7e62 1902 rbio_add_bio(rbio, bio);
6ac0f488 1903
0b246afa 1904 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
1905 rbio->generic_bio_cnt = 1;
1906
6ac0f488
CM
1907 /*
1908 * don't plug on full rbios, just get them out the door
1909 * as quickly as we can
1910 */
4245215d
MX
1911 if (rbio_is_full(rbio)) {
1912 ret = full_stripe_write(rbio);
1913 if (ret)
0b246afa 1914 btrfs_bio_counter_dec(fs_info);
4245215d
MX
1915 return ret;
1916 }
6ac0f488 1917
0b246afa 1918 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
6ac0f488
CM
1919 if (cb) {
1920 plug = container_of(cb, struct btrfs_plug_cb, cb);
1921 if (!plug->info) {
0b246afa 1922 plug->info = fs_info;
6ac0f488
CM
1923 INIT_LIST_HEAD(&plug->rbio_list);
1924 }
1925 list_add_tail(&rbio->plug_list, &plug->rbio_list);
4245215d 1926 ret = 0;
6ac0f488 1927 } else {
4245215d
MX
1928 ret = __raid56_parity_write(rbio);
1929 if (ret)
0b246afa 1930 btrfs_bio_counter_dec(fs_info);
6ac0f488 1931 }
4245215d 1932 return ret;
53b381b3
DW
1933}
1934
1935/*
1936 * all parity reconstruction happens here. We've read in everything
1937 * we can find from the drives and this does the heavy lifting of
1938 * sorting the good from the bad.
1939 */
1940static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1941{
07e4d380
QW
1942 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1943 int sectornr, stripe;
53b381b3 1944 void **pointers;
94a0b58d 1945 void **unmap_array;
53b381b3 1946 int faila = -1, failb = -1;
58efbc9f 1947 blk_status_t err;
53b381b3
DW
1948 int i;
1949
07e4d380
QW
1950 /*
1951 * This array stores the pointer for each sector, thus it has the extra
1952 * pgoff value added from each sector
1953 */
31e818fe 1954 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
53b381b3 1955 if (!pointers) {
58efbc9f 1956 err = BLK_STS_RESOURCE;
53b381b3
DW
1957 goto cleanup_io;
1958 }
1959
94a0b58d
IW
1960 /*
1961 * Store copy of pointers that does not get reordered during
1962 * reconstruction so that kunmap_local works.
1963 */
1964 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1965 if (!unmap_array) {
1966 err = BLK_STS_RESOURCE;
1967 goto cleanup_pointers;
1968 }
1969
53b381b3
DW
1970 faila = rbio->faila;
1971 failb = rbio->failb;
1972
b4ee1782
OS
1973 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1974 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
53b381b3
DW
1975 spin_lock_irq(&rbio->bio_list_lock);
1976 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1977 spin_unlock_irq(&rbio->bio_list_lock);
1978 }
1979
1980 index_rbio_pages(rbio);
1981
07e4d380
QW
1982 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1983 struct sector_ptr *sector;
1984
5a6ac9ea
MX
1985 /*
1986 * Now we just use bitmap to mark the horizontal stripes in
1987 * which we have data when doing parity scrub.
1988 */
1989 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
c67c68eb 1990 !test_bit(sectornr, &rbio->dbitmap))
5a6ac9ea
MX
1991 continue;
1992
94a0b58d 1993 /*
07e4d380 1994 * Setup our array of pointers with sectors from each stripe
94a0b58d
IW
1995 *
1996 * NOTE: store a duplicate array of pointers to preserve the
1997 * pointer order
53b381b3 1998 */
2c8cdd6e 1999 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
53b381b3 2000 /*
07e4d380 2001 * If we're rebuilding a read, we have to use
53b381b3
DW
2002 * pages from the bio list
2003 */
b4ee1782
OS
2004 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2005 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
53b381b3 2006 (stripe == faila || stripe == failb)) {
07e4d380 2007 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
53b381b3 2008 } else {
07e4d380 2009 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3 2010 }
07e4d380
QW
2011 ASSERT(sector->page);
2012 pointers[stripe] = kmap_local_page(sector->page) +
2013 sector->pgoff;
94a0b58d 2014 unmap_array[stripe] = pointers[stripe];
53b381b3
DW
2015 }
2016
07e4d380 2017 /* All raid6 handling here */
4c664611 2018 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
07e4d380 2019 /* Single failure, rebuild from parity raid5 style */
53b381b3
DW
2020 if (failb < 0) {
2021 if (faila == rbio->nr_data) {
2022 /*
2023 * Just the P stripe has failed, without
2024 * a bad data or Q stripe.
2025 * TODO, we should redo the xor here.
2026 */
58efbc9f 2027 err = BLK_STS_IOERR;
53b381b3
DW
2028 goto cleanup;
2029 }
2030 /*
2031 * a single failure in raid6 is rebuilt
2032 * in the pstripe code below
2033 */
2034 goto pstripe;
2035 }
2036
2037 /* make sure our ps and qs are in order */
b7d2083a
NB
2038 if (faila > failb)
2039 swap(faila, failb);
53b381b3
DW
2040
2041 /* if the q stripe is failed, do a pstripe reconstruction
2042 * from the xors.
2043 * If both the q stripe and the P stripe are failed, we're
2044 * here due to a crc mismatch and we can't give them the
2045 * data they want
2046 */
4c664611
QW
2047 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
2048 if (rbio->bioc->raid_map[faila] ==
8e5cfb55 2049 RAID5_P_STRIPE) {
58efbc9f 2050 err = BLK_STS_IOERR;
53b381b3
DW
2051 goto cleanup;
2052 }
2053 /*
2054 * otherwise we have one bad data stripe and
2055 * a good P stripe. raid5!
2056 */
2057 goto pstripe;
2058 }
2059
4c664611 2060 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2c8cdd6e 2061 raid6_datap_recov(rbio->real_stripes,
07e4d380 2062 sectorsize, faila, pointers);
53b381b3 2063 } else {
2c8cdd6e 2064 raid6_2data_recov(rbio->real_stripes,
07e4d380 2065 sectorsize, faila, failb,
53b381b3
DW
2066 pointers);
2067 }
2068 } else {
2069 void *p;
2070
2071 /* rebuild from P stripe here (raid5 or raid6) */
2072 BUG_ON(failb != -1);
2073pstripe:
2074 /* Copy parity block into failed block to start with */
07e4d380 2075 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
53b381b3
DW
2076
2077 /* rearrange the pointer array */
2078 p = pointers[faila];
2079 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2080 pointers[stripe] = pointers[stripe + 1];
2081 pointers[rbio->nr_data - 1] = p;
2082
2083 /* xor in the rest */
07e4d380 2084 run_xor(pointers, rbio->nr_data - 1, sectorsize);
53b381b3
DW
2085 }
2086 /* if we're doing this rebuild as part of an rmw, go through
2087 * and set all of our private rbio pages in the
2088 * failed stripes as uptodate. This way finish_rmw will
2089 * know they can be trusted. If this was a read reconstruction,
2090 * other endio functions will fiddle the uptodate bits
2091 */
1b94b556 2092 if (rbio->operation == BTRFS_RBIO_WRITE) {
07e4d380 2093 for (i = 0; i < rbio->stripe_nsectors; i++) {
53b381b3 2094 if (faila != -1) {
07e4d380
QW
2095 sector = rbio_stripe_sector(rbio, faila, i);
2096 sector->uptodate = 1;
53b381b3
DW
2097 }
2098 if (failb != -1) {
07e4d380
QW
2099 sector = rbio_stripe_sector(rbio, failb, i);
2100 sector->uptodate = 1;
53b381b3
DW
2101 }
2102 }
2103 }
94a0b58d
IW
2104 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2105 kunmap_local(unmap_array[stripe]);
53b381b3
DW
2106 }
2107
58efbc9f 2108 err = BLK_STS_OK;
53b381b3 2109cleanup:
94a0b58d
IW
2110 kfree(unmap_array);
2111cleanup_pointers:
53b381b3
DW
2112 kfree(pointers);
2113
2114cleanup_io:
580c6efa
LB
2115 /*
2116 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2117 * valid rbio which is consistent with ondisk content, thus such a
2118 * valid rbio can be cached to avoid further disk reads.
2119 */
2120 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2121 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
44ac474d
LB
2122 /*
2123 * - In case of two failures, where rbio->failb != -1:
2124 *
2125 * Do not cache this rbio since the above read reconstruction
2126 * (raid6_datap_recov() or raid6_2data_recov()) may have
2127 * changed some content of stripes which are not identical to
2128 * on-disk content any more, otherwise, a later write/recover
2129 * may steal stripe_pages from this rbio and end up with
2130 * corruptions or rebuild failures.
2131 *
2132 * - In case of single failure, where rbio->failb == -1:
2133 *
2134 * Cache this rbio iff the above read reconstruction is
52042d8e 2135 * executed without problems.
44ac474d
LB
2136 */
2137 if (err == BLK_STS_OK && rbio->failb < 0)
4ae10b3a
CM
2138 cache_rbio_pages(rbio);
2139 else
2140 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2141
4246a0b6 2142 rbio_orig_end_io(rbio, err);
58efbc9f 2143 } else if (err == BLK_STS_OK) {
53b381b3
DW
2144 rbio->faila = -1;
2145 rbio->failb = -1;
5a6ac9ea
MX
2146
2147 if (rbio->operation == BTRFS_RBIO_WRITE)
2148 finish_rmw(rbio);
2149 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2150 finish_parity_scrub(rbio, 0);
2151 else
2152 BUG();
53b381b3 2153 } else {
4246a0b6 2154 rbio_orig_end_io(rbio, err);
53b381b3
DW
2155 }
2156}
2157
2158/*
2159 * This is called only for stripes we've read from disk to
2160 * reconstruct the parity.
2161 */
4246a0b6 2162static void raid_recover_end_io(struct bio *bio)
53b381b3
DW
2163{
2164 struct btrfs_raid_bio *rbio = bio->bi_private;
2165
2166 /*
2167 * we only read stripe pages off the disk, set them
2168 * up to date if there were no errors
2169 */
4e4cbee9 2170 if (bio->bi_status)
53b381b3
DW
2171 fail_bio_stripe(rbio, bio);
2172 else
5fdb7afc 2173 set_bio_pages_uptodate(rbio, bio);
53b381b3
DW
2174 bio_put(bio);
2175
b89e1b01 2176 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
2177 return;
2178
4c664611 2179 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
58efbc9f 2180 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
2181 else
2182 __raid_recover_end_io(rbio);
2183}
2184
2185/*
2186 * reads everything we need off the disk to reconstruct
2187 * the parity. endio handlers trigger final reconstruction
2188 * when the IO is done.
2189 *
2190 * This is used both for reads from the higher layers and for
2191 * parity construction required to finish a rmw cycle.
2192 */
2193static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2194{
2195 int bios_to_read = 0;
53b381b3
DW
2196 struct bio_list bio_list;
2197 int ret;
3e77605d 2198 int sectornr;
53b381b3
DW
2199 int stripe;
2200 struct bio *bio;
2201
2202 bio_list_init(&bio_list);
2203
2204 ret = alloc_rbio_pages(rbio);
2205 if (ret)
2206 goto cleanup;
2207
b89e1b01 2208 atomic_set(&rbio->error, 0);
53b381b3
DW
2209
2210 /*
4ae10b3a
CM
2211 * read everything that hasn't failed. Thanks to the
2212 * stripe cache, it is possible that some or all of these
2213 * pages are going to be uptodate.
53b381b3 2214 */
2c8cdd6e 2215 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5588383e 2216 if (rbio->faila == stripe || rbio->failb == stripe) {
b89e1b01 2217 atomic_inc(&rbio->error);
53b381b3 2218 continue;
5588383e 2219 }
53b381b3 2220
3e77605d
QW
2221 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2222 struct sector_ptr *sector;
53b381b3
DW
2223
2224 /*
2225 * the rmw code may have already read this
2226 * page in
2227 */
3e77605d
QW
2228 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2229 if (sector->uptodate)
53b381b3
DW
2230 continue;
2231
3e77605d
QW
2232 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2233 stripe, sectornr, rbio->stripe_len,
2234 REQ_OP_READ);
53b381b3
DW
2235 if (ret < 0)
2236 goto cleanup;
2237 }
2238 }
2239
2240 bios_to_read = bio_list_size(&bio_list);
2241 if (!bios_to_read) {
2242 /*
2243 * we might have no bios to read just because the pages
2244 * were up to date, or we might have no bios to read because
2245 * the devices were gone.
2246 */
4c664611 2247 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
53b381b3 2248 __raid_recover_end_io(rbio);
813f8a0e 2249 return 0;
53b381b3
DW
2250 } else {
2251 goto cleanup;
2252 }
2253 }
2254
2255 /*
4c664611
QW
2256 * The bioc may be freed once we submit the last bio. Make sure not to
2257 * touch it after that.
53b381b3 2258 */
b89e1b01 2259 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2260 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
2261 bio->bi_end_io = raid_recover_end_io;
2262
6a258d72 2263 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 2264
4e49ea4a 2265 submit_bio(bio);
53b381b3 2266 }
813f8a0e 2267
53b381b3
DW
2268 return 0;
2269
2270cleanup:
b4ee1782
OS
2271 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2272 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
58efbc9f 2273 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2274
2275 while ((bio = bio_list_pop(&bio_list)))
2276 bio_put(bio);
2277
53b381b3
DW
2278 return -EIO;
2279}
2280
2281/*
2282 * the main entry point for reads from the higher layers. This
2283 * is really only called when the normal read path had a failure,
2284 * so we assume the bio they send down corresponds to a failed part
2285 * of the drive.
2286 */
6a258d72 2287int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
cc353a8b 2288 u32 stripe_len, int mirror_num, int generic_io)
53b381b3 2289{
6a258d72 2290 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3
DW
2291 struct btrfs_raid_bio *rbio;
2292 int ret;
2293
abad60c6 2294 if (generic_io) {
4c664611 2295 ASSERT(bioc->mirror_num == mirror_num);
c3a3b19b 2296 btrfs_bio(bio)->mirror_num = mirror_num;
abad60c6
LB
2297 }
2298
4c664611 2299 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 2300 if (IS_ERR(rbio)) {
6e9606d2 2301 if (generic_io)
4c664611 2302 btrfs_put_bioc(bioc);
53b381b3 2303 return PTR_ERR(rbio);
af8e2d1d 2304 }
53b381b3 2305
1b94b556 2306 rbio->operation = BTRFS_RBIO_READ_REBUILD;
bd8f7e62 2307 rbio_add_bio(rbio, bio);
53b381b3
DW
2308
2309 rbio->faila = find_logical_bio_stripe(rbio, bio);
2310 if (rbio->faila == -1) {
0b246afa 2311 btrfs_warn(fs_info,
4c664611 2312"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
1201b58b 2313 __func__, bio->bi_iter.bi_sector << 9,
4c664611 2314 (u64)bio->bi_iter.bi_size, bioc->map_type);
6e9606d2 2315 if (generic_io)
4c664611 2316 btrfs_put_bioc(bioc);
53b381b3
DW
2317 kfree(rbio);
2318 return -EIO;
2319 }
2320
4245215d 2321 if (generic_io) {
0b246afa 2322 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
2323 rbio->generic_bio_cnt = 1;
2324 } else {
4c664611 2325 btrfs_get_bioc(bioc);
4245215d
MX
2326 }
2327
53b381b3 2328 /*
8810f751
LB
2329 * Loop retry:
2330 * for 'mirror == 2', reconstruct from all other stripes.
2331 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2332 */
8810f751
LB
2333 if (mirror_num > 2) {
2334 /*
2335 * 'mirror == 3' is to fail the p stripe and
2336 * reconstruct from the q stripe. 'mirror > 3' is to
2337 * fail a data stripe and reconstruct from p+q stripe.
2338 */
2339 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2340 ASSERT(rbio->failb > 0);
2341 if (rbio->failb <= rbio->faila)
2342 rbio->failb--;
2343 }
53b381b3
DW
2344
2345 ret = lock_stripe_add(rbio);
2346
2347 /*
2348 * __raid56_parity_recover will end the bio with
2349 * any errors it hits. We don't want to return
2350 * its error value up the stack because our caller
2351 * will end up calling bio_endio with any nonzero
2352 * return
2353 */
2354 if (ret == 0)
2355 __raid56_parity_recover(rbio);
2356 /*
2357 * our rbio has been added to the list of
2358 * rbios that will be handled after the
2359 * currently lock owner is done
2360 */
2361 return 0;
2362
2363}
2364
385de0ef 2365static void rmw_work(struct work_struct *work)
53b381b3
DW
2366{
2367 struct btrfs_raid_bio *rbio;
2368
2369 rbio = container_of(work, struct btrfs_raid_bio, work);
2370 raid56_rmw_stripe(rbio);
2371}
2372
385de0ef 2373static void read_rebuild_work(struct work_struct *work)
53b381b3
DW
2374{
2375 struct btrfs_raid_bio *rbio;
2376
2377 rbio = container_of(work, struct btrfs_raid_bio, work);
2378 __raid56_parity_recover(rbio);
2379}
5a6ac9ea
MX
2380
2381/*
2382 * The following code is used to scrub/replace the parity stripe
2383 *
4c664611 2384 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2385 *
5a6ac9ea
MX
2386 * Note: We need make sure all the pages that add into the scrub/replace
2387 * raid bio are correct and not be changed during the scrub/replace. That
2388 * is those pages just hold metadata or file data with checksum.
2389 */
2390
6a258d72
QW
2391struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2392 struct btrfs_io_context *bioc,
cc353a8b 2393 u32 stripe_len, struct btrfs_device *scrub_dev,
6a258d72 2394 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2395{
6a258d72 2396 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2397 struct btrfs_raid_bio *rbio;
2398 int i;
2399
4c664611 2400 rbio = alloc_rbio(fs_info, bioc, stripe_len);
5a6ac9ea
MX
2401 if (IS_ERR(rbio))
2402 return NULL;
2403 bio_list_add(&rbio->bio_list, bio);
2404 /*
2405 * This is a special bio which is used to hold the completion handler
2406 * and make the scrub rbio is similar to the other types
2407 */
2408 ASSERT(!bio->bi_iter.bi_size);
2409 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2410
9cd3a7eb 2411 /*
4c664611 2412 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2413 * to the end position, so this search can start from the first parity
2414 * stripe.
2415 */
2416 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2417 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2418 rbio->scrubp = i;
2419 break;
2420 }
2421 }
9cd3a7eb 2422 ASSERT(i < rbio->real_stripes);
5a6ac9ea 2423
c67c68eb 2424 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
5a6ac9ea 2425
ae6529c3 2426 /*
4c664611 2427 * We have already increased bio_counter when getting bioc, record it
ae6529c3
QW
2428 * so we can free it at rbio_orig_end_io().
2429 */
2430 rbio->generic_bio_cnt = 1;
2431
5a6ac9ea
MX
2432 return rbio;
2433}
2434
b4ee1782
OS
2435/* Used for both parity scrub and missing. */
2436void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
6346f6bf 2437 unsigned int pgoff, u64 logical)
5a6ac9ea 2438{
6346f6bf 2439 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
5a6ac9ea
MX
2440 int stripe_offset;
2441 int index;
2442
4c664611 2443 ASSERT(logical >= rbio->bioc->raid_map[0]);
6346f6bf 2444 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
5a6ac9ea 2445 rbio->stripe_len * rbio->nr_data);
4c664611 2446 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
6346f6bf
QW
2447 index = stripe_offset / sectorsize;
2448 rbio->bio_sectors[index].page = page;
2449 rbio->bio_sectors[index].pgoff = pgoff;
5a6ac9ea
MX
2450}
2451
2452/*
2453 * We just scrub the parity that we have correct data on the same horizontal,
2454 * so we needn't allocate all pages for all the stripes.
2455 */
2456static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2457{
3907ce29
QW
2458 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2459 int stripe;
2460 int sectornr;
2461
c67c68eb 2462 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3907ce29
QW
2463 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2464 struct page *page;
2465 int index = (stripe * rbio->stripe_nsectors + sectornr) *
2466 sectorsize >> PAGE_SHIFT;
5a6ac9ea 2467
5a6ac9ea
MX
2468 if (rbio->stripe_pages[index])
2469 continue;
2470
b0ee5e1e 2471 page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2472 if (!page)
2473 return -ENOMEM;
2474 rbio->stripe_pages[index] = page;
5a6ac9ea
MX
2475 }
2476 }
eb357060 2477 index_stripe_sectors(rbio);
5a6ac9ea
MX
2478 return 0;
2479}
2480
5a6ac9ea
MX
2481static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2482 int need_check)
2483{
4c664611 2484 struct btrfs_io_context *bioc = rbio->bioc;
46900662 2485 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 2486 void **pointers = rbio->finish_pointers;
c67c68eb 2487 unsigned long *pbitmap = &rbio->finish_pbitmap;
5a6ac9ea
MX
2488 int nr_data = rbio->nr_data;
2489 int stripe;
3e77605d 2490 int sectornr;
c17af965 2491 bool has_qstripe;
46900662
QW
2492 struct sector_ptr p_sector = { 0 };
2493 struct sector_ptr q_sector = { 0 };
5a6ac9ea
MX
2494 struct bio_list bio_list;
2495 struct bio *bio;
76035976 2496 int is_replace = 0;
5a6ac9ea
MX
2497 int ret;
2498
2499 bio_list_init(&bio_list);
2500
c17af965
DS
2501 if (rbio->real_stripes - rbio->nr_data == 1)
2502 has_qstripe = false;
2503 else if (rbio->real_stripes - rbio->nr_data == 2)
2504 has_qstripe = true;
2505 else
5a6ac9ea 2506 BUG();
5a6ac9ea 2507
4c664611 2508 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
76035976 2509 is_replace = 1;
c67c68eb 2510 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
76035976
MX
2511 }
2512
5a6ac9ea
MX
2513 /*
2514 * Because the higher layers(scrubber) are unlikely to
2515 * use this area of the disk again soon, so don't cache
2516 * it.
2517 */
2518 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2519
2520 if (!need_check)
2521 goto writeback;
2522
46900662
QW
2523 p_sector.page = alloc_page(GFP_NOFS);
2524 if (!p_sector.page)
5a6ac9ea 2525 goto cleanup;
46900662
QW
2526 p_sector.pgoff = 0;
2527 p_sector.uptodate = 1;
5a6ac9ea 2528
c17af965 2529 if (has_qstripe) {
d70cef0d 2530 /* RAID6, allocate and map temp space for the Q stripe */
46900662
QW
2531 q_sector.page = alloc_page(GFP_NOFS);
2532 if (!q_sector.page) {
2533 __free_page(p_sector.page);
2534 p_sector.page = NULL;
5a6ac9ea
MX
2535 goto cleanup;
2536 }
46900662
QW
2537 q_sector.pgoff = 0;
2538 q_sector.uptodate = 1;
2539 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
5a6ac9ea
MX
2540 }
2541
2542 atomic_set(&rbio->error, 0);
2543
d70cef0d 2544 /* Map the parity stripe just once */
46900662 2545 pointers[nr_data] = kmap_local_page(p_sector.page);
d70cef0d 2546
c67c68eb 2547 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
46900662 2548 struct sector_ptr *sector;
5a6ac9ea 2549 void *parity;
46900662 2550
5a6ac9ea
MX
2551 /* first collect one page from each data stripe */
2552 for (stripe = 0; stripe < nr_data; stripe++) {
46900662
QW
2553 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2554 pointers[stripe] = kmap_local_page(sector->page) +
2555 sector->pgoff;
5a6ac9ea
MX
2556 }
2557
c17af965 2558 if (has_qstripe) {
d70cef0d 2559 /* RAID6, call the library function to fill in our P/Q */
46900662 2560 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
5a6ac9ea
MX
2561 pointers);
2562 } else {
2563 /* raid5 */
46900662
QW
2564 memcpy(pointers[nr_data], pointers[0], sectorsize);
2565 run_xor(pointers + 1, nr_data - 1, sectorsize);
5a6ac9ea
MX
2566 }
2567
01327610 2568 /* Check scrubbing parity and repair it */
46900662
QW
2569 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2570 parity = kmap_local_page(sector->page) + sector->pgoff;
2571 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2572 memcpy(parity, pointers[rbio->scrubp], sectorsize);
5a6ac9ea
MX
2573 else
2574 /* Parity is right, needn't writeback */
c67c68eb 2575 bitmap_clear(&rbio->dbitmap, sectornr, 1);
58c1a35c 2576 kunmap_local(parity);
5a6ac9ea 2577
94a0b58d
IW
2578 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2579 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2580 }
2581
94a0b58d 2582 kunmap_local(pointers[nr_data]);
46900662
QW
2583 __free_page(p_sector.page);
2584 p_sector.page = NULL;
2585 if (q_sector.page) {
94a0b58d 2586 kunmap_local(pointers[rbio->real_stripes - 1]);
46900662
QW
2587 __free_page(q_sector.page);
2588 q_sector.page = NULL;
d70cef0d 2589 }
5a6ac9ea
MX
2590
2591writeback:
2592 /*
2593 * time to start writing. Make bios for everything from the
2594 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2595 * everything else.
2596 */
c67c68eb 2597 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3e77605d 2598 struct sector_ptr *sector;
5a6ac9ea 2599
3e77605d
QW
2600 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2601 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2602 sectornr, rbio->stripe_len, REQ_OP_WRITE);
5a6ac9ea
MX
2603 if (ret)
2604 goto cleanup;
2605 }
2606
76035976
MX
2607 if (!is_replace)
2608 goto submit_write;
2609
3e77605d
QW
2610 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2611 struct sector_ptr *sector;
76035976 2612
3e77605d
QW
2613 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2614 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 2615 bioc->tgtdev_map[rbio->scrubp],
3e77605d 2616 sectornr, rbio->stripe_len, REQ_OP_WRITE);
76035976
MX
2617 if (ret)
2618 goto cleanup;
2619 }
2620
2621submit_write:
5a6ac9ea
MX
2622 nr_data = bio_list_size(&bio_list);
2623 if (!nr_data) {
2624 /* Every parity is right */
58efbc9f 2625 rbio_orig_end_io(rbio, BLK_STS_OK);
5a6ac9ea
MX
2626 return;
2627 }
2628
2629 atomic_set(&rbio->stripes_pending, nr_data);
2630
bf28a605 2631 while ((bio = bio_list_pop(&bio_list))) {
a6111d11 2632 bio->bi_end_io = raid_write_end_io;
4e49ea4a
MC
2633
2634 submit_bio(bio);
5a6ac9ea
MX
2635 }
2636 return;
2637
2638cleanup:
58efbc9f 2639 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2640
2641 while ((bio = bio_list_pop(&bio_list)))
2642 bio_put(bio);
5a6ac9ea
MX
2643}
2644
2645static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2646{
2647 if (stripe >= 0 && stripe < rbio->nr_data)
2648 return 1;
2649 return 0;
2650}
2651
2652/*
2653 * While we're doing the parity check and repair, we could have errors
2654 * in reading pages off the disk. This checks for errors and if we're
2655 * not able to read the page it'll trigger parity reconstruction. The
2656 * parity scrub will be finished after we've reconstructed the failed
2657 * stripes
2658 */
2659static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2660{
4c664611 2661 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
5a6ac9ea
MX
2662 goto cleanup;
2663
2664 if (rbio->faila >= 0 || rbio->failb >= 0) {
2665 int dfail = 0, failp = -1;
2666
2667 if (is_data_stripe(rbio, rbio->faila))
2668 dfail++;
2669 else if (is_parity_stripe(rbio->faila))
2670 failp = rbio->faila;
2671
2672 if (is_data_stripe(rbio, rbio->failb))
2673 dfail++;
2674 else if (is_parity_stripe(rbio->failb))
2675 failp = rbio->failb;
2676
2677 /*
2678 * Because we can not use a scrubbing parity to repair
2679 * the data, so the capability of the repair is declined.
2680 * (In the case of RAID5, we can not repair anything)
2681 */
4c664611 2682 if (dfail > rbio->bioc->max_errors - 1)
5a6ac9ea
MX
2683 goto cleanup;
2684
2685 /*
2686 * If all data is good, only parity is correctly, just
2687 * repair the parity.
2688 */
2689 if (dfail == 0) {
2690 finish_parity_scrub(rbio, 0);
2691 return;
2692 }
2693
2694 /*
2695 * Here means we got one corrupted data stripe and one
2696 * corrupted parity on RAID6, if the corrupted parity
01327610 2697 * is scrubbing parity, luckily, use the other one to repair
5a6ac9ea
MX
2698 * the data, or we can not repair the data stripe.
2699 */
2700 if (failp != rbio->scrubp)
2701 goto cleanup;
2702
2703 __raid_recover_end_io(rbio);
2704 } else {
2705 finish_parity_scrub(rbio, 1);
2706 }
2707 return;
2708
2709cleanup:
58efbc9f 2710 rbio_orig_end_io(rbio, BLK_STS_IOERR);
5a6ac9ea
MX
2711}
2712
2713/*
2714 * end io for the read phase of the rmw cycle. All the bios here are physical
2715 * stripe bios we've read from the disk so we can recalculate the parity of the
2716 * stripe.
2717 *
2718 * This will usually kick off finish_rmw once all the bios are read in, but it
2719 * may trigger parity reconstruction if we had any errors along the way
2720 */
4246a0b6 2721static void raid56_parity_scrub_end_io(struct bio *bio)
5a6ac9ea
MX
2722{
2723 struct btrfs_raid_bio *rbio = bio->bi_private;
2724
4e4cbee9 2725 if (bio->bi_status)
5a6ac9ea
MX
2726 fail_bio_stripe(rbio, bio);
2727 else
5fdb7afc 2728 set_bio_pages_uptodate(rbio, bio);
5a6ac9ea
MX
2729
2730 bio_put(bio);
2731
2732 if (!atomic_dec_and_test(&rbio->stripes_pending))
2733 return;
2734
2735 /*
2736 * this will normally call finish_rmw to start our write
2737 * but if there are any failed stripes we'll reconstruct
2738 * from parity first
2739 */
2740 validate_rbio_for_parity_scrub(rbio);
2741}
2742
2743static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2744{
2745 int bios_to_read = 0;
5a6ac9ea
MX
2746 struct bio_list bio_list;
2747 int ret;
3e77605d 2748 int sectornr;
5a6ac9ea
MX
2749 int stripe;
2750 struct bio *bio;
2751
785884fc
LB
2752 bio_list_init(&bio_list);
2753
5a6ac9ea
MX
2754 ret = alloc_rbio_essential_pages(rbio);
2755 if (ret)
2756 goto cleanup;
2757
5a6ac9ea
MX
2758 atomic_set(&rbio->error, 0);
2759 /*
2760 * build a list of bios to read all the missing parts of this
2761 * stripe
2762 */
2c8cdd6e 2763 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
c67c68eb 2764 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3e77605d 2765 struct sector_ptr *sector;
5a6ac9ea 2766 /*
3e77605d
QW
2767 * We want to find all the sectors missing from the
2768 * rbio and read them from the disk. If * sector_in_rbio()
2769 * finds a sector in the bio list we don't need to read
2770 * it off the stripe.
5a6ac9ea 2771 */
3e77605d
QW
2772 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2773 if (sector)
5a6ac9ea
MX
2774 continue;
2775
3e77605d 2776 sector = rbio_stripe_sector(rbio, stripe, sectornr);
5a6ac9ea 2777 /*
3e77605d
QW
2778 * The bio cache may have handed us an uptodate sector.
2779 * If so, be happy and use it.
5a6ac9ea 2780 */
3e77605d 2781 if (sector->uptodate)
5a6ac9ea
MX
2782 continue;
2783
3e77605d
QW
2784 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2785 stripe, sectornr, rbio->stripe_len,
2786 REQ_OP_READ);
5a6ac9ea
MX
2787 if (ret)
2788 goto cleanup;
2789 }
2790 }
2791
2792 bios_to_read = bio_list_size(&bio_list);
2793 if (!bios_to_read) {
2794 /*
2795 * this can happen if others have merged with
2796 * us, it means there is nothing left to read.
2797 * But if there are missing devices it may not be
2798 * safe to do the full stripe write yet.
2799 */
2800 goto finish;
2801 }
2802
2803 /*
4c664611
QW
2804 * The bioc may be freed once we submit the last bio. Make sure not to
2805 * touch it after that.
5a6ac9ea
MX
2806 */
2807 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2808 while ((bio = bio_list_pop(&bio_list))) {
5a6ac9ea
MX
2809 bio->bi_end_io = raid56_parity_scrub_end_io;
2810
6a258d72 2811 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
5a6ac9ea 2812
4e49ea4a 2813 submit_bio(bio);
5a6ac9ea
MX
2814 }
2815 /* the actual write will happen once the reads are done */
2816 return;
2817
2818cleanup:
58efbc9f 2819 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2820
2821 while ((bio = bio_list_pop(&bio_list)))
2822 bio_put(bio);
2823
5a6ac9ea
MX
2824 return;
2825
2826finish:
2827 validate_rbio_for_parity_scrub(rbio);
2828}
2829
385de0ef 2830static void scrub_parity_work(struct work_struct *work)
5a6ac9ea
MX
2831{
2832 struct btrfs_raid_bio *rbio;
2833
2834 rbio = container_of(work, struct btrfs_raid_bio, work);
2835 raid56_parity_scrub_stripe(rbio);
2836}
2837
5a6ac9ea
MX
2838void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2839{
2840 if (!lock_stripe_add(rbio))
a81b747d 2841 start_async_work(rbio, scrub_parity_work);
5a6ac9ea 2842}
b4ee1782
OS
2843
2844/* The following code is used for dev replace of a missing RAID 5/6 device. */
2845
2846struct btrfs_raid_bio *
6a258d72
QW
2847raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2848 u64 length)
b4ee1782 2849{
6a258d72 2850 struct btrfs_fs_info *fs_info = bioc->fs_info;
b4ee1782
OS
2851 struct btrfs_raid_bio *rbio;
2852
4c664611 2853 rbio = alloc_rbio(fs_info, bioc, length);
b4ee1782
OS
2854 if (IS_ERR(rbio))
2855 return NULL;
2856
2857 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2858 bio_list_add(&rbio->bio_list, bio);
2859 /*
2860 * This is a special bio which is used to hold the completion handler
2861 * and make the scrub rbio is similar to the other types
2862 */
2863 ASSERT(!bio->bi_iter.bi_size);
2864
2865 rbio->faila = find_logical_bio_stripe(rbio, bio);
2866 if (rbio->faila == -1) {
2867 BUG();
2868 kfree(rbio);
2869 return NULL;
2870 }
2871
ae6529c3 2872 /*
4c664611 2873 * When we get bioc, we have already increased bio_counter, record it
ae6529c3
QW
2874 * so we can free it at rbio_orig_end_io()
2875 */
2876 rbio->generic_bio_cnt = 1;
2877
b4ee1782
OS
2878 return rbio;
2879}
2880
b4ee1782
OS
2881void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2882{
2883 if (!lock_stripe_add(rbio))
e66d8d5a 2884 start_async_work(rbio, read_rebuild_work);
b4ee1782 2885}