btrfs: pass a block_device to btrfs_bio_clone
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
cea62800 16#include "misc.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
53b381b3
DW
22
23/* set when additional merges to this rbio are not allowed */
24#define RBIO_RMW_LOCKED_BIT 1
25
4ae10b3a
CM
26/*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30#define RBIO_CACHE_BIT 2
31
32/*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35#define RBIO_CACHE_READY_BIT 3
36
4ae10b3a
CM
37#define RBIO_CACHE_SIZE 1024
38
8a953348
DS
39#define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41/* Used by the raid56 code to lock stripes for read/modify/write */
42struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45};
46
47/* Used by the raid56 code to lock stripes for read/modify/write */
48struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53};
54
1b94b556 55enum btrfs_rbio_ops {
b4ee1782
OS
56 BTRFS_RBIO_WRITE,
57 BTRFS_RBIO_READ_REBUILD,
58 BTRFS_RBIO_PARITY_SCRUB,
59 BTRFS_RBIO_REBUILD_MISSING,
1b94b556
MX
60};
61
53b381b3 62struct btrfs_raid_bio {
4c664611 63 struct btrfs_io_context *bioc;
53b381b3 64
53b381b3
DW
65 /* while we're doing rmw on a stripe
66 * we put it into a hash table so we can
67 * lock the stripe and merge more rbios
68 * into it.
69 */
70 struct list_head hash_list;
71
4ae10b3a
CM
72 /*
73 * LRU list for the stripe cache
74 */
75 struct list_head stripe_cache;
76
53b381b3
DW
77 /*
78 * for scheduling work in the helper threads
79 */
80 struct btrfs_work work;
81
82 /*
83 * bio list and bio_list_lock are used
84 * to add more bios into the stripe
85 * in hopes of avoiding the full rmw
86 */
87 struct bio_list bio_list;
88 spinlock_t bio_list_lock;
89
6ac0f488
CM
90 /* also protected by the bio_list_lock, the
91 * plug list is used by the plugging code
92 * to collect partial bios while plugged. The
93 * stripe locking code also uses it to hand off
53b381b3
DW
94 * the stripe lock to the next pending IO
95 */
96 struct list_head plug_list;
97
98 /*
99 * flags that tell us if it is safe to
100 * merge with this bio
101 */
102 unsigned long flags;
103
104 /* size of each individual stripe on disk */
105 int stripe_len;
106
107 /* number of data stripes (no p/q) */
108 int nr_data;
109
2c8cdd6e
MX
110 int real_stripes;
111
5a6ac9ea 112 int stripe_npages;
53b381b3
DW
113 /*
114 * set if we're doing a parity rebuild
115 * for a read from higher up, which is handled
116 * differently from a parity rebuild as part of
117 * rmw
118 */
1b94b556 119 enum btrfs_rbio_ops operation;
53b381b3
DW
120
121 /* first bad stripe */
122 int faila;
123
124 /* second bad stripe (for raid6 use) */
125 int failb;
126
5a6ac9ea 127 int scrubp;
53b381b3
DW
128 /*
129 * number of pages needed to represent the full
130 * stripe
131 */
132 int nr_pages;
133
134 /*
135 * size of all the bios in the bio_list. This
136 * helps us decide if the rbio maps to a full
137 * stripe or not
138 */
139 int bio_list_bytes;
140
4245215d
MX
141 int generic_bio_cnt;
142
dec95574 143 refcount_t refs;
53b381b3 144
b89e1b01
MX
145 atomic_t stripes_pending;
146
147 atomic_t error;
53b381b3
DW
148 /*
149 * these are two arrays of pointers. We allocate the
150 * rbio big enough to hold them both and setup their
151 * locations when the rbio is allocated
152 */
153
154 /* pointers to pages that we allocated for
155 * reading/writing stripes directly from the disk (including P/Q)
156 */
157 struct page **stripe_pages;
158
159 /*
160 * pointers to the pages in the bio_list. Stored
161 * here for faster lookup
162 */
163 struct page **bio_pages;
5a6ac9ea
MX
164
165 /*
166 * bitmap to record which horizontal stripe has data
167 */
168 unsigned long *dbitmap;
1389053e
KC
169
170 /* allocated with real_stripes-many pointers for finish_*() calls */
171 void **finish_pointers;
172
173 /* allocated with stripe_npages-many bits for finish_*() calls */
174 unsigned long *finish_pbitmap;
53b381b3
DW
175};
176
177static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
178static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
179static void rmw_work(struct btrfs_work *work);
180static void read_rebuild_work(struct btrfs_work *work);
53b381b3
DW
181static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
182static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
183static void __free_raid_bio(struct btrfs_raid_bio *rbio);
184static void index_rbio_pages(struct btrfs_raid_bio *rbio);
185static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
186
5a6ac9ea
MX
187static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
188 int need_check);
a81b747d 189static void scrub_parity_work(struct btrfs_work *work);
5a6ac9ea 190
ac638859
DS
191static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
192{
a0cac0ec 193 btrfs_init_work(&rbio->work, work_func, NULL, NULL);
6a258d72 194 btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
195}
196
53b381b3
DW
197/*
198 * the stripe hash table is used for locking, and to collect
199 * bios in hopes of making a full stripe
200 */
201int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
202{
203 struct btrfs_stripe_hash_table *table;
204 struct btrfs_stripe_hash_table *x;
205 struct btrfs_stripe_hash *cur;
206 struct btrfs_stripe_hash *h;
207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
208 int i;
209
210 if (info->stripe_hash_table)
211 return 0;
212
83c8266a
DS
213 /*
214 * The table is large, starting with order 4 and can go as high as
215 * order 7 in case lock debugging is turned on.
216 *
217 * Try harder to allocate and fallback to vmalloc to lower the chance
218 * of a failing mount.
219 */
ee787f95 220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
221 if (!table)
222 return -ENOMEM;
53b381b3 223
4ae10b3a
CM
224 spin_lock_init(&table->cache_lock);
225 INIT_LIST_HEAD(&table->stripe_cache);
226
53b381b3
DW
227 h = table->table;
228
229 for (i = 0; i < num_entries; i++) {
230 cur = h + i;
231 INIT_LIST_HEAD(&cur->hash_list);
232 spin_lock_init(&cur->lock);
53b381b3
DW
233 }
234
235 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 236 kvfree(x);
53b381b3
DW
237 return 0;
238}
239
4ae10b3a
CM
240/*
241 * caching an rbio means to copy anything from the
242 * bio_pages array into the stripe_pages array. We
243 * use the page uptodate bit in the stripe cache array
244 * to indicate if it has valid data
245 *
246 * once the caching is done, we set the cache ready
247 * bit.
248 */
249static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
250{
251 int i;
4ae10b3a
CM
252 int ret;
253
254 ret = alloc_rbio_pages(rbio);
255 if (ret)
256 return;
257
258 for (i = 0; i < rbio->nr_pages; i++) {
259 if (!rbio->bio_pages[i])
260 continue;
261
80cc8384 262 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
4ae10b3a
CM
263 SetPageUptodate(rbio->stripe_pages[i]);
264 }
265 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
266}
267
53b381b3
DW
268/*
269 * we hash on the first logical address of the stripe
270 */
271static int rbio_bucket(struct btrfs_raid_bio *rbio)
272{
4c664611 273 u64 num = rbio->bioc->raid_map[0];
53b381b3
DW
274
275 /*
276 * we shift down quite a bit. We're using byte
277 * addressing, and most of the lower bits are zeros.
278 * This tends to upset hash_64, and it consistently
279 * returns just one or two different values.
280 *
281 * shifting off the lower bits fixes things.
282 */
283 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
284}
285
4ae10b3a
CM
286/*
287 * stealing an rbio means taking all the uptodate pages from the stripe
288 * array in the source rbio and putting them into the destination rbio
289 */
290static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
291{
292 int i;
293 struct page *s;
294 struct page *d;
295
296 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
297 return;
298
299 for (i = 0; i < dest->nr_pages; i++) {
300 s = src->stripe_pages[i];
301 if (!s || !PageUptodate(s)) {
302 continue;
303 }
304
305 d = dest->stripe_pages[i];
306 if (d)
307 __free_page(d);
308
309 dest->stripe_pages[i] = s;
310 src->stripe_pages[i] = NULL;
311 }
312}
313
53b381b3
DW
314/*
315 * merging means we take the bio_list from the victim and
316 * splice it into the destination. The victim should
317 * be discarded afterwards.
318 *
319 * must be called with dest->rbio_list_lock held
320 */
321static void merge_rbio(struct btrfs_raid_bio *dest,
322 struct btrfs_raid_bio *victim)
323{
324 bio_list_merge(&dest->bio_list, &victim->bio_list);
325 dest->bio_list_bytes += victim->bio_list_bytes;
4245215d 326 dest->generic_bio_cnt += victim->generic_bio_cnt;
53b381b3
DW
327 bio_list_init(&victim->bio_list);
328}
329
330/*
4ae10b3a
CM
331 * used to prune items that are in the cache. The caller
332 * must hold the hash table lock.
333 */
334static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335{
336 int bucket = rbio_bucket(rbio);
337 struct btrfs_stripe_hash_table *table;
338 struct btrfs_stripe_hash *h;
339 int freeit = 0;
340
341 /*
342 * check the bit again under the hash table lock.
343 */
344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 return;
346
6a258d72 347 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
348 h = table->table + bucket;
349
350 /* hold the lock for the bucket because we may be
351 * removing it from the hash table
352 */
353 spin_lock(&h->lock);
354
355 /*
356 * hold the lock for the bio list because we need
357 * to make sure the bio list is empty
358 */
359 spin_lock(&rbio->bio_list_lock);
360
361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 list_del_init(&rbio->stripe_cache);
363 table->cache_size -= 1;
364 freeit = 1;
365
366 /* if the bio list isn't empty, this rbio is
367 * still involved in an IO. We take it out
368 * of the cache list, and drop the ref that
369 * was held for the list.
370 *
371 * If the bio_list was empty, we also remove
372 * the rbio from the hash_table, and drop
373 * the corresponding ref
374 */
375 if (bio_list_empty(&rbio->bio_list)) {
376 if (!list_empty(&rbio->hash_list)) {
377 list_del_init(&rbio->hash_list);
dec95574 378 refcount_dec(&rbio->refs);
4ae10b3a
CM
379 BUG_ON(!list_empty(&rbio->plug_list));
380 }
381 }
382 }
383
384 spin_unlock(&rbio->bio_list_lock);
385 spin_unlock(&h->lock);
386
387 if (freeit)
388 __free_raid_bio(rbio);
389}
390
391/*
392 * prune a given rbio from the cache
393 */
394static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395{
396 struct btrfs_stripe_hash_table *table;
397 unsigned long flags;
398
399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 return;
401
6a258d72 402 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
403
404 spin_lock_irqsave(&table->cache_lock, flags);
405 __remove_rbio_from_cache(rbio);
406 spin_unlock_irqrestore(&table->cache_lock, flags);
407}
408
409/*
410 * remove everything in the cache
411 */
48a3b636 412static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
413{
414 struct btrfs_stripe_hash_table *table;
415 unsigned long flags;
416 struct btrfs_raid_bio *rbio;
417
418 table = info->stripe_hash_table;
419
420 spin_lock_irqsave(&table->cache_lock, flags);
421 while (!list_empty(&table->stripe_cache)) {
422 rbio = list_entry(table->stripe_cache.next,
423 struct btrfs_raid_bio,
424 stripe_cache);
425 __remove_rbio_from_cache(rbio);
426 }
427 spin_unlock_irqrestore(&table->cache_lock, flags);
428}
429
430/*
431 * remove all cached entries and free the hash table
432 * used by unmount
53b381b3
DW
433 */
434void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435{
436 if (!info->stripe_hash_table)
437 return;
4ae10b3a 438 btrfs_clear_rbio_cache(info);
f749303b 439 kvfree(info->stripe_hash_table);
53b381b3
DW
440 info->stripe_hash_table = NULL;
441}
442
4ae10b3a
CM
443/*
444 * insert an rbio into the stripe cache. It
445 * must have already been prepared by calling
446 * cache_rbio_pages
447 *
448 * If this rbio was already cached, it gets
449 * moved to the front of the lru.
450 *
451 * If the size of the rbio cache is too big, we
452 * prune an item.
453 */
454static void cache_rbio(struct btrfs_raid_bio *rbio)
455{
456 struct btrfs_stripe_hash_table *table;
457 unsigned long flags;
458
459 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
460 return;
461
6a258d72 462 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
463
464 spin_lock_irqsave(&table->cache_lock, flags);
465 spin_lock(&rbio->bio_list_lock);
466
467 /* bump our ref if we were not in the list before */
468 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 469 refcount_inc(&rbio->refs);
4ae10b3a
CM
470
471 if (!list_empty(&rbio->stripe_cache)){
472 list_move(&rbio->stripe_cache, &table->stripe_cache);
473 } else {
474 list_add(&rbio->stripe_cache, &table->stripe_cache);
475 table->cache_size += 1;
476 }
477
478 spin_unlock(&rbio->bio_list_lock);
479
480 if (table->cache_size > RBIO_CACHE_SIZE) {
481 struct btrfs_raid_bio *found;
482
483 found = list_entry(table->stripe_cache.prev,
484 struct btrfs_raid_bio,
485 stripe_cache);
486
487 if (found != rbio)
488 __remove_rbio_from_cache(found);
489 }
490
491 spin_unlock_irqrestore(&table->cache_lock, flags);
4ae10b3a
CM
492}
493
53b381b3
DW
494/*
495 * helper function to run the xor_blocks api. It is only
496 * able to do MAX_XOR_BLOCKS at a time, so we need to
497 * loop through.
498 */
499static void run_xor(void **pages, int src_cnt, ssize_t len)
500{
501 int src_off = 0;
502 int xor_src_cnt = 0;
503 void *dest = pages[src_cnt];
504
505 while(src_cnt > 0) {
506 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
507 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
508
509 src_cnt -= xor_src_cnt;
510 src_off += xor_src_cnt;
511 }
512}
513
514/*
176571a1
DS
515 * Returns true if the bio list inside this rbio covers an entire stripe (no
516 * rmw required).
53b381b3 517 */
176571a1 518static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3 519{
176571a1 520 unsigned long flags;
53b381b3
DW
521 unsigned long size = rbio->bio_list_bytes;
522 int ret = 1;
523
176571a1 524 spin_lock_irqsave(&rbio->bio_list_lock, flags);
53b381b3
DW
525 if (size != rbio->nr_data * rbio->stripe_len)
526 ret = 0;
53b381b3 527 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
53b381b3 528 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
176571a1 529
53b381b3
DW
530 return ret;
531}
532
533/*
534 * returns 1 if it is safe to merge two rbios together.
535 * The merging is safe if the two rbios correspond to
536 * the same stripe and if they are both going in the same
537 * direction (read vs write), and if neither one is
538 * locked for final IO
539 *
540 * The caller is responsible for locking such that
541 * rmw_locked is safe to test
542 */
543static int rbio_can_merge(struct btrfs_raid_bio *last,
544 struct btrfs_raid_bio *cur)
545{
546 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
547 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
548 return 0;
549
4ae10b3a
CM
550 /*
551 * we can't merge with cached rbios, since the
552 * idea is that when we merge the destination
553 * rbio is going to run our IO for us. We can
01327610 554 * steal from cached rbios though, other functions
4ae10b3a
CM
555 * handle that.
556 */
557 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
558 test_bit(RBIO_CACHE_BIT, &cur->flags))
559 return 0;
560
4c664611 561 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
53b381b3
DW
562 return 0;
563
5a6ac9ea
MX
564 /* we can't merge with different operations */
565 if (last->operation != cur->operation)
566 return 0;
567 /*
568 * We've need read the full stripe from the drive.
569 * check and repair the parity and write the new results.
570 *
571 * We're not allowed to add any new bios to the
572 * bio list here, anyone else that wants to
573 * change this stripe needs to do their own rmw.
574 */
db34be19 575 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 576 return 0;
53b381b3 577
db34be19 578 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
b4ee1782
OS
579 return 0;
580
cc54ff62
LB
581 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
582 int fa = last->faila;
583 int fb = last->failb;
584 int cur_fa = cur->faila;
585 int cur_fb = cur->failb;
586
587 if (last->faila >= last->failb) {
588 fa = last->failb;
589 fb = last->faila;
590 }
591
592 if (cur->faila >= cur->failb) {
593 cur_fa = cur->failb;
594 cur_fb = cur->faila;
595 }
596
597 if (fa != cur_fa || fb != cur_fb)
598 return 0;
599 }
53b381b3
DW
600 return 1;
601}
602
b7178a5f
ZL
603static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
604 int index)
605{
606 return stripe * rbio->stripe_npages + index;
607}
608
609/*
610 * these are just the pages from the rbio array, not from anything
611 * the FS sent down to us
612 */
613static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
614 int index)
615{
616 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
617}
618
53b381b3
DW
619/*
620 * helper to index into the pstripe
621 */
622static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
623{
b7178a5f 624 return rbio_stripe_page(rbio, rbio->nr_data, index);
53b381b3
DW
625}
626
627/*
628 * helper to index into the qstripe, returns null
629 * if there is no qstripe
630 */
631static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
632{
2c8cdd6e 633 if (rbio->nr_data + 1 == rbio->real_stripes)
53b381b3 634 return NULL;
b7178a5f 635 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
53b381b3
DW
636}
637
638/*
639 * The first stripe in the table for a logical address
640 * has the lock. rbios are added in one of three ways:
641 *
642 * 1) Nobody has the stripe locked yet. The rbio is given
643 * the lock and 0 is returned. The caller must start the IO
644 * themselves.
645 *
646 * 2) Someone has the stripe locked, but we're able to merge
647 * with the lock owner. The rbio is freed and the IO will
648 * start automatically along with the existing rbio. 1 is returned.
649 *
650 * 3) Someone has the stripe locked, but we're not able to merge.
651 * The rbio is added to the lock owner's plug list, or merged into
652 * an rbio already on the plug list. When the lock owner unlocks,
653 * the next rbio on the list is run and the IO is started automatically.
654 * 1 is returned
655 *
656 * If we return 0, the caller still owns the rbio and must continue with
657 * IO submission. If we return 1, the caller must assume the rbio has
658 * already been freed.
659 */
660static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
661{
721860d5 662 struct btrfs_stripe_hash *h;
53b381b3
DW
663 struct btrfs_raid_bio *cur;
664 struct btrfs_raid_bio *pending;
665 unsigned long flags;
53b381b3 666 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 667 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 668 int ret = 0;
53b381b3 669
6a258d72 670 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 671
53b381b3
DW
672 spin_lock_irqsave(&h->lock, flags);
673 list_for_each_entry(cur, &h->hash_list, hash_list) {
4c664611 674 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
9d6cb1b0 675 continue;
4ae10b3a 676
9d6cb1b0 677 spin_lock(&cur->bio_list_lock);
4ae10b3a 678
9d6cb1b0
JT
679 /* Can we steal this cached rbio's pages? */
680 if (bio_list_empty(&cur->bio_list) &&
681 list_empty(&cur->plug_list) &&
682 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
683 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
684 list_del_init(&cur->hash_list);
685 refcount_dec(&cur->refs);
53b381b3 686
9d6cb1b0
JT
687 steal_rbio(cur, rbio);
688 cache_drop = cur;
689 spin_unlock(&cur->bio_list_lock);
4ae10b3a 690
9d6cb1b0
JT
691 goto lockit;
692 }
53b381b3 693
9d6cb1b0
JT
694 /* Can we merge into the lock owner? */
695 if (rbio_can_merge(cur, rbio)) {
696 merge_rbio(cur, rbio);
53b381b3 697 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 698 freeit = rbio;
53b381b3
DW
699 ret = 1;
700 goto out;
701 }
9d6cb1b0
JT
702
703
704 /*
705 * We couldn't merge with the running rbio, see if we can merge
706 * with the pending ones. We don't have to check for rmw_locked
707 * because there is no way they are inside finish_rmw right now
708 */
709 list_for_each_entry(pending, &cur->plug_list, plug_list) {
710 if (rbio_can_merge(pending, rbio)) {
711 merge_rbio(pending, rbio);
712 spin_unlock(&cur->bio_list_lock);
713 freeit = rbio;
714 ret = 1;
715 goto out;
716 }
717 }
718
719 /*
720 * No merging, put us on the tail of the plug list, our rbio
721 * will be started with the currently running rbio unlocks
722 */
723 list_add_tail(&rbio->plug_list, &cur->plug_list);
724 spin_unlock(&cur->bio_list_lock);
725 ret = 1;
726 goto out;
53b381b3 727 }
4ae10b3a 728lockit:
dec95574 729 refcount_inc(&rbio->refs);
53b381b3
DW
730 list_add(&rbio->hash_list, &h->hash_list);
731out:
732 spin_unlock_irqrestore(&h->lock, flags);
4ae10b3a
CM
733 if (cache_drop)
734 remove_rbio_from_cache(cache_drop);
53b381b3
DW
735 if (freeit)
736 __free_raid_bio(freeit);
737 return ret;
738}
739
740/*
741 * called as rmw or parity rebuild is completed. If the plug list has more
742 * rbios waiting for this stripe, the next one on the list will be started
743 */
744static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
745{
746 int bucket;
747 struct btrfs_stripe_hash *h;
748 unsigned long flags;
4ae10b3a 749 int keep_cache = 0;
53b381b3
DW
750
751 bucket = rbio_bucket(rbio);
6a258d72 752 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 753
4ae10b3a
CM
754 if (list_empty(&rbio->plug_list))
755 cache_rbio(rbio);
756
53b381b3
DW
757 spin_lock_irqsave(&h->lock, flags);
758 spin_lock(&rbio->bio_list_lock);
759
760 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
761 /*
762 * if we're still cached and there is no other IO
763 * to perform, just leave this rbio here for others
764 * to steal from later
765 */
766 if (list_empty(&rbio->plug_list) &&
767 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
768 keep_cache = 1;
769 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
770 BUG_ON(!bio_list_empty(&rbio->bio_list));
771 goto done;
772 }
53b381b3
DW
773
774 list_del_init(&rbio->hash_list);
dec95574 775 refcount_dec(&rbio->refs);
53b381b3
DW
776
777 /*
778 * we use the plug list to hold all the rbios
779 * waiting for the chance to lock this stripe.
780 * hand the lock over to one of them.
781 */
782 if (!list_empty(&rbio->plug_list)) {
783 struct btrfs_raid_bio *next;
784 struct list_head *head = rbio->plug_list.next;
785
786 next = list_entry(head, struct btrfs_raid_bio,
787 plug_list);
788
789 list_del_init(&rbio->plug_list);
790
791 list_add(&next->hash_list, &h->hash_list);
dec95574 792 refcount_inc(&next->refs);
53b381b3
DW
793 spin_unlock(&rbio->bio_list_lock);
794 spin_unlock_irqrestore(&h->lock, flags);
795
1b94b556 796 if (next->operation == BTRFS_RBIO_READ_REBUILD)
e66d8d5a 797 start_async_work(next, read_rebuild_work);
b4ee1782
OS
798 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
799 steal_rbio(rbio, next);
e66d8d5a 800 start_async_work(next, read_rebuild_work);
b4ee1782 801 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 802 steal_rbio(rbio, next);
cf6a4a75 803 start_async_work(next, rmw_work);
5a6ac9ea
MX
804 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
805 steal_rbio(rbio, next);
a81b747d 806 start_async_work(next, scrub_parity_work);
4ae10b3a 807 }
53b381b3
DW
808
809 goto done_nolock;
53b381b3
DW
810 }
811 }
4ae10b3a 812done:
53b381b3
DW
813 spin_unlock(&rbio->bio_list_lock);
814 spin_unlock_irqrestore(&h->lock, flags);
815
816done_nolock:
4ae10b3a
CM
817 if (!keep_cache)
818 remove_rbio_from_cache(rbio);
53b381b3
DW
819}
820
821static void __free_raid_bio(struct btrfs_raid_bio *rbio)
822{
823 int i;
824
dec95574 825 if (!refcount_dec_and_test(&rbio->refs))
53b381b3
DW
826 return;
827
4ae10b3a 828 WARN_ON(!list_empty(&rbio->stripe_cache));
53b381b3
DW
829 WARN_ON(!list_empty(&rbio->hash_list));
830 WARN_ON(!bio_list_empty(&rbio->bio_list));
831
832 for (i = 0; i < rbio->nr_pages; i++) {
833 if (rbio->stripe_pages[i]) {
834 __free_page(rbio->stripe_pages[i]);
835 rbio->stripe_pages[i] = NULL;
836 }
837 }
af8e2d1d 838
4c664611 839 btrfs_put_bioc(rbio->bioc);
53b381b3
DW
840 kfree(rbio);
841}
842
7583d8d0 843static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
53b381b3 844{
7583d8d0
LB
845 struct bio *next;
846
847 while (cur) {
848 next = cur->bi_next;
849 cur->bi_next = NULL;
850 cur->bi_status = err;
851 bio_endio(cur);
852 cur = next;
853 }
53b381b3
DW
854}
855
856/*
857 * this frees the rbio and runs through all the bios in the
858 * bio_list and calls end_io on them
859 */
4e4cbee9 860static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
53b381b3
DW
861{
862 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 863 struct bio *extra;
4245215d
MX
864
865 if (rbio->generic_bio_cnt)
6a258d72 866 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
4245215d 867
7583d8d0
LB
868 /*
869 * At this moment, rbio->bio_list is empty, however since rbio does not
870 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
871 * hash list, rbio may be merged with others so that rbio->bio_list
872 * becomes non-empty.
873 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
874 * more and we can call bio_endio() on all queued bios.
875 */
876 unlock_stripe(rbio);
877 extra = bio_list_get(&rbio->bio_list);
878 __free_raid_bio(rbio);
53b381b3 879
7583d8d0
LB
880 rbio_endio_bio_list(cur, err);
881 if (extra)
882 rbio_endio_bio_list(extra, err);
53b381b3
DW
883}
884
885/*
886 * end io function used by finish_rmw. When we finally
887 * get here, we've written a full stripe
888 */
4246a0b6 889static void raid_write_end_io(struct bio *bio)
53b381b3
DW
890{
891 struct btrfs_raid_bio *rbio = bio->bi_private;
4e4cbee9 892 blk_status_t err = bio->bi_status;
a6111d11 893 int max_errors;
53b381b3
DW
894
895 if (err)
896 fail_bio_stripe(rbio, bio);
897
898 bio_put(bio);
899
b89e1b01 900 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
901 return;
902
58efbc9f 903 err = BLK_STS_OK;
53b381b3
DW
904
905 /* OK, we have read all the stripes we need to. */
a6111d11 906 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
4c664611 907 0 : rbio->bioc->max_errors;
a6111d11 908 if (atomic_read(&rbio->error) > max_errors)
4e4cbee9 909 err = BLK_STS_IOERR;
53b381b3 910
4246a0b6 911 rbio_orig_end_io(rbio, err);
53b381b3
DW
912}
913
914/*
915 * the read/modify/write code wants to use the original bio for
916 * any pages it included, and then use the rbio for everything
917 * else. This function decides if a given index (stripe number)
918 * and page number in that stripe fall inside the original bio
919 * or the rbio.
920 *
921 * if you set bio_list_only, you'll get a NULL back for any ranges
922 * that are outside the bio_list
923 *
924 * This doesn't take any refs on anything, you get a bare page pointer
925 * and the caller must bump refs as required.
926 *
927 * You must call index_rbio_pages once before you can trust
928 * the answers from this function.
929 */
930static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
931 int index, int pagenr, int bio_list_only)
932{
933 int chunk_page;
934 struct page *p = NULL;
935
936 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
937
938 spin_lock_irq(&rbio->bio_list_lock);
939 p = rbio->bio_pages[chunk_page];
940 spin_unlock_irq(&rbio->bio_list_lock);
941
942 if (p || bio_list_only)
943 return p;
944
945 return rbio->stripe_pages[chunk_page];
946}
947
948/*
949 * number of pages we need for the entire stripe across all the
950 * drives
951 */
952static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
953{
09cbfeaf 954 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
53b381b3
DW
955}
956
957/*
958 * allocation and initial setup for the btrfs_raid_bio. Not
959 * this does not allocate any pages for rbio->pages.
960 */
2ff7e61e 961static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
4c664611 962 struct btrfs_io_context *bioc,
2ff7e61e 963 u64 stripe_len)
53b381b3
DW
964{
965 struct btrfs_raid_bio *rbio;
966 int nr_data = 0;
4c664611 967 int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
2c8cdd6e 968 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
5a6ac9ea 969 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
53b381b3
DW
970 void *p;
971
1389053e
KC
972 rbio = kzalloc(sizeof(*rbio) +
973 sizeof(*rbio->stripe_pages) * num_pages +
974 sizeof(*rbio->bio_pages) * num_pages +
975 sizeof(*rbio->finish_pointers) * real_stripes +
976 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
977 sizeof(*rbio->finish_pbitmap) *
978 BITS_TO_LONGS(stripe_npages),
979 GFP_NOFS);
af8e2d1d 980 if (!rbio)
53b381b3 981 return ERR_PTR(-ENOMEM);
53b381b3
DW
982
983 bio_list_init(&rbio->bio_list);
984 INIT_LIST_HEAD(&rbio->plug_list);
985 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 986 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 987 INIT_LIST_HEAD(&rbio->hash_list);
4c664611 988 rbio->bioc = bioc;
53b381b3
DW
989 rbio->stripe_len = stripe_len;
990 rbio->nr_pages = num_pages;
2c8cdd6e 991 rbio->real_stripes = real_stripes;
5a6ac9ea 992 rbio->stripe_npages = stripe_npages;
53b381b3
DW
993 rbio->faila = -1;
994 rbio->failb = -1;
dec95574 995 refcount_set(&rbio->refs, 1);
b89e1b01
MX
996 atomic_set(&rbio->error, 0);
997 atomic_set(&rbio->stripes_pending, 0);
53b381b3
DW
998
999 /*
1389053e 1000 * the stripe_pages, bio_pages, etc arrays point to the extra
53b381b3
DW
1001 * memory we allocated past the end of the rbio
1002 */
1003 p = rbio + 1;
1389053e
KC
1004#define CONSUME_ALLOC(ptr, count) do { \
1005 ptr = p; \
1006 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1007 } while (0)
1008 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1009 CONSUME_ALLOC(rbio->bio_pages, num_pages);
1010 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1011 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1012 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1013#undef CONSUME_ALLOC
53b381b3 1014
4c664611 1015 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
10f11900 1016 nr_data = real_stripes - 1;
4c664611 1017 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
2c8cdd6e 1018 nr_data = real_stripes - 2;
53b381b3 1019 else
10f11900 1020 BUG();
53b381b3
DW
1021
1022 rbio->nr_data = nr_data;
1023 return rbio;
1024}
1025
1026/* allocate pages for all the stripes in the bio, including parity */
1027static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1028{
dd137dd1 1029 return btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
53b381b3
DW
1030}
1031
b7178a5f 1032/* only allocate pages for p/q stripes */
53b381b3
DW
1033static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1034{
dd137dd1 1035 int data_pages = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
53b381b3 1036
dd137dd1
STD
1037 return btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1038 rbio->stripe_pages + data_pages);
53b381b3
DW
1039}
1040
1041/*
1042 * add a single page from a specific stripe into our list of bios for IO
1043 * this will try to merge into existing bios if possible, and returns
1044 * zero if all went well.
1045 */
48a3b636
ES
1046static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1047 struct bio_list *bio_list,
1048 struct page *page,
1049 int stripe_nr,
1050 unsigned long page_index,
1051 unsigned long bio_max_len)
53b381b3
DW
1052{
1053 struct bio *last = bio_list->tail;
53b381b3
DW
1054 int ret;
1055 struct bio *bio;
4c664611 1056 struct btrfs_io_stripe *stripe;
53b381b3
DW
1057 u64 disk_start;
1058
4c664611 1059 stripe = &rbio->bioc->stripes[stripe_nr];
09cbfeaf 1060 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
53b381b3
DW
1061
1062 /* if the device is missing, just fail this stripe */
1063 if (!stripe->dev->bdev)
1064 return fail_rbio_index(rbio, stripe_nr);
1065
1066 /* see if we can add this page onto our existing bio */
1067 if (last) {
1201b58b 1068 u64 last_end = last->bi_iter.bi_sector << 9;
4f024f37 1069 last_end += last->bi_iter.bi_size;
53b381b3
DW
1070
1071 /*
1072 * we can't merge these if they are from different
1073 * devices or if they are not contiguous
1074 */
f90ae76a 1075 if (last_end == disk_start && !last->bi_status &&
309dca30 1076 last->bi_bdev == stripe->dev->bdev) {
09cbfeaf
KS
1077 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1078 if (ret == PAGE_SIZE)
53b381b3
DW
1079 return 0;
1080 }
1081 }
1082
1083 /* put a new bio on the list */
c3a3b19b
QW
1084 bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1085 btrfs_bio(bio)->device = stripe->dev;
4f024f37 1086 bio->bi_iter.bi_size = 0;
74d46992 1087 bio_set_dev(bio, stripe->dev->bdev);
4f024f37 1088 bio->bi_iter.bi_sector = disk_start >> 9;
53b381b3 1089
09cbfeaf 1090 bio_add_page(bio, page, PAGE_SIZE, 0);
53b381b3
DW
1091 bio_list_add(bio_list, bio);
1092 return 0;
1093}
1094
1095/*
1096 * while we're doing the read/modify/write cycle, we could
1097 * have errors in reading pages off the disk. This checks
1098 * for errors and if we're not able to read the page it'll
1099 * trigger parity reconstruction. The rmw will be finished
1100 * after we've reconstructed the failed stripes
1101 */
1102static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1103{
1104 if (rbio->faila >= 0 || rbio->failb >= 0) {
2c8cdd6e 1105 BUG_ON(rbio->faila == rbio->real_stripes - 1);
53b381b3
DW
1106 __raid56_parity_recover(rbio);
1107 } else {
1108 finish_rmw(rbio);
1109 }
1110}
1111
53b381b3
DW
1112/*
1113 * helper function to walk our bio list and populate the bio_pages array with
1114 * the result. This seems expensive, but it is faster than constantly
1115 * searching through the bio list as we setup the IO in finish_rmw or stripe
1116 * reconstruction.
1117 *
1118 * This must be called before you trust the answers from page_in_rbio
1119 */
1120static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1121{
1122 struct bio *bio;
1123 u64 start;
1124 unsigned long stripe_offset;
1125 unsigned long page_index;
53b381b3
DW
1126
1127 spin_lock_irq(&rbio->bio_list_lock);
1128 bio_list_for_each(bio, &rbio->bio_list) {
6592e58c
FM
1129 struct bio_vec bvec;
1130 struct bvec_iter iter;
1131 int i = 0;
1132
1201b58b 1133 start = bio->bi_iter.bi_sector << 9;
4c664611 1134 stripe_offset = start - rbio->bioc->raid_map[0];
09cbfeaf 1135 page_index = stripe_offset >> PAGE_SHIFT;
53b381b3 1136
6592e58c 1137 if (bio_flagged(bio, BIO_CLONED))
c3a3b19b 1138 bio->bi_iter = btrfs_bio(bio)->iter;
6592e58c
FM
1139
1140 bio_for_each_segment(bvec, bio, iter) {
1141 rbio->bio_pages[page_index + i] = bvec.bv_page;
1142 i++;
1143 }
53b381b3
DW
1144 }
1145 spin_unlock_irq(&rbio->bio_list_lock);
1146}
1147
1148/*
1149 * this is called from one of two situations. We either
1150 * have a full stripe from the higher layers, or we've read all
1151 * the missing bits off disk.
1152 *
1153 * This will calculate the parity and then send down any
1154 * changed blocks.
1155 */
1156static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1157{
4c664611 1158 struct btrfs_io_context *bioc = rbio->bioc;
1389053e 1159 void **pointers = rbio->finish_pointers;
53b381b3
DW
1160 int nr_data = rbio->nr_data;
1161 int stripe;
1162 int pagenr;
c17af965 1163 bool has_qstripe;
53b381b3
DW
1164 struct bio_list bio_list;
1165 struct bio *bio;
53b381b3
DW
1166 int ret;
1167
1168 bio_list_init(&bio_list);
1169
c17af965
DS
1170 if (rbio->real_stripes - rbio->nr_data == 1)
1171 has_qstripe = false;
1172 else if (rbio->real_stripes - rbio->nr_data == 2)
1173 has_qstripe = true;
1174 else
53b381b3 1175 BUG();
53b381b3
DW
1176
1177 /* at this point we either have a full stripe,
1178 * or we've read the full stripe from the drive.
1179 * recalculate the parity and write the new results.
1180 *
1181 * We're not allowed to add any new bios to the
1182 * bio list here, anyone else that wants to
1183 * change this stripe needs to do their own rmw.
1184 */
1185 spin_lock_irq(&rbio->bio_list_lock);
1186 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1187 spin_unlock_irq(&rbio->bio_list_lock);
1188
b89e1b01 1189 atomic_set(&rbio->error, 0);
53b381b3
DW
1190
1191 /*
1192 * now that we've set rmw_locked, run through the
1193 * bio list one last time and map the page pointers
4ae10b3a
CM
1194 *
1195 * We don't cache full rbios because we're assuming
1196 * the higher layers are unlikely to use this area of
1197 * the disk again soon. If they do use it again,
1198 * hopefully they will send another full bio.
53b381b3
DW
1199 */
1200 index_rbio_pages(rbio);
4ae10b3a
CM
1201 if (!rbio_is_full(rbio))
1202 cache_rbio_pages(rbio);
1203 else
1204 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
53b381b3 1205
915e2290 1206 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
1207 struct page *p;
1208 /* first collect one page from each data stripe */
1209 for (stripe = 0; stripe < nr_data; stripe++) {
1210 p = page_in_rbio(rbio, stripe, pagenr, 0);
94a0b58d 1211 pointers[stripe] = kmap_local_page(p);
53b381b3
DW
1212 }
1213
1214 /* then add the parity stripe */
1215 p = rbio_pstripe_page(rbio, pagenr);
1216 SetPageUptodate(p);
94a0b58d 1217 pointers[stripe++] = kmap_local_page(p);
53b381b3 1218
c17af965 1219 if (has_qstripe) {
53b381b3
DW
1220
1221 /*
1222 * raid6, add the qstripe and call the
1223 * library function to fill in our p/q
1224 */
1225 p = rbio_qstripe_page(rbio, pagenr);
1226 SetPageUptodate(p);
94a0b58d 1227 pointers[stripe++] = kmap_local_page(p);
53b381b3 1228
2c8cdd6e 1229 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
53b381b3
DW
1230 pointers);
1231 } else {
1232 /* raid5 */
69d24804 1233 copy_page(pointers[nr_data], pointers[0]);
09cbfeaf 1234 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
53b381b3 1235 }
94a0b58d
IW
1236 for (stripe = stripe - 1; stripe >= 0; stripe--)
1237 kunmap_local(pointers[stripe]);
53b381b3
DW
1238 }
1239
1240 /*
1241 * time to start writing. Make bios for everything from the
1242 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1243 * everything else.
1244 */
2c8cdd6e 1245 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
915e2290 1246 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
1247 struct page *page;
1248 if (stripe < rbio->nr_data) {
1249 page = page_in_rbio(rbio, stripe, pagenr, 1);
1250 if (!page)
1251 continue;
1252 } else {
1253 page = rbio_stripe_page(rbio, stripe, pagenr);
1254 }
1255
1256 ret = rbio_add_io_page(rbio, &bio_list,
1257 page, stripe, pagenr, rbio->stripe_len);
1258 if (ret)
1259 goto cleanup;
1260 }
1261 }
1262
4c664611 1263 if (likely(!bioc->num_tgtdevs))
2c8cdd6e
MX
1264 goto write_data;
1265
1266 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
4c664611 1267 if (!bioc->tgtdev_map[stripe])
2c8cdd6e
MX
1268 continue;
1269
915e2290 1270 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2c8cdd6e
MX
1271 struct page *page;
1272 if (stripe < rbio->nr_data) {
1273 page = page_in_rbio(rbio, stripe, pagenr, 1);
1274 if (!page)
1275 continue;
1276 } else {
1277 page = rbio_stripe_page(rbio, stripe, pagenr);
1278 }
1279
1280 ret = rbio_add_io_page(rbio, &bio_list, page,
4c664611 1281 rbio->bioc->tgtdev_map[stripe],
2c8cdd6e
MX
1282 pagenr, rbio->stripe_len);
1283 if (ret)
1284 goto cleanup;
1285 }
1286 }
1287
1288write_data:
b89e1b01
MX
1289 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1290 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
53b381b3 1291
bf28a605 1292 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
1293 bio->bi_private = rbio;
1294 bio->bi_end_io = raid_write_end_io;
ebcc3263 1295 bio->bi_opf = REQ_OP_WRITE;
4e49ea4a
MC
1296
1297 submit_bio(bio);
53b381b3
DW
1298 }
1299 return;
1300
1301cleanup:
58efbc9f 1302 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1303
1304 while ((bio = bio_list_pop(&bio_list)))
1305 bio_put(bio);
53b381b3
DW
1306}
1307
1308/*
1309 * helper to find the stripe number for a given bio. Used to figure out which
1310 * stripe has failed. This expects the bio to correspond to a physical disk,
1311 * so it looks up based on physical sector numbers.
1312 */
1313static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1314 struct bio *bio)
1315{
4f024f37 1316 u64 physical = bio->bi_iter.bi_sector;
53b381b3 1317 int i;
4c664611 1318 struct btrfs_io_stripe *stripe;
53b381b3
DW
1319
1320 physical <<= 9;
1321
4c664611
QW
1322 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1323 stripe = &rbio->bioc->stripes[i];
83025863 1324 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
309dca30 1325 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
53b381b3
DW
1326 return i;
1327 }
1328 }
1329 return -1;
1330}
1331
1332/*
1333 * helper to find the stripe number for a given
1334 * bio (before mapping). Used to figure out which stripe has
1335 * failed. This looks up based on logical block numbers.
1336 */
1337static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1338 struct bio *bio)
1339{
1201b58b 1340 u64 logical = bio->bi_iter.bi_sector << 9;
53b381b3
DW
1341 int i;
1342
53b381b3 1343 for (i = 0; i < rbio->nr_data; i++) {
4c664611 1344 u64 stripe_start = rbio->bioc->raid_map[i];
83025863
NB
1345
1346 if (in_range(logical, stripe_start, rbio->stripe_len))
53b381b3 1347 return i;
53b381b3
DW
1348 }
1349 return -1;
1350}
1351
1352/*
1353 * returns -EIO if we had too many failures
1354 */
1355static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1356{
1357 unsigned long flags;
1358 int ret = 0;
1359
1360 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1361
1362 /* we already know this stripe is bad, move on */
1363 if (rbio->faila == failed || rbio->failb == failed)
1364 goto out;
1365
1366 if (rbio->faila == -1) {
1367 /* first failure on this rbio */
1368 rbio->faila = failed;
b89e1b01 1369 atomic_inc(&rbio->error);
53b381b3
DW
1370 } else if (rbio->failb == -1) {
1371 /* second failure on this rbio */
1372 rbio->failb = failed;
b89e1b01 1373 atomic_inc(&rbio->error);
53b381b3
DW
1374 } else {
1375 ret = -EIO;
1376 }
1377out:
1378 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1379
1380 return ret;
1381}
1382
1383/*
1384 * helper to fail a stripe based on a physical disk
1385 * bio.
1386 */
1387static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1388 struct bio *bio)
1389{
1390 int failed = find_bio_stripe(rbio, bio);
1391
1392 if (failed < 0)
1393 return -EIO;
1394
1395 return fail_rbio_index(rbio, failed);
1396}
1397
1398/*
1399 * this sets each page in the bio uptodate. It should only be used on private
1400 * rbio pages, nothing that comes in from the higher layers
1401 */
1402static void set_bio_pages_uptodate(struct bio *bio)
1403{
0198e5b7 1404 struct bio_vec *bvec;
6dc4f100 1405 struct bvec_iter_all iter_all;
6592e58c 1406
0198e5b7 1407 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1408
2b070cfe 1409 bio_for_each_segment_all(bvec, bio, iter_all)
0198e5b7 1410 SetPageUptodate(bvec->bv_page);
53b381b3
DW
1411}
1412
1413/*
1414 * end io for the read phase of the rmw cycle. All the bios here are physical
1415 * stripe bios we've read from the disk so we can recalculate the parity of the
1416 * stripe.
1417 *
1418 * This will usually kick off finish_rmw once all the bios are read in, but it
1419 * may trigger parity reconstruction if we had any errors along the way
1420 */
4246a0b6 1421static void raid_rmw_end_io(struct bio *bio)
53b381b3
DW
1422{
1423 struct btrfs_raid_bio *rbio = bio->bi_private;
1424
4e4cbee9 1425 if (bio->bi_status)
53b381b3
DW
1426 fail_bio_stripe(rbio, bio);
1427 else
1428 set_bio_pages_uptodate(bio);
1429
1430 bio_put(bio);
1431
b89e1b01 1432 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
1433 return;
1434
4c664611 1435 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
53b381b3
DW
1436 goto cleanup;
1437
1438 /*
1439 * this will normally call finish_rmw to start our write
1440 * but if there are any failed stripes we'll reconstruct
1441 * from parity first
1442 */
1443 validate_rbio_for_rmw(rbio);
1444 return;
1445
1446cleanup:
1447
58efbc9f 1448 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
1449}
1450
53b381b3
DW
1451/*
1452 * the stripe must be locked by the caller. It will
1453 * unlock after all the writes are done
1454 */
1455static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1456{
1457 int bios_to_read = 0;
53b381b3
DW
1458 struct bio_list bio_list;
1459 int ret;
53b381b3
DW
1460 int pagenr;
1461 int stripe;
1462 struct bio *bio;
1463
1464 bio_list_init(&bio_list);
1465
1466 ret = alloc_rbio_pages(rbio);
1467 if (ret)
1468 goto cleanup;
1469
1470 index_rbio_pages(rbio);
1471
b89e1b01 1472 atomic_set(&rbio->error, 0);
53b381b3
DW
1473 /*
1474 * build a list of bios to read all the missing parts of this
1475 * stripe
1476 */
1477 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
915e2290 1478 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
1479 struct page *page;
1480 /*
1481 * we want to find all the pages missing from
1482 * the rbio and read them from the disk. If
1483 * page_in_rbio finds a page in the bio list
1484 * we don't need to read it off the stripe.
1485 */
1486 page = page_in_rbio(rbio, stripe, pagenr, 1);
1487 if (page)
1488 continue;
1489
1490 page = rbio_stripe_page(rbio, stripe, pagenr);
4ae10b3a
CM
1491 /*
1492 * the bio cache may have handed us an uptodate
1493 * page. If so, be happy and use it
1494 */
1495 if (PageUptodate(page))
1496 continue;
1497
53b381b3
DW
1498 ret = rbio_add_io_page(rbio, &bio_list, page,
1499 stripe, pagenr, rbio->stripe_len);
1500 if (ret)
1501 goto cleanup;
1502 }
1503 }
1504
1505 bios_to_read = bio_list_size(&bio_list);
1506 if (!bios_to_read) {
1507 /*
1508 * this can happen if others have merged with
1509 * us, it means there is nothing left to read.
1510 * But if there are missing devices it may not be
1511 * safe to do the full stripe write yet.
1512 */
1513 goto finish;
1514 }
1515
1516 /*
4c664611
QW
1517 * The bioc may be freed once we submit the last bio. Make sure not to
1518 * touch it after that.
53b381b3 1519 */
b89e1b01 1520 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 1521 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
1522 bio->bi_private = rbio;
1523 bio->bi_end_io = raid_rmw_end_io;
ebcc3263 1524 bio->bi_opf = REQ_OP_READ;
53b381b3 1525
6a258d72 1526 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 1527
4e49ea4a 1528 submit_bio(bio);
53b381b3
DW
1529 }
1530 /* the actual write will happen once the reads are done */
1531 return 0;
1532
1533cleanup:
58efbc9f 1534 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1535
1536 while ((bio = bio_list_pop(&bio_list)))
1537 bio_put(bio);
1538
53b381b3
DW
1539 return -EIO;
1540
1541finish:
1542 validate_rbio_for_rmw(rbio);
1543 return 0;
1544}
1545
1546/*
1547 * if the upper layers pass in a full stripe, we thank them by only allocating
1548 * enough pages to hold the parity, and sending it all down quickly.
1549 */
1550static int full_stripe_write(struct btrfs_raid_bio *rbio)
1551{
1552 int ret;
1553
1554 ret = alloc_rbio_parity_pages(rbio);
3cd846d1
MX
1555 if (ret) {
1556 __free_raid_bio(rbio);
53b381b3 1557 return ret;
3cd846d1 1558 }
53b381b3
DW
1559
1560 ret = lock_stripe_add(rbio);
1561 if (ret == 0)
1562 finish_rmw(rbio);
1563 return 0;
1564}
1565
1566/*
1567 * partial stripe writes get handed over to async helpers.
1568 * We're really hoping to merge a few more writes into this
1569 * rbio before calculating new parity
1570 */
1571static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1572{
1573 int ret;
1574
1575 ret = lock_stripe_add(rbio);
1576 if (ret == 0)
cf6a4a75 1577 start_async_work(rbio, rmw_work);
53b381b3
DW
1578 return 0;
1579}
1580
1581/*
1582 * sometimes while we were reading from the drive to
1583 * recalculate parity, enough new bios come into create
1584 * a full stripe. So we do a check here to see if we can
1585 * go directly to finish_rmw
1586 */
1587static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1588{
1589 /* head off into rmw land if we don't have a full stripe */
1590 if (!rbio_is_full(rbio))
1591 return partial_stripe_write(rbio);
1592 return full_stripe_write(rbio);
1593}
1594
6ac0f488
CM
1595/*
1596 * We use plugging call backs to collect full stripes.
1597 * Any time we get a partial stripe write while plugged
1598 * we collect it into a list. When the unplug comes down,
1599 * we sort the list by logical block number and merge
1600 * everything we can into the same rbios
1601 */
1602struct btrfs_plug_cb {
1603 struct blk_plug_cb cb;
1604 struct btrfs_fs_info *info;
1605 struct list_head rbio_list;
1606 struct btrfs_work work;
1607};
1608
1609/*
1610 * rbios on the plug list are sorted for easier merging.
1611 */
4f0f586b
ST
1612static int plug_cmp(void *priv, const struct list_head *a,
1613 const struct list_head *b)
6ac0f488 1614{
214cc184
DS
1615 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1616 plug_list);
1617 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1618 plug_list);
4f024f37
KO
1619 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1620 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1621
1622 if (a_sector < b_sector)
1623 return -1;
1624 if (a_sector > b_sector)
1625 return 1;
1626 return 0;
1627}
1628
1629static void run_plug(struct btrfs_plug_cb *plug)
1630{
1631 struct btrfs_raid_bio *cur;
1632 struct btrfs_raid_bio *last = NULL;
1633
1634 /*
1635 * sort our plug list then try to merge
1636 * everything we can in hopes of creating full
1637 * stripes.
1638 */
1639 list_sort(NULL, &plug->rbio_list, plug_cmp);
1640 while (!list_empty(&plug->rbio_list)) {
1641 cur = list_entry(plug->rbio_list.next,
1642 struct btrfs_raid_bio, plug_list);
1643 list_del_init(&cur->plug_list);
1644
1645 if (rbio_is_full(cur)) {
c7b562c5
DS
1646 int ret;
1647
6ac0f488 1648 /* we have a full stripe, send it down */
c7b562c5
DS
1649 ret = full_stripe_write(cur);
1650 BUG_ON(ret);
6ac0f488
CM
1651 continue;
1652 }
1653 if (last) {
1654 if (rbio_can_merge(last, cur)) {
1655 merge_rbio(last, cur);
1656 __free_raid_bio(cur);
1657 continue;
1658
1659 }
1660 __raid56_parity_write(last);
1661 }
1662 last = cur;
1663 }
1664 if (last) {
1665 __raid56_parity_write(last);
1666 }
1667 kfree(plug);
1668}
1669
1670/*
1671 * if the unplug comes from schedule, we have to push the
1672 * work off to a helper thread
1673 */
1674static void unplug_work(struct btrfs_work *work)
1675{
1676 struct btrfs_plug_cb *plug;
1677 plug = container_of(work, struct btrfs_plug_cb, work);
1678 run_plug(plug);
1679}
1680
1681static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1682{
1683 struct btrfs_plug_cb *plug;
1684 plug = container_of(cb, struct btrfs_plug_cb, cb);
1685
1686 if (from_schedule) {
a0cac0ec 1687 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
d05a33ac
QW
1688 btrfs_queue_work(plug->info->rmw_workers,
1689 &plug->work);
6ac0f488
CM
1690 return;
1691 }
1692 run_plug(plug);
1693}
1694
53b381b3
DW
1695/*
1696 * our main entry point for writes from the rest of the FS.
1697 */
6a258d72
QW
1698int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
1699 u64 stripe_len)
53b381b3 1700{
6a258d72 1701 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1702 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1703 struct btrfs_plug_cb *plug = NULL;
1704 struct blk_plug_cb *cb;
4245215d 1705 int ret;
53b381b3 1706
4c664611 1707 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 1708 if (IS_ERR(rbio)) {
4c664611 1709 btrfs_put_bioc(bioc);
53b381b3 1710 return PTR_ERR(rbio);
af8e2d1d 1711 }
53b381b3 1712 bio_list_add(&rbio->bio_list, bio);
4f024f37 1713 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1b94b556 1714 rbio->operation = BTRFS_RBIO_WRITE;
6ac0f488 1715
0b246afa 1716 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
1717 rbio->generic_bio_cnt = 1;
1718
6ac0f488
CM
1719 /*
1720 * don't plug on full rbios, just get them out the door
1721 * as quickly as we can
1722 */
4245215d
MX
1723 if (rbio_is_full(rbio)) {
1724 ret = full_stripe_write(rbio);
1725 if (ret)
0b246afa 1726 btrfs_bio_counter_dec(fs_info);
4245215d
MX
1727 return ret;
1728 }
6ac0f488 1729
0b246afa 1730 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
6ac0f488
CM
1731 if (cb) {
1732 plug = container_of(cb, struct btrfs_plug_cb, cb);
1733 if (!plug->info) {
0b246afa 1734 plug->info = fs_info;
6ac0f488
CM
1735 INIT_LIST_HEAD(&plug->rbio_list);
1736 }
1737 list_add_tail(&rbio->plug_list, &plug->rbio_list);
4245215d 1738 ret = 0;
6ac0f488 1739 } else {
4245215d
MX
1740 ret = __raid56_parity_write(rbio);
1741 if (ret)
0b246afa 1742 btrfs_bio_counter_dec(fs_info);
6ac0f488 1743 }
4245215d 1744 return ret;
53b381b3
DW
1745}
1746
1747/*
1748 * all parity reconstruction happens here. We've read in everything
1749 * we can find from the drives and this does the heavy lifting of
1750 * sorting the good from the bad.
1751 */
1752static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1753{
1754 int pagenr, stripe;
1755 void **pointers;
94a0b58d 1756 void **unmap_array;
53b381b3 1757 int faila = -1, failb = -1;
53b381b3 1758 struct page *page;
58efbc9f 1759 blk_status_t err;
53b381b3
DW
1760 int i;
1761
31e818fe 1762 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
53b381b3 1763 if (!pointers) {
58efbc9f 1764 err = BLK_STS_RESOURCE;
53b381b3
DW
1765 goto cleanup_io;
1766 }
1767
94a0b58d
IW
1768 /*
1769 * Store copy of pointers that does not get reordered during
1770 * reconstruction so that kunmap_local works.
1771 */
1772 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1773 if (!unmap_array) {
1774 err = BLK_STS_RESOURCE;
1775 goto cleanup_pointers;
1776 }
1777
53b381b3
DW
1778 faila = rbio->faila;
1779 failb = rbio->failb;
1780
b4ee1782
OS
1781 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1782 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
53b381b3
DW
1783 spin_lock_irq(&rbio->bio_list_lock);
1784 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1785 spin_unlock_irq(&rbio->bio_list_lock);
1786 }
1787
1788 index_rbio_pages(rbio);
1789
915e2290 1790 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
5a6ac9ea
MX
1791 /*
1792 * Now we just use bitmap to mark the horizontal stripes in
1793 * which we have data when doing parity scrub.
1794 */
1795 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1796 !test_bit(pagenr, rbio->dbitmap))
1797 continue;
1798
94a0b58d
IW
1799 /*
1800 * Setup our array of pointers with pages from each stripe
1801 *
1802 * NOTE: store a duplicate array of pointers to preserve the
1803 * pointer order
53b381b3 1804 */
2c8cdd6e 1805 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
53b381b3
DW
1806 /*
1807 * if we're rebuilding a read, we have to use
1808 * pages from the bio list
1809 */
b4ee1782
OS
1810 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1811 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
53b381b3
DW
1812 (stripe == faila || stripe == failb)) {
1813 page = page_in_rbio(rbio, stripe, pagenr, 0);
1814 } else {
1815 page = rbio_stripe_page(rbio, stripe, pagenr);
1816 }
94a0b58d
IW
1817 pointers[stripe] = kmap_local_page(page);
1818 unmap_array[stripe] = pointers[stripe];
53b381b3
DW
1819 }
1820
1821 /* all raid6 handling here */
4c664611 1822 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
53b381b3
DW
1823 /*
1824 * single failure, rebuild from parity raid5
1825 * style
1826 */
1827 if (failb < 0) {
1828 if (faila == rbio->nr_data) {
1829 /*
1830 * Just the P stripe has failed, without
1831 * a bad data or Q stripe.
1832 * TODO, we should redo the xor here.
1833 */
58efbc9f 1834 err = BLK_STS_IOERR;
53b381b3
DW
1835 goto cleanup;
1836 }
1837 /*
1838 * a single failure in raid6 is rebuilt
1839 * in the pstripe code below
1840 */
1841 goto pstripe;
1842 }
1843
1844 /* make sure our ps and qs are in order */
b7d2083a
NB
1845 if (faila > failb)
1846 swap(faila, failb);
53b381b3
DW
1847
1848 /* if the q stripe is failed, do a pstripe reconstruction
1849 * from the xors.
1850 * If both the q stripe and the P stripe are failed, we're
1851 * here due to a crc mismatch and we can't give them the
1852 * data they want
1853 */
4c664611
QW
1854 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1855 if (rbio->bioc->raid_map[faila] ==
8e5cfb55 1856 RAID5_P_STRIPE) {
58efbc9f 1857 err = BLK_STS_IOERR;
53b381b3
DW
1858 goto cleanup;
1859 }
1860 /*
1861 * otherwise we have one bad data stripe and
1862 * a good P stripe. raid5!
1863 */
1864 goto pstripe;
1865 }
1866
4c664611 1867 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2c8cdd6e 1868 raid6_datap_recov(rbio->real_stripes,
53b381b3
DW
1869 PAGE_SIZE, faila, pointers);
1870 } else {
2c8cdd6e 1871 raid6_2data_recov(rbio->real_stripes,
53b381b3
DW
1872 PAGE_SIZE, faila, failb,
1873 pointers);
1874 }
1875 } else {
1876 void *p;
1877
1878 /* rebuild from P stripe here (raid5 or raid6) */
1879 BUG_ON(failb != -1);
1880pstripe:
1881 /* Copy parity block into failed block to start with */
69d24804 1882 copy_page(pointers[faila], pointers[rbio->nr_data]);
53b381b3
DW
1883
1884 /* rearrange the pointer array */
1885 p = pointers[faila];
1886 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1887 pointers[stripe] = pointers[stripe + 1];
1888 pointers[rbio->nr_data - 1] = p;
1889
1890 /* xor in the rest */
09cbfeaf 1891 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
53b381b3
DW
1892 }
1893 /* if we're doing this rebuild as part of an rmw, go through
1894 * and set all of our private rbio pages in the
1895 * failed stripes as uptodate. This way finish_rmw will
1896 * know they can be trusted. If this was a read reconstruction,
1897 * other endio functions will fiddle the uptodate bits
1898 */
1b94b556 1899 if (rbio->operation == BTRFS_RBIO_WRITE) {
915e2290 1900 for (i = 0; i < rbio->stripe_npages; i++) {
53b381b3
DW
1901 if (faila != -1) {
1902 page = rbio_stripe_page(rbio, faila, i);
1903 SetPageUptodate(page);
1904 }
1905 if (failb != -1) {
1906 page = rbio_stripe_page(rbio, failb, i);
1907 SetPageUptodate(page);
1908 }
1909 }
1910 }
94a0b58d
IW
1911 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
1912 kunmap_local(unmap_array[stripe]);
53b381b3
DW
1913 }
1914
58efbc9f 1915 err = BLK_STS_OK;
53b381b3 1916cleanup:
94a0b58d
IW
1917 kfree(unmap_array);
1918cleanup_pointers:
53b381b3
DW
1919 kfree(pointers);
1920
1921cleanup_io:
580c6efa
LB
1922 /*
1923 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1924 * valid rbio which is consistent with ondisk content, thus such a
1925 * valid rbio can be cached to avoid further disk reads.
1926 */
1927 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1928 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
44ac474d
LB
1929 /*
1930 * - In case of two failures, where rbio->failb != -1:
1931 *
1932 * Do not cache this rbio since the above read reconstruction
1933 * (raid6_datap_recov() or raid6_2data_recov()) may have
1934 * changed some content of stripes which are not identical to
1935 * on-disk content any more, otherwise, a later write/recover
1936 * may steal stripe_pages from this rbio and end up with
1937 * corruptions or rebuild failures.
1938 *
1939 * - In case of single failure, where rbio->failb == -1:
1940 *
1941 * Cache this rbio iff the above read reconstruction is
52042d8e 1942 * executed without problems.
44ac474d
LB
1943 */
1944 if (err == BLK_STS_OK && rbio->failb < 0)
4ae10b3a
CM
1945 cache_rbio_pages(rbio);
1946 else
1947 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1948
4246a0b6 1949 rbio_orig_end_io(rbio, err);
58efbc9f 1950 } else if (err == BLK_STS_OK) {
53b381b3
DW
1951 rbio->faila = -1;
1952 rbio->failb = -1;
5a6ac9ea
MX
1953
1954 if (rbio->operation == BTRFS_RBIO_WRITE)
1955 finish_rmw(rbio);
1956 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1957 finish_parity_scrub(rbio, 0);
1958 else
1959 BUG();
53b381b3 1960 } else {
4246a0b6 1961 rbio_orig_end_io(rbio, err);
53b381b3
DW
1962 }
1963}
1964
1965/*
1966 * This is called only for stripes we've read from disk to
1967 * reconstruct the parity.
1968 */
4246a0b6 1969static void raid_recover_end_io(struct bio *bio)
53b381b3
DW
1970{
1971 struct btrfs_raid_bio *rbio = bio->bi_private;
1972
1973 /*
1974 * we only read stripe pages off the disk, set them
1975 * up to date if there were no errors
1976 */
4e4cbee9 1977 if (bio->bi_status)
53b381b3
DW
1978 fail_bio_stripe(rbio, bio);
1979 else
1980 set_bio_pages_uptodate(bio);
1981 bio_put(bio);
1982
b89e1b01 1983 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
1984 return;
1985
4c664611 1986 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
58efbc9f 1987 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
1988 else
1989 __raid_recover_end_io(rbio);
1990}
1991
1992/*
1993 * reads everything we need off the disk to reconstruct
1994 * the parity. endio handlers trigger final reconstruction
1995 * when the IO is done.
1996 *
1997 * This is used both for reads from the higher layers and for
1998 * parity construction required to finish a rmw cycle.
1999 */
2000static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2001{
2002 int bios_to_read = 0;
53b381b3
DW
2003 struct bio_list bio_list;
2004 int ret;
53b381b3
DW
2005 int pagenr;
2006 int stripe;
2007 struct bio *bio;
2008
2009 bio_list_init(&bio_list);
2010
2011 ret = alloc_rbio_pages(rbio);
2012 if (ret)
2013 goto cleanup;
2014
b89e1b01 2015 atomic_set(&rbio->error, 0);
53b381b3
DW
2016
2017 /*
4ae10b3a
CM
2018 * read everything that hasn't failed. Thanks to the
2019 * stripe cache, it is possible that some or all of these
2020 * pages are going to be uptodate.
53b381b3 2021 */
2c8cdd6e 2022 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5588383e 2023 if (rbio->faila == stripe || rbio->failb == stripe) {
b89e1b01 2024 atomic_inc(&rbio->error);
53b381b3 2025 continue;
5588383e 2026 }
53b381b3 2027
915e2290 2028 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
2029 struct page *p;
2030
2031 /*
2032 * the rmw code may have already read this
2033 * page in
2034 */
2035 p = rbio_stripe_page(rbio, stripe, pagenr);
2036 if (PageUptodate(p))
2037 continue;
2038
2039 ret = rbio_add_io_page(rbio, &bio_list,
2040 rbio_stripe_page(rbio, stripe, pagenr),
2041 stripe, pagenr, rbio->stripe_len);
2042 if (ret < 0)
2043 goto cleanup;
2044 }
2045 }
2046
2047 bios_to_read = bio_list_size(&bio_list);
2048 if (!bios_to_read) {
2049 /*
2050 * we might have no bios to read just because the pages
2051 * were up to date, or we might have no bios to read because
2052 * the devices were gone.
2053 */
4c664611 2054 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
53b381b3 2055 __raid_recover_end_io(rbio);
813f8a0e 2056 return 0;
53b381b3
DW
2057 } else {
2058 goto cleanup;
2059 }
2060 }
2061
2062 /*
4c664611
QW
2063 * The bioc may be freed once we submit the last bio. Make sure not to
2064 * touch it after that.
53b381b3 2065 */
b89e1b01 2066 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2067 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
2068 bio->bi_private = rbio;
2069 bio->bi_end_io = raid_recover_end_io;
ebcc3263 2070 bio->bi_opf = REQ_OP_READ;
53b381b3 2071
6a258d72 2072 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 2073
4e49ea4a 2074 submit_bio(bio);
53b381b3 2075 }
813f8a0e 2076
53b381b3
DW
2077 return 0;
2078
2079cleanup:
b4ee1782
OS
2080 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2081 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
58efbc9f 2082 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2083
2084 while ((bio = bio_list_pop(&bio_list)))
2085 bio_put(bio);
2086
53b381b3
DW
2087 return -EIO;
2088}
2089
2090/*
2091 * the main entry point for reads from the higher layers. This
2092 * is really only called when the normal read path had a failure,
2093 * so we assume the bio they send down corresponds to a failed part
2094 * of the drive.
2095 */
6a258d72
QW
2096int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2097 u64 stripe_len, int mirror_num, int generic_io)
53b381b3 2098{
6a258d72 2099 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3
DW
2100 struct btrfs_raid_bio *rbio;
2101 int ret;
2102
abad60c6 2103 if (generic_io) {
4c664611 2104 ASSERT(bioc->mirror_num == mirror_num);
c3a3b19b 2105 btrfs_bio(bio)->mirror_num = mirror_num;
abad60c6
LB
2106 }
2107
4c664611 2108 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 2109 if (IS_ERR(rbio)) {
6e9606d2 2110 if (generic_io)
4c664611 2111 btrfs_put_bioc(bioc);
53b381b3 2112 return PTR_ERR(rbio);
af8e2d1d 2113 }
53b381b3 2114
1b94b556 2115 rbio->operation = BTRFS_RBIO_READ_REBUILD;
53b381b3 2116 bio_list_add(&rbio->bio_list, bio);
4f024f37 2117 rbio->bio_list_bytes = bio->bi_iter.bi_size;
53b381b3
DW
2118
2119 rbio->faila = find_logical_bio_stripe(rbio, bio);
2120 if (rbio->faila == -1) {
0b246afa 2121 btrfs_warn(fs_info,
4c664611 2122"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
1201b58b 2123 __func__, bio->bi_iter.bi_sector << 9,
4c664611 2124 (u64)bio->bi_iter.bi_size, bioc->map_type);
6e9606d2 2125 if (generic_io)
4c664611 2126 btrfs_put_bioc(bioc);
53b381b3
DW
2127 kfree(rbio);
2128 return -EIO;
2129 }
2130
4245215d 2131 if (generic_io) {
0b246afa 2132 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
2133 rbio->generic_bio_cnt = 1;
2134 } else {
4c664611 2135 btrfs_get_bioc(bioc);
4245215d
MX
2136 }
2137
53b381b3 2138 /*
8810f751
LB
2139 * Loop retry:
2140 * for 'mirror == 2', reconstruct from all other stripes.
2141 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2142 */
8810f751
LB
2143 if (mirror_num > 2) {
2144 /*
2145 * 'mirror == 3' is to fail the p stripe and
2146 * reconstruct from the q stripe. 'mirror > 3' is to
2147 * fail a data stripe and reconstruct from p+q stripe.
2148 */
2149 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2150 ASSERT(rbio->failb > 0);
2151 if (rbio->failb <= rbio->faila)
2152 rbio->failb--;
2153 }
53b381b3
DW
2154
2155 ret = lock_stripe_add(rbio);
2156
2157 /*
2158 * __raid56_parity_recover will end the bio with
2159 * any errors it hits. We don't want to return
2160 * its error value up the stack because our caller
2161 * will end up calling bio_endio with any nonzero
2162 * return
2163 */
2164 if (ret == 0)
2165 __raid56_parity_recover(rbio);
2166 /*
2167 * our rbio has been added to the list of
2168 * rbios that will be handled after the
2169 * currently lock owner is done
2170 */
2171 return 0;
2172
2173}
2174
2175static void rmw_work(struct btrfs_work *work)
2176{
2177 struct btrfs_raid_bio *rbio;
2178
2179 rbio = container_of(work, struct btrfs_raid_bio, work);
2180 raid56_rmw_stripe(rbio);
2181}
2182
2183static void read_rebuild_work(struct btrfs_work *work)
2184{
2185 struct btrfs_raid_bio *rbio;
2186
2187 rbio = container_of(work, struct btrfs_raid_bio, work);
2188 __raid56_parity_recover(rbio);
2189}
5a6ac9ea
MX
2190
2191/*
2192 * The following code is used to scrub/replace the parity stripe
2193 *
4c664611 2194 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2195 *
5a6ac9ea
MX
2196 * Note: We need make sure all the pages that add into the scrub/replace
2197 * raid bio are correct and not be changed during the scrub/replace. That
2198 * is those pages just hold metadata or file data with checksum.
2199 */
2200
6a258d72
QW
2201struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2202 struct btrfs_io_context *bioc,
2203 u64 stripe_len, struct btrfs_device *scrub_dev,
2204 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2205{
6a258d72 2206 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2207 struct btrfs_raid_bio *rbio;
2208 int i;
2209
4c664611 2210 rbio = alloc_rbio(fs_info, bioc, stripe_len);
5a6ac9ea
MX
2211 if (IS_ERR(rbio))
2212 return NULL;
2213 bio_list_add(&rbio->bio_list, bio);
2214 /*
2215 * This is a special bio which is used to hold the completion handler
2216 * and make the scrub rbio is similar to the other types
2217 */
2218 ASSERT(!bio->bi_iter.bi_size);
2219 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2220
9cd3a7eb 2221 /*
4c664611 2222 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2223 * to the end position, so this search can start from the first parity
2224 * stripe.
2225 */
2226 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2227 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2228 rbio->scrubp = i;
2229 break;
2230 }
2231 }
9cd3a7eb 2232 ASSERT(i < rbio->real_stripes);
5a6ac9ea
MX
2233
2234 /* Now we just support the sectorsize equals to page size */
0b246afa 2235 ASSERT(fs_info->sectorsize == PAGE_SIZE);
5a6ac9ea
MX
2236 ASSERT(rbio->stripe_npages == stripe_nsectors);
2237 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2238
ae6529c3 2239 /*
4c664611 2240 * We have already increased bio_counter when getting bioc, record it
ae6529c3
QW
2241 * so we can free it at rbio_orig_end_io().
2242 */
2243 rbio->generic_bio_cnt = 1;
2244
5a6ac9ea
MX
2245 return rbio;
2246}
2247
b4ee1782
OS
2248/* Used for both parity scrub and missing. */
2249void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2250 u64 logical)
5a6ac9ea
MX
2251{
2252 int stripe_offset;
2253 int index;
2254
4c664611
QW
2255 ASSERT(logical >= rbio->bioc->raid_map[0]);
2256 ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
5a6ac9ea 2257 rbio->stripe_len * rbio->nr_data);
4c664611 2258 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
09cbfeaf 2259 index = stripe_offset >> PAGE_SHIFT;
5a6ac9ea
MX
2260 rbio->bio_pages[index] = page;
2261}
2262
2263/*
2264 * We just scrub the parity that we have correct data on the same horizontal,
2265 * so we needn't allocate all pages for all the stripes.
2266 */
2267static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2268{
2269 int i;
2270 int bit;
2271 int index;
2272 struct page *page;
2273
2274 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2c8cdd6e 2275 for (i = 0; i < rbio->real_stripes; i++) {
5a6ac9ea
MX
2276 index = i * rbio->stripe_npages + bit;
2277 if (rbio->stripe_pages[index])
2278 continue;
2279
b0ee5e1e 2280 page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2281 if (!page)
2282 return -ENOMEM;
2283 rbio->stripe_pages[index] = page;
5a6ac9ea
MX
2284 }
2285 }
2286 return 0;
2287}
2288
5a6ac9ea
MX
2289static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2290 int need_check)
2291{
4c664611 2292 struct btrfs_io_context *bioc = rbio->bioc;
1389053e
KC
2293 void **pointers = rbio->finish_pointers;
2294 unsigned long *pbitmap = rbio->finish_pbitmap;
5a6ac9ea
MX
2295 int nr_data = rbio->nr_data;
2296 int stripe;
2297 int pagenr;
c17af965 2298 bool has_qstripe;
5a6ac9ea
MX
2299 struct page *p_page = NULL;
2300 struct page *q_page = NULL;
2301 struct bio_list bio_list;
2302 struct bio *bio;
76035976 2303 int is_replace = 0;
5a6ac9ea
MX
2304 int ret;
2305
2306 bio_list_init(&bio_list);
2307
c17af965
DS
2308 if (rbio->real_stripes - rbio->nr_data == 1)
2309 has_qstripe = false;
2310 else if (rbio->real_stripes - rbio->nr_data == 2)
2311 has_qstripe = true;
2312 else
5a6ac9ea 2313 BUG();
5a6ac9ea 2314
4c664611 2315 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
76035976
MX
2316 is_replace = 1;
2317 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2318 }
2319
5a6ac9ea
MX
2320 /*
2321 * Because the higher layers(scrubber) are unlikely to
2322 * use this area of the disk again soon, so don't cache
2323 * it.
2324 */
2325 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2326
2327 if (!need_check)
2328 goto writeback;
2329
b0ee5e1e 2330 p_page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2331 if (!p_page)
2332 goto cleanup;
2333 SetPageUptodate(p_page);
2334
c17af965 2335 if (has_qstripe) {
d70cef0d 2336 /* RAID6, allocate and map temp space for the Q stripe */
b0ee5e1e 2337 q_page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2338 if (!q_page) {
2339 __free_page(p_page);
2340 goto cleanup;
2341 }
2342 SetPageUptodate(q_page);
94a0b58d 2343 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
5a6ac9ea
MX
2344 }
2345
2346 atomic_set(&rbio->error, 0);
2347
d70cef0d 2348 /* Map the parity stripe just once */
94a0b58d 2349 pointers[nr_data] = kmap_local_page(p_page);
d70cef0d 2350
5a6ac9ea
MX
2351 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2352 struct page *p;
2353 void *parity;
2354 /* first collect one page from each data stripe */
2355 for (stripe = 0; stripe < nr_data; stripe++) {
2356 p = page_in_rbio(rbio, stripe, pagenr, 0);
94a0b58d 2357 pointers[stripe] = kmap_local_page(p);
5a6ac9ea
MX
2358 }
2359
c17af965 2360 if (has_qstripe) {
d70cef0d 2361 /* RAID6, call the library function to fill in our P/Q */
2c8cdd6e 2362 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
5a6ac9ea
MX
2363 pointers);
2364 } else {
2365 /* raid5 */
69d24804 2366 copy_page(pointers[nr_data], pointers[0]);
09cbfeaf 2367 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
5a6ac9ea
MX
2368 }
2369
01327610 2370 /* Check scrubbing parity and repair it */
5a6ac9ea 2371 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
58c1a35c 2372 parity = kmap_local_page(p);
09cbfeaf 2373 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
69d24804 2374 copy_page(parity, pointers[rbio->scrubp]);
5a6ac9ea
MX
2375 else
2376 /* Parity is right, needn't writeback */
2377 bitmap_clear(rbio->dbitmap, pagenr, 1);
58c1a35c 2378 kunmap_local(parity);
5a6ac9ea 2379
94a0b58d
IW
2380 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2381 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2382 }
2383
94a0b58d 2384 kunmap_local(pointers[nr_data]);
5a6ac9ea 2385 __free_page(p_page);
d70cef0d 2386 if (q_page) {
94a0b58d 2387 kunmap_local(pointers[rbio->real_stripes - 1]);
5a6ac9ea 2388 __free_page(q_page);
d70cef0d 2389 }
5a6ac9ea
MX
2390
2391writeback:
2392 /*
2393 * time to start writing. Make bios for everything from the
2394 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2395 * everything else.
2396 */
2397 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2398 struct page *page;
2399
2400 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2401 ret = rbio_add_io_page(rbio, &bio_list,
2402 page, rbio->scrubp, pagenr, rbio->stripe_len);
2403 if (ret)
2404 goto cleanup;
2405 }
2406
76035976
MX
2407 if (!is_replace)
2408 goto submit_write;
2409
2410 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2411 struct page *page;
2412
2413 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2414 ret = rbio_add_io_page(rbio, &bio_list, page,
4c664611 2415 bioc->tgtdev_map[rbio->scrubp],
76035976
MX
2416 pagenr, rbio->stripe_len);
2417 if (ret)
2418 goto cleanup;
2419 }
2420
2421submit_write:
5a6ac9ea
MX
2422 nr_data = bio_list_size(&bio_list);
2423 if (!nr_data) {
2424 /* Every parity is right */
58efbc9f 2425 rbio_orig_end_io(rbio, BLK_STS_OK);
5a6ac9ea
MX
2426 return;
2427 }
2428
2429 atomic_set(&rbio->stripes_pending, nr_data);
2430
bf28a605 2431 while ((bio = bio_list_pop(&bio_list))) {
5a6ac9ea 2432 bio->bi_private = rbio;
a6111d11 2433 bio->bi_end_io = raid_write_end_io;
ebcc3263 2434 bio->bi_opf = REQ_OP_WRITE;
4e49ea4a
MC
2435
2436 submit_bio(bio);
5a6ac9ea
MX
2437 }
2438 return;
2439
2440cleanup:
58efbc9f 2441 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2442
2443 while ((bio = bio_list_pop(&bio_list)))
2444 bio_put(bio);
5a6ac9ea
MX
2445}
2446
2447static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2448{
2449 if (stripe >= 0 && stripe < rbio->nr_data)
2450 return 1;
2451 return 0;
2452}
2453
2454/*
2455 * While we're doing the parity check and repair, we could have errors
2456 * in reading pages off the disk. This checks for errors and if we're
2457 * not able to read the page it'll trigger parity reconstruction. The
2458 * parity scrub will be finished after we've reconstructed the failed
2459 * stripes
2460 */
2461static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2462{
4c664611 2463 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
5a6ac9ea
MX
2464 goto cleanup;
2465
2466 if (rbio->faila >= 0 || rbio->failb >= 0) {
2467 int dfail = 0, failp = -1;
2468
2469 if (is_data_stripe(rbio, rbio->faila))
2470 dfail++;
2471 else if (is_parity_stripe(rbio->faila))
2472 failp = rbio->faila;
2473
2474 if (is_data_stripe(rbio, rbio->failb))
2475 dfail++;
2476 else if (is_parity_stripe(rbio->failb))
2477 failp = rbio->failb;
2478
2479 /*
2480 * Because we can not use a scrubbing parity to repair
2481 * the data, so the capability of the repair is declined.
2482 * (In the case of RAID5, we can not repair anything)
2483 */
4c664611 2484 if (dfail > rbio->bioc->max_errors - 1)
5a6ac9ea
MX
2485 goto cleanup;
2486
2487 /*
2488 * If all data is good, only parity is correctly, just
2489 * repair the parity.
2490 */
2491 if (dfail == 0) {
2492 finish_parity_scrub(rbio, 0);
2493 return;
2494 }
2495
2496 /*
2497 * Here means we got one corrupted data stripe and one
2498 * corrupted parity on RAID6, if the corrupted parity
01327610 2499 * is scrubbing parity, luckily, use the other one to repair
5a6ac9ea
MX
2500 * the data, or we can not repair the data stripe.
2501 */
2502 if (failp != rbio->scrubp)
2503 goto cleanup;
2504
2505 __raid_recover_end_io(rbio);
2506 } else {
2507 finish_parity_scrub(rbio, 1);
2508 }
2509 return;
2510
2511cleanup:
58efbc9f 2512 rbio_orig_end_io(rbio, BLK_STS_IOERR);
5a6ac9ea
MX
2513}
2514
2515/*
2516 * end io for the read phase of the rmw cycle. All the bios here are physical
2517 * stripe bios we've read from the disk so we can recalculate the parity of the
2518 * stripe.
2519 *
2520 * This will usually kick off finish_rmw once all the bios are read in, but it
2521 * may trigger parity reconstruction if we had any errors along the way
2522 */
4246a0b6 2523static void raid56_parity_scrub_end_io(struct bio *bio)
5a6ac9ea
MX
2524{
2525 struct btrfs_raid_bio *rbio = bio->bi_private;
2526
4e4cbee9 2527 if (bio->bi_status)
5a6ac9ea
MX
2528 fail_bio_stripe(rbio, bio);
2529 else
2530 set_bio_pages_uptodate(bio);
2531
2532 bio_put(bio);
2533
2534 if (!atomic_dec_and_test(&rbio->stripes_pending))
2535 return;
2536
2537 /*
2538 * this will normally call finish_rmw to start our write
2539 * but if there are any failed stripes we'll reconstruct
2540 * from parity first
2541 */
2542 validate_rbio_for_parity_scrub(rbio);
2543}
2544
2545static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2546{
2547 int bios_to_read = 0;
5a6ac9ea
MX
2548 struct bio_list bio_list;
2549 int ret;
2550 int pagenr;
2551 int stripe;
2552 struct bio *bio;
2553
785884fc
LB
2554 bio_list_init(&bio_list);
2555
5a6ac9ea
MX
2556 ret = alloc_rbio_essential_pages(rbio);
2557 if (ret)
2558 goto cleanup;
2559
5a6ac9ea
MX
2560 atomic_set(&rbio->error, 0);
2561 /*
2562 * build a list of bios to read all the missing parts of this
2563 * stripe
2564 */
2c8cdd6e 2565 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5a6ac9ea
MX
2566 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2567 struct page *page;
2568 /*
2569 * we want to find all the pages missing from
2570 * the rbio and read them from the disk. If
2571 * page_in_rbio finds a page in the bio list
2572 * we don't need to read it off the stripe.
2573 */
2574 page = page_in_rbio(rbio, stripe, pagenr, 1);
2575 if (page)
2576 continue;
2577
2578 page = rbio_stripe_page(rbio, stripe, pagenr);
2579 /*
2580 * the bio cache may have handed us an uptodate
2581 * page. If so, be happy and use it
2582 */
2583 if (PageUptodate(page))
2584 continue;
2585
2586 ret = rbio_add_io_page(rbio, &bio_list, page,
2587 stripe, pagenr, rbio->stripe_len);
2588 if (ret)
2589 goto cleanup;
2590 }
2591 }
2592
2593 bios_to_read = bio_list_size(&bio_list);
2594 if (!bios_to_read) {
2595 /*
2596 * this can happen if others have merged with
2597 * us, it means there is nothing left to read.
2598 * But if there are missing devices it may not be
2599 * safe to do the full stripe write yet.
2600 */
2601 goto finish;
2602 }
2603
2604 /*
4c664611
QW
2605 * The bioc may be freed once we submit the last bio. Make sure not to
2606 * touch it after that.
5a6ac9ea
MX
2607 */
2608 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2609 while ((bio = bio_list_pop(&bio_list))) {
5a6ac9ea
MX
2610 bio->bi_private = rbio;
2611 bio->bi_end_io = raid56_parity_scrub_end_io;
ebcc3263 2612 bio->bi_opf = REQ_OP_READ;
5a6ac9ea 2613
6a258d72 2614 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
5a6ac9ea 2615
4e49ea4a 2616 submit_bio(bio);
5a6ac9ea
MX
2617 }
2618 /* the actual write will happen once the reads are done */
2619 return;
2620
2621cleanup:
58efbc9f 2622 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2623
2624 while ((bio = bio_list_pop(&bio_list)))
2625 bio_put(bio);
2626
5a6ac9ea
MX
2627 return;
2628
2629finish:
2630 validate_rbio_for_parity_scrub(rbio);
2631}
2632
2633static void scrub_parity_work(struct btrfs_work *work)
2634{
2635 struct btrfs_raid_bio *rbio;
2636
2637 rbio = container_of(work, struct btrfs_raid_bio, work);
2638 raid56_parity_scrub_stripe(rbio);
2639}
2640
5a6ac9ea
MX
2641void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2642{
2643 if (!lock_stripe_add(rbio))
a81b747d 2644 start_async_work(rbio, scrub_parity_work);
5a6ac9ea 2645}
b4ee1782
OS
2646
2647/* The following code is used for dev replace of a missing RAID 5/6 device. */
2648
2649struct btrfs_raid_bio *
6a258d72
QW
2650raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2651 u64 length)
b4ee1782 2652{
6a258d72 2653 struct btrfs_fs_info *fs_info = bioc->fs_info;
b4ee1782
OS
2654 struct btrfs_raid_bio *rbio;
2655
4c664611 2656 rbio = alloc_rbio(fs_info, bioc, length);
b4ee1782
OS
2657 if (IS_ERR(rbio))
2658 return NULL;
2659
2660 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2661 bio_list_add(&rbio->bio_list, bio);
2662 /*
2663 * This is a special bio which is used to hold the completion handler
2664 * and make the scrub rbio is similar to the other types
2665 */
2666 ASSERT(!bio->bi_iter.bi_size);
2667
2668 rbio->faila = find_logical_bio_stripe(rbio, bio);
2669 if (rbio->faila == -1) {
2670 BUG();
2671 kfree(rbio);
2672 return NULL;
2673 }
2674
ae6529c3 2675 /*
4c664611 2676 * When we get bioc, we have already increased bio_counter, record it
ae6529c3
QW
2677 * so we can free it at rbio_orig_end_io()
2678 */
2679 rbio->generic_bio_cnt = 1;
2680
b4ee1782
OS
2681 return rbio;
2682}
2683
b4ee1782
OS
2684void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2685{
2686 if (!lock_stripe_add(rbio))
e66d8d5a 2687 start_async_work(rbio, read_rebuild_work);
b4ee1782 2688}