btrfs: raid56: introduce btrfs_raid_bio::stripe_sectors
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
cea62800 16#include "misc.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
53b381b3
DW
22
23/* set when additional merges to this rbio are not allowed */
24#define RBIO_RMW_LOCKED_BIT 1
25
4ae10b3a
CM
26/*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30#define RBIO_CACHE_BIT 2
31
32/*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35#define RBIO_CACHE_READY_BIT 3
36
4ae10b3a
CM
37#define RBIO_CACHE_SIZE 1024
38
8a953348
DS
39#define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41/* Used by the raid56 code to lock stripes for read/modify/write */
42struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45};
46
47/* Used by the raid56 code to lock stripes for read/modify/write */
48struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53};
54
eb357060
QW
55/*
56 * A bvec like structure to present a sector inside a page.
57 *
58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
59 */
60struct sector_ptr {
61 struct page *page;
62 unsigned int pgoff;
63};
64
1b94b556 65enum btrfs_rbio_ops {
b4ee1782
OS
66 BTRFS_RBIO_WRITE,
67 BTRFS_RBIO_READ_REBUILD,
68 BTRFS_RBIO_PARITY_SCRUB,
69 BTRFS_RBIO_REBUILD_MISSING,
1b94b556
MX
70};
71
53b381b3 72struct btrfs_raid_bio {
4c664611 73 struct btrfs_io_context *bioc;
53b381b3 74
53b381b3
DW
75 /* while we're doing rmw on a stripe
76 * we put it into a hash table so we can
77 * lock the stripe and merge more rbios
78 * into it.
79 */
80 struct list_head hash_list;
81
4ae10b3a
CM
82 /*
83 * LRU list for the stripe cache
84 */
85 struct list_head stripe_cache;
86
53b381b3
DW
87 /*
88 * for scheduling work in the helper threads
89 */
90 struct btrfs_work work;
91
92 /*
93 * bio list and bio_list_lock are used
94 * to add more bios into the stripe
95 * in hopes of avoiding the full rmw
96 */
97 struct bio_list bio_list;
98 spinlock_t bio_list_lock;
99
6ac0f488
CM
100 /* also protected by the bio_list_lock, the
101 * plug list is used by the plugging code
102 * to collect partial bios while plugged. The
103 * stripe locking code also uses it to hand off
53b381b3
DW
104 * the stripe lock to the next pending IO
105 */
106 struct list_head plug_list;
107
108 /*
109 * flags that tell us if it is safe to
110 * merge with this bio
111 */
112 unsigned long flags;
113
53b381b3
DW
114 /*
115 * set if we're doing a parity rebuild
116 * for a read from higher up, which is handled
117 * differently from a parity rebuild as part of
118 * rmw
119 */
1b94b556 120 enum btrfs_rbio_ops operation;
53b381b3 121
29b06838
QW
122 /* Size of each individual stripe on disk */
123 u32 stripe_len;
53b381b3 124
29b06838
QW
125 /* How many pages there are for the full stripe including P/Q */
126 u16 nr_pages;
53b381b3 127
94efbe19
QW
128 /* How many sectors there are for the full stripe including P/Q */
129 u16 nr_sectors;
130
29b06838
QW
131 /* Number of data stripes (no p/q) */
132 u8 nr_data;
133
134 /* Numer of all stripes (including P/Q) */
135 u8 real_stripes;
136
137 /* How many pages there are for each stripe */
138 u8 stripe_npages;
139
94efbe19
QW
140 /* How many sectors there are for each stripe */
141 u8 stripe_nsectors;
142
29b06838
QW
143 /* First bad stripe, -1 means no corruption */
144 s8 faila;
145
146 /* Second bad stripe (for RAID6 use) */
147 s8 failb;
148
149 /* Stripe number that we're scrubbing */
150 u8 scrubp;
53b381b3
DW
151
152 /*
153 * size of all the bios in the bio_list. This
154 * helps us decide if the rbio maps to a full
155 * stripe or not
156 */
157 int bio_list_bytes;
158
4245215d
MX
159 int generic_bio_cnt;
160
dec95574 161 refcount_t refs;
53b381b3 162
b89e1b01
MX
163 atomic_t stripes_pending;
164
165 atomic_t error;
53b381b3
DW
166 /*
167 * these are two arrays of pointers. We allocate the
168 * rbio big enough to hold them both and setup their
169 * locations when the rbio is allocated
170 */
171
172 /* pointers to pages that we allocated for
173 * reading/writing stripes directly from the disk (including P/Q)
174 */
175 struct page **stripe_pages;
176
177 /*
178 * pointers to the pages in the bio_list. Stored
179 * here for faster lookup
180 */
181 struct page **bio_pages;
5a6ac9ea
MX
182
183 /*
eb357060
QW
184 * For subpage support, we need to map each sector to above
185 * stripe_pages.
5a6ac9ea 186 */
eb357060
QW
187 struct sector_ptr *stripe_sectors;
188
189 /* Bitmap to record which horizontal stripe has data */
5a6ac9ea 190 unsigned long *dbitmap;
1389053e
KC
191
192 /* allocated with real_stripes-many pointers for finish_*() calls */
193 void **finish_pointers;
194
94efbe19 195 /* Allocated with stripe_nsectors-many bits for finish_*() calls */
1389053e 196 unsigned long *finish_pbitmap;
53b381b3
DW
197};
198
199static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
200static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
201static void rmw_work(struct btrfs_work *work);
202static void read_rebuild_work(struct btrfs_work *work);
53b381b3
DW
203static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
204static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
205static void __free_raid_bio(struct btrfs_raid_bio *rbio);
206static void index_rbio_pages(struct btrfs_raid_bio *rbio);
207static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
208
5a6ac9ea
MX
209static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
210 int need_check);
a81b747d 211static void scrub_parity_work(struct btrfs_work *work);
5a6ac9ea 212
ac638859
DS
213static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
214{
a0cac0ec 215 btrfs_init_work(&rbio->work, work_func, NULL, NULL);
6a258d72 216 btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
217}
218
53b381b3
DW
219/*
220 * the stripe hash table is used for locking, and to collect
221 * bios in hopes of making a full stripe
222 */
223int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
224{
225 struct btrfs_stripe_hash_table *table;
226 struct btrfs_stripe_hash_table *x;
227 struct btrfs_stripe_hash *cur;
228 struct btrfs_stripe_hash *h;
229 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
230 int i;
231
232 if (info->stripe_hash_table)
233 return 0;
234
83c8266a
DS
235 /*
236 * The table is large, starting with order 4 and can go as high as
237 * order 7 in case lock debugging is turned on.
238 *
239 * Try harder to allocate and fallback to vmalloc to lower the chance
240 * of a failing mount.
241 */
ee787f95 242 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
243 if (!table)
244 return -ENOMEM;
53b381b3 245
4ae10b3a
CM
246 spin_lock_init(&table->cache_lock);
247 INIT_LIST_HEAD(&table->stripe_cache);
248
53b381b3
DW
249 h = table->table;
250
251 for (i = 0; i < num_entries; i++) {
252 cur = h + i;
253 INIT_LIST_HEAD(&cur->hash_list);
254 spin_lock_init(&cur->lock);
53b381b3
DW
255 }
256
257 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 258 kvfree(x);
53b381b3
DW
259 return 0;
260}
261
4ae10b3a
CM
262/*
263 * caching an rbio means to copy anything from the
264 * bio_pages array into the stripe_pages array. We
265 * use the page uptodate bit in the stripe cache array
266 * to indicate if it has valid data
267 *
268 * once the caching is done, we set the cache ready
269 * bit.
270 */
271static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
272{
273 int i;
4ae10b3a
CM
274 int ret;
275
276 ret = alloc_rbio_pages(rbio);
277 if (ret)
278 return;
279
280 for (i = 0; i < rbio->nr_pages; i++) {
281 if (!rbio->bio_pages[i])
282 continue;
283
80cc8384 284 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
4ae10b3a
CM
285 SetPageUptodate(rbio->stripe_pages[i]);
286 }
287 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
288}
289
53b381b3
DW
290/*
291 * we hash on the first logical address of the stripe
292 */
293static int rbio_bucket(struct btrfs_raid_bio *rbio)
294{
4c664611 295 u64 num = rbio->bioc->raid_map[0];
53b381b3
DW
296
297 /*
298 * we shift down quite a bit. We're using byte
299 * addressing, and most of the lower bits are zeros.
300 * This tends to upset hash_64, and it consistently
301 * returns just one or two different values.
302 *
303 * shifting off the lower bits fixes things.
304 */
305 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
306}
307
eb357060
QW
308/*
309 * Update the stripe_sectors[] array to use correct page and pgoff
310 *
311 * Should be called every time any page pointer in stripes_pages[] got modified.
312 */
313static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
314{
315 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
316 u32 offset;
317 int i;
318
319 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
320 int page_index = offset >> PAGE_SHIFT;
321
322 ASSERT(page_index < rbio->nr_pages);
323 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
324 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
325 }
326}
327
4ae10b3a
CM
328/*
329 * stealing an rbio means taking all the uptodate pages from the stripe
330 * array in the source rbio and putting them into the destination rbio
331 */
332static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
333{
334 int i;
335 struct page *s;
336 struct page *d;
337
338 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
339 return;
340
341 for (i = 0; i < dest->nr_pages; i++) {
342 s = src->stripe_pages[i];
343 if (!s || !PageUptodate(s)) {
344 continue;
345 }
346
347 d = dest->stripe_pages[i];
348 if (d)
349 __free_page(d);
350
351 dest->stripe_pages[i] = s;
352 src->stripe_pages[i] = NULL;
353 }
eb357060
QW
354 index_stripe_sectors(dest);
355 index_stripe_sectors(src);
4ae10b3a
CM
356}
357
53b381b3
DW
358/*
359 * merging means we take the bio_list from the victim and
360 * splice it into the destination. The victim should
361 * be discarded afterwards.
362 *
363 * must be called with dest->rbio_list_lock held
364 */
365static void merge_rbio(struct btrfs_raid_bio *dest,
366 struct btrfs_raid_bio *victim)
367{
368 bio_list_merge(&dest->bio_list, &victim->bio_list);
369 dest->bio_list_bytes += victim->bio_list_bytes;
4245215d 370 dest->generic_bio_cnt += victim->generic_bio_cnt;
53b381b3
DW
371 bio_list_init(&victim->bio_list);
372}
373
374/*
4ae10b3a
CM
375 * used to prune items that are in the cache. The caller
376 * must hold the hash table lock.
377 */
378static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
379{
380 int bucket = rbio_bucket(rbio);
381 struct btrfs_stripe_hash_table *table;
382 struct btrfs_stripe_hash *h;
383 int freeit = 0;
384
385 /*
386 * check the bit again under the hash table lock.
387 */
388 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
389 return;
390
6a258d72 391 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
392 h = table->table + bucket;
393
394 /* hold the lock for the bucket because we may be
395 * removing it from the hash table
396 */
397 spin_lock(&h->lock);
398
399 /*
400 * hold the lock for the bio list because we need
401 * to make sure the bio list is empty
402 */
403 spin_lock(&rbio->bio_list_lock);
404
405 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
406 list_del_init(&rbio->stripe_cache);
407 table->cache_size -= 1;
408 freeit = 1;
409
410 /* if the bio list isn't empty, this rbio is
411 * still involved in an IO. We take it out
412 * of the cache list, and drop the ref that
413 * was held for the list.
414 *
415 * If the bio_list was empty, we also remove
416 * the rbio from the hash_table, and drop
417 * the corresponding ref
418 */
419 if (bio_list_empty(&rbio->bio_list)) {
420 if (!list_empty(&rbio->hash_list)) {
421 list_del_init(&rbio->hash_list);
dec95574 422 refcount_dec(&rbio->refs);
4ae10b3a
CM
423 BUG_ON(!list_empty(&rbio->plug_list));
424 }
425 }
426 }
427
428 spin_unlock(&rbio->bio_list_lock);
429 spin_unlock(&h->lock);
430
431 if (freeit)
432 __free_raid_bio(rbio);
433}
434
435/*
436 * prune a given rbio from the cache
437 */
438static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
439{
440 struct btrfs_stripe_hash_table *table;
441 unsigned long flags;
442
443 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
444 return;
445
6a258d72 446 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
447
448 spin_lock_irqsave(&table->cache_lock, flags);
449 __remove_rbio_from_cache(rbio);
450 spin_unlock_irqrestore(&table->cache_lock, flags);
451}
452
453/*
454 * remove everything in the cache
455 */
48a3b636 456static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
457{
458 struct btrfs_stripe_hash_table *table;
459 unsigned long flags;
460 struct btrfs_raid_bio *rbio;
461
462 table = info->stripe_hash_table;
463
464 spin_lock_irqsave(&table->cache_lock, flags);
465 while (!list_empty(&table->stripe_cache)) {
466 rbio = list_entry(table->stripe_cache.next,
467 struct btrfs_raid_bio,
468 stripe_cache);
469 __remove_rbio_from_cache(rbio);
470 }
471 spin_unlock_irqrestore(&table->cache_lock, flags);
472}
473
474/*
475 * remove all cached entries and free the hash table
476 * used by unmount
53b381b3
DW
477 */
478void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
479{
480 if (!info->stripe_hash_table)
481 return;
4ae10b3a 482 btrfs_clear_rbio_cache(info);
f749303b 483 kvfree(info->stripe_hash_table);
53b381b3
DW
484 info->stripe_hash_table = NULL;
485}
486
4ae10b3a
CM
487/*
488 * insert an rbio into the stripe cache. It
489 * must have already been prepared by calling
490 * cache_rbio_pages
491 *
492 * If this rbio was already cached, it gets
493 * moved to the front of the lru.
494 *
495 * If the size of the rbio cache is too big, we
496 * prune an item.
497 */
498static void cache_rbio(struct btrfs_raid_bio *rbio)
499{
500 struct btrfs_stripe_hash_table *table;
501 unsigned long flags;
502
503 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
504 return;
505
6a258d72 506 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
507
508 spin_lock_irqsave(&table->cache_lock, flags);
509 spin_lock(&rbio->bio_list_lock);
510
511 /* bump our ref if we were not in the list before */
512 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 513 refcount_inc(&rbio->refs);
4ae10b3a
CM
514
515 if (!list_empty(&rbio->stripe_cache)){
516 list_move(&rbio->stripe_cache, &table->stripe_cache);
517 } else {
518 list_add(&rbio->stripe_cache, &table->stripe_cache);
519 table->cache_size += 1;
520 }
521
522 spin_unlock(&rbio->bio_list_lock);
523
524 if (table->cache_size > RBIO_CACHE_SIZE) {
525 struct btrfs_raid_bio *found;
526
527 found = list_entry(table->stripe_cache.prev,
528 struct btrfs_raid_bio,
529 stripe_cache);
530
531 if (found != rbio)
532 __remove_rbio_from_cache(found);
533 }
534
535 spin_unlock_irqrestore(&table->cache_lock, flags);
4ae10b3a
CM
536}
537
53b381b3
DW
538/*
539 * helper function to run the xor_blocks api. It is only
540 * able to do MAX_XOR_BLOCKS at a time, so we need to
541 * loop through.
542 */
543static void run_xor(void **pages, int src_cnt, ssize_t len)
544{
545 int src_off = 0;
546 int xor_src_cnt = 0;
547 void *dest = pages[src_cnt];
548
549 while(src_cnt > 0) {
550 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
551 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
552
553 src_cnt -= xor_src_cnt;
554 src_off += xor_src_cnt;
555 }
556}
557
558/*
176571a1
DS
559 * Returns true if the bio list inside this rbio covers an entire stripe (no
560 * rmw required).
53b381b3 561 */
176571a1 562static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3 563{
176571a1 564 unsigned long flags;
53b381b3
DW
565 unsigned long size = rbio->bio_list_bytes;
566 int ret = 1;
567
176571a1 568 spin_lock_irqsave(&rbio->bio_list_lock, flags);
53b381b3
DW
569 if (size != rbio->nr_data * rbio->stripe_len)
570 ret = 0;
53b381b3 571 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
53b381b3 572 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
176571a1 573
53b381b3
DW
574 return ret;
575}
576
577/*
578 * returns 1 if it is safe to merge two rbios together.
579 * The merging is safe if the two rbios correspond to
580 * the same stripe and if they are both going in the same
581 * direction (read vs write), and if neither one is
582 * locked for final IO
583 *
584 * The caller is responsible for locking such that
585 * rmw_locked is safe to test
586 */
587static int rbio_can_merge(struct btrfs_raid_bio *last,
588 struct btrfs_raid_bio *cur)
589{
590 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
591 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
592 return 0;
593
4ae10b3a
CM
594 /*
595 * we can't merge with cached rbios, since the
596 * idea is that when we merge the destination
597 * rbio is going to run our IO for us. We can
01327610 598 * steal from cached rbios though, other functions
4ae10b3a
CM
599 * handle that.
600 */
601 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
602 test_bit(RBIO_CACHE_BIT, &cur->flags))
603 return 0;
604
4c664611 605 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
53b381b3
DW
606 return 0;
607
5a6ac9ea
MX
608 /* we can't merge with different operations */
609 if (last->operation != cur->operation)
610 return 0;
611 /*
612 * We've need read the full stripe from the drive.
613 * check and repair the parity and write the new results.
614 *
615 * We're not allowed to add any new bios to the
616 * bio list here, anyone else that wants to
617 * change this stripe needs to do their own rmw.
618 */
db34be19 619 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 620 return 0;
53b381b3 621
db34be19 622 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
b4ee1782
OS
623 return 0;
624
cc54ff62
LB
625 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
626 int fa = last->faila;
627 int fb = last->failb;
628 int cur_fa = cur->faila;
629 int cur_fb = cur->failb;
630
631 if (last->faila >= last->failb) {
632 fa = last->failb;
633 fb = last->faila;
634 }
635
636 if (cur->faila >= cur->failb) {
637 cur_fa = cur->failb;
638 cur_fb = cur->faila;
639 }
640
641 if (fa != cur_fa || fb != cur_fb)
642 return 0;
643 }
53b381b3
DW
644 return 1;
645}
646
b7178a5f
ZL
647static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
648 int index)
649{
650 return stripe * rbio->stripe_npages + index;
651}
652
653/*
654 * these are just the pages from the rbio array, not from anything
655 * the FS sent down to us
656 */
657static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
658 int index)
659{
660 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
661}
662
53b381b3
DW
663/*
664 * helper to index into the pstripe
665 */
666static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
667{
b7178a5f 668 return rbio_stripe_page(rbio, rbio->nr_data, index);
53b381b3
DW
669}
670
671/*
672 * helper to index into the qstripe, returns null
673 * if there is no qstripe
674 */
675static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
676{
2c8cdd6e 677 if (rbio->nr_data + 1 == rbio->real_stripes)
53b381b3 678 return NULL;
b7178a5f 679 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
53b381b3
DW
680}
681
682/*
683 * The first stripe in the table for a logical address
684 * has the lock. rbios are added in one of three ways:
685 *
686 * 1) Nobody has the stripe locked yet. The rbio is given
687 * the lock and 0 is returned. The caller must start the IO
688 * themselves.
689 *
690 * 2) Someone has the stripe locked, but we're able to merge
691 * with the lock owner. The rbio is freed and the IO will
692 * start automatically along with the existing rbio. 1 is returned.
693 *
694 * 3) Someone has the stripe locked, but we're not able to merge.
695 * The rbio is added to the lock owner's plug list, or merged into
696 * an rbio already on the plug list. When the lock owner unlocks,
697 * the next rbio on the list is run and the IO is started automatically.
698 * 1 is returned
699 *
700 * If we return 0, the caller still owns the rbio and must continue with
701 * IO submission. If we return 1, the caller must assume the rbio has
702 * already been freed.
703 */
704static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
705{
721860d5 706 struct btrfs_stripe_hash *h;
53b381b3
DW
707 struct btrfs_raid_bio *cur;
708 struct btrfs_raid_bio *pending;
709 unsigned long flags;
53b381b3 710 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 711 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 712 int ret = 0;
53b381b3 713
6a258d72 714 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 715
53b381b3
DW
716 spin_lock_irqsave(&h->lock, flags);
717 list_for_each_entry(cur, &h->hash_list, hash_list) {
4c664611 718 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
9d6cb1b0 719 continue;
4ae10b3a 720
9d6cb1b0 721 spin_lock(&cur->bio_list_lock);
4ae10b3a 722
9d6cb1b0
JT
723 /* Can we steal this cached rbio's pages? */
724 if (bio_list_empty(&cur->bio_list) &&
725 list_empty(&cur->plug_list) &&
726 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
727 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
728 list_del_init(&cur->hash_list);
729 refcount_dec(&cur->refs);
53b381b3 730
9d6cb1b0
JT
731 steal_rbio(cur, rbio);
732 cache_drop = cur;
733 spin_unlock(&cur->bio_list_lock);
4ae10b3a 734
9d6cb1b0
JT
735 goto lockit;
736 }
53b381b3 737
9d6cb1b0
JT
738 /* Can we merge into the lock owner? */
739 if (rbio_can_merge(cur, rbio)) {
740 merge_rbio(cur, rbio);
53b381b3 741 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 742 freeit = rbio;
53b381b3
DW
743 ret = 1;
744 goto out;
745 }
9d6cb1b0
JT
746
747
748 /*
749 * We couldn't merge with the running rbio, see if we can merge
750 * with the pending ones. We don't have to check for rmw_locked
751 * because there is no way they are inside finish_rmw right now
752 */
753 list_for_each_entry(pending, &cur->plug_list, plug_list) {
754 if (rbio_can_merge(pending, rbio)) {
755 merge_rbio(pending, rbio);
756 spin_unlock(&cur->bio_list_lock);
757 freeit = rbio;
758 ret = 1;
759 goto out;
760 }
761 }
762
763 /*
764 * No merging, put us on the tail of the plug list, our rbio
765 * will be started with the currently running rbio unlocks
766 */
767 list_add_tail(&rbio->plug_list, &cur->plug_list);
768 spin_unlock(&cur->bio_list_lock);
769 ret = 1;
770 goto out;
53b381b3 771 }
4ae10b3a 772lockit:
dec95574 773 refcount_inc(&rbio->refs);
53b381b3
DW
774 list_add(&rbio->hash_list, &h->hash_list);
775out:
776 spin_unlock_irqrestore(&h->lock, flags);
4ae10b3a
CM
777 if (cache_drop)
778 remove_rbio_from_cache(cache_drop);
53b381b3
DW
779 if (freeit)
780 __free_raid_bio(freeit);
781 return ret;
782}
783
784/*
785 * called as rmw or parity rebuild is completed. If the plug list has more
786 * rbios waiting for this stripe, the next one on the list will be started
787 */
788static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
789{
790 int bucket;
791 struct btrfs_stripe_hash *h;
792 unsigned long flags;
4ae10b3a 793 int keep_cache = 0;
53b381b3
DW
794
795 bucket = rbio_bucket(rbio);
6a258d72 796 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 797
4ae10b3a
CM
798 if (list_empty(&rbio->plug_list))
799 cache_rbio(rbio);
800
53b381b3
DW
801 spin_lock_irqsave(&h->lock, flags);
802 spin_lock(&rbio->bio_list_lock);
803
804 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
805 /*
806 * if we're still cached and there is no other IO
807 * to perform, just leave this rbio here for others
808 * to steal from later
809 */
810 if (list_empty(&rbio->plug_list) &&
811 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
812 keep_cache = 1;
813 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
814 BUG_ON(!bio_list_empty(&rbio->bio_list));
815 goto done;
816 }
53b381b3
DW
817
818 list_del_init(&rbio->hash_list);
dec95574 819 refcount_dec(&rbio->refs);
53b381b3
DW
820
821 /*
822 * we use the plug list to hold all the rbios
823 * waiting for the chance to lock this stripe.
824 * hand the lock over to one of them.
825 */
826 if (!list_empty(&rbio->plug_list)) {
827 struct btrfs_raid_bio *next;
828 struct list_head *head = rbio->plug_list.next;
829
830 next = list_entry(head, struct btrfs_raid_bio,
831 plug_list);
832
833 list_del_init(&rbio->plug_list);
834
835 list_add(&next->hash_list, &h->hash_list);
dec95574 836 refcount_inc(&next->refs);
53b381b3
DW
837 spin_unlock(&rbio->bio_list_lock);
838 spin_unlock_irqrestore(&h->lock, flags);
839
1b94b556 840 if (next->operation == BTRFS_RBIO_READ_REBUILD)
e66d8d5a 841 start_async_work(next, read_rebuild_work);
b4ee1782
OS
842 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
843 steal_rbio(rbio, next);
e66d8d5a 844 start_async_work(next, read_rebuild_work);
b4ee1782 845 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 846 steal_rbio(rbio, next);
cf6a4a75 847 start_async_work(next, rmw_work);
5a6ac9ea
MX
848 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
849 steal_rbio(rbio, next);
a81b747d 850 start_async_work(next, scrub_parity_work);
4ae10b3a 851 }
53b381b3
DW
852
853 goto done_nolock;
53b381b3
DW
854 }
855 }
4ae10b3a 856done:
53b381b3
DW
857 spin_unlock(&rbio->bio_list_lock);
858 spin_unlock_irqrestore(&h->lock, flags);
859
860done_nolock:
4ae10b3a
CM
861 if (!keep_cache)
862 remove_rbio_from_cache(rbio);
53b381b3
DW
863}
864
865static void __free_raid_bio(struct btrfs_raid_bio *rbio)
866{
867 int i;
868
dec95574 869 if (!refcount_dec_and_test(&rbio->refs))
53b381b3
DW
870 return;
871
4ae10b3a 872 WARN_ON(!list_empty(&rbio->stripe_cache));
53b381b3
DW
873 WARN_ON(!list_empty(&rbio->hash_list));
874 WARN_ON(!bio_list_empty(&rbio->bio_list));
875
876 for (i = 0; i < rbio->nr_pages; i++) {
877 if (rbio->stripe_pages[i]) {
878 __free_page(rbio->stripe_pages[i]);
879 rbio->stripe_pages[i] = NULL;
880 }
881 }
af8e2d1d 882
4c664611 883 btrfs_put_bioc(rbio->bioc);
53b381b3
DW
884 kfree(rbio);
885}
886
7583d8d0 887static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
53b381b3 888{
7583d8d0
LB
889 struct bio *next;
890
891 while (cur) {
892 next = cur->bi_next;
893 cur->bi_next = NULL;
894 cur->bi_status = err;
895 bio_endio(cur);
896 cur = next;
897 }
53b381b3
DW
898}
899
900/*
901 * this frees the rbio and runs through all the bios in the
902 * bio_list and calls end_io on them
903 */
4e4cbee9 904static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
53b381b3
DW
905{
906 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 907 struct bio *extra;
4245215d
MX
908
909 if (rbio->generic_bio_cnt)
6a258d72 910 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
4245215d 911
7583d8d0
LB
912 /*
913 * At this moment, rbio->bio_list is empty, however since rbio does not
914 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
915 * hash list, rbio may be merged with others so that rbio->bio_list
916 * becomes non-empty.
917 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
918 * more and we can call bio_endio() on all queued bios.
919 */
920 unlock_stripe(rbio);
921 extra = bio_list_get(&rbio->bio_list);
922 __free_raid_bio(rbio);
53b381b3 923
7583d8d0
LB
924 rbio_endio_bio_list(cur, err);
925 if (extra)
926 rbio_endio_bio_list(extra, err);
53b381b3
DW
927}
928
929/*
930 * end io function used by finish_rmw. When we finally
931 * get here, we've written a full stripe
932 */
4246a0b6 933static void raid_write_end_io(struct bio *bio)
53b381b3
DW
934{
935 struct btrfs_raid_bio *rbio = bio->bi_private;
4e4cbee9 936 blk_status_t err = bio->bi_status;
a6111d11 937 int max_errors;
53b381b3
DW
938
939 if (err)
940 fail_bio_stripe(rbio, bio);
941
942 bio_put(bio);
943
b89e1b01 944 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
945 return;
946
58efbc9f 947 err = BLK_STS_OK;
53b381b3
DW
948
949 /* OK, we have read all the stripes we need to. */
a6111d11 950 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
4c664611 951 0 : rbio->bioc->max_errors;
a6111d11 952 if (atomic_read(&rbio->error) > max_errors)
4e4cbee9 953 err = BLK_STS_IOERR;
53b381b3 954
4246a0b6 955 rbio_orig_end_io(rbio, err);
53b381b3
DW
956}
957
958/*
959 * the read/modify/write code wants to use the original bio for
960 * any pages it included, and then use the rbio for everything
961 * else. This function decides if a given index (stripe number)
962 * and page number in that stripe fall inside the original bio
963 * or the rbio.
964 *
965 * if you set bio_list_only, you'll get a NULL back for any ranges
966 * that are outside the bio_list
967 *
968 * This doesn't take any refs on anything, you get a bare page pointer
969 * and the caller must bump refs as required.
970 *
971 * You must call index_rbio_pages once before you can trust
972 * the answers from this function.
973 */
974static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
975 int index, int pagenr, int bio_list_only)
976{
977 int chunk_page;
978 struct page *p = NULL;
979
980 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
981
982 spin_lock_irq(&rbio->bio_list_lock);
983 p = rbio->bio_pages[chunk_page];
984 spin_unlock_irq(&rbio->bio_list_lock);
985
986 if (p || bio_list_only)
987 return p;
988
989 return rbio->stripe_pages[chunk_page];
990}
991
53b381b3
DW
992/*
993 * allocation and initial setup for the btrfs_raid_bio. Not
994 * this does not allocate any pages for rbio->pages.
995 */
2ff7e61e 996static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
4c664611 997 struct btrfs_io_context *bioc,
cc353a8b 998 u32 stripe_len)
53b381b3 999{
843de58b
QW
1000 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
1001 const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
1002 const unsigned int num_pages = stripe_npages * real_stripes;
94efbe19
QW
1003 const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
1004 const unsigned int num_sectors = stripe_nsectors * real_stripes;
53b381b3
DW
1005 struct btrfs_raid_bio *rbio;
1006 int nr_data = 0;
53b381b3
DW
1007 void *p;
1008
843de58b 1009 ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
94efbe19
QW
1010 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1011 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
843de58b 1012
1389053e
KC
1013 rbio = kzalloc(sizeof(*rbio) +
1014 sizeof(*rbio->stripe_pages) * num_pages +
1015 sizeof(*rbio->bio_pages) * num_pages +
eb357060 1016 sizeof(*rbio->stripe_sectors) * num_sectors +
1389053e 1017 sizeof(*rbio->finish_pointers) * real_stripes +
94efbe19
QW
1018 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
1019 sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
1389053e 1020 GFP_NOFS);
af8e2d1d 1021 if (!rbio)
53b381b3 1022 return ERR_PTR(-ENOMEM);
53b381b3
DW
1023
1024 bio_list_init(&rbio->bio_list);
1025 INIT_LIST_HEAD(&rbio->plug_list);
1026 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 1027 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 1028 INIT_LIST_HEAD(&rbio->hash_list);
4c664611 1029 rbio->bioc = bioc;
53b381b3
DW
1030 rbio->stripe_len = stripe_len;
1031 rbio->nr_pages = num_pages;
94efbe19 1032 rbio->nr_sectors = num_sectors;
2c8cdd6e 1033 rbio->real_stripes = real_stripes;
5a6ac9ea 1034 rbio->stripe_npages = stripe_npages;
94efbe19 1035 rbio->stripe_nsectors = stripe_nsectors;
53b381b3
DW
1036 rbio->faila = -1;
1037 rbio->failb = -1;
dec95574 1038 refcount_set(&rbio->refs, 1);
b89e1b01
MX
1039 atomic_set(&rbio->error, 0);
1040 atomic_set(&rbio->stripes_pending, 0);
53b381b3
DW
1041
1042 /*
1389053e 1043 * the stripe_pages, bio_pages, etc arrays point to the extra
53b381b3
DW
1044 * memory we allocated past the end of the rbio
1045 */
1046 p = rbio + 1;
1389053e
KC
1047#define CONSUME_ALLOC(ptr, count) do { \
1048 ptr = p; \
1049 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1050 } while (0)
1051 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1052 CONSUME_ALLOC(rbio->bio_pages, num_pages);
eb357060 1053 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
1389053e 1054 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
94efbe19
QW
1055 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
1056 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
1389053e 1057#undef CONSUME_ALLOC
53b381b3 1058
4c664611 1059 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
10f11900 1060 nr_data = real_stripes - 1;
4c664611 1061 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
2c8cdd6e 1062 nr_data = real_stripes - 2;
53b381b3 1063 else
10f11900 1064 BUG();
53b381b3
DW
1065
1066 rbio->nr_data = nr_data;
1067 return rbio;
1068}
1069
1070/* allocate pages for all the stripes in the bio, including parity */
1071static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1072{
eb357060
QW
1073 int ret;
1074
1075 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
1076 if (ret < 0)
1077 return ret;
1078 /* Mapping all sectors */
1079 index_stripe_sectors(rbio);
1080 return 0;
53b381b3
DW
1081}
1082
b7178a5f 1083/* only allocate pages for p/q stripes */
53b381b3
DW
1084static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1085{
dd137dd1 1086 int data_pages = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
eb357060 1087 int ret;
53b381b3 1088
eb357060
QW
1089 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1090 rbio->stripe_pages + data_pages);
1091 if (ret < 0)
1092 return ret;
1093
1094 index_stripe_sectors(rbio);
1095 return 0;
53b381b3
DW
1096}
1097
1098/*
1099 * add a single page from a specific stripe into our list of bios for IO
1100 * this will try to merge into existing bios if possible, and returns
1101 * zero if all went well.
1102 */
48a3b636
ES
1103static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1104 struct bio_list *bio_list,
1105 struct page *page,
1106 int stripe_nr,
1107 unsigned long page_index,
e01bf588
CH
1108 unsigned long bio_max_len,
1109 unsigned int opf)
53b381b3
DW
1110{
1111 struct bio *last = bio_list->tail;
53b381b3
DW
1112 int ret;
1113 struct bio *bio;
4c664611 1114 struct btrfs_io_stripe *stripe;
53b381b3
DW
1115 u64 disk_start;
1116
4c664611 1117 stripe = &rbio->bioc->stripes[stripe_nr];
09cbfeaf 1118 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
53b381b3
DW
1119
1120 /* if the device is missing, just fail this stripe */
1121 if (!stripe->dev->bdev)
1122 return fail_rbio_index(rbio, stripe_nr);
1123
1124 /* see if we can add this page onto our existing bio */
1125 if (last) {
1201b58b 1126 u64 last_end = last->bi_iter.bi_sector << 9;
4f024f37 1127 last_end += last->bi_iter.bi_size;
53b381b3
DW
1128
1129 /*
1130 * we can't merge these if they are from different
1131 * devices or if they are not contiguous
1132 */
f90ae76a 1133 if (last_end == disk_start && !last->bi_status &&
309dca30 1134 last->bi_bdev == stripe->dev->bdev) {
09cbfeaf
KS
1135 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1136 if (ret == PAGE_SIZE)
53b381b3
DW
1137 return 0;
1138 }
1139 }
1140
1141 /* put a new bio on the list */
e1b4b44e
CH
1142 bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
1143 opf, GFP_NOFS);
4f024f37 1144 bio->bi_iter.bi_sector = disk_start >> 9;
e01bf588 1145 bio->bi_private = rbio;
53b381b3 1146
09cbfeaf 1147 bio_add_page(bio, page, PAGE_SIZE, 0);
53b381b3
DW
1148 bio_list_add(bio_list, bio);
1149 return 0;
1150}
1151
1152/*
1153 * while we're doing the read/modify/write cycle, we could
1154 * have errors in reading pages off the disk. This checks
1155 * for errors and if we're not able to read the page it'll
1156 * trigger parity reconstruction. The rmw will be finished
1157 * after we've reconstructed the failed stripes
1158 */
1159static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1160{
1161 if (rbio->faila >= 0 || rbio->failb >= 0) {
2c8cdd6e 1162 BUG_ON(rbio->faila == rbio->real_stripes - 1);
53b381b3
DW
1163 __raid56_parity_recover(rbio);
1164 } else {
1165 finish_rmw(rbio);
1166 }
1167}
1168
53b381b3
DW
1169/*
1170 * helper function to walk our bio list and populate the bio_pages array with
1171 * the result. This seems expensive, but it is faster than constantly
1172 * searching through the bio list as we setup the IO in finish_rmw or stripe
1173 * reconstruction.
1174 *
1175 * This must be called before you trust the answers from page_in_rbio
1176 */
1177static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1178{
1179 struct bio *bio;
1180 u64 start;
1181 unsigned long stripe_offset;
1182 unsigned long page_index;
53b381b3
DW
1183
1184 spin_lock_irq(&rbio->bio_list_lock);
1185 bio_list_for_each(bio, &rbio->bio_list) {
6592e58c
FM
1186 struct bio_vec bvec;
1187 struct bvec_iter iter;
1188 int i = 0;
1189
1201b58b 1190 start = bio->bi_iter.bi_sector << 9;
4c664611 1191 stripe_offset = start - rbio->bioc->raid_map[0];
09cbfeaf 1192 page_index = stripe_offset >> PAGE_SHIFT;
53b381b3 1193
6592e58c
FM
1194 bio_for_each_segment(bvec, bio, iter) {
1195 rbio->bio_pages[page_index + i] = bvec.bv_page;
1196 i++;
1197 }
53b381b3
DW
1198 }
1199 spin_unlock_irq(&rbio->bio_list_lock);
1200}
1201
1202/*
1203 * this is called from one of two situations. We either
1204 * have a full stripe from the higher layers, or we've read all
1205 * the missing bits off disk.
1206 *
1207 * This will calculate the parity and then send down any
1208 * changed blocks.
1209 */
1210static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1211{
4c664611 1212 struct btrfs_io_context *bioc = rbio->bioc;
1389053e 1213 void **pointers = rbio->finish_pointers;
53b381b3
DW
1214 int nr_data = rbio->nr_data;
1215 int stripe;
1216 int pagenr;
c17af965 1217 bool has_qstripe;
53b381b3
DW
1218 struct bio_list bio_list;
1219 struct bio *bio;
53b381b3
DW
1220 int ret;
1221
1222 bio_list_init(&bio_list);
1223
c17af965
DS
1224 if (rbio->real_stripes - rbio->nr_data == 1)
1225 has_qstripe = false;
1226 else if (rbio->real_stripes - rbio->nr_data == 2)
1227 has_qstripe = true;
1228 else
53b381b3 1229 BUG();
53b381b3
DW
1230
1231 /* at this point we either have a full stripe,
1232 * or we've read the full stripe from the drive.
1233 * recalculate the parity and write the new results.
1234 *
1235 * We're not allowed to add any new bios to the
1236 * bio list here, anyone else that wants to
1237 * change this stripe needs to do their own rmw.
1238 */
1239 spin_lock_irq(&rbio->bio_list_lock);
1240 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1241 spin_unlock_irq(&rbio->bio_list_lock);
1242
b89e1b01 1243 atomic_set(&rbio->error, 0);
53b381b3
DW
1244
1245 /*
1246 * now that we've set rmw_locked, run through the
1247 * bio list one last time and map the page pointers
4ae10b3a
CM
1248 *
1249 * We don't cache full rbios because we're assuming
1250 * the higher layers are unlikely to use this area of
1251 * the disk again soon. If they do use it again,
1252 * hopefully they will send another full bio.
53b381b3
DW
1253 */
1254 index_rbio_pages(rbio);
4ae10b3a
CM
1255 if (!rbio_is_full(rbio))
1256 cache_rbio_pages(rbio);
1257 else
1258 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
53b381b3 1259
915e2290 1260 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
1261 struct page *p;
1262 /* first collect one page from each data stripe */
1263 for (stripe = 0; stripe < nr_data; stripe++) {
1264 p = page_in_rbio(rbio, stripe, pagenr, 0);
94a0b58d 1265 pointers[stripe] = kmap_local_page(p);
53b381b3
DW
1266 }
1267
1268 /* then add the parity stripe */
1269 p = rbio_pstripe_page(rbio, pagenr);
1270 SetPageUptodate(p);
94a0b58d 1271 pointers[stripe++] = kmap_local_page(p);
53b381b3 1272
c17af965 1273 if (has_qstripe) {
53b381b3
DW
1274
1275 /*
1276 * raid6, add the qstripe and call the
1277 * library function to fill in our p/q
1278 */
1279 p = rbio_qstripe_page(rbio, pagenr);
1280 SetPageUptodate(p);
94a0b58d 1281 pointers[stripe++] = kmap_local_page(p);
53b381b3 1282
2c8cdd6e 1283 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
53b381b3
DW
1284 pointers);
1285 } else {
1286 /* raid5 */
69d24804 1287 copy_page(pointers[nr_data], pointers[0]);
09cbfeaf 1288 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
53b381b3 1289 }
94a0b58d
IW
1290 for (stripe = stripe - 1; stripe >= 0; stripe--)
1291 kunmap_local(pointers[stripe]);
53b381b3
DW
1292 }
1293
1294 /*
1295 * time to start writing. Make bios for everything from the
1296 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1297 * everything else.
1298 */
2c8cdd6e 1299 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
915e2290 1300 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
1301 struct page *page;
1302 if (stripe < rbio->nr_data) {
1303 page = page_in_rbio(rbio, stripe, pagenr, 1);
1304 if (!page)
1305 continue;
1306 } else {
1307 page = rbio_stripe_page(rbio, stripe, pagenr);
1308 }
1309
1310 ret = rbio_add_io_page(rbio, &bio_list,
e01bf588
CH
1311 page, stripe, pagenr, rbio->stripe_len,
1312 REQ_OP_WRITE);
53b381b3
DW
1313 if (ret)
1314 goto cleanup;
1315 }
1316 }
1317
4c664611 1318 if (likely(!bioc->num_tgtdevs))
2c8cdd6e
MX
1319 goto write_data;
1320
1321 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
4c664611 1322 if (!bioc->tgtdev_map[stripe])
2c8cdd6e
MX
1323 continue;
1324
915e2290 1325 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2c8cdd6e
MX
1326 struct page *page;
1327 if (stripe < rbio->nr_data) {
1328 page = page_in_rbio(rbio, stripe, pagenr, 1);
1329 if (!page)
1330 continue;
1331 } else {
1332 page = rbio_stripe_page(rbio, stripe, pagenr);
1333 }
1334
1335 ret = rbio_add_io_page(rbio, &bio_list, page,
4c664611 1336 rbio->bioc->tgtdev_map[stripe],
e01bf588
CH
1337 pagenr, rbio->stripe_len,
1338 REQ_OP_WRITE);
2c8cdd6e
MX
1339 if (ret)
1340 goto cleanup;
1341 }
1342 }
1343
1344write_data:
b89e1b01
MX
1345 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1346 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
53b381b3 1347
bf28a605 1348 while ((bio = bio_list_pop(&bio_list))) {
53b381b3 1349 bio->bi_end_io = raid_write_end_io;
4e49ea4a
MC
1350
1351 submit_bio(bio);
53b381b3
DW
1352 }
1353 return;
1354
1355cleanup:
58efbc9f 1356 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1357
1358 while ((bio = bio_list_pop(&bio_list)))
1359 bio_put(bio);
53b381b3
DW
1360}
1361
1362/*
1363 * helper to find the stripe number for a given bio. Used to figure out which
1364 * stripe has failed. This expects the bio to correspond to a physical disk,
1365 * so it looks up based on physical sector numbers.
1366 */
1367static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1368 struct bio *bio)
1369{
4f024f37 1370 u64 physical = bio->bi_iter.bi_sector;
53b381b3 1371 int i;
4c664611 1372 struct btrfs_io_stripe *stripe;
53b381b3
DW
1373
1374 physical <<= 9;
1375
4c664611
QW
1376 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1377 stripe = &rbio->bioc->stripes[i];
83025863 1378 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
309dca30 1379 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
53b381b3
DW
1380 return i;
1381 }
1382 }
1383 return -1;
1384}
1385
1386/*
1387 * helper to find the stripe number for a given
1388 * bio (before mapping). Used to figure out which stripe has
1389 * failed. This looks up based on logical block numbers.
1390 */
1391static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1392 struct bio *bio)
1393{
1201b58b 1394 u64 logical = bio->bi_iter.bi_sector << 9;
53b381b3
DW
1395 int i;
1396
53b381b3 1397 for (i = 0; i < rbio->nr_data; i++) {
4c664611 1398 u64 stripe_start = rbio->bioc->raid_map[i];
83025863
NB
1399
1400 if (in_range(logical, stripe_start, rbio->stripe_len))
53b381b3 1401 return i;
53b381b3
DW
1402 }
1403 return -1;
1404}
1405
1406/*
1407 * returns -EIO if we had too many failures
1408 */
1409static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1410{
1411 unsigned long flags;
1412 int ret = 0;
1413
1414 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1415
1416 /* we already know this stripe is bad, move on */
1417 if (rbio->faila == failed || rbio->failb == failed)
1418 goto out;
1419
1420 if (rbio->faila == -1) {
1421 /* first failure on this rbio */
1422 rbio->faila = failed;
b89e1b01 1423 atomic_inc(&rbio->error);
53b381b3
DW
1424 } else if (rbio->failb == -1) {
1425 /* second failure on this rbio */
1426 rbio->failb = failed;
b89e1b01 1427 atomic_inc(&rbio->error);
53b381b3
DW
1428 } else {
1429 ret = -EIO;
1430 }
1431out:
1432 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1433
1434 return ret;
1435}
1436
1437/*
1438 * helper to fail a stripe based on a physical disk
1439 * bio.
1440 */
1441static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1442 struct bio *bio)
1443{
1444 int failed = find_bio_stripe(rbio, bio);
1445
1446 if (failed < 0)
1447 return -EIO;
1448
1449 return fail_rbio_index(rbio, failed);
1450}
1451
1452/*
1453 * this sets each page in the bio uptodate. It should only be used on private
1454 * rbio pages, nothing that comes in from the higher layers
1455 */
1456static void set_bio_pages_uptodate(struct bio *bio)
1457{
0198e5b7 1458 struct bio_vec *bvec;
6dc4f100 1459 struct bvec_iter_all iter_all;
6592e58c 1460
0198e5b7 1461 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1462
2b070cfe 1463 bio_for_each_segment_all(bvec, bio, iter_all)
0198e5b7 1464 SetPageUptodate(bvec->bv_page);
53b381b3
DW
1465}
1466
1467/*
1468 * end io for the read phase of the rmw cycle. All the bios here are physical
1469 * stripe bios we've read from the disk so we can recalculate the parity of the
1470 * stripe.
1471 *
1472 * This will usually kick off finish_rmw once all the bios are read in, but it
1473 * may trigger parity reconstruction if we had any errors along the way
1474 */
4246a0b6 1475static void raid_rmw_end_io(struct bio *bio)
53b381b3
DW
1476{
1477 struct btrfs_raid_bio *rbio = bio->bi_private;
1478
4e4cbee9 1479 if (bio->bi_status)
53b381b3
DW
1480 fail_bio_stripe(rbio, bio);
1481 else
1482 set_bio_pages_uptodate(bio);
1483
1484 bio_put(bio);
1485
b89e1b01 1486 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
1487 return;
1488
4c664611 1489 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
53b381b3
DW
1490 goto cleanup;
1491
1492 /*
1493 * this will normally call finish_rmw to start our write
1494 * but if there are any failed stripes we'll reconstruct
1495 * from parity first
1496 */
1497 validate_rbio_for_rmw(rbio);
1498 return;
1499
1500cleanup:
1501
58efbc9f 1502 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
1503}
1504
53b381b3
DW
1505/*
1506 * the stripe must be locked by the caller. It will
1507 * unlock after all the writes are done
1508 */
1509static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1510{
1511 int bios_to_read = 0;
53b381b3
DW
1512 struct bio_list bio_list;
1513 int ret;
53b381b3
DW
1514 int pagenr;
1515 int stripe;
1516 struct bio *bio;
1517
1518 bio_list_init(&bio_list);
1519
1520 ret = alloc_rbio_pages(rbio);
1521 if (ret)
1522 goto cleanup;
1523
1524 index_rbio_pages(rbio);
1525
b89e1b01 1526 atomic_set(&rbio->error, 0);
53b381b3
DW
1527 /*
1528 * build a list of bios to read all the missing parts of this
1529 * stripe
1530 */
1531 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
915e2290 1532 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
1533 struct page *page;
1534 /*
1535 * we want to find all the pages missing from
1536 * the rbio and read them from the disk. If
1537 * page_in_rbio finds a page in the bio list
1538 * we don't need to read it off the stripe.
1539 */
1540 page = page_in_rbio(rbio, stripe, pagenr, 1);
1541 if (page)
1542 continue;
1543
1544 page = rbio_stripe_page(rbio, stripe, pagenr);
4ae10b3a
CM
1545 /*
1546 * the bio cache may have handed us an uptodate
1547 * page. If so, be happy and use it
1548 */
1549 if (PageUptodate(page))
1550 continue;
1551
53b381b3 1552 ret = rbio_add_io_page(rbio, &bio_list, page,
e01bf588
CH
1553 stripe, pagenr, rbio->stripe_len,
1554 REQ_OP_READ);
53b381b3
DW
1555 if (ret)
1556 goto cleanup;
1557 }
1558 }
1559
1560 bios_to_read = bio_list_size(&bio_list);
1561 if (!bios_to_read) {
1562 /*
1563 * this can happen if others have merged with
1564 * us, it means there is nothing left to read.
1565 * But if there are missing devices it may not be
1566 * safe to do the full stripe write yet.
1567 */
1568 goto finish;
1569 }
1570
1571 /*
4c664611
QW
1572 * The bioc may be freed once we submit the last bio. Make sure not to
1573 * touch it after that.
53b381b3 1574 */
b89e1b01 1575 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 1576 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
1577 bio->bi_end_io = raid_rmw_end_io;
1578
6a258d72 1579 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 1580
4e49ea4a 1581 submit_bio(bio);
53b381b3
DW
1582 }
1583 /* the actual write will happen once the reads are done */
1584 return 0;
1585
1586cleanup:
58efbc9f 1587 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1588
1589 while ((bio = bio_list_pop(&bio_list)))
1590 bio_put(bio);
1591
53b381b3
DW
1592 return -EIO;
1593
1594finish:
1595 validate_rbio_for_rmw(rbio);
1596 return 0;
1597}
1598
1599/*
1600 * if the upper layers pass in a full stripe, we thank them by only allocating
1601 * enough pages to hold the parity, and sending it all down quickly.
1602 */
1603static int full_stripe_write(struct btrfs_raid_bio *rbio)
1604{
1605 int ret;
1606
1607 ret = alloc_rbio_parity_pages(rbio);
3cd846d1
MX
1608 if (ret) {
1609 __free_raid_bio(rbio);
53b381b3 1610 return ret;
3cd846d1 1611 }
53b381b3
DW
1612
1613 ret = lock_stripe_add(rbio);
1614 if (ret == 0)
1615 finish_rmw(rbio);
1616 return 0;
1617}
1618
1619/*
1620 * partial stripe writes get handed over to async helpers.
1621 * We're really hoping to merge a few more writes into this
1622 * rbio before calculating new parity
1623 */
1624static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1625{
1626 int ret;
1627
1628 ret = lock_stripe_add(rbio);
1629 if (ret == 0)
cf6a4a75 1630 start_async_work(rbio, rmw_work);
53b381b3
DW
1631 return 0;
1632}
1633
1634/*
1635 * sometimes while we were reading from the drive to
1636 * recalculate parity, enough new bios come into create
1637 * a full stripe. So we do a check here to see if we can
1638 * go directly to finish_rmw
1639 */
1640static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1641{
1642 /* head off into rmw land if we don't have a full stripe */
1643 if (!rbio_is_full(rbio))
1644 return partial_stripe_write(rbio);
1645 return full_stripe_write(rbio);
1646}
1647
6ac0f488
CM
1648/*
1649 * We use plugging call backs to collect full stripes.
1650 * Any time we get a partial stripe write while plugged
1651 * we collect it into a list. When the unplug comes down,
1652 * we sort the list by logical block number and merge
1653 * everything we can into the same rbios
1654 */
1655struct btrfs_plug_cb {
1656 struct blk_plug_cb cb;
1657 struct btrfs_fs_info *info;
1658 struct list_head rbio_list;
1659 struct btrfs_work work;
1660};
1661
1662/*
1663 * rbios on the plug list are sorted for easier merging.
1664 */
4f0f586b
ST
1665static int plug_cmp(void *priv, const struct list_head *a,
1666 const struct list_head *b)
6ac0f488 1667{
214cc184
DS
1668 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1669 plug_list);
1670 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1671 plug_list);
4f024f37
KO
1672 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1673 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1674
1675 if (a_sector < b_sector)
1676 return -1;
1677 if (a_sector > b_sector)
1678 return 1;
1679 return 0;
1680}
1681
1682static void run_plug(struct btrfs_plug_cb *plug)
1683{
1684 struct btrfs_raid_bio *cur;
1685 struct btrfs_raid_bio *last = NULL;
1686
1687 /*
1688 * sort our plug list then try to merge
1689 * everything we can in hopes of creating full
1690 * stripes.
1691 */
1692 list_sort(NULL, &plug->rbio_list, plug_cmp);
1693 while (!list_empty(&plug->rbio_list)) {
1694 cur = list_entry(plug->rbio_list.next,
1695 struct btrfs_raid_bio, plug_list);
1696 list_del_init(&cur->plug_list);
1697
1698 if (rbio_is_full(cur)) {
c7b562c5
DS
1699 int ret;
1700
6ac0f488 1701 /* we have a full stripe, send it down */
c7b562c5
DS
1702 ret = full_stripe_write(cur);
1703 BUG_ON(ret);
6ac0f488
CM
1704 continue;
1705 }
1706 if (last) {
1707 if (rbio_can_merge(last, cur)) {
1708 merge_rbio(last, cur);
1709 __free_raid_bio(cur);
1710 continue;
1711
1712 }
1713 __raid56_parity_write(last);
1714 }
1715 last = cur;
1716 }
1717 if (last) {
1718 __raid56_parity_write(last);
1719 }
1720 kfree(plug);
1721}
1722
1723/*
1724 * if the unplug comes from schedule, we have to push the
1725 * work off to a helper thread
1726 */
1727static void unplug_work(struct btrfs_work *work)
1728{
1729 struct btrfs_plug_cb *plug;
1730 plug = container_of(work, struct btrfs_plug_cb, work);
1731 run_plug(plug);
1732}
1733
1734static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1735{
1736 struct btrfs_plug_cb *plug;
1737 plug = container_of(cb, struct btrfs_plug_cb, cb);
1738
1739 if (from_schedule) {
a0cac0ec 1740 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
d05a33ac
QW
1741 btrfs_queue_work(plug->info->rmw_workers,
1742 &plug->work);
6ac0f488
CM
1743 return;
1744 }
1745 run_plug(plug);
1746}
1747
53b381b3
DW
1748/*
1749 * our main entry point for writes from the rest of the FS.
1750 */
cc353a8b 1751int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
53b381b3 1752{
6a258d72 1753 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1754 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1755 struct btrfs_plug_cb *plug = NULL;
1756 struct blk_plug_cb *cb;
4245215d 1757 int ret;
53b381b3 1758
4c664611 1759 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 1760 if (IS_ERR(rbio)) {
4c664611 1761 btrfs_put_bioc(bioc);
53b381b3 1762 return PTR_ERR(rbio);
af8e2d1d 1763 }
53b381b3 1764 bio_list_add(&rbio->bio_list, bio);
4f024f37 1765 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1b94b556 1766 rbio->operation = BTRFS_RBIO_WRITE;
6ac0f488 1767
0b246afa 1768 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
1769 rbio->generic_bio_cnt = 1;
1770
6ac0f488
CM
1771 /*
1772 * don't plug on full rbios, just get them out the door
1773 * as quickly as we can
1774 */
4245215d
MX
1775 if (rbio_is_full(rbio)) {
1776 ret = full_stripe_write(rbio);
1777 if (ret)
0b246afa 1778 btrfs_bio_counter_dec(fs_info);
4245215d
MX
1779 return ret;
1780 }
6ac0f488 1781
0b246afa 1782 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
6ac0f488
CM
1783 if (cb) {
1784 plug = container_of(cb, struct btrfs_plug_cb, cb);
1785 if (!plug->info) {
0b246afa 1786 plug->info = fs_info;
6ac0f488
CM
1787 INIT_LIST_HEAD(&plug->rbio_list);
1788 }
1789 list_add_tail(&rbio->plug_list, &plug->rbio_list);
4245215d 1790 ret = 0;
6ac0f488 1791 } else {
4245215d
MX
1792 ret = __raid56_parity_write(rbio);
1793 if (ret)
0b246afa 1794 btrfs_bio_counter_dec(fs_info);
6ac0f488 1795 }
4245215d 1796 return ret;
53b381b3
DW
1797}
1798
1799/*
1800 * all parity reconstruction happens here. We've read in everything
1801 * we can find from the drives and this does the heavy lifting of
1802 * sorting the good from the bad.
1803 */
1804static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1805{
1806 int pagenr, stripe;
1807 void **pointers;
94a0b58d 1808 void **unmap_array;
53b381b3 1809 int faila = -1, failb = -1;
53b381b3 1810 struct page *page;
58efbc9f 1811 blk_status_t err;
53b381b3
DW
1812 int i;
1813
31e818fe 1814 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
53b381b3 1815 if (!pointers) {
58efbc9f 1816 err = BLK_STS_RESOURCE;
53b381b3
DW
1817 goto cleanup_io;
1818 }
1819
94a0b58d
IW
1820 /*
1821 * Store copy of pointers that does not get reordered during
1822 * reconstruction so that kunmap_local works.
1823 */
1824 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1825 if (!unmap_array) {
1826 err = BLK_STS_RESOURCE;
1827 goto cleanup_pointers;
1828 }
1829
53b381b3
DW
1830 faila = rbio->faila;
1831 failb = rbio->failb;
1832
b4ee1782
OS
1833 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1834 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
53b381b3
DW
1835 spin_lock_irq(&rbio->bio_list_lock);
1836 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1837 spin_unlock_irq(&rbio->bio_list_lock);
1838 }
1839
1840 index_rbio_pages(rbio);
1841
915e2290 1842 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
5a6ac9ea
MX
1843 /*
1844 * Now we just use bitmap to mark the horizontal stripes in
1845 * which we have data when doing parity scrub.
1846 */
1847 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1848 !test_bit(pagenr, rbio->dbitmap))
1849 continue;
1850
94a0b58d
IW
1851 /*
1852 * Setup our array of pointers with pages from each stripe
1853 *
1854 * NOTE: store a duplicate array of pointers to preserve the
1855 * pointer order
53b381b3 1856 */
2c8cdd6e 1857 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
53b381b3
DW
1858 /*
1859 * if we're rebuilding a read, we have to use
1860 * pages from the bio list
1861 */
b4ee1782
OS
1862 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1863 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
53b381b3
DW
1864 (stripe == faila || stripe == failb)) {
1865 page = page_in_rbio(rbio, stripe, pagenr, 0);
1866 } else {
1867 page = rbio_stripe_page(rbio, stripe, pagenr);
1868 }
94a0b58d
IW
1869 pointers[stripe] = kmap_local_page(page);
1870 unmap_array[stripe] = pointers[stripe];
53b381b3
DW
1871 }
1872
1873 /* all raid6 handling here */
4c664611 1874 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
53b381b3
DW
1875 /*
1876 * single failure, rebuild from parity raid5
1877 * style
1878 */
1879 if (failb < 0) {
1880 if (faila == rbio->nr_data) {
1881 /*
1882 * Just the P stripe has failed, without
1883 * a bad data or Q stripe.
1884 * TODO, we should redo the xor here.
1885 */
58efbc9f 1886 err = BLK_STS_IOERR;
53b381b3
DW
1887 goto cleanup;
1888 }
1889 /*
1890 * a single failure in raid6 is rebuilt
1891 * in the pstripe code below
1892 */
1893 goto pstripe;
1894 }
1895
1896 /* make sure our ps and qs are in order */
b7d2083a
NB
1897 if (faila > failb)
1898 swap(faila, failb);
53b381b3
DW
1899
1900 /* if the q stripe is failed, do a pstripe reconstruction
1901 * from the xors.
1902 * If both the q stripe and the P stripe are failed, we're
1903 * here due to a crc mismatch and we can't give them the
1904 * data they want
1905 */
4c664611
QW
1906 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1907 if (rbio->bioc->raid_map[faila] ==
8e5cfb55 1908 RAID5_P_STRIPE) {
58efbc9f 1909 err = BLK_STS_IOERR;
53b381b3
DW
1910 goto cleanup;
1911 }
1912 /*
1913 * otherwise we have one bad data stripe and
1914 * a good P stripe. raid5!
1915 */
1916 goto pstripe;
1917 }
1918
4c664611 1919 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2c8cdd6e 1920 raid6_datap_recov(rbio->real_stripes,
53b381b3
DW
1921 PAGE_SIZE, faila, pointers);
1922 } else {
2c8cdd6e 1923 raid6_2data_recov(rbio->real_stripes,
53b381b3
DW
1924 PAGE_SIZE, faila, failb,
1925 pointers);
1926 }
1927 } else {
1928 void *p;
1929
1930 /* rebuild from P stripe here (raid5 or raid6) */
1931 BUG_ON(failb != -1);
1932pstripe:
1933 /* Copy parity block into failed block to start with */
69d24804 1934 copy_page(pointers[faila], pointers[rbio->nr_data]);
53b381b3
DW
1935
1936 /* rearrange the pointer array */
1937 p = pointers[faila];
1938 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1939 pointers[stripe] = pointers[stripe + 1];
1940 pointers[rbio->nr_data - 1] = p;
1941
1942 /* xor in the rest */
09cbfeaf 1943 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
53b381b3
DW
1944 }
1945 /* if we're doing this rebuild as part of an rmw, go through
1946 * and set all of our private rbio pages in the
1947 * failed stripes as uptodate. This way finish_rmw will
1948 * know they can be trusted. If this was a read reconstruction,
1949 * other endio functions will fiddle the uptodate bits
1950 */
1b94b556 1951 if (rbio->operation == BTRFS_RBIO_WRITE) {
915e2290 1952 for (i = 0; i < rbio->stripe_npages; i++) {
53b381b3
DW
1953 if (faila != -1) {
1954 page = rbio_stripe_page(rbio, faila, i);
1955 SetPageUptodate(page);
1956 }
1957 if (failb != -1) {
1958 page = rbio_stripe_page(rbio, failb, i);
1959 SetPageUptodate(page);
1960 }
1961 }
1962 }
94a0b58d
IW
1963 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
1964 kunmap_local(unmap_array[stripe]);
53b381b3
DW
1965 }
1966
58efbc9f 1967 err = BLK_STS_OK;
53b381b3 1968cleanup:
94a0b58d
IW
1969 kfree(unmap_array);
1970cleanup_pointers:
53b381b3
DW
1971 kfree(pointers);
1972
1973cleanup_io:
580c6efa
LB
1974 /*
1975 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1976 * valid rbio which is consistent with ondisk content, thus such a
1977 * valid rbio can be cached to avoid further disk reads.
1978 */
1979 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1980 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
44ac474d
LB
1981 /*
1982 * - In case of two failures, where rbio->failb != -1:
1983 *
1984 * Do not cache this rbio since the above read reconstruction
1985 * (raid6_datap_recov() or raid6_2data_recov()) may have
1986 * changed some content of stripes which are not identical to
1987 * on-disk content any more, otherwise, a later write/recover
1988 * may steal stripe_pages from this rbio and end up with
1989 * corruptions or rebuild failures.
1990 *
1991 * - In case of single failure, where rbio->failb == -1:
1992 *
1993 * Cache this rbio iff the above read reconstruction is
52042d8e 1994 * executed without problems.
44ac474d
LB
1995 */
1996 if (err == BLK_STS_OK && rbio->failb < 0)
4ae10b3a
CM
1997 cache_rbio_pages(rbio);
1998 else
1999 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2000
4246a0b6 2001 rbio_orig_end_io(rbio, err);
58efbc9f 2002 } else if (err == BLK_STS_OK) {
53b381b3
DW
2003 rbio->faila = -1;
2004 rbio->failb = -1;
5a6ac9ea
MX
2005
2006 if (rbio->operation == BTRFS_RBIO_WRITE)
2007 finish_rmw(rbio);
2008 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2009 finish_parity_scrub(rbio, 0);
2010 else
2011 BUG();
53b381b3 2012 } else {
4246a0b6 2013 rbio_orig_end_io(rbio, err);
53b381b3
DW
2014 }
2015}
2016
2017/*
2018 * This is called only for stripes we've read from disk to
2019 * reconstruct the parity.
2020 */
4246a0b6 2021static void raid_recover_end_io(struct bio *bio)
53b381b3
DW
2022{
2023 struct btrfs_raid_bio *rbio = bio->bi_private;
2024
2025 /*
2026 * we only read stripe pages off the disk, set them
2027 * up to date if there were no errors
2028 */
4e4cbee9 2029 if (bio->bi_status)
53b381b3
DW
2030 fail_bio_stripe(rbio, bio);
2031 else
2032 set_bio_pages_uptodate(bio);
2033 bio_put(bio);
2034
b89e1b01 2035 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
2036 return;
2037
4c664611 2038 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
58efbc9f 2039 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
2040 else
2041 __raid_recover_end_io(rbio);
2042}
2043
2044/*
2045 * reads everything we need off the disk to reconstruct
2046 * the parity. endio handlers trigger final reconstruction
2047 * when the IO is done.
2048 *
2049 * This is used both for reads from the higher layers and for
2050 * parity construction required to finish a rmw cycle.
2051 */
2052static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2053{
2054 int bios_to_read = 0;
53b381b3
DW
2055 struct bio_list bio_list;
2056 int ret;
53b381b3
DW
2057 int pagenr;
2058 int stripe;
2059 struct bio *bio;
2060
2061 bio_list_init(&bio_list);
2062
2063 ret = alloc_rbio_pages(rbio);
2064 if (ret)
2065 goto cleanup;
2066
b89e1b01 2067 atomic_set(&rbio->error, 0);
53b381b3
DW
2068
2069 /*
4ae10b3a
CM
2070 * read everything that hasn't failed. Thanks to the
2071 * stripe cache, it is possible that some or all of these
2072 * pages are going to be uptodate.
53b381b3 2073 */
2c8cdd6e 2074 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5588383e 2075 if (rbio->faila == stripe || rbio->failb == stripe) {
b89e1b01 2076 atomic_inc(&rbio->error);
53b381b3 2077 continue;
5588383e 2078 }
53b381b3 2079
915e2290 2080 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
53b381b3
DW
2081 struct page *p;
2082
2083 /*
2084 * the rmw code may have already read this
2085 * page in
2086 */
2087 p = rbio_stripe_page(rbio, stripe, pagenr);
2088 if (PageUptodate(p))
2089 continue;
2090
2091 ret = rbio_add_io_page(rbio, &bio_list,
2092 rbio_stripe_page(rbio, stripe, pagenr),
e01bf588
CH
2093 stripe, pagenr, rbio->stripe_len,
2094 REQ_OP_READ);
53b381b3
DW
2095 if (ret < 0)
2096 goto cleanup;
2097 }
2098 }
2099
2100 bios_to_read = bio_list_size(&bio_list);
2101 if (!bios_to_read) {
2102 /*
2103 * we might have no bios to read just because the pages
2104 * were up to date, or we might have no bios to read because
2105 * the devices were gone.
2106 */
4c664611 2107 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
53b381b3 2108 __raid_recover_end_io(rbio);
813f8a0e 2109 return 0;
53b381b3
DW
2110 } else {
2111 goto cleanup;
2112 }
2113 }
2114
2115 /*
4c664611
QW
2116 * The bioc may be freed once we submit the last bio. Make sure not to
2117 * touch it after that.
53b381b3 2118 */
b89e1b01 2119 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2120 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
2121 bio->bi_end_io = raid_recover_end_io;
2122
6a258d72 2123 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 2124
4e49ea4a 2125 submit_bio(bio);
53b381b3 2126 }
813f8a0e 2127
53b381b3
DW
2128 return 0;
2129
2130cleanup:
b4ee1782
OS
2131 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2132 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
58efbc9f 2133 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2134
2135 while ((bio = bio_list_pop(&bio_list)))
2136 bio_put(bio);
2137
53b381b3
DW
2138 return -EIO;
2139}
2140
2141/*
2142 * the main entry point for reads from the higher layers. This
2143 * is really only called when the normal read path had a failure,
2144 * so we assume the bio they send down corresponds to a failed part
2145 * of the drive.
2146 */
6a258d72 2147int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
cc353a8b 2148 u32 stripe_len, int mirror_num, int generic_io)
53b381b3 2149{
6a258d72 2150 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3
DW
2151 struct btrfs_raid_bio *rbio;
2152 int ret;
2153
abad60c6 2154 if (generic_io) {
4c664611 2155 ASSERT(bioc->mirror_num == mirror_num);
c3a3b19b 2156 btrfs_bio(bio)->mirror_num = mirror_num;
abad60c6
LB
2157 }
2158
4c664611 2159 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 2160 if (IS_ERR(rbio)) {
6e9606d2 2161 if (generic_io)
4c664611 2162 btrfs_put_bioc(bioc);
53b381b3 2163 return PTR_ERR(rbio);
af8e2d1d 2164 }
53b381b3 2165
1b94b556 2166 rbio->operation = BTRFS_RBIO_READ_REBUILD;
53b381b3 2167 bio_list_add(&rbio->bio_list, bio);
4f024f37 2168 rbio->bio_list_bytes = bio->bi_iter.bi_size;
53b381b3
DW
2169
2170 rbio->faila = find_logical_bio_stripe(rbio, bio);
2171 if (rbio->faila == -1) {
0b246afa 2172 btrfs_warn(fs_info,
4c664611 2173"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
1201b58b 2174 __func__, bio->bi_iter.bi_sector << 9,
4c664611 2175 (u64)bio->bi_iter.bi_size, bioc->map_type);
6e9606d2 2176 if (generic_io)
4c664611 2177 btrfs_put_bioc(bioc);
53b381b3
DW
2178 kfree(rbio);
2179 return -EIO;
2180 }
2181
4245215d 2182 if (generic_io) {
0b246afa 2183 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
2184 rbio->generic_bio_cnt = 1;
2185 } else {
4c664611 2186 btrfs_get_bioc(bioc);
4245215d
MX
2187 }
2188
53b381b3 2189 /*
8810f751
LB
2190 * Loop retry:
2191 * for 'mirror == 2', reconstruct from all other stripes.
2192 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2193 */
8810f751
LB
2194 if (mirror_num > 2) {
2195 /*
2196 * 'mirror == 3' is to fail the p stripe and
2197 * reconstruct from the q stripe. 'mirror > 3' is to
2198 * fail a data stripe and reconstruct from p+q stripe.
2199 */
2200 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2201 ASSERT(rbio->failb > 0);
2202 if (rbio->failb <= rbio->faila)
2203 rbio->failb--;
2204 }
53b381b3
DW
2205
2206 ret = lock_stripe_add(rbio);
2207
2208 /*
2209 * __raid56_parity_recover will end the bio with
2210 * any errors it hits. We don't want to return
2211 * its error value up the stack because our caller
2212 * will end up calling bio_endio with any nonzero
2213 * return
2214 */
2215 if (ret == 0)
2216 __raid56_parity_recover(rbio);
2217 /*
2218 * our rbio has been added to the list of
2219 * rbios that will be handled after the
2220 * currently lock owner is done
2221 */
2222 return 0;
2223
2224}
2225
2226static void rmw_work(struct btrfs_work *work)
2227{
2228 struct btrfs_raid_bio *rbio;
2229
2230 rbio = container_of(work, struct btrfs_raid_bio, work);
2231 raid56_rmw_stripe(rbio);
2232}
2233
2234static void read_rebuild_work(struct btrfs_work *work)
2235{
2236 struct btrfs_raid_bio *rbio;
2237
2238 rbio = container_of(work, struct btrfs_raid_bio, work);
2239 __raid56_parity_recover(rbio);
2240}
5a6ac9ea
MX
2241
2242/*
2243 * The following code is used to scrub/replace the parity stripe
2244 *
4c664611 2245 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2246 *
5a6ac9ea
MX
2247 * Note: We need make sure all the pages that add into the scrub/replace
2248 * raid bio are correct and not be changed during the scrub/replace. That
2249 * is those pages just hold metadata or file data with checksum.
2250 */
2251
6a258d72
QW
2252struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2253 struct btrfs_io_context *bioc,
cc353a8b 2254 u32 stripe_len, struct btrfs_device *scrub_dev,
6a258d72 2255 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2256{
6a258d72 2257 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2258 struct btrfs_raid_bio *rbio;
2259 int i;
2260
4c664611 2261 rbio = alloc_rbio(fs_info, bioc, stripe_len);
5a6ac9ea
MX
2262 if (IS_ERR(rbio))
2263 return NULL;
2264 bio_list_add(&rbio->bio_list, bio);
2265 /*
2266 * This is a special bio which is used to hold the completion handler
2267 * and make the scrub rbio is similar to the other types
2268 */
2269 ASSERT(!bio->bi_iter.bi_size);
2270 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2271
9cd3a7eb 2272 /*
4c664611 2273 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2274 * to the end position, so this search can start from the first parity
2275 * stripe.
2276 */
2277 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2278 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2279 rbio->scrubp = i;
2280 break;
2281 }
2282 }
9cd3a7eb 2283 ASSERT(i < rbio->real_stripes);
5a6ac9ea
MX
2284
2285 /* Now we just support the sectorsize equals to page size */
0b246afa 2286 ASSERT(fs_info->sectorsize == PAGE_SIZE);
5a6ac9ea
MX
2287 ASSERT(rbio->stripe_npages == stripe_nsectors);
2288 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2289
ae6529c3 2290 /*
4c664611 2291 * We have already increased bio_counter when getting bioc, record it
ae6529c3
QW
2292 * so we can free it at rbio_orig_end_io().
2293 */
2294 rbio->generic_bio_cnt = 1;
2295
5a6ac9ea
MX
2296 return rbio;
2297}
2298
b4ee1782
OS
2299/* Used for both parity scrub and missing. */
2300void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2301 u64 logical)
5a6ac9ea
MX
2302{
2303 int stripe_offset;
2304 int index;
2305
4c664611
QW
2306 ASSERT(logical >= rbio->bioc->raid_map[0]);
2307 ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
5a6ac9ea 2308 rbio->stripe_len * rbio->nr_data);
4c664611 2309 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
09cbfeaf 2310 index = stripe_offset >> PAGE_SHIFT;
5a6ac9ea
MX
2311 rbio->bio_pages[index] = page;
2312}
2313
2314/*
2315 * We just scrub the parity that we have correct data on the same horizontal,
2316 * so we needn't allocate all pages for all the stripes.
2317 */
2318static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2319{
2320 int i;
2321 int bit;
2322 int index;
2323 struct page *page;
2324
2325 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2c8cdd6e 2326 for (i = 0; i < rbio->real_stripes; i++) {
5a6ac9ea
MX
2327 index = i * rbio->stripe_npages + bit;
2328 if (rbio->stripe_pages[index])
2329 continue;
2330
b0ee5e1e 2331 page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2332 if (!page)
2333 return -ENOMEM;
2334 rbio->stripe_pages[index] = page;
5a6ac9ea
MX
2335 }
2336 }
eb357060 2337 index_stripe_sectors(rbio);
5a6ac9ea
MX
2338 return 0;
2339}
2340
5a6ac9ea
MX
2341static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342 int need_check)
2343{
4c664611 2344 struct btrfs_io_context *bioc = rbio->bioc;
1389053e
KC
2345 void **pointers = rbio->finish_pointers;
2346 unsigned long *pbitmap = rbio->finish_pbitmap;
5a6ac9ea
MX
2347 int nr_data = rbio->nr_data;
2348 int stripe;
2349 int pagenr;
c17af965 2350 bool has_qstripe;
5a6ac9ea
MX
2351 struct page *p_page = NULL;
2352 struct page *q_page = NULL;
2353 struct bio_list bio_list;
2354 struct bio *bio;
76035976 2355 int is_replace = 0;
5a6ac9ea
MX
2356 int ret;
2357
2358 bio_list_init(&bio_list);
2359
c17af965
DS
2360 if (rbio->real_stripes - rbio->nr_data == 1)
2361 has_qstripe = false;
2362 else if (rbio->real_stripes - rbio->nr_data == 2)
2363 has_qstripe = true;
2364 else
5a6ac9ea 2365 BUG();
5a6ac9ea 2366
4c664611 2367 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
76035976
MX
2368 is_replace = 1;
2369 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2370 }
2371
5a6ac9ea
MX
2372 /*
2373 * Because the higher layers(scrubber) are unlikely to
2374 * use this area of the disk again soon, so don't cache
2375 * it.
2376 */
2377 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2378
2379 if (!need_check)
2380 goto writeback;
2381
b0ee5e1e 2382 p_page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2383 if (!p_page)
2384 goto cleanup;
2385 SetPageUptodate(p_page);
2386
c17af965 2387 if (has_qstripe) {
d70cef0d 2388 /* RAID6, allocate and map temp space for the Q stripe */
b0ee5e1e 2389 q_page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2390 if (!q_page) {
2391 __free_page(p_page);
2392 goto cleanup;
2393 }
2394 SetPageUptodate(q_page);
94a0b58d 2395 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
5a6ac9ea
MX
2396 }
2397
2398 atomic_set(&rbio->error, 0);
2399
d70cef0d 2400 /* Map the parity stripe just once */
94a0b58d 2401 pointers[nr_data] = kmap_local_page(p_page);
d70cef0d 2402
5a6ac9ea
MX
2403 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2404 struct page *p;
2405 void *parity;
2406 /* first collect one page from each data stripe */
2407 for (stripe = 0; stripe < nr_data; stripe++) {
2408 p = page_in_rbio(rbio, stripe, pagenr, 0);
94a0b58d 2409 pointers[stripe] = kmap_local_page(p);
5a6ac9ea
MX
2410 }
2411
c17af965 2412 if (has_qstripe) {
d70cef0d 2413 /* RAID6, call the library function to fill in our P/Q */
2c8cdd6e 2414 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
5a6ac9ea
MX
2415 pointers);
2416 } else {
2417 /* raid5 */
69d24804 2418 copy_page(pointers[nr_data], pointers[0]);
09cbfeaf 2419 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
5a6ac9ea
MX
2420 }
2421
01327610 2422 /* Check scrubbing parity and repair it */
5a6ac9ea 2423 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
58c1a35c 2424 parity = kmap_local_page(p);
09cbfeaf 2425 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
69d24804 2426 copy_page(parity, pointers[rbio->scrubp]);
5a6ac9ea
MX
2427 else
2428 /* Parity is right, needn't writeback */
2429 bitmap_clear(rbio->dbitmap, pagenr, 1);
58c1a35c 2430 kunmap_local(parity);
5a6ac9ea 2431
94a0b58d
IW
2432 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2433 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2434 }
2435
94a0b58d 2436 kunmap_local(pointers[nr_data]);
5a6ac9ea 2437 __free_page(p_page);
d70cef0d 2438 if (q_page) {
94a0b58d 2439 kunmap_local(pointers[rbio->real_stripes - 1]);
5a6ac9ea 2440 __free_page(q_page);
d70cef0d 2441 }
5a6ac9ea
MX
2442
2443writeback:
2444 /*
2445 * time to start writing. Make bios for everything from the
2446 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2447 * everything else.
2448 */
2449 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2450 struct page *page;
2451
2452 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
e01bf588
CH
2453 ret = rbio_add_io_page(rbio, &bio_list, page, rbio->scrubp,
2454 pagenr, rbio->stripe_len, REQ_OP_WRITE);
5a6ac9ea
MX
2455 if (ret)
2456 goto cleanup;
2457 }
2458
76035976
MX
2459 if (!is_replace)
2460 goto submit_write;
2461
2462 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2463 struct page *page;
2464
2465 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2466 ret = rbio_add_io_page(rbio, &bio_list, page,
4c664611 2467 bioc->tgtdev_map[rbio->scrubp],
e01bf588 2468 pagenr, rbio->stripe_len, REQ_OP_WRITE);
76035976
MX
2469 if (ret)
2470 goto cleanup;
2471 }
2472
2473submit_write:
5a6ac9ea
MX
2474 nr_data = bio_list_size(&bio_list);
2475 if (!nr_data) {
2476 /* Every parity is right */
58efbc9f 2477 rbio_orig_end_io(rbio, BLK_STS_OK);
5a6ac9ea
MX
2478 return;
2479 }
2480
2481 atomic_set(&rbio->stripes_pending, nr_data);
2482
bf28a605 2483 while ((bio = bio_list_pop(&bio_list))) {
a6111d11 2484 bio->bi_end_io = raid_write_end_io;
4e49ea4a
MC
2485
2486 submit_bio(bio);
5a6ac9ea
MX
2487 }
2488 return;
2489
2490cleanup:
58efbc9f 2491 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2492
2493 while ((bio = bio_list_pop(&bio_list)))
2494 bio_put(bio);
5a6ac9ea
MX
2495}
2496
2497static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2498{
2499 if (stripe >= 0 && stripe < rbio->nr_data)
2500 return 1;
2501 return 0;
2502}
2503
2504/*
2505 * While we're doing the parity check and repair, we could have errors
2506 * in reading pages off the disk. This checks for errors and if we're
2507 * not able to read the page it'll trigger parity reconstruction. The
2508 * parity scrub will be finished after we've reconstructed the failed
2509 * stripes
2510 */
2511static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2512{
4c664611 2513 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
5a6ac9ea
MX
2514 goto cleanup;
2515
2516 if (rbio->faila >= 0 || rbio->failb >= 0) {
2517 int dfail = 0, failp = -1;
2518
2519 if (is_data_stripe(rbio, rbio->faila))
2520 dfail++;
2521 else if (is_parity_stripe(rbio->faila))
2522 failp = rbio->faila;
2523
2524 if (is_data_stripe(rbio, rbio->failb))
2525 dfail++;
2526 else if (is_parity_stripe(rbio->failb))
2527 failp = rbio->failb;
2528
2529 /*
2530 * Because we can not use a scrubbing parity to repair
2531 * the data, so the capability of the repair is declined.
2532 * (In the case of RAID5, we can not repair anything)
2533 */
4c664611 2534 if (dfail > rbio->bioc->max_errors - 1)
5a6ac9ea
MX
2535 goto cleanup;
2536
2537 /*
2538 * If all data is good, only parity is correctly, just
2539 * repair the parity.
2540 */
2541 if (dfail == 0) {
2542 finish_parity_scrub(rbio, 0);
2543 return;
2544 }
2545
2546 /*
2547 * Here means we got one corrupted data stripe and one
2548 * corrupted parity on RAID6, if the corrupted parity
01327610 2549 * is scrubbing parity, luckily, use the other one to repair
5a6ac9ea
MX
2550 * the data, or we can not repair the data stripe.
2551 */
2552 if (failp != rbio->scrubp)
2553 goto cleanup;
2554
2555 __raid_recover_end_io(rbio);
2556 } else {
2557 finish_parity_scrub(rbio, 1);
2558 }
2559 return;
2560
2561cleanup:
58efbc9f 2562 rbio_orig_end_io(rbio, BLK_STS_IOERR);
5a6ac9ea
MX
2563}
2564
2565/*
2566 * end io for the read phase of the rmw cycle. All the bios here are physical
2567 * stripe bios we've read from the disk so we can recalculate the parity of the
2568 * stripe.
2569 *
2570 * This will usually kick off finish_rmw once all the bios are read in, but it
2571 * may trigger parity reconstruction if we had any errors along the way
2572 */
4246a0b6 2573static void raid56_parity_scrub_end_io(struct bio *bio)
5a6ac9ea
MX
2574{
2575 struct btrfs_raid_bio *rbio = bio->bi_private;
2576
4e4cbee9 2577 if (bio->bi_status)
5a6ac9ea
MX
2578 fail_bio_stripe(rbio, bio);
2579 else
2580 set_bio_pages_uptodate(bio);
2581
2582 bio_put(bio);
2583
2584 if (!atomic_dec_and_test(&rbio->stripes_pending))
2585 return;
2586
2587 /*
2588 * this will normally call finish_rmw to start our write
2589 * but if there are any failed stripes we'll reconstruct
2590 * from parity first
2591 */
2592 validate_rbio_for_parity_scrub(rbio);
2593}
2594
2595static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2596{
2597 int bios_to_read = 0;
5a6ac9ea
MX
2598 struct bio_list bio_list;
2599 int ret;
2600 int pagenr;
2601 int stripe;
2602 struct bio *bio;
2603
785884fc
LB
2604 bio_list_init(&bio_list);
2605
5a6ac9ea
MX
2606 ret = alloc_rbio_essential_pages(rbio);
2607 if (ret)
2608 goto cleanup;
2609
5a6ac9ea
MX
2610 atomic_set(&rbio->error, 0);
2611 /*
2612 * build a list of bios to read all the missing parts of this
2613 * stripe
2614 */
2c8cdd6e 2615 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5a6ac9ea
MX
2616 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2617 struct page *page;
2618 /*
2619 * we want to find all the pages missing from
2620 * the rbio and read them from the disk. If
2621 * page_in_rbio finds a page in the bio list
2622 * we don't need to read it off the stripe.
2623 */
2624 page = page_in_rbio(rbio, stripe, pagenr, 1);
2625 if (page)
2626 continue;
2627
2628 page = rbio_stripe_page(rbio, stripe, pagenr);
2629 /*
2630 * the bio cache may have handed us an uptodate
2631 * page. If so, be happy and use it
2632 */
2633 if (PageUptodate(page))
2634 continue;
2635
e01bf588
CH
2636 ret = rbio_add_io_page(rbio, &bio_list, page, stripe,
2637 pagenr, rbio->stripe_len, REQ_OP_READ);
5a6ac9ea
MX
2638 if (ret)
2639 goto cleanup;
2640 }
2641 }
2642
2643 bios_to_read = bio_list_size(&bio_list);
2644 if (!bios_to_read) {
2645 /*
2646 * this can happen if others have merged with
2647 * us, it means there is nothing left to read.
2648 * But if there are missing devices it may not be
2649 * safe to do the full stripe write yet.
2650 */
2651 goto finish;
2652 }
2653
2654 /*
4c664611
QW
2655 * The bioc may be freed once we submit the last bio. Make sure not to
2656 * touch it after that.
5a6ac9ea
MX
2657 */
2658 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2659 while ((bio = bio_list_pop(&bio_list))) {
5a6ac9ea
MX
2660 bio->bi_end_io = raid56_parity_scrub_end_io;
2661
6a258d72 2662 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
5a6ac9ea 2663
4e49ea4a 2664 submit_bio(bio);
5a6ac9ea
MX
2665 }
2666 /* the actual write will happen once the reads are done */
2667 return;
2668
2669cleanup:
58efbc9f 2670 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2671
2672 while ((bio = bio_list_pop(&bio_list)))
2673 bio_put(bio);
2674
5a6ac9ea
MX
2675 return;
2676
2677finish:
2678 validate_rbio_for_parity_scrub(rbio);
2679}
2680
2681static void scrub_parity_work(struct btrfs_work *work)
2682{
2683 struct btrfs_raid_bio *rbio;
2684
2685 rbio = container_of(work, struct btrfs_raid_bio, work);
2686 raid56_parity_scrub_stripe(rbio);
2687}
2688
5a6ac9ea
MX
2689void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2690{
2691 if (!lock_stripe_add(rbio))
a81b747d 2692 start_async_work(rbio, scrub_parity_work);
5a6ac9ea 2693}
b4ee1782
OS
2694
2695/* The following code is used for dev replace of a missing RAID 5/6 device. */
2696
2697struct btrfs_raid_bio *
6a258d72
QW
2698raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2699 u64 length)
b4ee1782 2700{
6a258d72 2701 struct btrfs_fs_info *fs_info = bioc->fs_info;
b4ee1782
OS
2702 struct btrfs_raid_bio *rbio;
2703
4c664611 2704 rbio = alloc_rbio(fs_info, bioc, length);
b4ee1782
OS
2705 if (IS_ERR(rbio))
2706 return NULL;
2707
2708 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2709 bio_list_add(&rbio->bio_list, bio);
2710 /*
2711 * This is a special bio which is used to hold the completion handler
2712 * and make the scrub rbio is similar to the other types
2713 */
2714 ASSERT(!bio->bi_iter.bi_size);
2715
2716 rbio->faila = find_logical_bio_stripe(rbio, bio);
2717 if (rbio->faila == -1) {
2718 BUG();
2719 kfree(rbio);
2720 return NULL;
2721 }
2722
ae6529c3 2723 /*
4c664611 2724 * When we get bioc, we have already increased bio_counter, record it
ae6529c3
QW
2725 * so we can free it at rbio_orig_end_io()
2726 */
2727 rbio->generic_bio_cnt = 1;
2728
b4ee1782
OS
2729 return rbio;
2730}
2731
b4ee1782
OS
2732void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2733{
2734 if (!lock_stripe_add(rbio))
e66d8d5a 2735 start_async_work(rbio, read_rebuild_work);
b4ee1782 2736}