btrfs: raid56: allocate memory separately for rbio pointers
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
cea62800 16#include "misc.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
53b381b3
DW
22
23/* set when additional merges to this rbio are not allowed */
24#define RBIO_RMW_LOCKED_BIT 1
25
4ae10b3a
CM
26/*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30#define RBIO_CACHE_BIT 2
31
32/*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35#define RBIO_CACHE_READY_BIT 3
36
4ae10b3a
CM
37#define RBIO_CACHE_SIZE 1024
38
8a953348
DS
39#define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41/* Used by the raid56 code to lock stripes for read/modify/write */
42struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45};
46
47/* Used by the raid56 code to lock stripes for read/modify/write */
48struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53};
54
eb357060
QW
55/*
56 * A bvec like structure to present a sector inside a page.
57 *
58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
59 */
60struct sector_ptr {
61 struct page *page;
00425dd9
QW
62 unsigned int pgoff:24;
63 unsigned int uptodate:8;
eb357060
QW
64};
65
53b381b3
DW
66static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
67static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
385de0ef
CH
68static void rmw_work(struct work_struct *work);
69static void read_rebuild_work(struct work_struct *work);
53b381b3
DW
70static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
71static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
53b381b3
DW
72static void index_rbio_pages(struct btrfs_raid_bio *rbio);
73static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
74
5a6ac9ea
MX
75static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
76 int need_check);
385de0ef 77static void scrub_parity_work(struct work_struct *work);
5a6ac9ea 78
797d74b7
QW
79static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
80{
81 kfree(rbio->stripe_pages);
82 kfree(rbio->bio_sectors);
83 kfree(rbio->stripe_sectors);
84 kfree(rbio->finish_pointers);
85}
86
ff2b64a2
QW
87static void free_raid_bio(struct btrfs_raid_bio *rbio)
88{
89 int i;
90
91 if (!refcount_dec_and_test(&rbio->refs))
92 return;
93
94 WARN_ON(!list_empty(&rbio->stripe_cache));
95 WARN_ON(!list_empty(&rbio->hash_list));
96 WARN_ON(!bio_list_empty(&rbio->bio_list));
97
98 for (i = 0; i < rbio->nr_pages; i++) {
99 if (rbio->stripe_pages[i]) {
100 __free_page(rbio->stripe_pages[i]);
101 rbio->stripe_pages[i] = NULL;
102 }
103 }
104
105 btrfs_put_bioc(rbio->bioc);
797d74b7 106 free_raid_bio_pointers(rbio);
ff2b64a2
QW
107 kfree(rbio);
108}
109
385de0ef 110static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
ac638859 111{
385de0ef
CH
112 INIT_WORK(&rbio->work, work_func);
113 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
114}
115
53b381b3
DW
116/*
117 * the stripe hash table is used for locking, and to collect
118 * bios in hopes of making a full stripe
119 */
120int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
121{
122 struct btrfs_stripe_hash_table *table;
123 struct btrfs_stripe_hash_table *x;
124 struct btrfs_stripe_hash *cur;
125 struct btrfs_stripe_hash *h;
126 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
127 int i;
128
129 if (info->stripe_hash_table)
130 return 0;
131
83c8266a
DS
132 /*
133 * The table is large, starting with order 4 and can go as high as
134 * order 7 in case lock debugging is turned on.
135 *
136 * Try harder to allocate and fallback to vmalloc to lower the chance
137 * of a failing mount.
138 */
ee787f95 139 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
140 if (!table)
141 return -ENOMEM;
53b381b3 142
4ae10b3a
CM
143 spin_lock_init(&table->cache_lock);
144 INIT_LIST_HEAD(&table->stripe_cache);
145
53b381b3
DW
146 h = table->table;
147
148 for (i = 0; i < num_entries; i++) {
149 cur = h + i;
150 INIT_LIST_HEAD(&cur->hash_list);
151 spin_lock_init(&cur->lock);
53b381b3
DW
152 }
153
154 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 155 kvfree(x);
53b381b3
DW
156 return 0;
157}
158
4ae10b3a
CM
159/*
160 * caching an rbio means to copy anything from the
ac26df8b 161 * bio_sectors array into the stripe_pages array. We
4ae10b3a
CM
162 * use the page uptodate bit in the stripe cache array
163 * to indicate if it has valid data
164 *
165 * once the caching is done, we set the cache ready
166 * bit.
167 */
168static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
169{
170 int i;
4ae10b3a
CM
171 int ret;
172
173 ret = alloc_rbio_pages(rbio);
174 if (ret)
175 return;
176
00425dd9
QW
177 for (i = 0; i < rbio->nr_sectors; i++) {
178 /* Some range not covered by bio (partial write), skip it */
179 if (!rbio->bio_sectors[i].page)
180 continue;
181
182 ASSERT(rbio->stripe_sectors[i].page);
183 memcpy_page(rbio->stripe_sectors[i].page,
184 rbio->stripe_sectors[i].pgoff,
185 rbio->bio_sectors[i].page,
186 rbio->bio_sectors[i].pgoff,
187 rbio->bioc->fs_info->sectorsize);
188 rbio->stripe_sectors[i].uptodate = 1;
189 }
4ae10b3a
CM
190 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
191}
192
53b381b3
DW
193/*
194 * we hash on the first logical address of the stripe
195 */
196static int rbio_bucket(struct btrfs_raid_bio *rbio)
197{
4c664611 198 u64 num = rbio->bioc->raid_map[0];
53b381b3
DW
199
200 /*
201 * we shift down quite a bit. We're using byte
202 * addressing, and most of the lower bits are zeros.
203 * This tends to upset hash_64, and it consistently
204 * returns just one or two different values.
205 *
206 * shifting off the lower bits fixes things.
207 */
208 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
209}
210
d4e28d9b
QW
211static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
212 unsigned int page_nr)
213{
214 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
215 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
216 int i;
217
218 ASSERT(page_nr < rbio->nr_pages);
219
220 for (i = sectors_per_page * page_nr;
221 i < sectors_per_page * page_nr + sectors_per_page;
222 i++) {
223 if (!rbio->stripe_sectors[i].uptodate)
224 return false;
225 }
226 return true;
227}
228
eb357060
QW
229/*
230 * Update the stripe_sectors[] array to use correct page and pgoff
231 *
232 * Should be called every time any page pointer in stripes_pages[] got modified.
233 */
234static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
235{
236 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
237 u32 offset;
238 int i;
239
240 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
241 int page_index = offset >> PAGE_SHIFT;
242
243 ASSERT(page_index < rbio->nr_pages);
244 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
245 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
246 }
247}
248
4d100466
QW
249static void steal_rbio_page(struct btrfs_raid_bio *src,
250 struct btrfs_raid_bio *dest, int page_nr)
251{
252 const u32 sectorsize = src->bioc->fs_info->sectorsize;
253 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
254 int i;
255
256 if (dest->stripe_pages[page_nr])
257 __free_page(dest->stripe_pages[page_nr]);
258 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
259 src->stripe_pages[page_nr] = NULL;
260
261 /* Also update the sector->uptodate bits. */
262 for (i = sectors_per_page * page_nr;
263 i < sectors_per_page * page_nr + sectors_per_page; i++)
264 dest->stripe_sectors[i].uptodate = true;
265}
266
4ae10b3a 267/*
d4e28d9b
QW
268 * Stealing an rbio means taking all the uptodate pages from the stripe array
269 * in the source rbio and putting them into the destination rbio.
270 *
271 * This will also update the involved stripe_sectors[] which are referring to
272 * the old pages.
4ae10b3a
CM
273 */
274static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
275{
276 int i;
277 struct page *s;
4ae10b3a
CM
278
279 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
280 return;
281
282 for (i = 0; i < dest->nr_pages; i++) {
283 s = src->stripe_pages[i];
d4e28d9b 284 if (!s || !full_page_sectors_uptodate(src, i))
4ae10b3a 285 continue;
4ae10b3a 286
4d100466 287 steal_rbio_page(src, dest, i);
4ae10b3a 288 }
eb357060
QW
289 index_stripe_sectors(dest);
290 index_stripe_sectors(src);
4ae10b3a
CM
291}
292
53b381b3
DW
293/*
294 * merging means we take the bio_list from the victim and
295 * splice it into the destination. The victim should
296 * be discarded afterwards.
297 *
298 * must be called with dest->rbio_list_lock held
299 */
300static void merge_rbio(struct btrfs_raid_bio *dest,
301 struct btrfs_raid_bio *victim)
302{
303 bio_list_merge(&dest->bio_list, &victim->bio_list);
304 dest->bio_list_bytes += victim->bio_list_bytes;
bd8f7e62
QW
305 /* Also inherit the bitmaps from @victim. */
306 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
307 dest->stripe_nsectors);
53b381b3
DW
308 bio_list_init(&victim->bio_list);
309}
310
311/*
4ae10b3a
CM
312 * used to prune items that are in the cache. The caller
313 * must hold the hash table lock.
314 */
315static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
316{
317 int bucket = rbio_bucket(rbio);
318 struct btrfs_stripe_hash_table *table;
319 struct btrfs_stripe_hash *h;
320 int freeit = 0;
321
322 /*
323 * check the bit again under the hash table lock.
324 */
325 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
326 return;
327
6a258d72 328 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
329 h = table->table + bucket;
330
331 /* hold the lock for the bucket because we may be
332 * removing it from the hash table
333 */
334 spin_lock(&h->lock);
335
336 /*
337 * hold the lock for the bio list because we need
338 * to make sure the bio list is empty
339 */
340 spin_lock(&rbio->bio_list_lock);
341
342 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
343 list_del_init(&rbio->stripe_cache);
344 table->cache_size -= 1;
345 freeit = 1;
346
347 /* if the bio list isn't empty, this rbio is
348 * still involved in an IO. We take it out
349 * of the cache list, and drop the ref that
350 * was held for the list.
351 *
352 * If the bio_list was empty, we also remove
353 * the rbio from the hash_table, and drop
354 * the corresponding ref
355 */
356 if (bio_list_empty(&rbio->bio_list)) {
357 if (!list_empty(&rbio->hash_list)) {
358 list_del_init(&rbio->hash_list);
dec95574 359 refcount_dec(&rbio->refs);
4ae10b3a
CM
360 BUG_ON(!list_empty(&rbio->plug_list));
361 }
362 }
363 }
364
365 spin_unlock(&rbio->bio_list_lock);
366 spin_unlock(&h->lock);
367
368 if (freeit)
ff2b64a2 369 free_raid_bio(rbio);
4ae10b3a
CM
370}
371
372/*
373 * prune a given rbio from the cache
374 */
375static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
376{
377 struct btrfs_stripe_hash_table *table;
378 unsigned long flags;
379
380 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
381 return;
382
6a258d72 383 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
384
385 spin_lock_irqsave(&table->cache_lock, flags);
386 __remove_rbio_from_cache(rbio);
387 spin_unlock_irqrestore(&table->cache_lock, flags);
388}
389
390/*
391 * remove everything in the cache
392 */
48a3b636 393static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
394{
395 struct btrfs_stripe_hash_table *table;
396 unsigned long flags;
397 struct btrfs_raid_bio *rbio;
398
399 table = info->stripe_hash_table;
400
401 spin_lock_irqsave(&table->cache_lock, flags);
402 while (!list_empty(&table->stripe_cache)) {
403 rbio = list_entry(table->stripe_cache.next,
404 struct btrfs_raid_bio,
405 stripe_cache);
406 __remove_rbio_from_cache(rbio);
407 }
408 spin_unlock_irqrestore(&table->cache_lock, flags);
409}
410
411/*
412 * remove all cached entries and free the hash table
413 * used by unmount
53b381b3
DW
414 */
415void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
416{
417 if (!info->stripe_hash_table)
418 return;
4ae10b3a 419 btrfs_clear_rbio_cache(info);
f749303b 420 kvfree(info->stripe_hash_table);
53b381b3
DW
421 info->stripe_hash_table = NULL;
422}
423
4ae10b3a
CM
424/*
425 * insert an rbio into the stripe cache. It
426 * must have already been prepared by calling
427 * cache_rbio_pages
428 *
429 * If this rbio was already cached, it gets
430 * moved to the front of the lru.
431 *
432 * If the size of the rbio cache is too big, we
433 * prune an item.
434 */
435static void cache_rbio(struct btrfs_raid_bio *rbio)
436{
437 struct btrfs_stripe_hash_table *table;
438 unsigned long flags;
439
440 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
441 return;
442
6a258d72 443 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
444
445 spin_lock_irqsave(&table->cache_lock, flags);
446 spin_lock(&rbio->bio_list_lock);
447
448 /* bump our ref if we were not in the list before */
449 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 450 refcount_inc(&rbio->refs);
4ae10b3a
CM
451
452 if (!list_empty(&rbio->stripe_cache)){
453 list_move(&rbio->stripe_cache, &table->stripe_cache);
454 } else {
455 list_add(&rbio->stripe_cache, &table->stripe_cache);
456 table->cache_size += 1;
457 }
458
459 spin_unlock(&rbio->bio_list_lock);
460
461 if (table->cache_size > RBIO_CACHE_SIZE) {
462 struct btrfs_raid_bio *found;
463
464 found = list_entry(table->stripe_cache.prev,
465 struct btrfs_raid_bio,
466 stripe_cache);
467
468 if (found != rbio)
469 __remove_rbio_from_cache(found);
470 }
471
472 spin_unlock_irqrestore(&table->cache_lock, flags);
4ae10b3a
CM
473}
474
53b381b3
DW
475/*
476 * helper function to run the xor_blocks api. It is only
477 * able to do MAX_XOR_BLOCKS at a time, so we need to
478 * loop through.
479 */
480static void run_xor(void **pages, int src_cnt, ssize_t len)
481{
482 int src_off = 0;
483 int xor_src_cnt = 0;
484 void *dest = pages[src_cnt];
485
486 while(src_cnt > 0) {
487 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
488 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
489
490 src_cnt -= xor_src_cnt;
491 src_off += xor_src_cnt;
492 }
493}
494
495/*
176571a1
DS
496 * Returns true if the bio list inside this rbio covers an entire stripe (no
497 * rmw required).
53b381b3 498 */
176571a1 499static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3 500{
176571a1 501 unsigned long flags;
53b381b3
DW
502 unsigned long size = rbio->bio_list_bytes;
503 int ret = 1;
504
176571a1 505 spin_lock_irqsave(&rbio->bio_list_lock, flags);
ff18a4af 506 if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
53b381b3 507 ret = 0;
ff18a4af 508 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
53b381b3 509 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
176571a1 510
53b381b3
DW
511 return ret;
512}
513
514/*
515 * returns 1 if it is safe to merge two rbios together.
516 * The merging is safe if the two rbios correspond to
517 * the same stripe and if they are both going in the same
518 * direction (read vs write), and if neither one is
519 * locked for final IO
520 *
521 * The caller is responsible for locking such that
522 * rmw_locked is safe to test
523 */
524static int rbio_can_merge(struct btrfs_raid_bio *last,
525 struct btrfs_raid_bio *cur)
526{
527 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
528 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
529 return 0;
530
4ae10b3a
CM
531 /*
532 * we can't merge with cached rbios, since the
533 * idea is that when we merge the destination
534 * rbio is going to run our IO for us. We can
01327610 535 * steal from cached rbios though, other functions
4ae10b3a
CM
536 * handle that.
537 */
538 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
539 test_bit(RBIO_CACHE_BIT, &cur->flags))
540 return 0;
541
4c664611 542 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
53b381b3
DW
543 return 0;
544
5a6ac9ea
MX
545 /* we can't merge with different operations */
546 if (last->operation != cur->operation)
547 return 0;
548 /*
549 * We've need read the full stripe from the drive.
550 * check and repair the parity and write the new results.
551 *
552 * We're not allowed to add any new bios to the
553 * bio list here, anyone else that wants to
554 * change this stripe needs to do their own rmw.
555 */
db34be19 556 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 557 return 0;
53b381b3 558
db34be19 559 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
b4ee1782
OS
560 return 0;
561
cc54ff62
LB
562 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
563 int fa = last->faila;
564 int fb = last->failb;
565 int cur_fa = cur->faila;
566 int cur_fb = cur->failb;
567
568 if (last->faila >= last->failb) {
569 fa = last->failb;
570 fb = last->faila;
571 }
572
573 if (cur->faila >= cur->failb) {
574 cur_fa = cur->failb;
575 cur_fb = cur->faila;
576 }
577
578 if (fa != cur_fa || fb != cur_fb)
579 return 0;
580 }
53b381b3
DW
581 return 1;
582}
583
3e77605d
QW
584static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
585 unsigned int stripe_nr,
586 unsigned int sector_nr)
587{
588 ASSERT(stripe_nr < rbio->real_stripes);
589 ASSERT(sector_nr < rbio->stripe_nsectors);
590
591 return stripe_nr * rbio->stripe_nsectors + sector_nr;
592}
593
594/* Return a sector from rbio->stripe_sectors, not from the bio list */
595static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
596 unsigned int stripe_nr,
597 unsigned int sector_nr)
598{
599 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
600 sector_nr)];
601}
602
1145059a
QW
603/* Grab a sector inside P stripe */
604static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
605 unsigned int sector_nr)
b7178a5f 606{
1145059a 607 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
b7178a5f
ZL
608}
609
1145059a
QW
610/* Grab a sector inside Q stripe, return NULL if not RAID6 */
611static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
612 unsigned int sector_nr)
53b381b3 613{
1145059a
QW
614 if (rbio->nr_data + 1 == rbio->real_stripes)
615 return NULL;
616 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
53b381b3
DW
617}
618
53b381b3
DW
619/*
620 * The first stripe in the table for a logical address
621 * has the lock. rbios are added in one of three ways:
622 *
623 * 1) Nobody has the stripe locked yet. The rbio is given
624 * the lock and 0 is returned. The caller must start the IO
625 * themselves.
626 *
627 * 2) Someone has the stripe locked, but we're able to merge
628 * with the lock owner. The rbio is freed and the IO will
629 * start automatically along with the existing rbio. 1 is returned.
630 *
631 * 3) Someone has the stripe locked, but we're not able to merge.
632 * The rbio is added to the lock owner's plug list, or merged into
633 * an rbio already on the plug list. When the lock owner unlocks,
634 * the next rbio on the list is run and the IO is started automatically.
635 * 1 is returned
636 *
637 * If we return 0, the caller still owns the rbio and must continue with
638 * IO submission. If we return 1, the caller must assume the rbio has
639 * already been freed.
640 */
641static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
642{
721860d5 643 struct btrfs_stripe_hash *h;
53b381b3
DW
644 struct btrfs_raid_bio *cur;
645 struct btrfs_raid_bio *pending;
646 unsigned long flags;
53b381b3 647 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 648 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 649 int ret = 0;
53b381b3 650
6a258d72 651 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 652
53b381b3
DW
653 spin_lock_irqsave(&h->lock, flags);
654 list_for_each_entry(cur, &h->hash_list, hash_list) {
4c664611 655 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
9d6cb1b0 656 continue;
4ae10b3a 657
9d6cb1b0 658 spin_lock(&cur->bio_list_lock);
4ae10b3a 659
9d6cb1b0
JT
660 /* Can we steal this cached rbio's pages? */
661 if (bio_list_empty(&cur->bio_list) &&
662 list_empty(&cur->plug_list) &&
663 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
664 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
665 list_del_init(&cur->hash_list);
666 refcount_dec(&cur->refs);
53b381b3 667
9d6cb1b0
JT
668 steal_rbio(cur, rbio);
669 cache_drop = cur;
670 spin_unlock(&cur->bio_list_lock);
4ae10b3a 671
9d6cb1b0
JT
672 goto lockit;
673 }
53b381b3 674
9d6cb1b0
JT
675 /* Can we merge into the lock owner? */
676 if (rbio_can_merge(cur, rbio)) {
677 merge_rbio(cur, rbio);
53b381b3 678 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 679 freeit = rbio;
53b381b3
DW
680 ret = 1;
681 goto out;
682 }
9d6cb1b0
JT
683
684
685 /*
686 * We couldn't merge with the running rbio, see if we can merge
687 * with the pending ones. We don't have to check for rmw_locked
688 * because there is no way they are inside finish_rmw right now
689 */
690 list_for_each_entry(pending, &cur->plug_list, plug_list) {
691 if (rbio_can_merge(pending, rbio)) {
692 merge_rbio(pending, rbio);
693 spin_unlock(&cur->bio_list_lock);
694 freeit = rbio;
695 ret = 1;
696 goto out;
697 }
698 }
699
700 /*
701 * No merging, put us on the tail of the plug list, our rbio
702 * will be started with the currently running rbio unlocks
703 */
704 list_add_tail(&rbio->plug_list, &cur->plug_list);
705 spin_unlock(&cur->bio_list_lock);
706 ret = 1;
707 goto out;
53b381b3 708 }
4ae10b3a 709lockit:
dec95574 710 refcount_inc(&rbio->refs);
53b381b3
DW
711 list_add(&rbio->hash_list, &h->hash_list);
712out:
713 spin_unlock_irqrestore(&h->lock, flags);
4ae10b3a
CM
714 if (cache_drop)
715 remove_rbio_from_cache(cache_drop);
53b381b3 716 if (freeit)
ff2b64a2 717 free_raid_bio(freeit);
53b381b3
DW
718 return ret;
719}
720
721/*
722 * called as rmw or parity rebuild is completed. If the plug list has more
723 * rbios waiting for this stripe, the next one on the list will be started
724 */
725static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
726{
727 int bucket;
728 struct btrfs_stripe_hash *h;
729 unsigned long flags;
4ae10b3a 730 int keep_cache = 0;
53b381b3
DW
731
732 bucket = rbio_bucket(rbio);
6a258d72 733 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 734
4ae10b3a
CM
735 if (list_empty(&rbio->plug_list))
736 cache_rbio(rbio);
737
53b381b3
DW
738 spin_lock_irqsave(&h->lock, flags);
739 spin_lock(&rbio->bio_list_lock);
740
741 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
742 /*
743 * if we're still cached and there is no other IO
744 * to perform, just leave this rbio here for others
745 * to steal from later
746 */
747 if (list_empty(&rbio->plug_list) &&
748 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
749 keep_cache = 1;
750 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
751 BUG_ON(!bio_list_empty(&rbio->bio_list));
752 goto done;
753 }
53b381b3
DW
754
755 list_del_init(&rbio->hash_list);
dec95574 756 refcount_dec(&rbio->refs);
53b381b3
DW
757
758 /*
759 * we use the plug list to hold all the rbios
760 * waiting for the chance to lock this stripe.
761 * hand the lock over to one of them.
762 */
763 if (!list_empty(&rbio->plug_list)) {
764 struct btrfs_raid_bio *next;
765 struct list_head *head = rbio->plug_list.next;
766
767 next = list_entry(head, struct btrfs_raid_bio,
768 plug_list);
769
770 list_del_init(&rbio->plug_list);
771
772 list_add(&next->hash_list, &h->hash_list);
dec95574 773 refcount_inc(&next->refs);
53b381b3
DW
774 spin_unlock(&rbio->bio_list_lock);
775 spin_unlock_irqrestore(&h->lock, flags);
776
1b94b556 777 if (next->operation == BTRFS_RBIO_READ_REBUILD)
e66d8d5a 778 start_async_work(next, read_rebuild_work);
b4ee1782
OS
779 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
780 steal_rbio(rbio, next);
e66d8d5a 781 start_async_work(next, read_rebuild_work);
b4ee1782 782 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 783 steal_rbio(rbio, next);
cf6a4a75 784 start_async_work(next, rmw_work);
5a6ac9ea
MX
785 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
786 steal_rbio(rbio, next);
a81b747d 787 start_async_work(next, scrub_parity_work);
4ae10b3a 788 }
53b381b3
DW
789
790 goto done_nolock;
53b381b3
DW
791 }
792 }
4ae10b3a 793done:
53b381b3
DW
794 spin_unlock(&rbio->bio_list_lock);
795 spin_unlock_irqrestore(&h->lock, flags);
796
797done_nolock:
4ae10b3a
CM
798 if (!keep_cache)
799 remove_rbio_from_cache(rbio);
53b381b3
DW
800}
801
7583d8d0 802static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
53b381b3 803{
7583d8d0
LB
804 struct bio *next;
805
806 while (cur) {
807 next = cur->bi_next;
808 cur->bi_next = NULL;
809 cur->bi_status = err;
810 bio_endio(cur);
811 cur = next;
812 }
53b381b3
DW
813}
814
815/*
816 * this frees the rbio and runs through all the bios in the
817 * bio_list and calls end_io on them
818 */
4e4cbee9 819static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
53b381b3
DW
820{
821 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 822 struct bio *extra;
4245215d 823
bd8f7e62
QW
824 /*
825 * Clear the data bitmap, as the rbio may be cached for later usage.
826 * do this before before unlock_stripe() so there will be no new bio
827 * for this bio.
828 */
829 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
4245215d 830
7583d8d0
LB
831 /*
832 * At this moment, rbio->bio_list is empty, however since rbio does not
833 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
834 * hash list, rbio may be merged with others so that rbio->bio_list
835 * becomes non-empty.
836 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
837 * more and we can call bio_endio() on all queued bios.
838 */
839 unlock_stripe(rbio);
840 extra = bio_list_get(&rbio->bio_list);
ff2b64a2 841 free_raid_bio(rbio);
53b381b3 842
7583d8d0
LB
843 rbio_endio_bio_list(cur, err);
844 if (extra)
845 rbio_endio_bio_list(extra, err);
53b381b3
DW
846}
847
848/*
849 * end io function used by finish_rmw. When we finally
850 * get here, we've written a full stripe
851 */
4246a0b6 852static void raid_write_end_io(struct bio *bio)
53b381b3
DW
853{
854 struct btrfs_raid_bio *rbio = bio->bi_private;
4e4cbee9 855 blk_status_t err = bio->bi_status;
a6111d11 856 int max_errors;
53b381b3
DW
857
858 if (err)
859 fail_bio_stripe(rbio, bio);
860
861 bio_put(bio);
862
b89e1b01 863 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
864 return;
865
58efbc9f 866 err = BLK_STS_OK;
53b381b3
DW
867
868 /* OK, we have read all the stripes we need to. */
a6111d11 869 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
4c664611 870 0 : rbio->bioc->max_errors;
a6111d11 871 if (atomic_read(&rbio->error) > max_errors)
4e4cbee9 872 err = BLK_STS_IOERR;
53b381b3 873
4246a0b6 874 rbio_orig_end_io(rbio, err);
53b381b3
DW
875}
876
3e77605d
QW
877/**
878 * Get a sector pointer specified by its @stripe_nr and @sector_nr
879 *
880 * @rbio: The raid bio
881 * @stripe_nr: Stripe number, valid range [0, real_stripe)
882 * @sector_nr: Sector number inside the stripe,
883 * valid range [0, stripe_nsectors)
884 * @bio_list_only: Whether to use sectors inside the bio list only.
885 *
886 * The read/modify/write code wants to reuse the original bio page as much
887 * as possible, and only use stripe_sectors as fallback.
888 */
889static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
890 int stripe_nr, int sector_nr,
891 bool bio_list_only)
892{
893 struct sector_ptr *sector;
894 int index;
895
896 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
897 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
898
899 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
900 ASSERT(index >= 0 && index < rbio->nr_sectors);
901
902 spin_lock_irq(&rbio->bio_list_lock);
903 sector = &rbio->bio_sectors[index];
904 if (sector->page || bio_list_only) {
905 /* Don't return sector without a valid page pointer */
906 if (!sector->page)
907 sector = NULL;
908 spin_unlock_irq(&rbio->bio_list_lock);
909 return sector;
910 }
911 spin_unlock_irq(&rbio->bio_list_lock);
912
913 return &rbio->stripe_sectors[index];
914}
915
53b381b3
DW
916/*
917 * allocation and initial setup for the btrfs_raid_bio. Not
918 * this does not allocate any pages for rbio->pages.
919 */
2ff7e61e 920static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
ff18a4af 921 struct btrfs_io_context *bioc)
53b381b3 922{
843de58b 923 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
ff18a4af 924 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
843de58b 925 const unsigned int num_pages = stripe_npages * real_stripes;
ff18a4af
CH
926 const unsigned int stripe_nsectors =
927 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
94efbe19 928 const unsigned int num_sectors = stripe_nsectors * real_stripes;
53b381b3 929 struct btrfs_raid_bio *rbio;
53b381b3 930
94efbe19
QW
931 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
932 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
c67c68eb
QW
933 /*
934 * Our current stripe len should be fixed to 64k thus stripe_nsectors
935 * (at most 16) should be no larger than BITS_PER_LONG.
936 */
937 ASSERT(stripe_nsectors <= BITS_PER_LONG);
843de58b 938
797d74b7 939 rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
af8e2d1d 940 if (!rbio)
53b381b3 941 return ERR_PTR(-ENOMEM);
797d74b7
QW
942 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
943 GFP_NOFS);
944 rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
945 GFP_NOFS);
946 rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
947 GFP_NOFS);
948 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
949
950 if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
951 !rbio->finish_pointers) {
952 free_raid_bio_pointers(rbio);
953 kfree(rbio);
954 return ERR_PTR(-ENOMEM);
955 }
53b381b3
DW
956
957 bio_list_init(&rbio->bio_list);
958 INIT_LIST_HEAD(&rbio->plug_list);
959 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 960 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 961 INIT_LIST_HEAD(&rbio->hash_list);
f1c29379 962 btrfs_get_bioc(bioc);
4c664611 963 rbio->bioc = bioc;
53b381b3 964 rbio->nr_pages = num_pages;
94efbe19 965 rbio->nr_sectors = num_sectors;
2c8cdd6e 966 rbio->real_stripes = real_stripes;
5a6ac9ea 967 rbio->stripe_npages = stripe_npages;
94efbe19 968 rbio->stripe_nsectors = stripe_nsectors;
53b381b3
DW
969 rbio->faila = -1;
970 rbio->failb = -1;
dec95574 971 refcount_set(&rbio->refs, 1);
b89e1b01
MX
972 atomic_set(&rbio->error, 0);
973 atomic_set(&rbio->stripes_pending, 0);
53b381b3 974
0b30f719
QW
975 ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
976 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
53b381b3 977
53b381b3
DW
978 return rbio;
979}
980
981/* allocate pages for all the stripes in the bio, including parity */
982static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
983{
eb357060
QW
984 int ret;
985
986 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
987 if (ret < 0)
988 return ret;
989 /* Mapping all sectors */
990 index_stripe_sectors(rbio);
991 return 0;
53b381b3
DW
992}
993
b7178a5f 994/* only allocate pages for p/q stripes */
53b381b3
DW
995static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
996{
f77183dc 997 const int data_pages = rbio->nr_data * rbio->stripe_npages;
eb357060 998 int ret;
53b381b3 999
eb357060
QW
1000 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1001 rbio->stripe_pages + data_pages);
1002 if (ret < 0)
1003 return ret;
1004
1005 index_stripe_sectors(rbio);
1006 return 0;
53b381b3
DW
1007}
1008
1009/*
3e77605d
QW
1010 * Add a single sector @sector into our list of bios for IO.
1011 *
1012 * Return 0 if everything went well.
1013 * Return <0 for error.
53b381b3 1014 */
3e77605d
QW
1015static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1016 struct bio_list *bio_list,
1017 struct sector_ptr *sector,
1018 unsigned int stripe_nr,
1019 unsigned int sector_nr,
bf9486d6 1020 enum req_op op)
53b381b3 1021{
3e77605d 1022 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
53b381b3 1023 struct bio *last = bio_list->tail;
53b381b3
DW
1024 int ret;
1025 struct bio *bio;
4c664611 1026 struct btrfs_io_stripe *stripe;
53b381b3
DW
1027 u64 disk_start;
1028
3e77605d
QW
1029 /*
1030 * Note: here stripe_nr has taken device replace into consideration,
1031 * thus it can be larger than rbio->real_stripe.
1032 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1033 */
1034 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1035 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1036 ASSERT(sector->page);
1037
4c664611 1038 stripe = &rbio->bioc->stripes[stripe_nr];
3e77605d 1039 disk_start = stripe->physical + sector_nr * sectorsize;
53b381b3
DW
1040
1041 /* if the device is missing, just fail this stripe */
1042 if (!stripe->dev->bdev)
1043 return fail_rbio_index(rbio, stripe_nr);
1044
1045 /* see if we can add this page onto our existing bio */
1046 if (last) {
1201b58b 1047 u64 last_end = last->bi_iter.bi_sector << 9;
4f024f37 1048 last_end += last->bi_iter.bi_size;
53b381b3
DW
1049
1050 /*
1051 * we can't merge these if they are from different
1052 * devices or if they are not contiguous
1053 */
f90ae76a 1054 if (last_end == disk_start && !last->bi_status &&
309dca30 1055 last->bi_bdev == stripe->dev->bdev) {
3e77605d
QW
1056 ret = bio_add_page(last, sector->page, sectorsize,
1057 sector->pgoff);
1058 if (ret == sectorsize)
53b381b3
DW
1059 return 0;
1060 }
1061 }
1062
1063 /* put a new bio on the list */
ff18a4af
CH
1064 bio = bio_alloc(stripe->dev->bdev,
1065 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
bf9486d6 1066 op, GFP_NOFS);
4f024f37 1067 bio->bi_iter.bi_sector = disk_start >> 9;
e01bf588 1068 bio->bi_private = rbio;
53b381b3 1069
3e77605d 1070 bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
53b381b3
DW
1071 bio_list_add(bio_list, bio);
1072 return 0;
1073}
1074
1075/*
1076 * while we're doing the read/modify/write cycle, we could
1077 * have errors in reading pages off the disk. This checks
1078 * for errors and if we're not able to read the page it'll
1079 * trigger parity reconstruction. The rmw will be finished
1080 * after we've reconstructed the failed stripes
1081 */
1082static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1083{
1084 if (rbio->faila >= 0 || rbio->failb >= 0) {
2c8cdd6e 1085 BUG_ON(rbio->faila == rbio->real_stripes - 1);
53b381b3
DW
1086 __raid56_parity_recover(rbio);
1087 } else {
1088 finish_rmw(rbio);
1089 }
1090}
1091
00425dd9
QW
1092static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1093{
1094 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1095 struct bio_vec bvec;
1096 struct bvec_iter iter;
1097 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1098 rbio->bioc->raid_map[0];
1099
00425dd9
QW
1100 bio_for_each_segment(bvec, bio, iter) {
1101 u32 bvec_offset;
1102
1103 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1104 bvec_offset += sectorsize, offset += sectorsize) {
1105 int index = offset / sectorsize;
1106 struct sector_ptr *sector = &rbio->bio_sectors[index];
1107
1108 sector->page = bvec.bv_page;
1109 sector->pgoff = bvec.bv_offset + bvec_offset;
1110 ASSERT(sector->pgoff < PAGE_SIZE);
1111 }
1112 }
1113}
1114
53b381b3
DW
1115/*
1116 * helper function to walk our bio list and populate the bio_pages array with
1117 * the result. This seems expensive, but it is faster than constantly
1118 * searching through the bio list as we setup the IO in finish_rmw or stripe
1119 * reconstruction.
1120 *
1121 * This must be called before you trust the answers from page_in_rbio
1122 */
1123static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1124{
1125 struct bio *bio;
53b381b3
DW
1126
1127 spin_lock_irq(&rbio->bio_list_lock);
00425dd9
QW
1128 bio_list_for_each(bio, &rbio->bio_list)
1129 index_one_bio(rbio, bio);
1130
53b381b3
DW
1131 spin_unlock_irq(&rbio->bio_list_lock);
1132}
1133
b8bea09a
QW
1134static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1135 struct raid56_bio_trace_info *trace_info)
1136{
1137 const struct btrfs_io_context *bioc = rbio->bioc;
1138 int i;
1139
1140 ASSERT(bioc);
1141
1142 /* We rely on bio->bi_bdev to find the stripe number. */
1143 if (!bio->bi_bdev)
1144 goto not_found;
1145
1146 for (i = 0; i < bioc->num_stripes; i++) {
1147 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1148 continue;
1149 trace_info->stripe_nr = i;
1150 trace_info->devid = bioc->stripes[i].dev->devid;
1151 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1152 bioc->stripes[i].physical;
1153 return;
1154 }
1155
1156not_found:
1157 trace_info->devid = -1;
1158 trace_info->offset = -1;
1159 trace_info->stripe_nr = -1;
1160}
1161
53b381b3
DW
1162/*
1163 * this is called from one of two situations. We either
1164 * have a full stripe from the higher layers, or we've read all
1165 * the missing bits off disk.
1166 *
1167 * This will calculate the parity and then send down any
1168 * changed blocks.
1169 */
1170static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1171{
4c664611 1172 struct btrfs_io_context *bioc = rbio->bioc;
1145059a 1173 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 1174 void **pointers = rbio->finish_pointers;
53b381b3 1175 int nr_data = rbio->nr_data;
36920044
QW
1176 /* The total sector number inside the full stripe. */
1177 int total_sector_nr;
53b381b3 1178 int stripe;
36920044 1179 /* Sector number inside a stripe. */
3e77605d 1180 int sectornr;
c17af965 1181 bool has_qstripe;
53b381b3
DW
1182 struct bio_list bio_list;
1183 struct bio *bio;
53b381b3
DW
1184 int ret;
1185
1186 bio_list_init(&bio_list);
1187
c17af965
DS
1188 if (rbio->real_stripes - rbio->nr_data == 1)
1189 has_qstripe = false;
1190 else if (rbio->real_stripes - rbio->nr_data == 2)
1191 has_qstripe = true;
1192 else
53b381b3 1193 BUG();
53b381b3 1194
bd8f7e62
QW
1195 /* We should have at least one data sector. */
1196 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1197
53b381b3
DW
1198 /* at this point we either have a full stripe,
1199 * or we've read the full stripe from the drive.
1200 * recalculate the parity and write the new results.
1201 *
1202 * We're not allowed to add any new bios to the
1203 * bio list here, anyone else that wants to
1204 * change this stripe needs to do their own rmw.
1205 */
1206 spin_lock_irq(&rbio->bio_list_lock);
1207 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1208 spin_unlock_irq(&rbio->bio_list_lock);
1209
b89e1b01 1210 atomic_set(&rbio->error, 0);
53b381b3
DW
1211
1212 /*
1213 * now that we've set rmw_locked, run through the
1214 * bio list one last time and map the page pointers
4ae10b3a
CM
1215 *
1216 * We don't cache full rbios because we're assuming
1217 * the higher layers are unlikely to use this area of
1218 * the disk again soon. If they do use it again,
1219 * hopefully they will send another full bio.
53b381b3
DW
1220 */
1221 index_rbio_pages(rbio);
4ae10b3a
CM
1222 if (!rbio_is_full(rbio))
1223 cache_rbio_pages(rbio);
1224 else
1225 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
53b381b3 1226
3e77605d 1227 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1145059a
QW
1228 struct sector_ptr *sector;
1229
1230 /* First collect one sector from each data stripe */
53b381b3 1231 for (stripe = 0; stripe < nr_data; stripe++) {
1145059a
QW
1232 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1233 pointers[stripe] = kmap_local_page(sector->page) +
1234 sector->pgoff;
53b381b3
DW
1235 }
1236
1145059a
QW
1237 /* Then add the parity stripe */
1238 sector = rbio_pstripe_sector(rbio, sectornr);
1239 sector->uptodate = 1;
1240 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
53b381b3 1241
c17af965 1242 if (has_qstripe) {
53b381b3 1243 /*
1145059a
QW
1244 * RAID6, add the qstripe and call the library function
1245 * to fill in our p/q
53b381b3 1246 */
1145059a
QW
1247 sector = rbio_qstripe_sector(rbio, sectornr);
1248 sector->uptodate = 1;
1249 pointers[stripe++] = kmap_local_page(sector->page) +
1250 sector->pgoff;
53b381b3 1251
1145059a 1252 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
53b381b3
DW
1253 pointers);
1254 } else {
1255 /* raid5 */
1145059a
QW
1256 memcpy(pointers[nr_data], pointers[0], sectorsize);
1257 run_xor(pointers + 1, nr_data - 1, sectorsize);
53b381b3 1258 }
94a0b58d
IW
1259 for (stripe = stripe - 1; stripe >= 0; stripe--)
1260 kunmap_local(pointers[stripe]);
53b381b3
DW
1261 }
1262
1263 /*
36920044
QW
1264 * Start writing. Make bios for everything from the higher layers (the
1265 * bio_list in our rbio) and our P/Q. Ignore everything else.
53b381b3 1266 */
36920044
QW
1267 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1268 total_sector_nr++) {
1269 struct sector_ptr *sector;
3e77605d 1270
36920044
QW
1271 stripe = total_sector_nr / rbio->stripe_nsectors;
1272 sectornr = total_sector_nr % rbio->stripe_nsectors;
53b381b3 1273
36920044
QW
1274 /* This vertical stripe has no data, skip it. */
1275 if (!test_bit(sectornr, &rbio->dbitmap))
1276 continue;
53b381b3 1277
36920044
QW
1278 if (stripe < rbio->nr_data) {
1279 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1280 if (!sector)
1281 continue;
1282 } else {
1283 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3 1284 }
36920044
QW
1285
1286 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
ff18a4af 1287 sectornr, REQ_OP_WRITE);
36920044
QW
1288 if (ret)
1289 goto cleanup;
53b381b3
DW
1290 }
1291
4c664611 1292 if (likely(!bioc->num_tgtdevs))
2c8cdd6e
MX
1293 goto write_data;
1294
36920044
QW
1295 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1296 total_sector_nr++) {
1297 struct sector_ptr *sector;
2c8cdd6e 1298
36920044
QW
1299 stripe = total_sector_nr / rbio->stripe_nsectors;
1300 sectornr = total_sector_nr % rbio->stripe_nsectors;
3e77605d 1301
36920044
QW
1302 if (!bioc->tgtdev_map[stripe]) {
1303 /*
1304 * We can skip the whole stripe completely, note
1305 * total_sector_nr will be increased by one anyway.
1306 */
1307 ASSERT(sectornr == 0);
1308 total_sector_nr += rbio->stripe_nsectors - 1;
1309 continue;
1310 }
2c8cdd6e 1311
36920044
QW
1312 /* This vertical stripe has no data, skip it. */
1313 if (!test_bit(sectornr, &rbio->dbitmap))
1314 continue;
2c8cdd6e 1315
36920044
QW
1316 if (stripe < rbio->nr_data) {
1317 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1318 if (!sector)
1319 continue;
1320 } else {
1321 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2c8cdd6e 1322 }
36920044
QW
1323
1324 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1325 rbio->bioc->tgtdev_map[stripe],
ff18a4af 1326 sectornr, REQ_OP_WRITE);
36920044
QW
1327 if (ret)
1328 goto cleanup;
2c8cdd6e
MX
1329 }
1330
1331write_data:
b89e1b01
MX
1332 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1333 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
53b381b3 1334
bf28a605 1335 while ((bio = bio_list_pop(&bio_list))) {
53b381b3 1336 bio->bi_end_io = raid_write_end_io;
4e49ea4a 1337
b8bea09a
QW
1338 if (trace_raid56_write_stripe_enabled()) {
1339 struct raid56_bio_trace_info trace_info = { 0 };
1340
1341 bio_get_trace_info(rbio, bio, &trace_info);
1342 trace_raid56_write_stripe(rbio, bio, &trace_info);
1343 }
4e49ea4a 1344 submit_bio(bio);
53b381b3
DW
1345 }
1346 return;
1347
1348cleanup:
58efbc9f 1349 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1350
1351 while ((bio = bio_list_pop(&bio_list)))
1352 bio_put(bio);
53b381b3
DW
1353}
1354
1355/*
1356 * helper to find the stripe number for a given bio. Used to figure out which
1357 * stripe has failed. This expects the bio to correspond to a physical disk,
1358 * so it looks up based on physical sector numbers.
1359 */
1360static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1361 struct bio *bio)
1362{
4f024f37 1363 u64 physical = bio->bi_iter.bi_sector;
53b381b3 1364 int i;
4c664611 1365 struct btrfs_io_stripe *stripe;
53b381b3
DW
1366
1367 physical <<= 9;
1368
4c664611
QW
1369 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1370 stripe = &rbio->bioc->stripes[i];
ff18a4af 1371 if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
309dca30 1372 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
53b381b3
DW
1373 return i;
1374 }
1375 }
1376 return -1;
1377}
1378
1379/*
1380 * helper to find the stripe number for a given
1381 * bio (before mapping). Used to figure out which stripe has
1382 * failed. This looks up based on logical block numbers.
1383 */
1384static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1385 struct bio *bio)
1386{
1201b58b 1387 u64 logical = bio->bi_iter.bi_sector << 9;
53b381b3
DW
1388 int i;
1389
53b381b3 1390 for (i = 0; i < rbio->nr_data; i++) {
4c664611 1391 u64 stripe_start = rbio->bioc->raid_map[i];
83025863 1392
ff18a4af 1393 if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
53b381b3 1394 return i;
53b381b3
DW
1395 }
1396 return -1;
1397}
1398
1399/*
1400 * returns -EIO if we had too many failures
1401 */
1402static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1403{
1404 unsigned long flags;
1405 int ret = 0;
1406
1407 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1408
1409 /* we already know this stripe is bad, move on */
1410 if (rbio->faila == failed || rbio->failb == failed)
1411 goto out;
1412
1413 if (rbio->faila == -1) {
1414 /* first failure on this rbio */
1415 rbio->faila = failed;
b89e1b01 1416 atomic_inc(&rbio->error);
53b381b3
DW
1417 } else if (rbio->failb == -1) {
1418 /* second failure on this rbio */
1419 rbio->failb = failed;
b89e1b01 1420 atomic_inc(&rbio->error);
53b381b3
DW
1421 } else {
1422 ret = -EIO;
1423 }
1424out:
1425 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1426
1427 return ret;
1428}
1429
1430/*
1431 * helper to fail a stripe based on a physical disk
1432 * bio.
1433 */
1434static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1435 struct bio *bio)
1436{
1437 int failed = find_bio_stripe(rbio, bio);
1438
1439 if (failed < 0)
1440 return -EIO;
1441
1442 return fail_rbio_index(rbio, failed);
1443}
1444
5fdb7afc
QW
1445/*
1446 * For subpage case, we can no longer set page Uptodate directly for
1447 * stripe_pages[], thus we need to locate the sector.
1448 */
1449static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1450 struct page *page,
1451 unsigned int pgoff)
1452{
1453 int i;
1454
1455 for (i = 0; i < rbio->nr_sectors; i++) {
1456 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1457
1458 if (sector->page == page && sector->pgoff == pgoff)
1459 return sector;
1460 }
1461 return NULL;
1462}
1463
53b381b3
DW
1464/*
1465 * this sets each page in the bio uptodate. It should only be used on private
1466 * rbio pages, nothing that comes in from the higher layers
1467 */
5fdb7afc 1468static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
53b381b3 1469{
5fdb7afc 1470 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
0198e5b7 1471 struct bio_vec *bvec;
6dc4f100 1472 struct bvec_iter_all iter_all;
6592e58c 1473
0198e5b7 1474 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1475
5fdb7afc
QW
1476 bio_for_each_segment_all(bvec, bio, iter_all) {
1477 struct sector_ptr *sector;
1478 int pgoff;
1479
1480 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1481 pgoff += sectorsize) {
1482 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1483 ASSERT(sector);
1484 if (sector)
1485 sector->uptodate = 1;
1486 }
1487 }
53b381b3
DW
1488}
1489
d34e123d 1490static void raid56_bio_end_io(struct bio *bio)
53b381b3
DW
1491{
1492 struct btrfs_raid_bio *rbio = bio->bi_private;
1493
4e4cbee9 1494 if (bio->bi_status)
53b381b3
DW
1495 fail_bio_stripe(rbio, bio);
1496 else
5fdb7afc 1497 set_bio_pages_uptodate(rbio, bio);
53b381b3
DW
1498
1499 bio_put(bio);
1500
d34e123d
CH
1501 if (atomic_dec_and_test(&rbio->stripes_pending))
1502 queue_work(rbio->bioc->fs_info->endio_raid56_workers,
1503 &rbio->end_io_work);
1504}
53b381b3 1505
d34e123d
CH
1506/*
1507 * End io handler for the read phase of the RMW cycle. All the bios here are
1508 * physical stripe bios we've read from the disk so we can recalculate the
1509 * parity of the stripe.
1510 *
1511 * This will usually kick off finish_rmw once all the bios are read in, but it
1512 * may trigger parity reconstruction if we had any errors along the way
1513 */
1514static void raid56_rmw_end_io_work(struct work_struct *work)
1515{
1516 struct btrfs_raid_bio *rbio =
1517 container_of(work, struct btrfs_raid_bio, end_io_work);
1518
1519 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
1520 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1521 return;
1522 }
53b381b3
DW
1523
1524 /*
d34e123d
CH
1525 * This will normally call finish_rmw to start our write but if there
1526 * are any failed stripes we'll reconstruct from parity first.
53b381b3
DW
1527 */
1528 validate_rbio_for_rmw(rbio);
53b381b3
DW
1529}
1530
53b381b3
DW
1531/*
1532 * the stripe must be locked by the caller. It will
1533 * unlock after all the writes are done
1534 */
1535static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1536{
1537 int bios_to_read = 0;
53b381b3 1538 struct bio_list bio_list;
550cdeb3 1539 const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
53b381b3 1540 int ret;
550cdeb3 1541 int total_sector_nr;
53b381b3
DW
1542 struct bio *bio;
1543
1544 bio_list_init(&bio_list);
1545
1546 ret = alloc_rbio_pages(rbio);
1547 if (ret)
1548 goto cleanup;
1549
1550 index_rbio_pages(rbio);
1551
b89e1b01 1552 atomic_set(&rbio->error, 0);
550cdeb3
QW
1553 /* Build a list of bios to read all the missing data sectors. */
1554 for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
1555 total_sector_nr++) {
1556 struct sector_ptr *sector;
1557 int stripe = total_sector_nr / rbio->stripe_nsectors;
1558 int sectornr = total_sector_nr % rbio->stripe_nsectors;
3e77605d 1559
550cdeb3
QW
1560 /*
1561 * We want to find all the sectors missing from the rbio and
1562 * read them from the disk. If sector_in_rbio() finds a page
1563 * in the bio list we don't need to read it off the stripe.
1564 */
1565 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1566 if (sector)
1567 continue;
53b381b3 1568
550cdeb3
QW
1569 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1570 /*
1571 * The bio cache may have handed us an uptodate page. If so,
1572 * use it.
1573 */
1574 if (sector->uptodate)
1575 continue;
4ae10b3a 1576
550cdeb3 1577 ret = rbio_add_io_sector(rbio, &bio_list, sector,
ff18a4af 1578 stripe, sectornr, REQ_OP_READ);
550cdeb3
QW
1579 if (ret)
1580 goto cleanup;
53b381b3
DW
1581 }
1582
1583 bios_to_read = bio_list_size(&bio_list);
1584 if (!bios_to_read) {
1585 /*
1586 * this can happen if others have merged with
1587 * us, it means there is nothing left to read.
1588 * But if there are missing devices it may not be
1589 * safe to do the full stripe write yet.
1590 */
1591 goto finish;
1592 }
1593
1594 /*
4c664611
QW
1595 * The bioc may be freed once we submit the last bio. Make sure not to
1596 * touch it after that.
53b381b3 1597 */
b89e1b01 1598 atomic_set(&rbio->stripes_pending, bios_to_read);
d34e123d 1599 INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
bf28a605 1600 while ((bio = bio_list_pop(&bio_list))) {
d34e123d 1601 bio->bi_end_io = raid56_bio_end_io;
53b381b3 1602
b8bea09a
QW
1603 if (trace_raid56_read_partial_enabled()) {
1604 struct raid56_bio_trace_info trace_info = { 0 };
53b381b3 1605
b8bea09a
QW
1606 bio_get_trace_info(rbio, bio, &trace_info);
1607 trace_raid56_read_partial(rbio, bio, &trace_info);
1608 }
4e49ea4a 1609 submit_bio(bio);
53b381b3
DW
1610 }
1611 /* the actual write will happen once the reads are done */
1612 return 0;
1613
1614cleanup:
58efbc9f 1615 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1616
1617 while ((bio = bio_list_pop(&bio_list)))
1618 bio_put(bio);
1619
53b381b3
DW
1620 return -EIO;
1621
1622finish:
1623 validate_rbio_for_rmw(rbio);
1624 return 0;
1625}
1626
1627/*
1628 * if the upper layers pass in a full stripe, we thank them by only allocating
1629 * enough pages to hold the parity, and sending it all down quickly.
1630 */
1631static int full_stripe_write(struct btrfs_raid_bio *rbio)
1632{
1633 int ret;
1634
1635 ret = alloc_rbio_parity_pages(rbio);
ab4c54c6 1636 if (ret)
53b381b3
DW
1637 return ret;
1638
1639 ret = lock_stripe_add(rbio);
1640 if (ret == 0)
1641 finish_rmw(rbio);
1642 return 0;
1643}
1644
1645/*
1646 * partial stripe writes get handed over to async helpers.
1647 * We're really hoping to merge a few more writes into this
1648 * rbio before calculating new parity
1649 */
1650static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1651{
1652 int ret;
1653
1654 ret = lock_stripe_add(rbio);
1655 if (ret == 0)
cf6a4a75 1656 start_async_work(rbio, rmw_work);
53b381b3
DW
1657 return 0;
1658}
1659
1660/*
1661 * sometimes while we were reading from the drive to
1662 * recalculate parity, enough new bios come into create
1663 * a full stripe. So we do a check here to see if we can
1664 * go directly to finish_rmw
1665 */
1666static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1667{
1668 /* head off into rmw land if we don't have a full stripe */
1669 if (!rbio_is_full(rbio))
1670 return partial_stripe_write(rbio);
1671 return full_stripe_write(rbio);
1672}
1673
6ac0f488
CM
1674/*
1675 * We use plugging call backs to collect full stripes.
1676 * Any time we get a partial stripe write while plugged
1677 * we collect it into a list. When the unplug comes down,
1678 * we sort the list by logical block number and merge
1679 * everything we can into the same rbios
1680 */
1681struct btrfs_plug_cb {
1682 struct blk_plug_cb cb;
1683 struct btrfs_fs_info *info;
1684 struct list_head rbio_list;
385de0ef 1685 struct work_struct work;
6ac0f488
CM
1686};
1687
1688/*
1689 * rbios on the plug list are sorted for easier merging.
1690 */
4f0f586b
ST
1691static int plug_cmp(void *priv, const struct list_head *a,
1692 const struct list_head *b)
6ac0f488 1693{
214cc184
DS
1694 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1695 plug_list);
1696 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1697 plug_list);
4f024f37
KO
1698 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1699 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1700
1701 if (a_sector < b_sector)
1702 return -1;
1703 if (a_sector > b_sector)
1704 return 1;
1705 return 0;
1706}
1707
1708static void run_plug(struct btrfs_plug_cb *plug)
1709{
1710 struct btrfs_raid_bio *cur;
1711 struct btrfs_raid_bio *last = NULL;
1712
1713 /*
1714 * sort our plug list then try to merge
1715 * everything we can in hopes of creating full
1716 * stripes.
1717 */
1718 list_sort(NULL, &plug->rbio_list, plug_cmp);
1719 while (!list_empty(&plug->rbio_list)) {
1720 cur = list_entry(plug->rbio_list.next,
1721 struct btrfs_raid_bio, plug_list);
1722 list_del_init(&cur->plug_list);
1723
1724 if (rbio_is_full(cur)) {
c7b562c5
DS
1725 int ret;
1726
6ac0f488 1727 /* we have a full stripe, send it down */
c7b562c5
DS
1728 ret = full_stripe_write(cur);
1729 BUG_ON(ret);
6ac0f488
CM
1730 continue;
1731 }
1732 if (last) {
1733 if (rbio_can_merge(last, cur)) {
1734 merge_rbio(last, cur);
ff2b64a2 1735 free_raid_bio(cur);
6ac0f488
CM
1736 continue;
1737
1738 }
1739 __raid56_parity_write(last);
1740 }
1741 last = cur;
1742 }
1743 if (last) {
1744 __raid56_parity_write(last);
1745 }
1746 kfree(plug);
1747}
1748
1749/*
1750 * if the unplug comes from schedule, we have to push the
1751 * work off to a helper thread
1752 */
385de0ef 1753static void unplug_work(struct work_struct *work)
6ac0f488
CM
1754{
1755 struct btrfs_plug_cb *plug;
1756 plug = container_of(work, struct btrfs_plug_cb, work);
1757 run_plug(plug);
1758}
1759
1760static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1761{
1762 struct btrfs_plug_cb *plug;
1763 plug = container_of(cb, struct btrfs_plug_cb, cb);
1764
1765 if (from_schedule) {
385de0ef
CH
1766 INIT_WORK(&plug->work, unplug_work);
1767 queue_work(plug->info->rmw_workers, &plug->work);
6ac0f488
CM
1768 return;
1769 }
1770 run_plug(plug);
1771}
1772
bd8f7e62
QW
1773/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1774static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1775{
1776 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1777 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1778 const u64 full_stripe_start = rbio->bioc->raid_map[0];
1779 const u32 orig_len = orig_bio->bi_iter.bi_size;
1780 const u32 sectorsize = fs_info->sectorsize;
1781 u64 cur_logical;
1782
1783 ASSERT(orig_logical >= full_stripe_start &&
1784 orig_logical + orig_len <= full_stripe_start +
ff18a4af 1785 rbio->nr_data * BTRFS_STRIPE_LEN);
bd8f7e62
QW
1786
1787 bio_list_add(&rbio->bio_list, orig_bio);
1788 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1789
1790 /* Update the dbitmap. */
1791 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1792 cur_logical += sectorsize) {
1793 int bit = ((u32)(cur_logical - full_stripe_start) >>
1794 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1795
1796 set_bit(bit, &rbio->dbitmap);
1797 }
1798}
1799
53b381b3
DW
1800/*
1801 * our main entry point for writes from the rest of the FS.
1802 */
31683f4a 1803void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
53b381b3 1804{
6a258d72 1805 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1806 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1807 struct btrfs_plug_cb *plug = NULL;
1808 struct blk_plug_cb *cb;
31683f4a 1809 int ret = 0;
53b381b3 1810
ff18a4af 1811 rbio = alloc_rbio(fs_info, bioc);
af8e2d1d 1812 if (IS_ERR(rbio)) {
31683f4a 1813 ret = PTR_ERR(rbio);
f1c29379 1814 goto fail;
af8e2d1d 1815 }
1b94b556 1816 rbio->operation = BTRFS_RBIO_WRITE;
bd8f7e62 1817 rbio_add_bio(rbio, bio);
6ac0f488
CM
1818
1819 /*
1820 * don't plug on full rbios, just get them out the door
1821 * as quickly as we can
1822 */
4245215d
MX
1823 if (rbio_is_full(rbio)) {
1824 ret = full_stripe_write(rbio);
ab4c54c6 1825 if (ret) {
ff2b64a2 1826 free_raid_bio(rbio);
f1c29379 1827 goto fail;
ab4c54c6 1828 }
31683f4a 1829 return;
4245215d 1830 }
6ac0f488 1831
0b246afa 1832 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
6ac0f488
CM
1833 if (cb) {
1834 plug = container_of(cb, struct btrfs_plug_cb, cb);
1835 if (!plug->info) {
0b246afa 1836 plug->info = fs_info;
6ac0f488
CM
1837 INIT_LIST_HEAD(&plug->rbio_list);
1838 }
1839 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1840 } else {
4245215d 1841 ret = __raid56_parity_write(rbio);
ab4c54c6 1842 if (ret) {
ff2b64a2 1843 free_raid_bio(rbio);
f1c29379 1844 goto fail;
ab4c54c6 1845 }
6ac0f488 1846 }
31683f4a
CH
1847
1848 return;
1849
f1c29379 1850fail:
31683f4a
CH
1851 bio->bi_status = errno_to_blk_status(ret);
1852 bio_endio(bio);
53b381b3
DW
1853}
1854
1855/*
1856 * all parity reconstruction happens here. We've read in everything
1857 * we can find from the drives and this does the heavy lifting of
1858 * sorting the good from the bad.
1859 */
1860static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1861{
07e4d380
QW
1862 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1863 int sectornr, stripe;
53b381b3 1864 void **pointers;
94a0b58d 1865 void **unmap_array;
53b381b3 1866 int faila = -1, failb = -1;
58efbc9f 1867 blk_status_t err;
53b381b3
DW
1868 int i;
1869
07e4d380
QW
1870 /*
1871 * This array stores the pointer for each sector, thus it has the extra
1872 * pgoff value added from each sector
1873 */
31e818fe 1874 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
53b381b3 1875 if (!pointers) {
58efbc9f 1876 err = BLK_STS_RESOURCE;
53b381b3
DW
1877 goto cleanup_io;
1878 }
1879
94a0b58d
IW
1880 /*
1881 * Store copy of pointers that does not get reordered during
1882 * reconstruction so that kunmap_local works.
1883 */
1884 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1885 if (!unmap_array) {
1886 err = BLK_STS_RESOURCE;
1887 goto cleanup_pointers;
1888 }
1889
53b381b3
DW
1890 faila = rbio->faila;
1891 failb = rbio->failb;
1892
b4ee1782
OS
1893 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1894 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
53b381b3
DW
1895 spin_lock_irq(&rbio->bio_list_lock);
1896 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1897 spin_unlock_irq(&rbio->bio_list_lock);
1898 }
1899
1900 index_rbio_pages(rbio);
1901
07e4d380
QW
1902 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1903 struct sector_ptr *sector;
1904
5a6ac9ea
MX
1905 /*
1906 * Now we just use bitmap to mark the horizontal stripes in
1907 * which we have data when doing parity scrub.
1908 */
1909 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
c67c68eb 1910 !test_bit(sectornr, &rbio->dbitmap))
5a6ac9ea
MX
1911 continue;
1912
94a0b58d 1913 /*
07e4d380 1914 * Setup our array of pointers with sectors from each stripe
94a0b58d
IW
1915 *
1916 * NOTE: store a duplicate array of pointers to preserve the
1917 * pointer order
53b381b3 1918 */
2c8cdd6e 1919 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
53b381b3 1920 /*
07e4d380 1921 * If we're rebuilding a read, we have to use
53b381b3
DW
1922 * pages from the bio list
1923 */
b4ee1782
OS
1924 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1925 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
53b381b3 1926 (stripe == faila || stripe == failb)) {
07e4d380 1927 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
53b381b3 1928 } else {
07e4d380 1929 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3 1930 }
07e4d380
QW
1931 ASSERT(sector->page);
1932 pointers[stripe] = kmap_local_page(sector->page) +
1933 sector->pgoff;
94a0b58d 1934 unmap_array[stripe] = pointers[stripe];
53b381b3
DW
1935 }
1936
07e4d380 1937 /* All raid6 handling here */
4c664611 1938 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
07e4d380 1939 /* Single failure, rebuild from parity raid5 style */
53b381b3
DW
1940 if (failb < 0) {
1941 if (faila == rbio->nr_data) {
1942 /*
1943 * Just the P stripe has failed, without
1944 * a bad data or Q stripe.
1945 * TODO, we should redo the xor here.
1946 */
58efbc9f 1947 err = BLK_STS_IOERR;
53b381b3
DW
1948 goto cleanup;
1949 }
1950 /*
1951 * a single failure in raid6 is rebuilt
1952 * in the pstripe code below
1953 */
1954 goto pstripe;
1955 }
1956
1957 /* make sure our ps and qs are in order */
b7d2083a
NB
1958 if (faila > failb)
1959 swap(faila, failb);
53b381b3
DW
1960
1961 /* if the q stripe is failed, do a pstripe reconstruction
1962 * from the xors.
1963 * If both the q stripe and the P stripe are failed, we're
1964 * here due to a crc mismatch and we can't give them the
1965 * data they want
1966 */
4c664611
QW
1967 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1968 if (rbio->bioc->raid_map[faila] ==
8e5cfb55 1969 RAID5_P_STRIPE) {
58efbc9f 1970 err = BLK_STS_IOERR;
53b381b3
DW
1971 goto cleanup;
1972 }
1973 /*
1974 * otherwise we have one bad data stripe and
1975 * a good P stripe. raid5!
1976 */
1977 goto pstripe;
1978 }
1979
4c664611 1980 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2c8cdd6e 1981 raid6_datap_recov(rbio->real_stripes,
07e4d380 1982 sectorsize, faila, pointers);
53b381b3 1983 } else {
2c8cdd6e 1984 raid6_2data_recov(rbio->real_stripes,
07e4d380 1985 sectorsize, faila, failb,
53b381b3
DW
1986 pointers);
1987 }
1988 } else {
1989 void *p;
1990
1991 /* rebuild from P stripe here (raid5 or raid6) */
1992 BUG_ON(failb != -1);
1993pstripe:
1994 /* Copy parity block into failed block to start with */
07e4d380 1995 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
53b381b3
DW
1996
1997 /* rearrange the pointer array */
1998 p = pointers[faila];
1999 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2000 pointers[stripe] = pointers[stripe + 1];
2001 pointers[rbio->nr_data - 1] = p;
2002
2003 /* xor in the rest */
07e4d380 2004 run_xor(pointers, rbio->nr_data - 1, sectorsize);
53b381b3
DW
2005 }
2006 /* if we're doing this rebuild as part of an rmw, go through
2007 * and set all of our private rbio pages in the
2008 * failed stripes as uptodate. This way finish_rmw will
2009 * know they can be trusted. If this was a read reconstruction,
2010 * other endio functions will fiddle the uptodate bits
2011 */
1b94b556 2012 if (rbio->operation == BTRFS_RBIO_WRITE) {
07e4d380 2013 for (i = 0; i < rbio->stripe_nsectors; i++) {
53b381b3 2014 if (faila != -1) {
07e4d380
QW
2015 sector = rbio_stripe_sector(rbio, faila, i);
2016 sector->uptodate = 1;
53b381b3
DW
2017 }
2018 if (failb != -1) {
07e4d380
QW
2019 sector = rbio_stripe_sector(rbio, failb, i);
2020 sector->uptodate = 1;
53b381b3
DW
2021 }
2022 }
2023 }
94a0b58d
IW
2024 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2025 kunmap_local(unmap_array[stripe]);
53b381b3
DW
2026 }
2027
58efbc9f 2028 err = BLK_STS_OK;
53b381b3 2029cleanup:
94a0b58d
IW
2030 kfree(unmap_array);
2031cleanup_pointers:
53b381b3
DW
2032 kfree(pointers);
2033
2034cleanup_io:
580c6efa
LB
2035 /*
2036 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2037 * valid rbio which is consistent with ondisk content, thus such a
2038 * valid rbio can be cached to avoid further disk reads.
2039 */
2040 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2041 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
44ac474d
LB
2042 /*
2043 * - In case of two failures, where rbio->failb != -1:
2044 *
2045 * Do not cache this rbio since the above read reconstruction
2046 * (raid6_datap_recov() or raid6_2data_recov()) may have
2047 * changed some content of stripes which are not identical to
2048 * on-disk content any more, otherwise, a later write/recover
2049 * may steal stripe_pages from this rbio and end up with
2050 * corruptions or rebuild failures.
2051 *
2052 * - In case of single failure, where rbio->failb == -1:
2053 *
2054 * Cache this rbio iff the above read reconstruction is
52042d8e 2055 * executed without problems.
44ac474d
LB
2056 */
2057 if (err == BLK_STS_OK && rbio->failb < 0)
4ae10b3a
CM
2058 cache_rbio_pages(rbio);
2059 else
2060 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2061
4246a0b6 2062 rbio_orig_end_io(rbio, err);
58efbc9f 2063 } else if (err == BLK_STS_OK) {
53b381b3
DW
2064 rbio->faila = -1;
2065 rbio->failb = -1;
5a6ac9ea
MX
2066
2067 if (rbio->operation == BTRFS_RBIO_WRITE)
2068 finish_rmw(rbio);
2069 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2070 finish_parity_scrub(rbio, 0);
2071 else
2072 BUG();
53b381b3 2073 } else {
4246a0b6 2074 rbio_orig_end_io(rbio, err);
53b381b3
DW
2075 }
2076}
2077
2078/*
d34e123d
CH
2079 * This is called only for stripes we've read from disk to reconstruct the
2080 * parity.
53b381b3 2081 */
d34e123d 2082static void raid_recover_end_io_work(struct work_struct *work)
53b381b3 2083{
d34e123d
CH
2084 struct btrfs_raid_bio *rbio =
2085 container_of(work, struct btrfs_raid_bio, end_io_work);
53b381b3 2086
4c664611 2087 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
58efbc9f 2088 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
2089 else
2090 __raid_recover_end_io(rbio);
2091}
2092
2093/*
2094 * reads everything we need off the disk to reconstruct
2095 * the parity. endio handlers trigger final reconstruction
2096 * when the IO is done.
2097 *
2098 * This is used both for reads from the higher layers and for
2099 * parity construction required to finish a rmw cycle.
2100 */
2101static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2102{
2103 int bios_to_read = 0;
53b381b3
DW
2104 struct bio_list bio_list;
2105 int ret;
ef340fcc 2106 int total_sector_nr;
53b381b3
DW
2107 struct bio *bio;
2108
2109 bio_list_init(&bio_list);
2110
2111 ret = alloc_rbio_pages(rbio);
2112 if (ret)
2113 goto cleanup;
2114
b89e1b01 2115 atomic_set(&rbio->error, 0);
53b381b3
DW
2116
2117 /*
f6065f8e
QW
2118 * Read everything that hasn't failed. However this time we will
2119 * not trust any cached sector.
2120 * As we may read out some stale data but higher layer is not reading
2121 * that stale part.
2122 *
2123 * So here we always re-read everything in recovery path.
53b381b3 2124 */
ef340fcc
QW
2125 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2126 total_sector_nr++) {
2127 int stripe = total_sector_nr / rbio->stripe_nsectors;
2128 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2129 struct sector_ptr *sector;
2130
5588383e 2131 if (rbio->faila == stripe || rbio->failb == stripe) {
b89e1b01 2132 atomic_inc(&rbio->error);
ef340fcc
QW
2133 /* Skip the current stripe. */
2134 ASSERT(sectornr == 0);
2135 total_sector_nr += rbio->stripe_nsectors - 1;
53b381b3 2136 continue;
5588383e 2137 }
ef340fcc 2138 sector = rbio_stripe_sector(rbio, stripe, sectornr);
ef340fcc 2139 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
ff18a4af 2140 sectornr, REQ_OP_READ);
ef340fcc
QW
2141 if (ret < 0)
2142 goto cleanup;
53b381b3
DW
2143 }
2144
2145 bios_to_read = bio_list_size(&bio_list);
2146 if (!bios_to_read) {
2147 /*
2148 * we might have no bios to read just because the pages
2149 * were up to date, or we might have no bios to read because
2150 * the devices were gone.
2151 */
4c664611 2152 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
53b381b3 2153 __raid_recover_end_io(rbio);
813f8a0e 2154 return 0;
53b381b3
DW
2155 } else {
2156 goto cleanup;
2157 }
2158 }
2159
2160 /*
4c664611
QW
2161 * The bioc may be freed once we submit the last bio. Make sure not to
2162 * touch it after that.
53b381b3 2163 */
b89e1b01 2164 atomic_set(&rbio->stripes_pending, bios_to_read);
d34e123d 2165 INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
bf28a605 2166 while ((bio = bio_list_pop(&bio_list))) {
d34e123d 2167 bio->bi_end_io = raid56_bio_end_io;
53b381b3 2168
b8bea09a
QW
2169 if (trace_raid56_scrub_read_recover_enabled()) {
2170 struct raid56_bio_trace_info trace_info = { 0 };
53b381b3 2171
b8bea09a
QW
2172 bio_get_trace_info(rbio, bio, &trace_info);
2173 trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
2174 }
4e49ea4a 2175 submit_bio(bio);
53b381b3 2176 }
813f8a0e 2177
53b381b3
DW
2178 return 0;
2179
2180cleanup:
b4ee1782
OS
2181 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2182 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
58efbc9f 2183 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2184
2185 while ((bio = bio_list_pop(&bio_list)))
2186 bio_put(bio);
2187
53b381b3
DW
2188 return -EIO;
2189}
2190
2191/*
2192 * the main entry point for reads from the higher layers. This
2193 * is really only called when the normal read path had a failure,
2194 * so we assume the bio they send down corresponds to a failed part
2195 * of the drive.
2196 */
6065fd95 2197void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
f1c29379 2198 int mirror_num)
53b381b3 2199{
6a258d72 2200 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 2201 struct btrfs_raid_bio *rbio;
53b381b3 2202
ff18a4af 2203 rbio = alloc_rbio(fs_info, bioc);
af8e2d1d 2204 if (IS_ERR(rbio)) {
6065fd95
CH
2205 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2206 goto out_end_bio;
af8e2d1d 2207 }
53b381b3 2208
1b94b556 2209 rbio->operation = BTRFS_RBIO_READ_REBUILD;
bd8f7e62 2210 rbio_add_bio(rbio, bio);
53b381b3
DW
2211
2212 rbio->faila = find_logical_bio_stripe(rbio, bio);
2213 if (rbio->faila == -1) {
0b246afa 2214 btrfs_warn(fs_info,
4c664611 2215"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
1201b58b 2216 __func__, bio->bi_iter.bi_sector << 9,
4c664611 2217 (u64)bio->bi_iter.bi_size, bioc->map_type);
ff2b64a2 2218 free_raid_bio(rbio);
6065fd95
CH
2219 bio->bi_status = BLK_STS_IOERR;
2220 goto out_end_bio;
53b381b3
DW
2221 }
2222
2223 /*
8810f751
LB
2224 * Loop retry:
2225 * for 'mirror == 2', reconstruct from all other stripes.
2226 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2227 */
8810f751
LB
2228 if (mirror_num > 2) {
2229 /*
2230 * 'mirror == 3' is to fail the p stripe and
2231 * reconstruct from the q stripe. 'mirror > 3' is to
2232 * fail a data stripe and reconstruct from p+q stripe.
2233 */
2234 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2235 ASSERT(rbio->failb > 0);
2236 if (rbio->failb <= rbio->faila)
2237 rbio->failb--;
2238 }
53b381b3 2239
6065fd95
CH
2240 if (lock_stripe_add(rbio))
2241 return;
53b381b3
DW
2242
2243 /*
6065fd95
CH
2244 * This adds our rbio to the list of rbios that will be handled after
2245 * the current lock owner is done.
53b381b3 2246 */
6065fd95
CH
2247 __raid56_parity_recover(rbio);
2248 return;
53b381b3 2249
6065fd95 2250out_end_bio:
6065fd95 2251 bio_endio(bio);
53b381b3
DW
2252}
2253
385de0ef 2254static void rmw_work(struct work_struct *work)
53b381b3
DW
2255{
2256 struct btrfs_raid_bio *rbio;
2257
2258 rbio = container_of(work, struct btrfs_raid_bio, work);
2259 raid56_rmw_stripe(rbio);
2260}
2261
385de0ef 2262static void read_rebuild_work(struct work_struct *work)
53b381b3
DW
2263{
2264 struct btrfs_raid_bio *rbio;
2265
2266 rbio = container_of(work, struct btrfs_raid_bio, work);
2267 __raid56_parity_recover(rbio);
2268}
5a6ac9ea
MX
2269
2270/*
2271 * The following code is used to scrub/replace the parity stripe
2272 *
4c664611 2273 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2274 *
5a6ac9ea
MX
2275 * Note: We need make sure all the pages that add into the scrub/replace
2276 * raid bio are correct and not be changed during the scrub/replace. That
2277 * is those pages just hold metadata or file data with checksum.
2278 */
2279
6a258d72
QW
2280struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2281 struct btrfs_io_context *bioc,
ff18a4af 2282 struct btrfs_device *scrub_dev,
6a258d72 2283 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2284{
6a258d72 2285 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2286 struct btrfs_raid_bio *rbio;
2287 int i;
2288
ff18a4af 2289 rbio = alloc_rbio(fs_info, bioc);
5a6ac9ea
MX
2290 if (IS_ERR(rbio))
2291 return NULL;
2292 bio_list_add(&rbio->bio_list, bio);
2293 /*
2294 * This is a special bio which is used to hold the completion handler
2295 * and make the scrub rbio is similar to the other types
2296 */
2297 ASSERT(!bio->bi_iter.bi_size);
2298 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2299
9cd3a7eb 2300 /*
4c664611 2301 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2302 * to the end position, so this search can start from the first parity
2303 * stripe.
2304 */
2305 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2306 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2307 rbio->scrubp = i;
2308 break;
2309 }
2310 }
9cd3a7eb 2311 ASSERT(i < rbio->real_stripes);
5a6ac9ea 2312
c67c68eb 2313 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
5a6ac9ea
MX
2314 return rbio;
2315}
2316
b4ee1782
OS
2317/* Used for both parity scrub and missing. */
2318void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
6346f6bf 2319 unsigned int pgoff, u64 logical)
5a6ac9ea 2320{
6346f6bf 2321 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
5a6ac9ea
MX
2322 int stripe_offset;
2323 int index;
2324
4c664611 2325 ASSERT(logical >= rbio->bioc->raid_map[0]);
6346f6bf 2326 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
ff18a4af 2327 BTRFS_STRIPE_LEN * rbio->nr_data);
4c664611 2328 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
6346f6bf
QW
2329 index = stripe_offset / sectorsize;
2330 rbio->bio_sectors[index].page = page;
2331 rbio->bio_sectors[index].pgoff = pgoff;
5a6ac9ea
MX
2332}
2333
2334/*
2335 * We just scrub the parity that we have correct data on the same horizontal,
2336 * so we needn't allocate all pages for all the stripes.
2337 */
2338static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2339{
3907ce29 2340 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
aee35e4b 2341 int total_sector_nr;
5a6ac9ea 2342
aee35e4b
QW
2343 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2344 total_sector_nr++) {
2345 struct page *page;
2346 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2347 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
5a6ac9ea 2348
aee35e4b
QW
2349 if (!test_bit(sectornr, &rbio->dbitmap))
2350 continue;
2351 if (rbio->stripe_pages[index])
2352 continue;
2353 page = alloc_page(GFP_NOFS);
2354 if (!page)
2355 return -ENOMEM;
2356 rbio->stripe_pages[index] = page;
5a6ac9ea 2357 }
eb357060 2358 index_stripe_sectors(rbio);
5a6ac9ea
MX
2359 return 0;
2360}
2361
5a6ac9ea
MX
2362static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2363 int need_check)
2364{
4c664611 2365 struct btrfs_io_context *bioc = rbio->bioc;
46900662 2366 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 2367 void **pointers = rbio->finish_pointers;
c67c68eb 2368 unsigned long *pbitmap = &rbio->finish_pbitmap;
5a6ac9ea
MX
2369 int nr_data = rbio->nr_data;
2370 int stripe;
3e77605d 2371 int sectornr;
c17af965 2372 bool has_qstripe;
46900662
QW
2373 struct sector_ptr p_sector = { 0 };
2374 struct sector_ptr q_sector = { 0 };
5a6ac9ea
MX
2375 struct bio_list bio_list;
2376 struct bio *bio;
76035976 2377 int is_replace = 0;
5a6ac9ea
MX
2378 int ret;
2379
2380 bio_list_init(&bio_list);
2381
c17af965
DS
2382 if (rbio->real_stripes - rbio->nr_data == 1)
2383 has_qstripe = false;
2384 else if (rbio->real_stripes - rbio->nr_data == 2)
2385 has_qstripe = true;
2386 else
5a6ac9ea 2387 BUG();
5a6ac9ea 2388
4c664611 2389 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
76035976 2390 is_replace = 1;
c67c68eb 2391 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
76035976
MX
2392 }
2393
5a6ac9ea
MX
2394 /*
2395 * Because the higher layers(scrubber) are unlikely to
2396 * use this area of the disk again soon, so don't cache
2397 * it.
2398 */
2399 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2400
2401 if (!need_check)
2402 goto writeback;
2403
46900662
QW
2404 p_sector.page = alloc_page(GFP_NOFS);
2405 if (!p_sector.page)
5a6ac9ea 2406 goto cleanup;
46900662
QW
2407 p_sector.pgoff = 0;
2408 p_sector.uptodate = 1;
5a6ac9ea 2409
c17af965 2410 if (has_qstripe) {
d70cef0d 2411 /* RAID6, allocate and map temp space for the Q stripe */
46900662
QW
2412 q_sector.page = alloc_page(GFP_NOFS);
2413 if (!q_sector.page) {
2414 __free_page(p_sector.page);
2415 p_sector.page = NULL;
5a6ac9ea
MX
2416 goto cleanup;
2417 }
46900662
QW
2418 q_sector.pgoff = 0;
2419 q_sector.uptodate = 1;
2420 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
5a6ac9ea
MX
2421 }
2422
2423 atomic_set(&rbio->error, 0);
2424
d70cef0d 2425 /* Map the parity stripe just once */
46900662 2426 pointers[nr_data] = kmap_local_page(p_sector.page);
d70cef0d 2427
c67c68eb 2428 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
46900662 2429 struct sector_ptr *sector;
5a6ac9ea 2430 void *parity;
46900662 2431
5a6ac9ea
MX
2432 /* first collect one page from each data stripe */
2433 for (stripe = 0; stripe < nr_data; stripe++) {
46900662
QW
2434 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2435 pointers[stripe] = kmap_local_page(sector->page) +
2436 sector->pgoff;
5a6ac9ea
MX
2437 }
2438
c17af965 2439 if (has_qstripe) {
d70cef0d 2440 /* RAID6, call the library function to fill in our P/Q */
46900662 2441 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
5a6ac9ea
MX
2442 pointers);
2443 } else {
2444 /* raid5 */
46900662
QW
2445 memcpy(pointers[nr_data], pointers[0], sectorsize);
2446 run_xor(pointers + 1, nr_data - 1, sectorsize);
5a6ac9ea
MX
2447 }
2448
01327610 2449 /* Check scrubbing parity and repair it */
46900662
QW
2450 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2451 parity = kmap_local_page(sector->page) + sector->pgoff;
2452 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2453 memcpy(parity, pointers[rbio->scrubp], sectorsize);
5a6ac9ea
MX
2454 else
2455 /* Parity is right, needn't writeback */
c67c68eb 2456 bitmap_clear(&rbio->dbitmap, sectornr, 1);
58c1a35c 2457 kunmap_local(parity);
5a6ac9ea 2458
94a0b58d
IW
2459 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2460 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2461 }
2462
94a0b58d 2463 kunmap_local(pointers[nr_data]);
46900662
QW
2464 __free_page(p_sector.page);
2465 p_sector.page = NULL;
2466 if (q_sector.page) {
94a0b58d 2467 kunmap_local(pointers[rbio->real_stripes - 1]);
46900662
QW
2468 __free_page(q_sector.page);
2469 q_sector.page = NULL;
d70cef0d 2470 }
5a6ac9ea
MX
2471
2472writeback:
2473 /*
2474 * time to start writing. Make bios for everything from the
2475 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2476 * everything else.
2477 */
c67c68eb 2478 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3e77605d 2479 struct sector_ptr *sector;
5a6ac9ea 2480
3e77605d
QW
2481 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2482 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
ff18a4af 2483 sectornr, REQ_OP_WRITE);
5a6ac9ea
MX
2484 if (ret)
2485 goto cleanup;
2486 }
2487
76035976
MX
2488 if (!is_replace)
2489 goto submit_write;
2490
3e77605d
QW
2491 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2492 struct sector_ptr *sector;
76035976 2493
3e77605d
QW
2494 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2495 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 2496 bioc->tgtdev_map[rbio->scrubp],
ff18a4af 2497 sectornr, REQ_OP_WRITE);
76035976
MX
2498 if (ret)
2499 goto cleanup;
2500 }
2501
2502submit_write:
5a6ac9ea
MX
2503 nr_data = bio_list_size(&bio_list);
2504 if (!nr_data) {
2505 /* Every parity is right */
58efbc9f 2506 rbio_orig_end_io(rbio, BLK_STS_OK);
5a6ac9ea
MX
2507 return;
2508 }
2509
2510 atomic_set(&rbio->stripes_pending, nr_data);
2511
bf28a605 2512 while ((bio = bio_list_pop(&bio_list))) {
a6111d11 2513 bio->bi_end_io = raid_write_end_io;
4e49ea4a 2514
b8bea09a
QW
2515 if (trace_raid56_scrub_write_stripe_enabled()) {
2516 struct raid56_bio_trace_info trace_info = { 0 };
2517
2518 bio_get_trace_info(rbio, bio, &trace_info);
2519 trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
2520 }
4e49ea4a 2521 submit_bio(bio);
5a6ac9ea
MX
2522 }
2523 return;
2524
2525cleanup:
58efbc9f 2526 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2527
2528 while ((bio = bio_list_pop(&bio_list)))
2529 bio_put(bio);
5a6ac9ea
MX
2530}
2531
2532static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2533{
2534 if (stripe >= 0 && stripe < rbio->nr_data)
2535 return 1;
2536 return 0;
2537}
2538
2539/*
2540 * While we're doing the parity check and repair, we could have errors
2541 * in reading pages off the disk. This checks for errors and if we're
2542 * not able to read the page it'll trigger parity reconstruction. The
2543 * parity scrub will be finished after we've reconstructed the failed
2544 * stripes
2545 */
2546static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2547{
4c664611 2548 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
5a6ac9ea
MX
2549 goto cleanup;
2550
2551 if (rbio->faila >= 0 || rbio->failb >= 0) {
2552 int dfail = 0, failp = -1;
2553
2554 if (is_data_stripe(rbio, rbio->faila))
2555 dfail++;
2556 else if (is_parity_stripe(rbio->faila))
2557 failp = rbio->faila;
2558
2559 if (is_data_stripe(rbio, rbio->failb))
2560 dfail++;
2561 else if (is_parity_stripe(rbio->failb))
2562 failp = rbio->failb;
2563
2564 /*
2565 * Because we can not use a scrubbing parity to repair
2566 * the data, so the capability of the repair is declined.
2567 * (In the case of RAID5, we can not repair anything)
2568 */
4c664611 2569 if (dfail > rbio->bioc->max_errors - 1)
5a6ac9ea
MX
2570 goto cleanup;
2571
2572 /*
2573 * If all data is good, only parity is correctly, just
2574 * repair the parity.
2575 */
2576 if (dfail == 0) {
2577 finish_parity_scrub(rbio, 0);
2578 return;
2579 }
2580
2581 /*
2582 * Here means we got one corrupted data stripe and one
2583 * corrupted parity on RAID6, if the corrupted parity
01327610 2584 * is scrubbing parity, luckily, use the other one to repair
5a6ac9ea
MX
2585 * the data, or we can not repair the data stripe.
2586 */
2587 if (failp != rbio->scrubp)
2588 goto cleanup;
2589
2590 __raid_recover_end_io(rbio);
2591 } else {
2592 finish_parity_scrub(rbio, 1);
2593 }
2594 return;
2595
2596cleanup:
58efbc9f 2597 rbio_orig_end_io(rbio, BLK_STS_IOERR);
5a6ac9ea
MX
2598}
2599
2600/*
2601 * end io for the read phase of the rmw cycle. All the bios here are physical
2602 * stripe bios we've read from the disk so we can recalculate the parity of the
2603 * stripe.
2604 *
2605 * This will usually kick off finish_rmw once all the bios are read in, but it
2606 * may trigger parity reconstruction if we had any errors along the way
2607 */
d34e123d 2608static void raid56_parity_scrub_end_io_work(struct work_struct *work)
5a6ac9ea 2609{
d34e123d
CH
2610 struct btrfs_raid_bio *rbio =
2611 container_of(work, struct btrfs_raid_bio, end_io_work);
5a6ac9ea
MX
2612
2613 /*
d34e123d
CH
2614 * This will normally call finish_rmw to start our write, but if there
2615 * are any failed stripes we'll reconstruct from parity first
5a6ac9ea
MX
2616 */
2617 validate_rbio_for_parity_scrub(rbio);
2618}
2619
2620static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2621{
2622 int bios_to_read = 0;
5a6ac9ea
MX
2623 struct bio_list bio_list;
2624 int ret;
1c10702e 2625 int total_sector_nr;
5a6ac9ea
MX
2626 struct bio *bio;
2627
785884fc
LB
2628 bio_list_init(&bio_list);
2629
5a6ac9ea
MX
2630 ret = alloc_rbio_essential_pages(rbio);
2631 if (ret)
2632 goto cleanup;
2633
5a6ac9ea 2634 atomic_set(&rbio->error, 0);
1c10702e
QW
2635 /* Build a list of bios to read all the missing parts. */
2636 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2637 total_sector_nr++) {
2638 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2639 int stripe = total_sector_nr / rbio->stripe_nsectors;
2640 struct sector_ptr *sector;
5a6ac9ea 2641
1c10702e
QW
2642 /* No data in the vertical stripe, no need to read. */
2643 if (!test_bit(sectornr, &rbio->dbitmap))
2644 continue;
5a6ac9ea 2645
1c10702e
QW
2646 /*
2647 * We want to find all the sectors missing from the rbio and
2648 * read them from the disk. If sector_in_rbio() finds a sector
2649 * in the bio list we don't need to read it off the stripe.
2650 */
2651 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2652 if (sector)
2653 continue;
2654
2655 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2656 /*
2657 * The bio cache may have handed us an uptodate sector. If so,
2658 * use it.
2659 */
2660 if (sector->uptodate)
2661 continue;
2662
2663 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
ff18a4af 2664 sectornr, REQ_OP_READ);
1c10702e
QW
2665 if (ret)
2666 goto cleanup;
5a6ac9ea
MX
2667 }
2668
2669 bios_to_read = bio_list_size(&bio_list);
2670 if (!bios_to_read) {
2671 /*
2672 * this can happen if others have merged with
2673 * us, it means there is nothing left to read.
2674 * But if there are missing devices it may not be
2675 * safe to do the full stripe write yet.
2676 */
2677 goto finish;
2678 }
2679
2680 /*
4c664611
QW
2681 * The bioc may be freed once we submit the last bio. Make sure not to
2682 * touch it after that.
5a6ac9ea
MX
2683 */
2684 atomic_set(&rbio->stripes_pending, bios_to_read);
d34e123d 2685 INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
bf28a605 2686 while ((bio = bio_list_pop(&bio_list))) {
d34e123d 2687 bio->bi_end_io = raid56_bio_end_io;
5a6ac9ea 2688
b8bea09a
QW
2689 if (trace_raid56_scrub_read_enabled()) {
2690 struct raid56_bio_trace_info trace_info = { 0 };
5a6ac9ea 2691
b8bea09a
QW
2692 bio_get_trace_info(rbio, bio, &trace_info);
2693 trace_raid56_scrub_read(rbio, bio, &trace_info);
2694 }
4e49ea4a 2695 submit_bio(bio);
5a6ac9ea
MX
2696 }
2697 /* the actual write will happen once the reads are done */
2698 return;
2699
2700cleanup:
58efbc9f 2701 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2702
2703 while ((bio = bio_list_pop(&bio_list)))
2704 bio_put(bio);
2705
5a6ac9ea
MX
2706 return;
2707
2708finish:
2709 validate_rbio_for_parity_scrub(rbio);
2710}
2711
385de0ef 2712static void scrub_parity_work(struct work_struct *work)
5a6ac9ea
MX
2713{
2714 struct btrfs_raid_bio *rbio;
2715
2716 rbio = container_of(work, struct btrfs_raid_bio, work);
2717 raid56_parity_scrub_stripe(rbio);
2718}
2719
5a6ac9ea
MX
2720void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2721{
2722 if (!lock_stripe_add(rbio))
a81b747d 2723 start_async_work(rbio, scrub_parity_work);
5a6ac9ea 2724}
b4ee1782
OS
2725
2726/* The following code is used for dev replace of a missing RAID 5/6 device. */
2727
2728struct btrfs_raid_bio *
ff18a4af 2729raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
b4ee1782 2730{
6a258d72 2731 struct btrfs_fs_info *fs_info = bioc->fs_info;
b4ee1782
OS
2732 struct btrfs_raid_bio *rbio;
2733
ff18a4af 2734 rbio = alloc_rbio(fs_info, bioc);
b4ee1782
OS
2735 if (IS_ERR(rbio))
2736 return NULL;
2737
2738 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2739 bio_list_add(&rbio->bio_list, bio);
2740 /*
2741 * This is a special bio which is used to hold the completion handler
2742 * and make the scrub rbio is similar to the other types
2743 */
2744 ASSERT(!bio->bi_iter.bi_size);
2745
2746 rbio->faila = find_logical_bio_stripe(rbio, bio);
2747 if (rbio->faila == -1) {
f15fb2cd
QW
2748 btrfs_warn_rl(fs_info,
2749 "can not determine the failed stripe number for full stripe %llu",
2750 bioc->raid_map[0]);
ff2b64a2 2751 free_raid_bio(rbio);
b4ee1782
OS
2752 return NULL;
2753 }
2754
2755 return rbio;
2756}
2757
b4ee1782
OS
2758void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2759{
2760 if (!lock_stripe_add(rbio))
e66d8d5a 2761 start_async_work(rbio, read_rebuild_work);
b4ee1782 2762}