btrfs: raid56: make finish_parity_scrub() subpage compatible
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
cea62800 16#include "misc.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
53b381b3
DW
22
23/* set when additional merges to this rbio are not allowed */
24#define RBIO_RMW_LOCKED_BIT 1
25
4ae10b3a
CM
26/*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30#define RBIO_CACHE_BIT 2
31
32/*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35#define RBIO_CACHE_READY_BIT 3
36
4ae10b3a
CM
37#define RBIO_CACHE_SIZE 1024
38
8a953348
DS
39#define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41/* Used by the raid56 code to lock stripes for read/modify/write */
42struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45};
46
47/* Used by the raid56 code to lock stripes for read/modify/write */
48struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53};
54
eb357060
QW
55/*
56 * A bvec like structure to present a sector inside a page.
57 *
58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
59 */
60struct sector_ptr {
61 struct page *page;
00425dd9
QW
62 unsigned int pgoff:24;
63 unsigned int uptodate:8;
eb357060
QW
64};
65
1b94b556 66enum btrfs_rbio_ops {
b4ee1782
OS
67 BTRFS_RBIO_WRITE,
68 BTRFS_RBIO_READ_REBUILD,
69 BTRFS_RBIO_PARITY_SCRUB,
70 BTRFS_RBIO_REBUILD_MISSING,
1b94b556
MX
71};
72
53b381b3 73struct btrfs_raid_bio {
4c664611 74 struct btrfs_io_context *bioc;
53b381b3 75
53b381b3
DW
76 /* while we're doing rmw on a stripe
77 * we put it into a hash table so we can
78 * lock the stripe and merge more rbios
79 * into it.
80 */
81 struct list_head hash_list;
82
4ae10b3a
CM
83 /*
84 * LRU list for the stripe cache
85 */
86 struct list_head stripe_cache;
87
53b381b3
DW
88 /*
89 * for scheduling work in the helper threads
90 */
91 struct btrfs_work work;
92
93 /*
94 * bio list and bio_list_lock are used
95 * to add more bios into the stripe
96 * in hopes of avoiding the full rmw
97 */
98 struct bio_list bio_list;
99 spinlock_t bio_list_lock;
100
6ac0f488
CM
101 /* also protected by the bio_list_lock, the
102 * plug list is used by the plugging code
103 * to collect partial bios while plugged. The
104 * stripe locking code also uses it to hand off
53b381b3
DW
105 * the stripe lock to the next pending IO
106 */
107 struct list_head plug_list;
108
109 /*
110 * flags that tell us if it is safe to
111 * merge with this bio
112 */
113 unsigned long flags;
114
53b381b3
DW
115 /*
116 * set if we're doing a parity rebuild
117 * for a read from higher up, which is handled
118 * differently from a parity rebuild as part of
119 * rmw
120 */
1b94b556 121 enum btrfs_rbio_ops operation;
53b381b3 122
29b06838
QW
123 /* Size of each individual stripe on disk */
124 u32 stripe_len;
53b381b3 125
29b06838
QW
126 /* How many pages there are for the full stripe including P/Q */
127 u16 nr_pages;
53b381b3 128
94efbe19
QW
129 /* How many sectors there are for the full stripe including P/Q */
130 u16 nr_sectors;
131
29b06838
QW
132 /* Number of data stripes (no p/q) */
133 u8 nr_data;
134
135 /* Numer of all stripes (including P/Q) */
136 u8 real_stripes;
137
138 /* How many pages there are for each stripe */
139 u8 stripe_npages;
140
94efbe19
QW
141 /* How many sectors there are for each stripe */
142 u8 stripe_nsectors;
143
29b06838
QW
144 /* First bad stripe, -1 means no corruption */
145 s8 faila;
146
147 /* Second bad stripe (for RAID6 use) */
148 s8 failb;
149
150 /* Stripe number that we're scrubbing */
151 u8 scrubp;
53b381b3
DW
152
153 /*
154 * size of all the bios in the bio_list. This
155 * helps us decide if the rbio maps to a full
156 * stripe or not
157 */
158 int bio_list_bytes;
159
4245215d
MX
160 int generic_bio_cnt;
161
dec95574 162 refcount_t refs;
53b381b3 163
b89e1b01
MX
164 atomic_t stripes_pending;
165
166 atomic_t error;
53b381b3
DW
167 /*
168 * these are two arrays of pointers. We allocate the
169 * rbio big enough to hold them both and setup their
170 * locations when the rbio is allocated
171 */
172
173 /* pointers to pages that we allocated for
174 * reading/writing stripes directly from the disk (including P/Q)
175 */
176 struct page **stripe_pages;
177
00425dd9
QW
178 /* Pointers to the sectors in the bio_list, for faster lookup */
179 struct sector_ptr *bio_sectors;
180
53b381b3
DW
181 /*
182 * pointers to the pages in the bio_list. Stored
183 * here for faster lookup
184 */
185 struct page **bio_pages;
5a6ac9ea
MX
186
187 /*
eb357060
QW
188 * For subpage support, we need to map each sector to above
189 * stripe_pages.
5a6ac9ea 190 */
eb357060
QW
191 struct sector_ptr *stripe_sectors;
192
193 /* Bitmap to record which horizontal stripe has data */
5a6ac9ea 194 unsigned long *dbitmap;
1389053e
KC
195
196 /* allocated with real_stripes-many pointers for finish_*() calls */
197 void **finish_pointers;
198
94efbe19 199 /* Allocated with stripe_nsectors-many bits for finish_*() calls */
1389053e 200 unsigned long *finish_pbitmap;
53b381b3
DW
201};
202
203static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
204static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
205static void rmw_work(struct btrfs_work *work);
206static void read_rebuild_work(struct btrfs_work *work);
53b381b3
DW
207static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
208static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
209static void __free_raid_bio(struct btrfs_raid_bio *rbio);
210static void index_rbio_pages(struct btrfs_raid_bio *rbio);
211static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
212
5a6ac9ea
MX
213static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
214 int need_check);
a81b747d 215static void scrub_parity_work(struct btrfs_work *work);
5a6ac9ea 216
ac638859
DS
217static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
218{
a0cac0ec 219 btrfs_init_work(&rbio->work, work_func, NULL, NULL);
6a258d72 220 btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
221}
222
53b381b3
DW
223/*
224 * the stripe hash table is used for locking, and to collect
225 * bios in hopes of making a full stripe
226 */
227int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
228{
229 struct btrfs_stripe_hash_table *table;
230 struct btrfs_stripe_hash_table *x;
231 struct btrfs_stripe_hash *cur;
232 struct btrfs_stripe_hash *h;
233 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
234 int i;
235
236 if (info->stripe_hash_table)
237 return 0;
238
83c8266a
DS
239 /*
240 * The table is large, starting with order 4 and can go as high as
241 * order 7 in case lock debugging is turned on.
242 *
243 * Try harder to allocate and fallback to vmalloc to lower the chance
244 * of a failing mount.
245 */
ee787f95 246 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
247 if (!table)
248 return -ENOMEM;
53b381b3 249
4ae10b3a
CM
250 spin_lock_init(&table->cache_lock);
251 INIT_LIST_HEAD(&table->stripe_cache);
252
53b381b3
DW
253 h = table->table;
254
255 for (i = 0; i < num_entries; i++) {
256 cur = h + i;
257 INIT_LIST_HEAD(&cur->hash_list);
258 spin_lock_init(&cur->lock);
53b381b3
DW
259 }
260
261 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 262 kvfree(x);
53b381b3
DW
263 return 0;
264}
265
4ae10b3a
CM
266/*
267 * caching an rbio means to copy anything from the
268 * bio_pages array into the stripe_pages array. We
269 * use the page uptodate bit in the stripe cache array
270 * to indicate if it has valid data
271 *
272 * once the caching is done, we set the cache ready
273 * bit.
274 */
275static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
276{
277 int i;
4ae10b3a
CM
278 int ret;
279
280 ret = alloc_rbio_pages(rbio);
281 if (ret)
282 return;
283
284 for (i = 0; i < rbio->nr_pages; i++) {
285 if (!rbio->bio_pages[i])
286 continue;
287
80cc8384 288 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
4ae10b3a
CM
289 SetPageUptodate(rbio->stripe_pages[i]);
290 }
00425dd9
QW
291
292 /*
293 * This work is duplicated with the above loop, will be removed when
294 * the switch is done.
295 */
296 for (i = 0; i < rbio->nr_sectors; i++) {
297 /* Some range not covered by bio (partial write), skip it */
298 if (!rbio->bio_sectors[i].page)
299 continue;
300
301 ASSERT(rbio->stripe_sectors[i].page);
302 memcpy_page(rbio->stripe_sectors[i].page,
303 rbio->stripe_sectors[i].pgoff,
304 rbio->bio_sectors[i].page,
305 rbio->bio_sectors[i].pgoff,
306 rbio->bioc->fs_info->sectorsize);
307 rbio->stripe_sectors[i].uptodate = 1;
308 }
4ae10b3a
CM
309 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
310}
311
53b381b3
DW
312/*
313 * we hash on the first logical address of the stripe
314 */
315static int rbio_bucket(struct btrfs_raid_bio *rbio)
316{
4c664611 317 u64 num = rbio->bioc->raid_map[0];
53b381b3
DW
318
319 /*
320 * we shift down quite a bit. We're using byte
321 * addressing, and most of the lower bits are zeros.
322 * This tends to upset hash_64, and it consistently
323 * returns just one or two different values.
324 *
325 * shifting off the lower bits fixes things.
326 */
327 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
328}
329
eb357060
QW
330/*
331 * Update the stripe_sectors[] array to use correct page and pgoff
332 *
333 * Should be called every time any page pointer in stripes_pages[] got modified.
334 */
335static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
336{
337 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
338 u32 offset;
339 int i;
340
341 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
342 int page_index = offset >> PAGE_SHIFT;
343
344 ASSERT(page_index < rbio->nr_pages);
345 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
346 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
347 }
348}
349
4ae10b3a
CM
350/*
351 * stealing an rbio means taking all the uptodate pages from the stripe
352 * array in the source rbio and putting them into the destination rbio
353 */
354static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
355{
356 int i;
357 struct page *s;
358 struct page *d;
359
360 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
361 return;
362
363 for (i = 0; i < dest->nr_pages; i++) {
364 s = src->stripe_pages[i];
365 if (!s || !PageUptodate(s)) {
366 continue;
367 }
368
369 d = dest->stripe_pages[i];
370 if (d)
371 __free_page(d);
372
373 dest->stripe_pages[i] = s;
374 src->stripe_pages[i] = NULL;
375 }
eb357060
QW
376 index_stripe_sectors(dest);
377 index_stripe_sectors(src);
4ae10b3a
CM
378}
379
53b381b3
DW
380/*
381 * merging means we take the bio_list from the victim and
382 * splice it into the destination. The victim should
383 * be discarded afterwards.
384 *
385 * must be called with dest->rbio_list_lock held
386 */
387static void merge_rbio(struct btrfs_raid_bio *dest,
388 struct btrfs_raid_bio *victim)
389{
390 bio_list_merge(&dest->bio_list, &victim->bio_list);
391 dest->bio_list_bytes += victim->bio_list_bytes;
4245215d 392 dest->generic_bio_cnt += victim->generic_bio_cnt;
53b381b3
DW
393 bio_list_init(&victim->bio_list);
394}
395
396/*
4ae10b3a
CM
397 * used to prune items that are in the cache. The caller
398 * must hold the hash table lock.
399 */
400static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
401{
402 int bucket = rbio_bucket(rbio);
403 struct btrfs_stripe_hash_table *table;
404 struct btrfs_stripe_hash *h;
405 int freeit = 0;
406
407 /*
408 * check the bit again under the hash table lock.
409 */
410 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
411 return;
412
6a258d72 413 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
414 h = table->table + bucket;
415
416 /* hold the lock for the bucket because we may be
417 * removing it from the hash table
418 */
419 spin_lock(&h->lock);
420
421 /*
422 * hold the lock for the bio list because we need
423 * to make sure the bio list is empty
424 */
425 spin_lock(&rbio->bio_list_lock);
426
427 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
428 list_del_init(&rbio->stripe_cache);
429 table->cache_size -= 1;
430 freeit = 1;
431
432 /* if the bio list isn't empty, this rbio is
433 * still involved in an IO. We take it out
434 * of the cache list, and drop the ref that
435 * was held for the list.
436 *
437 * If the bio_list was empty, we also remove
438 * the rbio from the hash_table, and drop
439 * the corresponding ref
440 */
441 if (bio_list_empty(&rbio->bio_list)) {
442 if (!list_empty(&rbio->hash_list)) {
443 list_del_init(&rbio->hash_list);
dec95574 444 refcount_dec(&rbio->refs);
4ae10b3a
CM
445 BUG_ON(!list_empty(&rbio->plug_list));
446 }
447 }
448 }
449
450 spin_unlock(&rbio->bio_list_lock);
451 spin_unlock(&h->lock);
452
453 if (freeit)
454 __free_raid_bio(rbio);
455}
456
457/*
458 * prune a given rbio from the cache
459 */
460static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
461{
462 struct btrfs_stripe_hash_table *table;
463 unsigned long flags;
464
465 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
466 return;
467
6a258d72 468 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
469
470 spin_lock_irqsave(&table->cache_lock, flags);
471 __remove_rbio_from_cache(rbio);
472 spin_unlock_irqrestore(&table->cache_lock, flags);
473}
474
475/*
476 * remove everything in the cache
477 */
48a3b636 478static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
479{
480 struct btrfs_stripe_hash_table *table;
481 unsigned long flags;
482 struct btrfs_raid_bio *rbio;
483
484 table = info->stripe_hash_table;
485
486 spin_lock_irqsave(&table->cache_lock, flags);
487 while (!list_empty(&table->stripe_cache)) {
488 rbio = list_entry(table->stripe_cache.next,
489 struct btrfs_raid_bio,
490 stripe_cache);
491 __remove_rbio_from_cache(rbio);
492 }
493 spin_unlock_irqrestore(&table->cache_lock, flags);
494}
495
496/*
497 * remove all cached entries and free the hash table
498 * used by unmount
53b381b3
DW
499 */
500void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
501{
502 if (!info->stripe_hash_table)
503 return;
4ae10b3a 504 btrfs_clear_rbio_cache(info);
f749303b 505 kvfree(info->stripe_hash_table);
53b381b3
DW
506 info->stripe_hash_table = NULL;
507}
508
4ae10b3a
CM
509/*
510 * insert an rbio into the stripe cache. It
511 * must have already been prepared by calling
512 * cache_rbio_pages
513 *
514 * If this rbio was already cached, it gets
515 * moved to the front of the lru.
516 *
517 * If the size of the rbio cache is too big, we
518 * prune an item.
519 */
520static void cache_rbio(struct btrfs_raid_bio *rbio)
521{
522 struct btrfs_stripe_hash_table *table;
523 unsigned long flags;
524
525 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
526 return;
527
6a258d72 528 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
529
530 spin_lock_irqsave(&table->cache_lock, flags);
531 spin_lock(&rbio->bio_list_lock);
532
533 /* bump our ref if we were not in the list before */
534 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 535 refcount_inc(&rbio->refs);
4ae10b3a
CM
536
537 if (!list_empty(&rbio->stripe_cache)){
538 list_move(&rbio->stripe_cache, &table->stripe_cache);
539 } else {
540 list_add(&rbio->stripe_cache, &table->stripe_cache);
541 table->cache_size += 1;
542 }
543
544 spin_unlock(&rbio->bio_list_lock);
545
546 if (table->cache_size > RBIO_CACHE_SIZE) {
547 struct btrfs_raid_bio *found;
548
549 found = list_entry(table->stripe_cache.prev,
550 struct btrfs_raid_bio,
551 stripe_cache);
552
553 if (found != rbio)
554 __remove_rbio_from_cache(found);
555 }
556
557 spin_unlock_irqrestore(&table->cache_lock, flags);
4ae10b3a
CM
558}
559
53b381b3
DW
560/*
561 * helper function to run the xor_blocks api. It is only
562 * able to do MAX_XOR_BLOCKS at a time, so we need to
563 * loop through.
564 */
565static void run_xor(void **pages, int src_cnt, ssize_t len)
566{
567 int src_off = 0;
568 int xor_src_cnt = 0;
569 void *dest = pages[src_cnt];
570
571 while(src_cnt > 0) {
572 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
573 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
574
575 src_cnt -= xor_src_cnt;
576 src_off += xor_src_cnt;
577 }
578}
579
580/*
176571a1
DS
581 * Returns true if the bio list inside this rbio covers an entire stripe (no
582 * rmw required).
53b381b3 583 */
176571a1 584static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3 585{
176571a1 586 unsigned long flags;
53b381b3
DW
587 unsigned long size = rbio->bio_list_bytes;
588 int ret = 1;
589
176571a1 590 spin_lock_irqsave(&rbio->bio_list_lock, flags);
53b381b3
DW
591 if (size != rbio->nr_data * rbio->stripe_len)
592 ret = 0;
53b381b3 593 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
53b381b3 594 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
176571a1 595
53b381b3
DW
596 return ret;
597}
598
599/*
600 * returns 1 if it is safe to merge two rbios together.
601 * The merging is safe if the two rbios correspond to
602 * the same stripe and if they are both going in the same
603 * direction (read vs write), and if neither one is
604 * locked for final IO
605 *
606 * The caller is responsible for locking such that
607 * rmw_locked is safe to test
608 */
609static int rbio_can_merge(struct btrfs_raid_bio *last,
610 struct btrfs_raid_bio *cur)
611{
612 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
613 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
614 return 0;
615
4ae10b3a
CM
616 /*
617 * we can't merge with cached rbios, since the
618 * idea is that when we merge the destination
619 * rbio is going to run our IO for us. We can
01327610 620 * steal from cached rbios though, other functions
4ae10b3a
CM
621 * handle that.
622 */
623 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
624 test_bit(RBIO_CACHE_BIT, &cur->flags))
625 return 0;
626
4c664611 627 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
53b381b3
DW
628 return 0;
629
5a6ac9ea
MX
630 /* we can't merge with different operations */
631 if (last->operation != cur->operation)
632 return 0;
633 /*
634 * We've need read the full stripe from the drive.
635 * check and repair the parity and write the new results.
636 *
637 * We're not allowed to add any new bios to the
638 * bio list here, anyone else that wants to
639 * change this stripe needs to do their own rmw.
640 */
db34be19 641 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 642 return 0;
53b381b3 643
db34be19 644 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
b4ee1782
OS
645 return 0;
646
cc54ff62
LB
647 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
648 int fa = last->faila;
649 int fb = last->failb;
650 int cur_fa = cur->faila;
651 int cur_fb = cur->failb;
652
653 if (last->faila >= last->failb) {
654 fa = last->failb;
655 fb = last->faila;
656 }
657
658 if (cur->faila >= cur->failb) {
659 cur_fa = cur->failb;
660 cur_fb = cur->faila;
661 }
662
663 if (fa != cur_fa || fb != cur_fb)
664 return 0;
665 }
53b381b3
DW
666 return 1;
667}
668
3e77605d
QW
669static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
670 unsigned int stripe_nr,
671 unsigned int sector_nr)
672{
673 ASSERT(stripe_nr < rbio->real_stripes);
674 ASSERT(sector_nr < rbio->stripe_nsectors);
675
676 return stripe_nr * rbio->stripe_nsectors + sector_nr;
677}
678
679/* Return a sector from rbio->stripe_sectors, not from the bio list */
680static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
681 unsigned int stripe_nr,
682 unsigned int sector_nr)
683{
684 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
685 sector_nr)];
686}
687
b7178a5f
ZL
688static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
689 int index)
690{
691 return stripe * rbio->stripe_npages + index;
692}
693
694/*
695 * these are just the pages from the rbio array, not from anything
696 * the FS sent down to us
697 */
698static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
699 int index)
700{
701 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
702}
703
53b381b3
DW
704/*
705 * helper to index into the pstripe
706 */
707static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
708{
b7178a5f 709 return rbio_stripe_page(rbio, rbio->nr_data, index);
53b381b3
DW
710}
711
712/*
713 * helper to index into the qstripe, returns null
714 * if there is no qstripe
715 */
716static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
717{
2c8cdd6e 718 if (rbio->nr_data + 1 == rbio->real_stripes)
53b381b3 719 return NULL;
b7178a5f 720 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
53b381b3
DW
721}
722
723/*
724 * The first stripe in the table for a logical address
725 * has the lock. rbios are added in one of three ways:
726 *
727 * 1) Nobody has the stripe locked yet. The rbio is given
728 * the lock and 0 is returned. The caller must start the IO
729 * themselves.
730 *
731 * 2) Someone has the stripe locked, but we're able to merge
732 * with the lock owner. The rbio is freed and the IO will
733 * start automatically along with the existing rbio. 1 is returned.
734 *
735 * 3) Someone has the stripe locked, but we're not able to merge.
736 * The rbio is added to the lock owner's plug list, or merged into
737 * an rbio already on the plug list. When the lock owner unlocks,
738 * the next rbio on the list is run and the IO is started automatically.
739 * 1 is returned
740 *
741 * If we return 0, the caller still owns the rbio and must continue with
742 * IO submission. If we return 1, the caller must assume the rbio has
743 * already been freed.
744 */
745static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
746{
721860d5 747 struct btrfs_stripe_hash *h;
53b381b3
DW
748 struct btrfs_raid_bio *cur;
749 struct btrfs_raid_bio *pending;
750 unsigned long flags;
53b381b3 751 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 752 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 753 int ret = 0;
53b381b3 754
6a258d72 755 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 756
53b381b3
DW
757 spin_lock_irqsave(&h->lock, flags);
758 list_for_each_entry(cur, &h->hash_list, hash_list) {
4c664611 759 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
9d6cb1b0 760 continue;
4ae10b3a 761
9d6cb1b0 762 spin_lock(&cur->bio_list_lock);
4ae10b3a 763
9d6cb1b0
JT
764 /* Can we steal this cached rbio's pages? */
765 if (bio_list_empty(&cur->bio_list) &&
766 list_empty(&cur->plug_list) &&
767 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
768 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
769 list_del_init(&cur->hash_list);
770 refcount_dec(&cur->refs);
53b381b3 771
9d6cb1b0
JT
772 steal_rbio(cur, rbio);
773 cache_drop = cur;
774 spin_unlock(&cur->bio_list_lock);
4ae10b3a 775
9d6cb1b0
JT
776 goto lockit;
777 }
53b381b3 778
9d6cb1b0
JT
779 /* Can we merge into the lock owner? */
780 if (rbio_can_merge(cur, rbio)) {
781 merge_rbio(cur, rbio);
53b381b3 782 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 783 freeit = rbio;
53b381b3
DW
784 ret = 1;
785 goto out;
786 }
9d6cb1b0
JT
787
788
789 /*
790 * We couldn't merge with the running rbio, see if we can merge
791 * with the pending ones. We don't have to check for rmw_locked
792 * because there is no way they are inside finish_rmw right now
793 */
794 list_for_each_entry(pending, &cur->plug_list, plug_list) {
795 if (rbio_can_merge(pending, rbio)) {
796 merge_rbio(pending, rbio);
797 spin_unlock(&cur->bio_list_lock);
798 freeit = rbio;
799 ret = 1;
800 goto out;
801 }
802 }
803
804 /*
805 * No merging, put us on the tail of the plug list, our rbio
806 * will be started with the currently running rbio unlocks
807 */
808 list_add_tail(&rbio->plug_list, &cur->plug_list);
809 spin_unlock(&cur->bio_list_lock);
810 ret = 1;
811 goto out;
53b381b3 812 }
4ae10b3a 813lockit:
dec95574 814 refcount_inc(&rbio->refs);
53b381b3
DW
815 list_add(&rbio->hash_list, &h->hash_list);
816out:
817 spin_unlock_irqrestore(&h->lock, flags);
4ae10b3a
CM
818 if (cache_drop)
819 remove_rbio_from_cache(cache_drop);
53b381b3
DW
820 if (freeit)
821 __free_raid_bio(freeit);
822 return ret;
823}
824
825/*
826 * called as rmw or parity rebuild is completed. If the plug list has more
827 * rbios waiting for this stripe, the next one on the list will be started
828 */
829static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
830{
831 int bucket;
832 struct btrfs_stripe_hash *h;
833 unsigned long flags;
4ae10b3a 834 int keep_cache = 0;
53b381b3
DW
835
836 bucket = rbio_bucket(rbio);
6a258d72 837 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 838
4ae10b3a
CM
839 if (list_empty(&rbio->plug_list))
840 cache_rbio(rbio);
841
53b381b3
DW
842 spin_lock_irqsave(&h->lock, flags);
843 spin_lock(&rbio->bio_list_lock);
844
845 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
846 /*
847 * if we're still cached and there is no other IO
848 * to perform, just leave this rbio here for others
849 * to steal from later
850 */
851 if (list_empty(&rbio->plug_list) &&
852 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
853 keep_cache = 1;
854 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
855 BUG_ON(!bio_list_empty(&rbio->bio_list));
856 goto done;
857 }
53b381b3
DW
858
859 list_del_init(&rbio->hash_list);
dec95574 860 refcount_dec(&rbio->refs);
53b381b3
DW
861
862 /*
863 * we use the plug list to hold all the rbios
864 * waiting for the chance to lock this stripe.
865 * hand the lock over to one of them.
866 */
867 if (!list_empty(&rbio->plug_list)) {
868 struct btrfs_raid_bio *next;
869 struct list_head *head = rbio->plug_list.next;
870
871 next = list_entry(head, struct btrfs_raid_bio,
872 plug_list);
873
874 list_del_init(&rbio->plug_list);
875
876 list_add(&next->hash_list, &h->hash_list);
dec95574 877 refcount_inc(&next->refs);
53b381b3
DW
878 spin_unlock(&rbio->bio_list_lock);
879 spin_unlock_irqrestore(&h->lock, flags);
880
1b94b556 881 if (next->operation == BTRFS_RBIO_READ_REBUILD)
e66d8d5a 882 start_async_work(next, read_rebuild_work);
b4ee1782
OS
883 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
884 steal_rbio(rbio, next);
e66d8d5a 885 start_async_work(next, read_rebuild_work);
b4ee1782 886 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 887 steal_rbio(rbio, next);
cf6a4a75 888 start_async_work(next, rmw_work);
5a6ac9ea
MX
889 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
890 steal_rbio(rbio, next);
a81b747d 891 start_async_work(next, scrub_parity_work);
4ae10b3a 892 }
53b381b3
DW
893
894 goto done_nolock;
53b381b3
DW
895 }
896 }
4ae10b3a 897done:
53b381b3
DW
898 spin_unlock(&rbio->bio_list_lock);
899 spin_unlock_irqrestore(&h->lock, flags);
900
901done_nolock:
4ae10b3a
CM
902 if (!keep_cache)
903 remove_rbio_from_cache(rbio);
53b381b3
DW
904}
905
906static void __free_raid_bio(struct btrfs_raid_bio *rbio)
907{
908 int i;
909
dec95574 910 if (!refcount_dec_and_test(&rbio->refs))
53b381b3
DW
911 return;
912
4ae10b3a 913 WARN_ON(!list_empty(&rbio->stripe_cache));
53b381b3
DW
914 WARN_ON(!list_empty(&rbio->hash_list));
915 WARN_ON(!bio_list_empty(&rbio->bio_list));
916
917 for (i = 0; i < rbio->nr_pages; i++) {
918 if (rbio->stripe_pages[i]) {
919 __free_page(rbio->stripe_pages[i]);
920 rbio->stripe_pages[i] = NULL;
921 }
922 }
af8e2d1d 923
4c664611 924 btrfs_put_bioc(rbio->bioc);
53b381b3
DW
925 kfree(rbio);
926}
927
7583d8d0 928static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
53b381b3 929{
7583d8d0
LB
930 struct bio *next;
931
932 while (cur) {
933 next = cur->bi_next;
934 cur->bi_next = NULL;
935 cur->bi_status = err;
936 bio_endio(cur);
937 cur = next;
938 }
53b381b3
DW
939}
940
941/*
942 * this frees the rbio and runs through all the bios in the
943 * bio_list and calls end_io on them
944 */
4e4cbee9 945static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
53b381b3
DW
946{
947 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 948 struct bio *extra;
4245215d
MX
949
950 if (rbio->generic_bio_cnt)
6a258d72 951 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
4245215d 952
7583d8d0
LB
953 /*
954 * At this moment, rbio->bio_list is empty, however since rbio does not
955 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
956 * hash list, rbio may be merged with others so that rbio->bio_list
957 * becomes non-empty.
958 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
959 * more and we can call bio_endio() on all queued bios.
960 */
961 unlock_stripe(rbio);
962 extra = bio_list_get(&rbio->bio_list);
963 __free_raid_bio(rbio);
53b381b3 964
7583d8d0
LB
965 rbio_endio_bio_list(cur, err);
966 if (extra)
967 rbio_endio_bio_list(extra, err);
53b381b3
DW
968}
969
970/*
971 * end io function used by finish_rmw. When we finally
972 * get here, we've written a full stripe
973 */
4246a0b6 974static void raid_write_end_io(struct bio *bio)
53b381b3
DW
975{
976 struct btrfs_raid_bio *rbio = bio->bi_private;
4e4cbee9 977 blk_status_t err = bio->bi_status;
a6111d11 978 int max_errors;
53b381b3
DW
979
980 if (err)
981 fail_bio_stripe(rbio, bio);
982
983 bio_put(bio);
984
b89e1b01 985 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
986 return;
987
58efbc9f 988 err = BLK_STS_OK;
53b381b3
DW
989
990 /* OK, we have read all the stripes we need to. */
a6111d11 991 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
4c664611 992 0 : rbio->bioc->max_errors;
a6111d11 993 if (atomic_read(&rbio->error) > max_errors)
4e4cbee9 994 err = BLK_STS_IOERR;
53b381b3 995
4246a0b6 996 rbio_orig_end_io(rbio, err);
53b381b3
DW
997}
998
3e77605d
QW
999/**
1000 * Get a sector pointer specified by its @stripe_nr and @sector_nr
1001 *
1002 * @rbio: The raid bio
1003 * @stripe_nr: Stripe number, valid range [0, real_stripe)
1004 * @sector_nr: Sector number inside the stripe,
1005 * valid range [0, stripe_nsectors)
1006 * @bio_list_only: Whether to use sectors inside the bio list only.
1007 *
1008 * The read/modify/write code wants to reuse the original bio page as much
1009 * as possible, and only use stripe_sectors as fallback.
1010 */
1011static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
1012 int stripe_nr, int sector_nr,
1013 bool bio_list_only)
1014{
1015 struct sector_ptr *sector;
1016 int index;
1017
1018 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
1019 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1020
1021 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
1022 ASSERT(index >= 0 && index < rbio->nr_sectors);
1023
1024 spin_lock_irq(&rbio->bio_list_lock);
1025 sector = &rbio->bio_sectors[index];
1026 if (sector->page || bio_list_only) {
1027 /* Don't return sector without a valid page pointer */
1028 if (!sector->page)
1029 sector = NULL;
1030 spin_unlock_irq(&rbio->bio_list_lock);
1031 return sector;
1032 }
1033 spin_unlock_irq(&rbio->bio_list_lock);
1034
1035 return &rbio->stripe_sectors[index];
1036}
1037
53b381b3
DW
1038/*
1039 * the read/modify/write code wants to use the original bio for
1040 * any pages it included, and then use the rbio for everything
1041 * else. This function decides if a given index (stripe number)
1042 * and page number in that stripe fall inside the original bio
1043 * or the rbio.
1044 *
1045 * if you set bio_list_only, you'll get a NULL back for any ranges
1046 * that are outside the bio_list
1047 *
1048 * This doesn't take any refs on anything, you get a bare page pointer
1049 * and the caller must bump refs as required.
1050 *
1051 * You must call index_rbio_pages once before you can trust
1052 * the answers from this function.
1053 */
1054static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
1055 int index, int pagenr, int bio_list_only)
1056{
1057 int chunk_page;
1058 struct page *p = NULL;
1059
1060 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
1061
1062 spin_lock_irq(&rbio->bio_list_lock);
1063 p = rbio->bio_pages[chunk_page];
1064 spin_unlock_irq(&rbio->bio_list_lock);
1065
1066 if (p || bio_list_only)
1067 return p;
1068
1069 return rbio->stripe_pages[chunk_page];
1070}
1071
53b381b3
DW
1072/*
1073 * allocation and initial setup for the btrfs_raid_bio. Not
1074 * this does not allocate any pages for rbio->pages.
1075 */
2ff7e61e 1076static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
4c664611 1077 struct btrfs_io_context *bioc,
cc353a8b 1078 u32 stripe_len)
53b381b3 1079{
843de58b
QW
1080 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
1081 const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
1082 const unsigned int num_pages = stripe_npages * real_stripes;
94efbe19
QW
1083 const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
1084 const unsigned int num_sectors = stripe_nsectors * real_stripes;
53b381b3
DW
1085 struct btrfs_raid_bio *rbio;
1086 int nr_data = 0;
53b381b3
DW
1087 void *p;
1088
843de58b 1089 ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
94efbe19
QW
1090 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1091 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
843de58b 1092
1389053e
KC
1093 rbio = kzalloc(sizeof(*rbio) +
1094 sizeof(*rbio->stripe_pages) * num_pages +
00425dd9 1095 sizeof(*rbio->bio_sectors) * num_sectors +
eb357060 1096 sizeof(*rbio->stripe_sectors) * num_sectors +
1389053e 1097 sizeof(*rbio->finish_pointers) * real_stripes +
94efbe19
QW
1098 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
1099 sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
1389053e 1100 GFP_NOFS);
af8e2d1d 1101 if (!rbio)
53b381b3 1102 return ERR_PTR(-ENOMEM);
53b381b3
DW
1103
1104 bio_list_init(&rbio->bio_list);
1105 INIT_LIST_HEAD(&rbio->plug_list);
1106 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 1107 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 1108 INIT_LIST_HEAD(&rbio->hash_list);
4c664611 1109 rbio->bioc = bioc;
53b381b3
DW
1110 rbio->stripe_len = stripe_len;
1111 rbio->nr_pages = num_pages;
94efbe19 1112 rbio->nr_sectors = num_sectors;
2c8cdd6e 1113 rbio->real_stripes = real_stripes;
5a6ac9ea 1114 rbio->stripe_npages = stripe_npages;
94efbe19 1115 rbio->stripe_nsectors = stripe_nsectors;
53b381b3
DW
1116 rbio->faila = -1;
1117 rbio->failb = -1;
dec95574 1118 refcount_set(&rbio->refs, 1);
b89e1b01
MX
1119 atomic_set(&rbio->error, 0);
1120 atomic_set(&rbio->stripes_pending, 0);
53b381b3
DW
1121
1122 /*
1389053e 1123 * the stripe_pages, bio_pages, etc arrays point to the extra
53b381b3
DW
1124 * memory we allocated past the end of the rbio
1125 */
1126 p = rbio + 1;
1389053e
KC
1127#define CONSUME_ALLOC(ptr, count) do { \
1128 ptr = p; \
1129 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1130 } while (0)
1131 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1132 CONSUME_ALLOC(rbio->bio_pages, num_pages);
00425dd9 1133 CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
eb357060 1134 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
1389053e 1135 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
94efbe19
QW
1136 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
1137 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
1389053e 1138#undef CONSUME_ALLOC
53b381b3 1139
4c664611 1140 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
10f11900 1141 nr_data = real_stripes - 1;
4c664611 1142 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
2c8cdd6e 1143 nr_data = real_stripes - 2;
53b381b3 1144 else
10f11900 1145 BUG();
53b381b3
DW
1146
1147 rbio->nr_data = nr_data;
1148 return rbio;
1149}
1150
1151/* allocate pages for all the stripes in the bio, including parity */
1152static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1153{
eb357060
QW
1154 int ret;
1155
1156 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
1157 if (ret < 0)
1158 return ret;
1159 /* Mapping all sectors */
1160 index_stripe_sectors(rbio);
1161 return 0;
53b381b3
DW
1162}
1163
b7178a5f 1164/* only allocate pages for p/q stripes */
53b381b3
DW
1165static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1166{
dd137dd1 1167 int data_pages = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
eb357060 1168 int ret;
53b381b3 1169
eb357060
QW
1170 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1171 rbio->stripe_pages + data_pages);
1172 if (ret < 0)
1173 return ret;
1174
1175 index_stripe_sectors(rbio);
1176 return 0;
53b381b3
DW
1177}
1178
1179/*
3e77605d
QW
1180 * Add a single sector @sector into our list of bios for IO.
1181 *
1182 * Return 0 if everything went well.
1183 * Return <0 for error.
53b381b3 1184 */
3e77605d
QW
1185static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1186 struct bio_list *bio_list,
1187 struct sector_ptr *sector,
1188 unsigned int stripe_nr,
1189 unsigned int sector_nr,
1190 unsigned long bio_max_len,
1191 unsigned int opf)
53b381b3 1192{
3e77605d 1193 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
53b381b3 1194 struct bio *last = bio_list->tail;
53b381b3
DW
1195 int ret;
1196 struct bio *bio;
4c664611 1197 struct btrfs_io_stripe *stripe;
53b381b3
DW
1198 u64 disk_start;
1199
3e77605d
QW
1200 /*
1201 * Note: here stripe_nr has taken device replace into consideration,
1202 * thus it can be larger than rbio->real_stripe.
1203 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1204 */
1205 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1206 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1207 ASSERT(sector->page);
1208
1209 /* We don't yet support subpage, thus pgoff should always be 0 */
1210 ASSERT(sector->pgoff == 0);
1211
4c664611 1212 stripe = &rbio->bioc->stripes[stripe_nr];
3e77605d 1213 disk_start = stripe->physical + sector_nr * sectorsize;
53b381b3
DW
1214
1215 /* if the device is missing, just fail this stripe */
1216 if (!stripe->dev->bdev)
1217 return fail_rbio_index(rbio, stripe_nr);
1218
1219 /* see if we can add this page onto our existing bio */
1220 if (last) {
1201b58b 1221 u64 last_end = last->bi_iter.bi_sector << 9;
4f024f37 1222 last_end += last->bi_iter.bi_size;
53b381b3
DW
1223
1224 /*
1225 * we can't merge these if they are from different
1226 * devices or if they are not contiguous
1227 */
f90ae76a 1228 if (last_end == disk_start && !last->bi_status &&
309dca30 1229 last->bi_bdev == stripe->dev->bdev) {
3e77605d
QW
1230 ret = bio_add_page(last, sector->page, sectorsize,
1231 sector->pgoff);
1232 if (ret == sectorsize)
53b381b3
DW
1233 return 0;
1234 }
1235 }
1236
1237 /* put a new bio on the list */
e1b4b44e
CH
1238 bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
1239 opf, GFP_NOFS);
4f024f37 1240 bio->bi_iter.bi_sector = disk_start >> 9;
e01bf588 1241 bio->bi_private = rbio;
53b381b3 1242
3e77605d 1243 bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
53b381b3
DW
1244 bio_list_add(bio_list, bio);
1245 return 0;
1246}
1247
1248/*
1249 * while we're doing the read/modify/write cycle, we could
1250 * have errors in reading pages off the disk. This checks
1251 * for errors and if we're not able to read the page it'll
1252 * trigger parity reconstruction. The rmw will be finished
1253 * after we've reconstructed the failed stripes
1254 */
1255static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1256{
1257 if (rbio->faila >= 0 || rbio->failb >= 0) {
2c8cdd6e 1258 BUG_ON(rbio->faila == rbio->real_stripes - 1);
53b381b3
DW
1259 __raid56_parity_recover(rbio);
1260 } else {
1261 finish_rmw(rbio);
1262 }
1263}
1264
00425dd9
QW
1265static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1266{
1267 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1268 struct bio_vec bvec;
1269 struct bvec_iter iter;
1270 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1271 rbio->bioc->raid_map[0];
1272
1273 if (bio_flagged(bio, BIO_CLONED))
1274 bio->bi_iter = btrfs_bio(bio)->iter;
1275
1276 bio_for_each_segment(bvec, bio, iter) {
1277 u32 bvec_offset;
1278
1279 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1280 bvec_offset += sectorsize, offset += sectorsize) {
1281 int index = offset / sectorsize;
1282 struct sector_ptr *sector = &rbio->bio_sectors[index];
1283
1284 sector->page = bvec.bv_page;
1285 sector->pgoff = bvec.bv_offset + bvec_offset;
1286 ASSERT(sector->pgoff < PAGE_SIZE);
1287 }
1288 }
1289}
1290
53b381b3
DW
1291/*
1292 * helper function to walk our bio list and populate the bio_pages array with
1293 * the result. This seems expensive, but it is faster than constantly
1294 * searching through the bio list as we setup the IO in finish_rmw or stripe
1295 * reconstruction.
1296 *
1297 * This must be called before you trust the answers from page_in_rbio
1298 */
1299static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1300{
1301 struct bio *bio;
1302 u64 start;
1303 unsigned long stripe_offset;
1304 unsigned long page_index;
53b381b3
DW
1305
1306 spin_lock_irq(&rbio->bio_list_lock);
1307 bio_list_for_each(bio, &rbio->bio_list) {
6592e58c
FM
1308 struct bio_vec bvec;
1309 struct bvec_iter iter;
1310 int i = 0;
1311
1201b58b 1312 start = bio->bi_iter.bi_sector << 9;
4c664611 1313 stripe_offset = start - rbio->bioc->raid_map[0];
09cbfeaf 1314 page_index = stripe_offset >> PAGE_SHIFT;
53b381b3 1315
6592e58c
FM
1316 bio_for_each_segment(bvec, bio, iter) {
1317 rbio->bio_pages[page_index + i] = bvec.bv_page;
1318 i++;
1319 }
53b381b3 1320 }
00425dd9
QW
1321 /* This loop will replace above loop when the full switch is done */
1322 bio_list_for_each(bio, &rbio->bio_list)
1323 index_one_bio(rbio, bio);
1324
53b381b3
DW
1325 spin_unlock_irq(&rbio->bio_list_lock);
1326}
1327
1328/*
1329 * this is called from one of two situations. We either
1330 * have a full stripe from the higher layers, or we've read all
1331 * the missing bits off disk.
1332 *
1333 * This will calculate the parity and then send down any
1334 * changed blocks.
1335 */
1336static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1337{
4c664611 1338 struct btrfs_io_context *bioc = rbio->bioc;
1389053e 1339 void **pointers = rbio->finish_pointers;
53b381b3
DW
1340 int nr_data = rbio->nr_data;
1341 int stripe;
3e77605d 1342 int sectornr;
c17af965 1343 bool has_qstripe;
53b381b3
DW
1344 struct bio_list bio_list;
1345 struct bio *bio;
53b381b3
DW
1346 int ret;
1347
1348 bio_list_init(&bio_list);
1349
c17af965
DS
1350 if (rbio->real_stripes - rbio->nr_data == 1)
1351 has_qstripe = false;
1352 else if (rbio->real_stripes - rbio->nr_data == 2)
1353 has_qstripe = true;
1354 else
53b381b3 1355 BUG();
53b381b3
DW
1356
1357 /* at this point we either have a full stripe,
1358 * or we've read the full stripe from the drive.
1359 * recalculate the parity and write the new results.
1360 *
1361 * We're not allowed to add any new bios to the
1362 * bio list here, anyone else that wants to
1363 * change this stripe needs to do their own rmw.
1364 */
1365 spin_lock_irq(&rbio->bio_list_lock);
1366 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1367 spin_unlock_irq(&rbio->bio_list_lock);
1368
b89e1b01 1369 atomic_set(&rbio->error, 0);
53b381b3
DW
1370
1371 /*
1372 * now that we've set rmw_locked, run through the
1373 * bio list one last time and map the page pointers
4ae10b3a
CM
1374 *
1375 * We don't cache full rbios because we're assuming
1376 * the higher layers are unlikely to use this area of
1377 * the disk again soon. If they do use it again,
1378 * hopefully they will send another full bio.
53b381b3
DW
1379 */
1380 index_rbio_pages(rbio);
4ae10b3a
CM
1381 if (!rbio_is_full(rbio))
1382 cache_rbio_pages(rbio);
1383 else
1384 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
53b381b3 1385
3e77605d 1386 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
53b381b3
DW
1387 struct page *p;
1388 /* first collect one page from each data stripe */
1389 for (stripe = 0; stripe < nr_data; stripe++) {
3e77605d 1390 p = page_in_rbio(rbio, stripe, sectornr, 0);
94a0b58d 1391 pointers[stripe] = kmap_local_page(p);
53b381b3
DW
1392 }
1393
1394 /* then add the parity stripe */
3e77605d 1395 p = rbio_pstripe_page(rbio, sectornr);
53b381b3 1396 SetPageUptodate(p);
94a0b58d 1397 pointers[stripe++] = kmap_local_page(p);
53b381b3 1398
c17af965 1399 if (has_qstripe) {
53b381b3
DW
1400
1401 /*
1402 * raid6, add the qstripe and call the
1403 * library function to fill in our p/q
1404 */
3e77605d 1405 p = rbio_qstripe_page(rbio, sectornr);
53b381b3 1406 SetPageUptodate(p);
94a0b58d 1407 pointers[stripe++] = kmap_local_page(p);
53b381b3 1408
2c8cdd6e 1409 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
53b381b3
DW
1410 pointers);
1411 } else {
1412 /* raid5 */
69d24804 1413 copy_page(pointers[nr_data], pointers[0]);
09cbfeaf 1414 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
53b381b3 1415 }
94a0b58d
IW
1416 for (stripe = stripe - 1; stripe >= 0; stripe--)
1417 kunmap_local(pointers[stripe]);
53b381b3
DW
1418 }
1419
1420 /*
1421 * time to start writing. Make bios for everything from the
1422 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1423 * everything else.
1424 */
2c8cdd6e 1425 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
3e77605d
QW
1426 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1427 struct sector_ptr *sector;
1428
53b381b3 1429 if (stripe < rbio->nr_data) {
3e77605d
QW
1430 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1431 if (!sector)
53b381b3
DW
1432 continue;
1433 } else {
3e77605d 1434 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3
DW
1435 }
1436
3e77605d
QW
1437 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1438 sectornr, rbio->stripe_len,
1439 REQ_OP_WRITE);
53b381b3
DW
1440 if (ret)
1441 goto cleanup;
1442 }
1443 }
1444
4c664611 1445 if (likely(!bioc->num_tgtdevs))
2c8cdd6e
MX
1446 goto write_data;
1447
1448 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
4c664611 1449 if (!bioc->tgtdev_map[stripe])
2c8cdd6e
MX
1450 continue;
1451
3e77605d
QW
1452 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1453 struct sector_ptr *sector;
1454
2c8cdd6e 1455 if (stripe < rbio->nr_data) {
3e77605d
QW
1456 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1457 if (!sector)
2c8cdd6e
MX
1458 continue;
1459 } else {
3e77605d 1460 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2c8cdd6e
MX
1461 }
1462
3e77605d 1463 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 1464 rbio->bioc->tgtdev_map[stripe],
3e77605d 1465 sectornr, rbio->stripe_len,
e01bf588 1466 REQ_OP_WRITE);
2c8cdd6e
MX
1467 if (ret)
1468 goto cleanup;
1469 }
1470 }
1471
1472write_data:
b89e1b01
MX
1473 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1474 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
53b381b3 1475
bf28a605 1476 while ((bio = bio_list_pop(&bio_list))) {
53b381b3 1477 bio->bi_end_io = raid_write_end_io;
4e49ea4a
MC
1478
1479 submit_bio(bio);
53b381b3
DW
1480 }
1481 return;
1482
1483cleanup:
58efbc9f 1484 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1485
1486 while ((bio = bio_list_pop(&bio_list)))
1487 bio_put(bio);
53b381b3
DW
1488}
1489
1490/*
1491 * helper to find the stripe number for a given bio. Used to figure out which
1492 * stripe has failed. This expects the bio to correspond to a physical disk,
1493 * so it looks up based on physical sector numbers.
1494 */
1495static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1496 struct bio *bio)
1497{
4f024f37 1498 u64 physical = bio->bi_iter.bi_sector;
53b381b3 1499 int i;
4c664611 1500 struct btrfs_io_stripe *stripe;
53b381b3
DW
1501
1502 physical <<= 9;
1503
4c664611
QW
1504 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1505 stripe = &rbio->bioc->stripes[i];
83025863 1506 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
309dca30 1507 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
53b381b3
DW
1508 return i;
1509 }
1510 }
1511 return -1;
1512}
1513
1514/*
1515 * helper to find the stripe number for a given
1516 * bio (before mapping). Used to figure out which stripe has
1517 * failed. This looks up based on logical block numbers.
1518 */
1519static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1520 struct bio *bio)
1521{
1201b58b 1522 u64 logical = bio->bi_iter.bi_sector << 9;
53b381b3
DW
1523 int i;
1524
53b381b3 1525 for (i = 0; i < rbio->nr_data; i++) {
4c664611 1526 u64 stripe_start = rbio->bioc->raid_map[i];
83025863
NB
1527
1528 if (in_range(logical, stripe_start, rbio->stripe_len))
53b381b3 1529 return i;
53b381b3
DW
1530 }
1531 return -1;
1532}
1533
1534/*
1535 * returns -EIO if we had too many failures
1536 */
1537static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1538{
1539 unsigned long flags;
1540 int ret = 0;
1541
1542 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1543
1544 /* we already know this stripe is bad, move on */
1545 if (rbio->faila == failed || rbio->failb == failed)
1546 goto out;
1547
1548 if (rbio->faila == -1) {
1549 /* first failure on this rbio */
1550 rbio->faila = failed;
b89e1b01 1551 atomic_inc(&rbio->error);
53b381b3
DW
1552 } else if (rbio->failb == -1) {
1553 /* second failure on this rbio */
1554 rbio->failb = failed;
b89e1b01 1555 atomic_inc(&rbio->error);
53b381b3
DW
1556 } else {
1557 ret = -EIO;
1558 }
1559out:
1560 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1561
1562 return ret;
1563}
1564
1565/*
1566 * helper to fail a stripe based on a physical disk
1567 * bio.
1568 */
1569static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1570 struct bio *bio)
1571{
1572 int failed = find_bio_stripe(rbio, bio);
1573
1574 if (failed < 0)
1575 return -EIO;
1576
1577 return fail_rbio_index(rbio, failed);
1578}
1579
1580/*
1581 * this sets each page in the bio uptodate. It should only be used on private
1582 * rbio pages, nothing that comes in from the higher layers
1583 */
1584static void set_bio_pages_uptodate(struct bio *bio)
1585{
0198e5b7 1586 struct bio_vec *bvec;
6dc4f100 1587 struct bvec_iter_all iter_all;
6592e58c 1588
0198e5b7 1589 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1590
2b070cfe 1591 bio_for_each_segment_all(bvec, bio, iter_all)
0198e5b7 1592 SetPageUptodate(bvec->bv_page);
53b381b3
DW
1593}
1594
1595/*
1596 * end io for the read phase of the rmw cycle. All the bios here are physical
1597 * stripe bios we've read from the disk so we can recalculate the parity of the
1598 * stripe.
1599 *
1600 * This will usually kick off finish_rmw once all the bios are read in, but it
1601 * may trigger parity reconstruction if we had any errors along the way
1602 */
4246a0b6 1603static void raid_rmw_end_io(struct bio *bio)
53b381b3
DW
1604{
1605 struct btrfs_raid_bio *rbio = bio->bi_private;
1606
4e4cbee9 1607 if (bio->bi_status)
53b381b3
DW
1608 fail_bio_stripe(rbio, bio);
1609 else
1610 set_bio_pages_uptodate(bio);
1611
1612 bio_put(bio);
1613
b89e1b01 1614 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
1615 return;
1616
4c664611 1617 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
53b381b3
DW
1618 goto cleanup;
1619
1620 /*
1621 * this will normally call finish_rmw to start our write
1622 * but if there are any failed stripes we'll reconstruct
1623 * from parity first
1624 */
1625 validate_rbio_for_rmw(rbio);
1626 return;
1627
1628cleanup:
1629
58efbc9f 1630 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
1631}
1632
53b381b3
DW
1633/*
1634 * the stripe must be locked by the caller. It will
1635 * unlock after all the writes are done
1636 */
1637static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1638{
1639 int bios_to_read = 0;
53b381b3
DW
1640 struct bio_list bio_list;
1641 int ret;
3e77605d 1642 int sectornr;
53b381b3
DW
1643 int stripe;
1644 struct bio *bio;
1645
1646 bio_list_init(&bio_list);
1647
1648 ret = alloc_rbio_pages(rbio);
1649 if (ret)
1650 goto cleanup;
1651
1652 index_rbio_pages(rbio);
1653
b89e1b01 1654 atomic_set(&rbio->error, 0);
53b381b3
DW
1655 /*
1656 * build a list of bios to read all the missing parts of this
1657 * stripe
1658 */
1659 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
3e77605d
QW
1660 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1661 struct sector_ptr *sector;
1662
53b381b3 1663 /*
3e77605d
QW
1664 * We want to find all the sectors missing from the
1665 * rbio and read them from the disk. If * sector_in_rbio()
1666 * finds a page in the bio list we don't need to read
1667 * it off the stripe.
53b381b3 1668 */
3e77605d
QW
1669 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1670 if (sector)
53b381b3
DW
1671 continue;
1672
3e77605d 1673 sector = rbio_stripe_sector(rbio, stripe, sectornr);
4ae10b3a 1674 /*
3e77605d
QW
1675 * The bio cache may have handed us an uptodate page.
1676 * If so, be happy and use it.
4ae10b3a 1677 */
3e77605d 1678 if (sector->uptodate)
4ae10b3a
CM
1679 continue;
1680
3e77605d
QW
1681 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1682 stripe, sectornr, rbio->stripe_len,
e01bf588 1683 REQ_OP_READ);
53b381b3
DW
1684 if (ret)
1685 goto cleanup;
1686 }
1687 }
1688
1689 bios_to_read = bio_list_size(&bio_list);
1690 if (!bios_to_read) {
1691 /*
1692 * this can happen if others have merged with
1693 * us, it means there is nothing left to read.
1694 * But if there are missing devices it may not be
1695 * safe to do the full stripe write yet.
1696 */
1697 goto finish;
1698 }
1699
1700 /*
4c664611
QW
1701 * The bioc may be freed once we submit the last bio. Make sure not to
1702 * touch it after that.
53b381b3 1703 */
b89e1b01 1704 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 1705 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
1706 bio->bi_end_io = raid_rmw_end_io;
1707
6a258d72 1708 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 1709
4e49ea4a 1710 submit_bio(bio);
53b381b3
DW
1711 }
1712 /* the actual write will happen once the reads are done */
1713 return 0;
1714
1715cleanup:
58efbc9f 1716 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1717
1718 while ((bio = bio_list_pop(&bio_list)))
1719 bio_put(bio);
1720
53b381b3
DW
1721 return -EIO;
1722
1723finish:
1724 validate_rbio_for_rmw(rbio);
1725 return 0;
1726}
1727
1728/*
1729 * if the upper layers pass in a full stripe, we thank them by only allocating
1730 * enough pages to hold the parity, and sending it all down quickly.
1731 */
1732static int full_stripe_write(struct btrfs_raid_bio *rbio)
1733{
1734 int ret;
1735
1736 ret = alloc_rbio_parity_pages(rbio);
3cd846d1
MX
1737 if (ret) {
1738 __free_raid_bio(rbio);
53b381b3 1739 return ret;
3cd846d1 1740 }
53b381b3
DW
1741
1742 ret = lock_stripe_add(rbio);
1743 if (ret == 0)
1744 finish_rmw(rbio);
1745 return 0;
1746}
1747
1748/*
1749 * partial stripe writes get handed over to async helpers.
1750 * We're really hoping to merge a few more writes into this
1751 * rbio before calculating new parity
1752 */
1753static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1754{
1755 int ret;
1756
1757 ret = lock_stripe_add(rbio);
1758 if (ret == 0)
cf6a4a75 1759 start_async_work(rbio, rmw_work);
53b381b3
DW
1760 return 0;
1761}
1762
1763/*
1764 * sometimes while we were reading from the drive to
1765 * recalculate parity, enough new bios come into create
1766 * a full stripe. So we do a check here to see if we can
1767 * go directly to finish_rmw
1768 */
1769static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1770{
1771 /* head off into rmw land if we don't have a full stripe */
1772 if (!rbio_is_full(rbio))
1773 return partial_stripe_write(rbio);
1774 return full_stripe_write(rbio);
1775}
1776
6ac0f488
CM
1777/*
1778 * We use plugging call backs to collect full stripes.
1779 * Any time we get a partial stripe write while plugged
1780 * we collect it into a list. When the unplug comes down,
1781 * we sort the list by logical block number and merge
1782 * everything we can into the same rbios
1783 */
1784struct btrfs_plug_cb {
1785 struct blk_plug_cb cb;
1786 struct btrfs_fs_info *info;
1787 struct list_head rbio_list;
1788 struct btrfs_work work;
1789};
1790
1791/*
1792 * rbios on the plug list are sorted for easier merging.
1793 */
4f0f586b
ST
1794static int plug_cmp(void *priv, const struct list_head *a,
1795 const struct list_head *b)
6ac0f488 1796{
214cc184
DS
1797 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1798 plug_list);
1799 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1800 plug_list);
4f024f37
KO
1801 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1802 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1803
1804 if (a_sector < b_sector)
1805 return -1;
1806 if (a_sector > b_sector)
1807 return 1;
1808 return 0;
1809}
1810
1811static void run_plug(struct btrfs_plug_cb *plug)
1812{
1813 struct btrfs_raid_bio *cur;
1814 struct btrfs_raid_bio *last = NULL;
1815
1816 /*
1817 * sort our plug list then try to merge
1818 * everything we can in hopes of creating full
1819 * stripes.
1820 */
1821 list_sort(NULL, &plug->rbio_list, plug_cmp);
1822 while (!list_empty(&plug->rbio_list)) {
1823 cur = list_entry(plug->rbio_list.next,
1824 struct btrfs_raid_bio, plug_list);
1825 list_del_init(&cur->plug_list);
1826
1827 if (rbio_is_full(cur)) {
c7b562c5
DS
1828 int ret;
1829
6ac0f488 1830 /* we have a full stripe, send it down */
c7b562c5
DS
1831 ret = full_stripe_write(cur);
1832 BUG_ON(ret);
6ac0f488
CM
1833 continue;
1834 }
1835 if (last) {
1836 if (rbio_can_merge(last, cur)) {
1837 merge_rbio(last, cur);
1838 __free_raid_bio(cur);
1839 continue;
1840
1841 }
1842 __raid56_parity_write(last);
1843 }
1844 last = cur;
1845 }
1846 if (last) {
1847 __raid56_parity_write(last);
1848 }
1849 kfree(plug);
1850}
1851
1852/*
1853 * if the unplug comes from schedule, we have to push the
1854 * work off to a helper thread
1855 */
1856static void unplug_work(struct btrfs_work *work)
1857{
1858 struct btrfs_plug_cb *plug;
1859 plug = container_of(work, struct btrfs_plug_cb, work);
1860 run_plug(plug);
1861}
1862
1863static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1864{
1865 struct btrfs_plug_cb *plug;
1866 plug = container_of(cb, struct btrfs_plug_cb, cb);
1867
1868 if (from_schedule) {
a0cac0ec 1869 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
d05a33ac
QW
1870 btrfs_queue_work(plug->info->rmw_workers,
1871 &plug->work);
6ac0f488
CM
1872 return;
1873 }
1874 run_plug(plug);
1875}
1876
53b381b3
DW
1877/*
1878 * our main entry point for writes from the rest of the FS.
1879 */
cc353a8b 1880int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
53b381b3 1881{
6a258d72 1882 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1883 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1884 struct btrfs_plug_cb *plug = NULL;
1885 struct blk_plug_cb *cb;
4245215d 1886 int ret;
53b381b3 1887
4c664611 1888 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 1889 if (IS_ERR(rbio)) {
4c664611 1890 btrfs_put_bioc(bioc);
53b381b3 1891 return PTR_ERR(rbio);
af8e2d1d 1892 }
53b381b3 1893 bio_list_add(&rbio->bio_list, bio);
4f024f37 1894 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1b94b556 1895 rbio->operation = BTRFS_RBIO_WRITE;
6ac0f488 1896
0b246afa 1897 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
1898 rbio->generic_bio_cnt = 1;
1899
6ac0f488
CM
1900 /*
1901 * don't plug on full rbios, just get them out the door
1902 * as quickly as we can
1903 */
4245215d
MX
1904 if (rbio_is_full(rbio)) {
1905 ret = full_stripe_write(rbio);
1906 if (ret)
0b246afa 1907 btrfs_bio_counter_dec(fs_info);
4245215d
MX
1908 return ret;
1909 }
6ac0f488 1910
0b246afa 1911 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
6ac0f488
CM
1912 if (cb) {
1913 plug = container_of(cb, struct btrfs_plug_cb, cb);
1914 if (!plug->info) {
0b246afa 1915 plug->info = fs_info;
6ac0f488
CM
1916 INIT_LIST_HEAD(&plug->rbio_list);
1917 }
1918 list_add_tail(&rbio->plug_list, &plug->rbio_list);
4245215d 1919 ret = 0;
6ac0f488 1920 } else {
4245215d
MX
1921 ret = __raid56_parity_write(rbio);
1922 if (ret)
0b246afa 1923 btrfs_bio_counter_dec(fs_info);
6ac0f488 1924 }
4245215d 1925 return ret;
53b381b3
DW
1926}
1927
1928/*
1929 * all parity reconstruction happens here. We've read in everything
1930 * we can find from the drives and this does the heavy lifting of
1931 * sorting the good from the bad.
1932 */
1933static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1934{
1935 int pagenr, stripe;
1936 void **pointers;
94a0b58d 1937 void **unmap_array;
53b381b3 1938 int faila = -1, failb = -1;
53b381b3 1939 struct page *page;
58efbc9f 1940 blk_status_t err;
53b381b3
DW
1941 int i;
1942
31e818fe 1943 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
53b381b3 1944 if (!pointers) {
58efbc9f 1945 err = BLK_STS_RESOURCE;
53b381b3
DW
1946 goto cleanup_io;
1947 }
1948
94a0b58d
IW
1949 /*
1950 * Store copy of pointers that does not get reordered during
1951 * reconstruction so that kunmap_local works.
1952 */
1953 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1954 if (!unmap_array) {
1955 err = BLK_STS_RESOURCE;
1956 goto cleanup_pointers;
1957 }
1958
53b381b3
DW
1959 faila = rbio->faila;
1960 failb = rbio->failb;
1961
b4ee1782
OS
1962 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1963 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
53b381b3
DW
1964 spin_lock_irq(&rbio->bio_list_lock);
1965 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1966 spin_unlock_irq(&rbio->bio_list_lock);
1967 }
1968
1969 index_rbio_pages(rbio);
1970
915e2290 1971 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
5a6ac9ea
MX
1972 /*
1973 * Now we just use bitmap to mark the horizontal stripes in
1974 * which we have data when doing parity scrub.
1975 */
1976 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1977 !test_bit(pagenr, rbio->dbitmap))
1978 continue;
1979
94a0b58d
IW
1980 /*
1981 * Setup our array of pointers with pages from each stripe
1982 *
1983 * NOTE: store a duplicate array of pointers to preserve the
1984 * pointer order
53b381b3 1985 */
2c8cdd6e 1986 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
53b381b3
DW
1987 /*
1988 * if we're rebuilding a read, we have to use
1989 * pages from the bio list
1990 */
b4ee1782
OS
1991 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1992 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
53b381b3
DW
1993 (stripe == faila || stripe == failb)) {
1994 page = page_in_rbio(rbio, stripe, pagenr, 0);
1995 } else {
1996 page = rbio_stripe_page(rbio, stripe, pagenr);
1997 }
94a0b58d
IW
1998 pointers[stripe] = kmap_local_page(page);
1999 unmap_array[stripe] = pointers[stripe];
53b381b3
DW
2000 }
2001
2002 /* all raid6 handling here */
4c664611 2003 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
53b381b3
DW
2004 /*
2005 * single failure, rebuild from parity raid5
2006 * style
2007 */
2008 if (failb < 0) {
2009 if (faila == rbio->nr_data) {
2010 /*
2011 * Just the P stripe has failed, without
2012 * a bad data or Q stripe.
2013 * TODO, we should redo the xor here.
2014 */
58efbc9f 2015 err = BLK_STS_IOERR;
53b381b3
DW
2016 goto cleanup;
2017 }
2018 /*
2019 * a single failure in raid6 is rebuilt
2020 * in the pstripe code below
2021 */
2022 goto pstripe;
2023 }
2024
2025 /* make sure our ps and qs are in order */
b7d2083a
NB
2026 if (faila > failb)
2027 swap(faila, failb);
53b381b3
DW
2028
2029 /* if the q stripe is failed, do a pstripe reconstruction
2030 * from the xors.
2031 * If both the q stripe and the P stripe are failed, we're
2032 * here due to a crc mismatch and we can't give them the
2033 * data they want
2034 */
4c664611
QW
2035 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
2036 if (rbio->bioc->raid_map[faila] ==
8e5cfb55 2037 RAID5_P_STRIPE) {
58efbc9f 2038 err = BLK_STS_IOERR;
53b381b3
DW
2039 goto cleanup;
2040 }
2041 /*
2042 * otherwise we have one bad data stripe and
2043 * a good P stripe. raid5!
2044 */
2045 goto pstripe;
2046 }
2047
4c664611 2048 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2c8cdd6e 2049 raid6_datap_recov(rbio->real_stripes,
53b381b3
DW
2050 PAGE_SIZE, faila, pointers);
2051 } else {
2c8cdd6e 2052 raid6_2data_recov(rbio->real_stripes,
53b381b3
DW
2053 PAGE_SIZE, faila, failb,
2054 pointers);
2055 }
2056 } else {
2057 void *p;
2058
2059 /* rebuild from P stripe here (raid5 or raid6) */
2060 BUG_ON(failb != -1);
2061pstripe:
2062 /* Copy parity block into failed block to start with */
69d24804 2063 copy_page(pointers[faila], pointers[rbio->nr_data]);
53b381b3
DW
2064
2065 /* rearrange the pointer array */
2066 p = pointers[faila];
2067 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2068 pointers[stripe] = pointers[stripe + 1];
2069 pointers[rbio->nr_data - 1] = p;
2070
2071 /* xor in the rest */
09cbfeaf 2072 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
53b381b3
DW
2073 }
2074 /* if we're doing this rebuild as part of an rmw, go through
2075 * and set all of our private rbio pages in the
2076 * failed stripes as uptodate. This way finish_rmw will
2077 * know they can be trusted. If this was a read reconstruction,
2078 * other endio functions will fiddle the uptodate bits
2079 */
1b94b556 2080 if (rbio->operation == BTRFS_RBIO_WRITE) {
915e2290 2081 for (i = 0; i < rbio->stripe_npages; i++) {
53b381b3
DW
2082 if (faila != -1) {
2083 page = rbio_stripe_page(rbio, faila, i);
2084 SetPageUptodate(page);
2085 }
2086 if (failb != -1) {
2087 page = rbio_stripe_page(rbio, failb, i);
2088 SetPageUptodate(page);
2089 }
2090 }
2091 }
94a0b58d
IW
2092 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2093 kunmap_local(unmap_array[stripe]);
53b381b3
DW
2094 }
2095
58efbc9f 2096 err = BLK_STS_OK;
53b381b3 2097cleanup:
94a0b58d
IW
2098 kfree(unmap_array);
2099cleanup_pointers:
53b381b3
DW
2100 kfree(pointers);
2101
2102cleanup_io:
580c6efa
LB
2103 /*
2104 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2105 * valid rbio which is consistent with ondisk content, thus such a
2106 * valid rbio can be cached to avoid further disk reads.
2107 */
2108 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2109 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
44ac474d
LB
2110 /*
2111 * - In case of two failures, where rbio->failb != -1:
2112 *
2113 * Do not cache this rbio since the above read reconstruction
2114 * (raid6_datap_recov() or raid6_2data_recov()) may have
2115 * changed some content of stripes which are not identical to
2116 * on-disk content any more, otherwise, a later write/recover
2117 * may steal stripe_pages from this rbio and end up with
2118 * corruptions or rebuild failures.
2119 *
2120 * - In case of single failure, where rbio->failb == -1:
2121 *
2122 * Cache this rbio iff the above read reconstruction is
52042d8e 2123 * executed without problems.
44ac474d
LB
2124 */
2125 if (err == BLK_STS_OK && rbio->failb < 0)
4ae10b3a
CM
2126 cache_rbio_pages(rbio);
2127 else
2128 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2129
4246a0b6 2130 rbio_orig_end_io(rbio, err);
58efbc9f 2131 } else if (err == BLK_STS_OK) {
53b381b3
DW
2132 rbio->faila = -1;
2133 rbio->failb = -1;
5a6ac9ea
MX
2134
2135 if (rbio->operation == BTRFS_RBIO_WRITE)
2136 finish_rmw(rbio);
2137 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2138 finish_parity_scrub(rbio, 0);
2139 else
2140 BUG();
53b381b3 2141 } else {
4246a0b6 2142 rbio_orig_end_io(rbio, err);
53b381b3
DW
2143 }
2144}
2145
2146/*
2147 * This is called only for stripes we've read from disk to
2148 * reconstruct the parity.
2149 */
4246a0b6 2150static void raid_recover_end_io(struct bio *bio)
53b381b3
DW
2151{
2152 struct btrfs_raid_bio *rbio = bio->bi_private;
2153
2154 /*
2155 * we only read stripe pages off the disk, set them
2156 * up to date if there were no errors
2157 */
4e4cbee9 2158 if (bio->bi_status)
53b381b3
DW
2159 fail_bio_stripe(rbio, bio);
2160 else
2161 set_bio_pages_uptodate(bio);
2162 bio_put(bio);
2163
b89e1b01 2164 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
2165 return;
2166
4c664611 2167 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
58efbc9f 2168 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
2169 else
2170 __raid_recover_end_io(rbio);
2171}
2172
2173/*
2174 * reads everything we need off the disk to reconstruct
2175 * the parity. endio handlers trigger final reconstruction
2176 * when the IO is done.
2177 *
2178 * This is used both for reads from the higher layers and for
2179 * parity construction required to finish a rmw cycle.
2180 */
2181static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2182{
2183 int bios_to_read = 0;
53b381b3
DW
2184 struct bio_list bio_list;
2185 int ret;
3e77605d 2186 int sectornr;
53b381b3
DW
2187 int stripe;
2188 struct bio *bio;
2189
2190 bio_list_init(&bio_list);
2191
2192 ret = alloc_rbio_pages(rbio);
2193 if (ret)
2194 goto cleanup;
2195
b89e1b01 2196 atomic_set(&rbio->error, 0);
53b381b3
DW
2197
2198 /*
4ae10b3a
CM
2199 * read everything that hasn't failed. Thanks to the
2200 * stripe cache, it is possible that some or all of these
2201 * pages are going to be uptodate.
53b381b3 2202 */
2c8cdd6e 2203 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5588383e 2204 if (rbio->faila == stripe || rbio->failb == stripe) {
b89e1b01 2205 atomic_inc(&rbio->error);
53b381b3 2206 continue;
5588383e 2207 }
53b381b3 2208
3e77605d
QW
2209 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2210 struct sector_ptr *sector;
53b381b3
DW
2211
2212 /*
2213 * the rmw code may have already read this
2214 * page in
2215 */
3e77605d
QW
2216 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2217 if (sector->uptodate)
53b381b3
DW
2218 continue;
2219
3e77605d
QW
2220 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2221 stripe, sectornr, rbio->stripe_len,
2222 REQ_OP_READ);
53b381b3
DW
2223 if (ret < 0)
2224 goto cleanup;
2225 }
2226 }
2227
2228 bios_to_read = bio_list_size(&bio_list);
2229 if (!bios_to_read) {
2230 /*
2231 * we might have no bios to read just because the pages
2232 * were up to date, or we might have no bios to read because
2233 * the devices were gone.
2234 */
4c664611 2235 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
53b381b3 2236 __raid_recover_end_io(rbio);
813f8a0e 2237 return 0;
53b381b3
DW
2238 } else {
2239 goto cleanup;
2240 }
2241 }
2242
2243 /*
4c664611
QW
2244 * The bioc may be freed once we submit the last bio. Make sure not to
2245 * touch it after that.
53b381b3 2246 */
b89e1b01 2247 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2248 while ((bio = bio_list_pop(&bio_list))) {
53b381b3
DW
2249 bio->bi_end_io = raid_recover_end_io;
2250
6a258d72 2251 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
53b381b3 2252
4e49ea4a 2253 submit_bio(bio);
53b381b3 2254 }
813f8a0e 2255
53b381b3
DW
2256 return 0;
2257
2258cleanup:
b4ee1782
OS
2259 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2260 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
58efbc9f 2261 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2262
2263 while ((bio = bio_list_pop(&bio_list)))
2264 bio_put(bio);
2265
53b381b3
DW
2266 return -EIO;
2267}
2268
2269/*
2270 * the main entry point for reads from the higher layers. This
2271 * is really only called when the normal read path had a failure,
2272 * so we assume the bio they send down corresponds to a failed part
2273 * of the drive.
2274 */
6a258d72 2275int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
cc353a8b 2276 u32 stripe_len, int mirror_num, int generic_io)
53b381b3 2277{
6a258d72 2278 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3
DW
2279 struct btrfs_raid_bio *rbio;
2280 int ret;
2281
abad60c6 2282 if (generic_io) {
4c664611 2283 ASSERT(bioc->mirror_num == mirror_num);
c3a3b19b 2284 btrfs_bio(bio)->mirror_num = mirror_num;
abad60c6
LB
2285 }
2286
4c664611 2287 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 2288 if (IS_ERR(rbio)) {
6e9606d2 2289 if (generic_io)
4c664611 2290 btrfs_put_bioc(bioc);
53b381b3 2291 return PTR_ERR(rbio);
af8e2d1d 2292 }
53b381b3 2293
1b94b556 2294 rbio->operation = BTRFS_RBIO_READ_REBUILD;
53b381b3 2295 bio_list_add(&rbio->bio_list, bio);
4f024f37 2296 rbio->bio_list_bytes = bio->bi_iter.bi_size;
53b381b3
DW
2297
2298 rbio->faila = find_logical_bio_stripe(rbio, bio);
2299 if (rbio->faila == -1) {
0b246afa 2300 btrfs_warn(fs_info,
4c664611 2301"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
1201b58b 2302 __func__, bio->bi_iter.bi_sector << 9,
4c664611 2303 (u64)bio->bi_iter.bi_size, bioc->map_type);
6e9606d2 2304 if (generic_io)
4c664611 2305 btrfs_put_bioc(bioc);
53b381b3
DW
2306 kfree(rbio);
2307 return -EIO;
2308 }
2309
4245215d 2310 if (generic_io) {
0b246afa 2311 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
2312 rbio->generic_bio_cnt = 1;
2313 } else {
4c664611 2314 btrfs_get_bioc(bioc);
4245215d
MX
2315 }
2316
53b381b3 2317 /*
8810f751
LB
2318 * Loop retry:
2319 * for 'mirror == 2', reconstruct from all other stripes.
2320 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2321 */
8810f751
LB
2322 if (mirror_num > 2) {
2323 /*
2324 * 'mirror == 3' is to fail the p stripe and
2325 * reconstruct from the q stripe. 'mirror > 3' is to
2326 * fail a data stripe and reconstruct from p+q stripe.
2327 */
2328 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2329 ASSERT(rbio->failb > 0);
2330 if (rbio->failb <= rbio->faila)
2331 rbio->failb--;
2332 }
53b381b3
DW
2333
2334 ret = lock_stripe_add(rbio);
2335
2336 /*
2337 * __raid56_parity_recover will end the bio with
2338 * any errors it hits. We don't want to return
2339 * its error value up the stack because our caller
2340 * will end up calling bio_endio with any nonzero
2341 * return
2342 */
2343 if (ret == 0)
2344 __raid56_parity_recover(rbio);
2345 /*
2346 * our rbio has been added to the list of
2347 * rbios that will be handled after the
2348 * currently lock owner is done
2349 */
2350 return 0;
2351
2352}
2353
2354static void rmw_work(struct btrfs_work *work)
2355{
2356 struct btrfs_raid_bio *rbio;
2357
2358 rbio = container_of(work, struct btrfs_raid_bio, work);
2359 raid56_rmw_stripe(rbio);
2360}
2361
2362static void read_rebuild_work(struct btrfs_work *work)
2363{
2364 struct btrfs_raid_bio *rbio;
2365
2366 rbio = container_of(work, struct btrfs_raid_bio, work);
2367 __raid56_parity_recover(rbio);
2368}
5a6ac9ea
MX
2369
2370/*
2371 * The following code is used to scrub/replace the parity stripe
2372 *
4c664611 2373 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2374 *
5a6ac9ea
MX
2375 * Note: We need make sure all the pages that add into the scrub/replace
2376 * raid bio are correct and not be changed during the scrub/replace. That
2377 * is those pages just hold metadata or file data with checksum.
2378 */
2379
6a258d72
QW
2380struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2381 struct btrfs_io_context *bioc,
cc353a8b 2382 u32 stripe_len, struct btrfs_device *scrub_dev,
6a258d72 2383 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2384{
6a258d72 2385 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2386 struct btrfs_raid_bio *rbio;
2387 int i;
2388
4c664611 2389 rbio = alloc_rbio(fs_info, bioc, stripe_len);
5a6ac9ea
MX
2390 if (IS_ERR(rbio))
2391 return NULL;
2392 bio_list_add(&rbio->bio_list, bio);
2393 /*
2394 * This is a special bio which is used to hold the completion handler
2395 * and make the scrub rbio is similar to the other types
2396 */
2397 ASSERT(!bio->bi_iter.bi_size);
2398 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2399
9cd3a7eb 2400 /*
4c664611 2401 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2402 * to the end position, so this search can start from the first parity
2403 * stripe.
2404 */
2405 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2406 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2407 rbio->scrubp = i;
2408 break;
2409 }
2410 }
9cd3a7eb 2411 ASSERT(i < rbio->real_stripes);
5a6ac9ea
MX
2412
2413 /* Now we just support the sectorsize equals to page size */
0b246afa 2414 ASSERT(fs_info->sectorsize == PAGE_SIZE);
5a6ac9ea
MX
2415 ASSERT(rbio->stripe_npages == stripe_nsectors);
2416 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2417
ae6529c3 2418 /*
4c664611 2419 * We have already increased bio_counter when getting bioc, record it
ae6529c3
QW
2420 * so we can free it at rbio_orig_end_io().
2421 */
2422 rbio->generic_bio_cnt = 1;
2423
5a6ac9ea
MX
2424 return rbio;
2425}
2426
b4ee1782
OS
2427/* Used for both parity scrub and missing. */
2428void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2429 u64 logical)
5a6ac9ea
MX
2430{
2431 int stripe_offset;
2432 int index;
2433
4c664611
QW
2434 ASSERT(logical >= rbio->bioc->raid_map[0]);
2435 ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
5a6ac9ea 2436 rbio->stripe_len * rbio->nr_data);
4c664611 2437 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
09cbfeaf 2438 index = stripe_offset >> PAGE_SHIFT;
5a6ac9ea
MX
2439 rbio->bio_pages[index] = page;
2440}
2441
2442/*
2443 * We just scrub the parity that we have correct data on the same horizontal,
2444 * so we needn't allocate all pages for all the stripes.
2445 */
2446static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2447{
2448 int i;
2449 int bit;
2450 int index;
2451 struct page *page;
2452
2453 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2c8cdd6e 2454 for (i = 0; i < rbio->real_stripes; i++) {
5a6ac9ea
MX
2455 index = i * rbio->stripe_npages + bit;
2456 if (rbio->stripe_pages[index])
2457 continue;
2458
b0ee5e1e 2459 page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2460 if (!page)
2461 return -ENOMEM;
2462 rbio->stripe_pages[index] = page;
5a6ac9ea
MX
2463 }
2464 }
eb357060 2465 index_stripe_sectors(rbio);
5a6ac9ea
MX
2466 return 0;
2467}
2468
5a6ac9ea
MX
2469static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2470 int need_check)
2471{
4c664611 2472 struct btrfs_io_context *bioc = rbio->bioc;
46900662 2473 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e
KC
2474 void **pointers = rbio->finish_pointers;
2475 unsigned long *pbitmap = rbio->finish_pbitmap;
5a6ac9ea
MX
2476 int nr_data = rbio->nr_data;
2477 int stripe;
3e77605d 2478 int sectornr;
c17af965 2479 bool has_qstripe;
46900662
QW
2480 struct sector_ptr p_sector = { 0 };
2481 struct sector_ptr q_sector = { 0 };
5a6ac9ea
MX
2482 struct bio_list bio_list;
2483 struct bio *bio;
76035976 2484 int is_replace = 0;
5a6ac9ea
MX
2485 int ret;
2486
2487 bio_list_init(&bio_list);
2488
c17af965
DS
2489 if (rbio->real_stripes - rbio->nr_data == 1)
2490 has_qstripe = false;
2491 else if (rbio->real_stripes - rbio->nr_data == 2)
2492 has_qstripe = true;
2493 else
5a6ac9ea 2494 BUG();
5a6ac9ea 2495
4c664611 2496 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
76035976 2497 is_replace = 1;
3e77605d 2498 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
76035976
MX
2499 }
2500
5a6ac9ea
MX
2501 /*
2502 * Because the higher layers(scrubber) are unlikely to
2503 * use this area of the disk again soon, so don't cache
2504 * it.
2505 */
2506 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2507
2508 if (!need_check)
2509 goto writeback;
2510
46900662
QW
2511 p_sector.page = alloc_page(GFP_NOFS);
2512 if (!p_sector.page)
5a6ac9ea 2513 goto cleanup;
46900662
QW
2514 p_sector.pgoff = 0;
2515 p_sector.uptodate = 1;
5a6ac9ea 2516
c17af965 2517 if (has_qstripe) {
d70cef0d 2518 /* RAID6, allocate and map temp space for the Q stripe */
46900662
QW
2519 q_sector.page = alloc_page(GFP_NOFS);
2520 if (!q_sector.page) {
2521 __free_page(p_sector.page);
2522 p_sector.page = NULL;
5a6ac9ea
MX
2523 goto cleanup;
2524 }
46900662
QW
2525 q_sector.pgoff = 0;
2526 q_sector.uptodate = 1;
2527 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
5a6ac9ea
MX
2528 }
2529
2530 atomic_set(&rbio->error, 0);
2531
d70cef0d 2532 /* Map the parity stripe just once */
46900662 2533 pointers[nr_data] = kmap_local_page(p_sector.page);
d70cef0d 2534
3e77605d 2535 for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
46900662 2536 struct sector_ptr *sector;
5a6ac9ea 2537 void *parity;
46900662 2538
5a6ac9ea
MX
2539 /* first collect one page from each data stripe */
2540 for (stripe = 0; stripe < nr_data; stripe++) {
46900662
QW
2541 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2542 pointers[stripe] = kmap_local_page(sector->page) +
2543 sector->pgoff;
5a6ac9ea
MX
2544 }
2545
c17af965 2546 if (has_qstripe) {
d70cef0d 2547 /* RAID6, call the library function to fill in our P/Q */
46900662 2548 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
5a6ac9ea
MX
2549 pointers);
2550 } else {
2551 /* raid5 */
46900662
QW
2552 memcpy(pointers[nr_data], pointers[0], sectorsize);
2553 run_xor(pointers + 1, nr_data - 1, sectorsize);
5a6ac9ea
MX
2554 }
2555
01327610 2556 /* Check scrubbing parity and repair it */
46900662
QW
2557 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2558 parity = kmap_local_page(sector->page) + sector->pgoff;
2559 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2560 memcpy(parity, pointers[rbio->scrubp], sectorsize);
5a6ac9ea
MX
2561 else
2562 /* Parity is right, needn't writeback */
3e77605d 2563 bitmap_clear(rbio->dbitmap, sectornr, 1);
58c1a35c 2564 kunmap_local(parity);
5a6ac9ea 2565
94a0b58d
IW
2566 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2567 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2568 }
2569
94a0b58d 2570 kunmap_local(pointers[nr_data]);
46900662
QW
2571 __free_page(p_sector.page);
2572 p_sector.page = NULL;
2573 if (q_sector.page) {
94a0b58d 2574 kunmap_local(pointers[rbio->real_stripes - 1]);
46900662
QW
2575 __free_page(q_sector.page);
2576 q_sector.page = NULL;
d70cef0d 2577 }
5a6ac9ea
MX
2578
2579writeback:
2580 /*
2581 * time to start writing. Make bios for everything from the
2582 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2583 * everything else.
2584 */
3e77605d
QW
2585 for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2586 struct sector_ptr *sector;
5a6ac9ea 2587
3e77605d
QW
2588 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2589 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2590 sectornr, rbio->stripe_len, REQ_OP_WRITE);
5a6ac9ea
MX
2591 if (ret)
2592 goto cleanup;
2593 }
2594
76035976
MX
2595 if (!is_replace)
2596 goto submit_write;
2597
3e77605d
QW
2598 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2599 struct sector_ptr *sector;
76035976 2600
3e77605d
QW
2601 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2602 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 2603 bioc->tgtdev_map[rbio->scrubp],
3e77605d 2604 sectornr, rbio->stripe_len, REQ_OP_WRITE);
76035976
MX
2605 if (ret)
2606 goto cleanup;
2607 }
2608
2609submit_write:
5a6ac9ea
MX
2610 nr_data = bio_list_size(&bio_list);
2611 if (!nr_data) {
2612 /* Every parity is right */
58efbc9f 2613 rbio_orig_end_io(rbio, BLK_STS_OK);
5a6ac9ea
MX
2614 return;
2615 }
2616
2617 atomic_set(&rbio->stripes_pending, nr_data);
2618
bf28a605 2619 while ((bio = bio_list_pop(&bio_list))) {
a6111d11 2620 bio->bi_end_io = raid_write_end_io;
4e49ea4a
MC
2621
2622 submit_bio(bio);
5a6ac9ea
MX
2623 }
2624 return;
2625
2626cleanup:
58efbc9f 2627 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2628
2629 while ((bio = bio_list_pop(&bio_list)))
2630 bio_put(bio);
5a6ac9ea
MX
2631}
2632
2633static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2634{
2635 if (stripe >= 0 && stripe < rbio->nr_data)
2636 return 1;
2637 return 0;
2638}
2639
2640/*
2641 * While we're doing the parity check and repair, we could have errors
2642 * in reading pages off the disk. This checks for errors and if we're
2643 * not able to read the page it'll trigger parity reconstruction. The
2644 * parity scrub will be finished after we've reconstructed the failed
2645 * stripes
2646 */
2647static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2648{
4c664611 2649 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
5a6ac9ea
MX
2650 goto cleanup;
2651
2652 if (rbio->faila >= 0 || rbio->failb >= 0) {
2653 int dfail = 0, failp = -1;
2654
2655 if (is_data_stripe(rbio, rbio->faila))
2656 dfail++;
2657 else if (is_parity_stripe(rbio->faila))
2658 failp = rbio->faila;
2659
2660 if (is_data_stripe(rbio, rbio->failb))
2661 dfail++;
2662 else if (is_parity_stripe(rbio->failb))
2663 failp = rbio->failb;
2664
2665 /*
2666 * Because we can not use a scrubbing parity to repair
2667 * the data, so the capability of the repair is declined.
2668 * (In the case of RAID5, we can not repair anything)
2669 */
4c664611 2670 if (dfail > rbio->bioc->max_errors - 1)
5a6ac9ea
MX
2671 goto cleanup;
2672
2673 /*
2674 * If all data is good, only parity is correctly, just
2675 * repair the parity.
2676 */
2677 if (dfail == 0) {
2678 finish_parity_scrub(rbio, 0);
2679 return;
2680 }
2681
2682 /*
2683 * Here means we got one corrupted data stripe and one
2684 * corrupted parity on RAID6, if the corrupted parity
01327610 2685 * is scrubbing parity, luckily, use the other one to repair
5a6ac9ea
MX
2686 * the data, or we can not repair the data stripe.
2687 */
2688 if (failp != rbio->scrubp)
2689 goto cleanup;
2690
2691 __raid_recover_end_io(rbio);
2692 } else {
2693 finish_parity_scrub(rbio, 1);
2694 }
2695 return;
2696
2697cleanup:
58efbc9f 2698 rbio_orig_end_io(rbio, BLK_STS_IOERR);
5a6ac9ea
MX
2699}
2700
2701/*
2702 * end io for the read phase of the rmw cycle. All the bios here are physical
2703 * stripe bios we've read from the disk so we can recalculate the parity of the
2704 * stripe.
2705 *
2706 * This will usually kick off finish_rmw once all the bios are read in, but it
2707 * may trigger parity reconstruction if we had any errors along the way
2708 */
4246a0b6 2709static void raid56_parity_scrub_end_io(struct bio *bio)
5a6ac9ea
MX
2710{
2711 struct btrfs_raid_bio *rbio = bio->bi_private;
2712
4e4cbee9 2713 if (bio->bi_status)
5a6ac9ea
MX
2714 fail_bio_stripe(rbio, bio);
2715 else
2716 set_bio_pages_uptodate(bio);
2717
2718 bio_put(bio);
2719
2720 if (!atomic_dec_and_test(&rbio->stripes_pending))
2721 return;
2722
2723 /*
2724 * this will normally call finish_rmw to start our write
2725 * but if there are any failed stripes we'll reconstruct
2726 * from parity first
2727 */
2728 validate_rbio_for_parity_scrub(rbio);
2729}
2730
2731static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2732{
2733 int bios_to_read = 0;
5a6ac9ea
MX
2734 struct bio_list bio_list;
2735 int ret;
3e77605d 2736 int sectornr;
5a6ac9ea
MX
2737 int stripe;
2738 struct bio *bio;
2739
785884fc
LB
2740 bio_list_init(&bio_list);
2741
5a6ac9ea
MX
2742 ret = alloc_rbio_essential_pages(rbio);
2743 if (ret)
2744 goto cleanup;
2745
5a6ac9ea
MX
2746 atomic_set(&rbio->error, 0);
2747 /*
2748 * build a list of bios to read all the missing parts of this
2749 * stripe
2750 */
2c8cdd6e 2751 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
3e77605d
QW
2752 for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
2753 struct sector_ptr *sector;
5a6ac9ea 2754 /*
3e77605d
QW
2755 * We want to find all the sectors missing from the
2756 * rbio and read them from the disk. If * sector_in_rbio()
2757 * finds a sector in the bio list we don't need to read
2758 * it off the stripe.
5a6ac9ea 2759 */
3e77605d
QW
2760 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2761 if (sector)
5a6ac9ea
MX
2762 continue;
2763
3e77605d 2764 sector = rbio_stripe_sector(rbio, stripe, sectornr);
5a6ac9ea 2765 /*
3e77605d
QW
2766 * The bio cache may have handed us an uptodate sector.
2767 * If so, be happy and use it.
5a6ac9ea 2768 */
3e77605d 2769 if (sector->uptodate)
5a6ac9ea
MX
2770 continue;
2771
3e77605d
QW
2772 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2773 stripe, sectornr, rbio->stripe_len,
2774 REQ_OP_READ);
5a6ac9ea
MX
2775 if (ret)
2776 goto cleanup;
2777 }
2778 }
2779
2780 bios_to_read = bio_list_size(&bio_list);
2781 if (!bios_to_read) {
2782 /*
2783 * this can happen if others have merged with
2784 * us, it means there is nothing left to read.
2785 * But if there are missing devices it may not be
2786 * safe to do the full stripe write yet.
2787 */
2788 goto finish;
2789 }
2790
2791 /*
4c664611
QW
2792 * The bioc may be freed once we submit the last bio. Make sure not to
2793 * touch it after that.
5a6ac9ea
MX
2794 */
2795 atomic_set(&rbio->stripes_pending, bios_to_read);
bf28a605 2796 while ((bio = bio_list_pop(&bio_list))) {
5a6ac9ea
MX
2797 bio->bi_end_io = raid56_parity_scrub_end_io;
2798
6a258d72 2799 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
5a6ac9ea 2800
4e49ea4a 2801 submit_bio(bio);
5a6ac9ea
MX
2802 }
2803 /* the actual write will happen once the reads are done */
2804 return;
2805
2806cleanup:
58efbc9f 2807 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2808
2809 while ((bio = bio_list_pop(&bio_list)))
2810 bio_put(bio);
2811
5a6ac9ea
MX
2812 return;
2813
2814finish:
2815 validate_rbio_for_parity_scrub(rbio);
2816}
2817
2818static void scrub_parity_work(struct btrfs_work *work)
2819{
2820 struct btrfs_raid_bio *rbio;
2821
2822 rbio = container_of(work, struct btrfs_raid_bio, work);
2823 raid56_parity_scrub_stripe(rbio);
2824}
2825
5a6ac9ea
MX
2826void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2827{
2828 if (!lock_stripe_add(rbio))
a81b747d 2829 start_async_work(rbio, scrub_parity_work);
5a6ac9ea 2830}
b4ee1782
OS
2831
2832/* The following code is used for dev replace of a missing RAID 5/6 device. */
2833
2834struct btrfs_raid_bio *
6a258d72
QW
2835raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2836 u64 length)
b4ee1782 2837{
6a258d72 2838 struct btrfs_fs_info *fs_info = bioc->fs_info;
b4ee1782
OS
2839 struct btrfs_raid_bio *rbio;
2840
4c664611 2841 rbio = alloc_rbio(fs_info, bioc, length);
b4ee1782
OS
2842 if (IS_ERR(rbio))
2843 return NULL;
2844
2845 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2846 bio_list_add(&rbio->bio_list, bio);
2847 /*
2848 * This is a special bio which is used to hold the completion handler
2849 * and make the scrub rbio is similar to the other types
2850 */
2851 ASSERT(!bio->bi_iter.bi_size);
2852
2853 rbio->faila = find_logical_bio_stripe(rbio, bio);
2854 if (rbio->faila == -1) {
2855 BUG();
2856 kfree(rbio);
2857 return NULL;
2858 }
2859
ae6529c3 2860 /*
4c664611 2861 * When we get bioc, we have already increased bio_counter, record it
ae6529c3
QW
2862 * so we can free it at rbio_orig_end_io()
2863 */
2864 rbio->generic_bio_cnt = 1;
2865
b4ee1782
OS
2866 return rbio;
2867}
2868
b4ee1782
OS
2869void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2870{
2871 if (!lock_stripe_add(rbio))
e66d8d5a 2872 start_async_work(rbio, read_rebuild_work);
b4ee1782 2873}