1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
20 #include "async-thread.h"
22 /* set when additional merges to this rbio are not allowed */
23 #define RBIO_RMW_LOCKED_BIT 1
26 * set when this rbio is sitting in the hash, but it is just a cache
29 #define RBIO_CACHE_BIT 2
32 * set when it is safe to trust the stripe_pages for caching
34 #define RBIO_CACHE_READY_BIT 3
36 #define RBIO_CACHE_SIZE 1024
38 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
40 /* Used by the raid56 code to lock stripes for read/modify/write */
41 struct btrfs_stripe_hash {
42 struct list_head hash_list;
46 /* Used by the raid56 code to lock stripes for read/modify/write */
47 struct btrfs_stripe_hash_table {
48 struct list_head stripe_cache;
49 spinlock_t cache_lock;
51 struct btrfs_stripe_hash table[];
56 BTRFS_RBIO_READ_REBUILD,
57 BTRFS_RBIO_PARITY_SCRUB,
58 BTRFS_RBIO_REBUILD_MISSING,
61 struct btrfs_raid_bio {
62 struct btrfs_fs_info *fs_info;
63 struct btrfs_bio *bbio;
65 /* while we're doing rmw on a stripe
66 * we put it into a hash table so we can
67 * lock the stripe and merge more rbios
70 struct list_head hash_list;
73 * LRU list for the stripe cache
75 struct list_head stripe_cache;
78 * for scheduling work in the helper threads
80 struct btrfs_work work;
83 * bio list and bio_list_lock are used
84 * to add more bios into the stripe
85 * in hopes of avoiding the full rmw
87 struct bio_list bio_list;
88 spinlock_t bio_list_lock;
90 /* also protected by the bio_list_lock, the
91 * plug list is used by the plugging code
92 * to collect partial bios while plugged. The
93 * stripe locking code also uses it to hand off
94 * the stripe lock to the next pending IO
96 struct list_head plug_list;
99 * flags that tell us if it is safe to
100 * merge with this bio
104 /* size of each individual stripe on disk */
107 /* number of data stripes (no p/q) */
114 * set if we're doing a parity rebuild
115 * for a read from higher up, which is handled
116 * differently from a parity rebuild as part of
119 enum btrfs_rbio_ops operation;
121 /* first bad stripe */
124 /* second bad stripe (for raid6 use) */
129 * number of pages needed to represent the full
135 * size of all the bios in the bio_list. This
136 * helps us decide if the rbio maps to a full
145 atomic_t stripes_pending;
149 * these are two arrays of pointers. We allocate the
150 * rbio big enough to hold them both and setup their
151 * locations when the rbio is allocated
154 /* pointers to pages that we allocated for
155 * reading/writing stripes directly from the disk (including P/Q)
157 struct page **stripe_pages;
160 * pointers to the pages in the bio_list. Stored
161 * here for faster lookup
163 struct page **bio_pages;
166 * bitmap to record which horizontal stripe has data
168 unsigned long *dbitmap;
170 /* allocated with real_stripes-many pointers for finish_*() calls */
171 void **finish_pointers;
173 /* allocated with stripe_npages-many bits for finish_*() calls */
174 unsigned long *finish_pbitmap;
177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
179 static void rmw_work(struct btrfs_work *work);
180 static void read_rebuild_work(struct btrfs_work *work);
181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
183 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
184 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
189 static void scrub_parity_work(struct btrfs_work *work);
191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
193 btrfs_init_work(&rbio->work, work_func, NULL, NULL);
194 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
198 * the stripe hash table is used for locking, and to collect
199 * bios in hopes of making a full stripe
201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
203 struct btrfs_stripe_hash_table *table;
204 struct btrfs_stripe_hash_table *x;
205 struct btrfs_stripe_hash *cur;
206 struct btrfs_stripe_hash *h;
207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
210 if (info->stripe_hash_table)
214 * The table is large, starting with order 4 and can go as high as
215 * order 7 in case lock debugging is turned on.
217 * Try harder to allocate and fallback to vmalloc to lower the chance
218 * of a failing mount.
220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
224 spin_lock_init(&table->cache_lock);
225 INIT_LIST_HEAD(&table->stripe_cache);
229 for (i = 0; i < num_entries; i++) {
231 INIT_LIST_HEAD(&cur->hash_list);
232 spin_lock_init(&cur->lock);
235 x = cmpxchg(&info->stripe_hash_table, NULL, table);
241 * caching an rbio means to copy anything from the
242 * bio_pages array into the stripe_pages array. We
243 * use the page uptodate bit in the stripe cache array
244 * to indicate if it has valid data
246 * once the caching is done, we set the cache ready
249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
254 ret = alloc_rbio_pages(rbio);
258 for (i = 0; i < rbio->nr_pages; i++) {
259 if (!rbio->bio_pages[i])
262 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
263 SetPageUptodate(rbio->stripe_pages[i]);
265 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
269 * we hash on the first logical address of the stripe
271 static int rbio_bucket(struct btrfs_raid_bio *rbio)
273 u64 num = rbio->bbio->raid_map[0];
276 * we shift down quite a bit. We're using byte
277 * addressing, and most of the lower bits are zeros.
278 * This tends to upset hash_64, and it consistently
279 * returns just one or two different values.
281 * shifting off the lower bits fixes things.
283 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
287 * stealing an rbio means taking all the uptodate pages from the stripe
288 * array in the source rbio and putting them into the destination rbio
290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
296 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
299 for (i = 0; i < dest->nr_pages; i++) {
300 s = src->stripe_pages[i];
301 if (!s || !PageUptodate(s)) {
305 d = dest->stripe_pages[i];
309 dest->stripe_pages[i] = s;
310 src->stripe_pages[i] = NULL;
315 * merging means we take the bio_list from the victim and
316 * splice it into the destination. The victim should
317 * be discarded afterwards.
319 * must be called with dest->rbio_list_lock held
321 static void merge_rbio(struct btrfs_raid_bio *dest,
322 struct btrfs_raid_bio *victim)
324 bio_list_merge(&dest->bio_list, &victim->bio_list);
325 dest->bio_list_bytes += victim->bio_list_bytes;
326 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list);
331 * used to prune items that are in the cache. The caller
332 * must hold the hash table lock.
334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
336 int bucket = rbio_bucket(rbio);
337 struct btrfs_stripe_hash_table *table;
338 struct btrfs_stripe_hash *h;
342 * check the bit again under the hash table lock.
344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
347 table = rbio->fs_info->stripe_hash_table;
348 h = table->table + bucket;
350 /* hold the lock for the bucket because we may be
351 * removing it from the hash table
356 * hold the lock for the bio list because we need
357 * to make sure the bio list is empty
359 spin_lock(&rbio->bio_list_lock);
361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 list_del_init(&rbio->stripe_cache);
363 table->cache_size -= 1;
366 /* if the bio list isn't empty, this rbio is
367 * still involved in an IO. We take it out
368 * of the cache list, and drop the ref that
369 * was held for the list.
371 * If the bio_list was empty, we also remove
372 * the rbio from the hash_table, and drop
373 * the corresponding ref
375 if (bio_list_empty(&rbio->bio_list)) {
376 if (!list_empty(&rbio->hash_list)) {
377 list_del_init(&rbio->hash_list);
378 refcount_dec(&rbio->refs);
379 BUG_ON(!list_empty(&rbio->plug_list));
384 spin_unlock(&rbio->bio_list_lock);
385 spin_unlock(&h->lock);
388 __free_raid_bio(rbio);
392 * prune a given rbio from the cache
394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
396 struct btrfs_stripe_hash_table *table;
399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
402 table = rbio->fs_info->stripe_hash_table;
404 spin_lock_irqsave(&table->cache_lock, flags);
405 __remove_rbio_from_cache(rbio);
406 spin_unlock_irqrestore(&table->cache_lock, flags);
410 * remove everything in the cache
412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
414 struct btrfs_stripe_hash_table *table;
416 struct btrfs_raid_bio *rbio;
418 table = info->stripe_hash_table;
420 spin_lock_irqsave(&table->cache_lock, flags);
421 while (!list_empty(&table->stripe_cache)) {
422 rbio = list_entry(table->stripe_cache.next,
423 struct btrfs_raid_bio,
425 __remove_rbio_from_cache(rbio);
427 spin_unlock_irqrestore(&table->cache_lock, flags);
431 * remove all cached entries and free the hash table
434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
436 if (!info->stripe_hash_table)
438 btrfs_clear_rbio_cache(info);
439 kvfree(info->stripe_hash_table);
440 info->stripe_hash_table = NULL;
444 * insert an rbio into the stripe cache. It
445 * must have already been prepared by calling
448 * If this rbio was already cached, it gets
449 * moved to the front of the lru.
451 * If the size of the rbio cache is too big, we
454 static void cache_rbio(struct btrfs_raid_bio *rbio)
456 struct btrfs_stripe_hash_table *table;
459 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
462 table = rbio->fs_info->stripe_hash_table;
464 spin_lock_irqsave(&table->cache_lock, flags);
465 spin_lock(&rbio->bio_list_lock);
467 /* bump our ref if we were not in the list before */
468 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
469 refcount_inc(&rbio->refs);
471 if (!list_empty(&rbio->stripe_cache)){
472 list_move(&rbio->stripe_cache, &table->stripe_cache);
474 list_add(&rbio->stripe_cache, &table->stripe_cache);
475 table->cache_size += 1;
478 spin_unlock(&rbio->bio_list_lock);
480 if (table->cache_size > RBIO_CACHE_SIZE) {
481 struct btrfs_raid_bio *found;
483 found = list_entry(table->stripe_cache.prev,
484 struct btrfs_raid_bio,
488 __remove_rbio_from_cache(found);
491 spin_unlock_irqrestore(&table->cache_lock, flags);
495 * helper function to run the xor_blocks api. It is only
496 * able to do MAX_XOR_BLOCKS at a time, so we need to
499 static void run_xor(void **pages, int src_cnt, ssize_t len)
503 void *dest = pages[src_cnt];
506 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
507 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
509 src_cnt -= xor_src_cnt;
510 src_off += xor_src_cnt;
515 * Returns true if the bio list inside this rbio covers an entire stripe (no
518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
521 unsigned long size = rbio->bio_list_bytes;
524 spin_lock_irqsave(&rbio->bio_list_lock, flags);
525 if (size != rbio->nr_data * rbio->stripe_len)
527 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
528 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
534 * returns 1 if it is safe to merge two rbios together.
535 * The merging is safe if the two rbios correspond to
536 * the same stripe and if they are both going in the same
537 * direction (read vs write), and if neither one is
538 * locked for final IO
540 * The caller is responsible for locking such that
541 * rmw_locked is safe to test
543 static int rbio_can_merge(struct btrfs_raid_bio *last,
544 struct btrfs_raid_bio *cur)
546 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
547 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
551 * we can't merge with cached rbios, since the
552 * idea is that when we merge the destination
553 * rbio is going to run our IO for us. We can
554 * steal from cached rbios though, other functions
557 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
558 test_bit(RBIO_CACHE_BIT, &cur->flags))
561 if (last->bbio->raid_map[0] !=
562 cur->bbio->raid_map[0])
565 /* we can't merge with different operations */
566 if (last->operation != cur->operation)
569 * We've need read the full stripe from the drive.
570 * check and repair the parity and write the new results.
572 * We're not allowed to add any new bios to the
573 * bio list here, anyone else that wants to
574 * change this stripe needs to do their own rmw.
576 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
579 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
582 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
583 int fa = last->faila;
584 int fb = last->failb;
585 int cur_fa = cur->faila;
586 int cur_fb = cur->failb;
588 if (last->faila >= last->failb) {
593 if (cur->faila >= cur->failb) {
598 if (fa != cur_fa || fb != cur_fb)
604 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
607 return stripe * rbio->stripe_npages + index;
611 * these are just the pages from the rbio array, not from anything
612 * the FS sent down to us
614 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
617 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
621 * helper to index into the pstripe
623 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
625 return rbio_stripe_page(rbio, rbio->nr_data, index);
629 * helper to index into the qstripe, returns null
630 * if there is no qstripe
632 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
634 if (rbio->nr_data + 1 == rbio->real_stripes)
636 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
640 * The first stripe in the table for a logical address
641 * has the lock. rbios are added in one of three ways:
643 * 1) Nobody has the stripe locked yet. The rbio is given
644 * the lock and 0 is returned. The caller must start the IO
647 * 2) Someone has the stripe locked, but we're able to merge
648 * with the lock owner. The rbio is freed and the IO will
649 * start automatically along with the existing rbio. 1 is returned.
651 * 3) Someone has the stripe locked, but we're not able to merge.
652 * The rbio is added to the lock owner's plug list, or merged into
653 * an rbio already on the plug list. When the lock owner unlocks,
654 * the next rbio on the list is run and the IO is started automatically.
657 * If we return 0, the caller still owns the rbio and must continue with
658 * IO submission. If we return 1, the caller must assume the rbio has
659 * already been freed.
661 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
663 struct btrfs_stripe_hash *h;
664 struct btrfs_raid_bio *cur;
665 struct btrfs_raid_bio *pending;
667 struct btrfs_raid_bio *freeit = NULL;
668 struct btrfs_raid_bio *cache_drop = NULL;
671 h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
673 spin_lock_irqsave(&h->lock, flags);
674 list_for_each_entry(cur, &h->hash_list, hash_list) {
675 if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
678 spin_lock(&cur->bio_list_lock);
680 /* Can we steal this cached rbio's pages? */
681 if (bio_list_empty(&cur->bio_list) &&
682 list_empty(&cur->plug_list) &&
683 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
684 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
685 list_del_init(&cur->hash_list);
686 refcount_dec(&cur->refs);
688 steal_rbio(cur, rbio);
690 spin_unlock(&cur->bio_list_lock);
695 /* Can we merge into the lock owner? */
696 if (rbio_can_merge(cur, rbio)) {
697 merge_rbio(cur, rbio);
698 spin_unlock(&cur->bio_list_lock);
706 * We couldn't merge with the running rbio, see if we can merge
707 * with the pending ones. We don't have to check for rmw_locked
708 * because there is no way they are inside finish_rmw right now
710 list_for_each_entry(pending, &cur->plug_list, plug_list) {
711 if (rbio_can_merge(pending, rbio)) {
712 merge_rbio(pending, rbio);
713 spin_unlock(&cur->bio_list_lock);
721 * No merging, put us on the tail of the plug list, our rbio
722 * will be started with the currently running rbio unlocks
724 list_add_tail(&rbio->plug_list, &cur->plug_list);
725 spin_unlock(&cur->bio_list_lock);
730 refcount_inc(&rbio->refs);
731 list_add(&rbio->hash_list, &h->hash_list);
733 spin_unlock_irqrestore(&h->lock, flags);
735 remove_rbio_from_cache(cache_drop);
737 __free_raid_bio(freeit);
742 * called as rmw or parity rebuild is completed. If the plug list has more
743 * rbios waiting for this stripe, the next one on the list will be started
745 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
748 struct btrfs_stripe_hash *h;
752 bucket = rbio_bucket(rbio);
753 h = rbio->fs_info->stripe_hash_table->table + bucket;
755 if (list_empty(&rbio->plug_list))
758 spin_lock_irqsave(&h->lock, flags);
759 spin_lock(&rbio->bio_list_lock);
761 if (!list_empty(&rbio->hash_list)) {
763 * if we're still cached and there is no other IO
764 * to perform, just leave this rbio here for others
765 * to steal from later
767 if (list_empty(&rbio->plug_list) &&
768 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
770 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
771 BUG_ON(!bio_list_empty(&rbio->bio_list));
775 list_del_init(&rbio->hash_list);
776 refcount_dec(&rbio->refs);
779 * we use the plug list to hold all the rbios
780 * waiting for the chance to lock this stripe.
781 * hand the lock over to one of them.
783 if (!list_empty(&rbio->plug_list)) {
784 struct btrfs_raid_bio *next;
785 struct list_head *head = rbio->plug_list.next;
787 next = list_entry(head, struct btrfs_raid_bio,
790 list_del_init(&rbio->plug_list);
792 list_add(&next->hash_list, &h->hash_list);
793 refcount_inc(&next->refs);
794 spin_unlock(&rbio->bio_list_lock);
795 spin_unlock_irqrestore(&h->lock, flags);
797 if (next->operation == BTRFS_RBIO_READ_REBUILD)
798 start_async_work(next, read_rebuild_work);
799 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
800 steal_rbio(rbio, next);
801 start_async_work(next, read_rebuild_work);
802 } else if (next->operation == BTRFS_RBIO_WRITE) {
803 steal_rbio(rbio, next);
804 start_async_work(next, rmw_work);
805 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
806 steal_rbio(rbio, next);
807 start_async_work(next, scrub_parity_work);
814 spin_unlock(&rbio->bio_list_lock);
815 spin_unlock_irqrestore(&h->lock, flags);
819 remove_rbio_from_cache(rbio);
822 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
826 if (!refcount_dec_and_test(&rbio->refs))
829 WARN_ON(!list_empty(&rbio->stripe_cache));
830 WARN_ON(!list_empty(&rbio->hash_list));
831 WARN_ON(!bio_list_empty(&rbio->bio_list));
833 for (i = 0; i < rbio->nr_pages; i++) {
834 if (rbio->stripe_pages[i]) {
835 __free_page(rbio->stripe_pages[i]);
836 rbio->stripe_pages[i] = NULL;
840 btrfs_put_bbio(rbio->bbio);
844 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
851 cur->bi_status = err;
858 * this frees the rbio and runs through all the bios in the
859 * bio_list and calls end_io on them
861 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
863 struct bio *cur = bio_list_get(&rbio->bio_list);
866 if (rbio->generic_bio_cnt)
867 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
870 * At this moment, rbio->bio_list is empty, however since rbio does not
871 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
872 * hash list, rbio may be merged with others so that rbio->bio_list
874 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
875 * more and we can call bio_endio() on all queued bios.
878 extra = bio_list_get(&rbio->bio_list);
879 __free_raid_bio(rbio);
881 rbio_endio_bio_list(cur, err);
883 rbio_endio_bio_list(extra, err);
887 * end io function used by finish_rmw. When we finally
888 * get here, we've written a full stripe
890 static void raid_write_end_io(struct bio *bio)
892 struct btrfs_raid_bio *rbio = bio->bi_private;
893 blk_status_t err = bio->bi_status;
897 fail_bio_stripe(rbio, bio);
901 if (!atomic_dec_and_test(&rbio->stripes_pending))
906 /* OK, we have read all the stripes we need to. */
907 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
908 0 : rbio->bbio->max_errors;
909 if (atomic_read(&rbio->error) > max_errors)
912 rbio_orig_end_io(rbio, err);
916 * the read/modify/write code wants to use the original bio for
917 * any pages it included, and then use the rbio for everything
918 * else. This function decides if a given index (stripe number)
919 * and page number in that stripe fall inside the original bio
922 * if you set bio_list_only, you'll get a NULL back for any ranges
923 * that are outside the bio_list
925 * This doesn't take any refs on anything, you get a bare page pointer
926 * and the caller must bump refs as required.
928 * You must call index_rbio_pages once before you can trust
929 * the answers from this function.
931 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
932 int index, int pagenr, int bio_list_only)
935 struct page *p = NULL;
937 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
939 spin_lock_irq(&rbio->bio_list_lock);
940 p = rbio->bio_pages[chunk_page];
941 spin_unlock_irq(&rbio->bio_list_lock);
943 if (p || bio_list_only)
946 return rbio->stripe_pages[chunk_page];
950 * number of pages we need for the entire stripe across all the
953 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
955 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
959 * allocation and initial setup for the btrfs_raid_bio. Not
960 * this does not allocate any pages for rbio->pages.
962 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
963 struct btrfs_bio *bbio,
966 struct btrfs_raid_bio *rbio;
968 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
969 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
970 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
973 rbio = kzalloc(sizeof(*rbio) +
974 sizeof(*rbio->stripe_pages) * num_pages +
975 sizeof(*rbio->bio_pages) * num_pages +
976 sizeof(*rbio->finish_pointers) * real_stripes +
977 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
978 sizeof(*rbio->finish_pbitmap) *
979 BITS_TO_LONGS(stripe_npages),
982 return ERR_PTR(-ENOMEM);
984 bio_list_init(&rbio->bio_list);
985 INIT_LIST_HEAD(&rbio->plug_list);
986 spin_lock_init(&rbio->bio_list_lock);
987 INIT_LIST_HEAD(&rbio->stripe_cache);
988 INIT_LIST_HEAD(&rbio->hash_list);
990 rbio->fs_info = fs_info;
991 rbio->stripe_len = stripe_len;
992 rbio->nr_pages = num_pages;
993 rbio->real_stripes = real_stripes;
994 rbio->stripe_npages = stripe_npages;
997 refcount_set(&rbio->refs, 1);
998 atomic_set(&rbio->error, 0);
999 atomic_set(&rbio->stripes_pending, 0);
1002 * the stripe_pages, bio_pages, etc arrays point to the extra
1003 * memory we allocated past the end of the rbio
1006 #define CONSUME_ALLOC(ptr, count) do { \
1008 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1010 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1011 CONSUME_ALLOC(rbio->bio_pages, num_pages);
1012 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1013 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1014 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1015 #undef CONSUME_ALLOC
1017 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1018 nr_data = real_stripes - 1;
1019 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1020 nr_data = real_stripes - 2;
1024 rbio->nr_data = nr_data;
1028 /* allocate pages for all the stripes in the bio, including parity */
1029 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1034 for (i = 0; i < rbio->nr_pages; i++) {
1035 if (rbio->stripe_pages[i])
1037 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1040 rbio->stripe_pages[i] = page;
1045 /* only allocate pages for p/q stripes */
1046 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1051 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1053 for (; i < rbio->nr_pages; i++) {
1054 if (rbio->stripe_pages[i])
1056 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1059 rbio->stripe_pages[i] = page;
1065 * add a single page from a specific stripe into our list of bios for IO
1066 * this will try to merge into existing bios if possible, and returns
1067 * zero if all went well.
1069 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1070 struct bio_list *bio_list,
1073 unsigned long page_index,
1074 unsigned long bio_max_len)
1076 struct bio *last = bio_list->tail;
1079 struct btrfs_bio_stripe *stripe;
1082 stripe = &rbio->bbio->stripes[stripe_nr];
1083 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1085 /* if the device is missing, just fail this stripe */
1086 if (!stripe->dev->bdev)
1087 return fail_rbio_index(rbio, stripe_nr);
1089 /* see if we can add this page onto our existing bio */
1091 u64 last_end = last->bi_iter.bi_sector << 9;
1092 last_end += last->bi_iter.bi_size;
1095 * we can't merge these if they are from different
1096 * devices or if they are not contiguous
1098 if (last_end == disk_start && !last->bi_status &&
1099 last->bi_bdev == stripe->dev->bdev) {
1100 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1101 if (ret == PAGE_SIZE)
1106 /* put a new bio on the list */
1107 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1108 btrfs_io_bio(bio)->device = stripe->dev;
1109 bio->bi_iter.bi_size = 0;
1110 bio_set_dev(bio, stripe->dev->bdev);
1111 bio->bi_iter.bi_sector = disk_start >> 9;
1113 bio_add_page(bio, page, PAGE_SIZE, 0);
1114 bio_list_add(bio_list, bio);
1119 * while we're doing the read/modify/write cycle, we could
1120 * have errors in reading pages off the disk. This checks
1121 * for errors and if we're not able to read the page it'll
1122 * trigger parity reconstruction. The rmw will be finished
1123 * after we've reconstructed the failed stripes
1125 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1127 if (rbio->faila >= 0 || rbio->failb >= 0) {
1128 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1129 __raid56_parity_recover(rbio);
1136 * helper function to walk our bio list and populate the bio_pages array with
1137 * the result. This seems expensive, but it is faster than constantly
1138 * searching through the bio list as we setup the IO in finish_rmw or stripe
1141 * This must be called before you trust the answers from page_in_rbio
1143 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1147 unsigned long stripe_offset;
1148 unsigned long page_index;
1150 spin_lock_irq(&rbio->bio_list_lock);
1151 bio_list_for_each(bio, &rbio->bio_list) {
1152 struct bio_vec bvec;
1153 struct bvec_iter iter;
1156 start = bio->bi_iter.bi_sector << 9;
1157 stripe_offset = start - rbio->bbio->raid_map[0];
1158 page_index = stripe_offset >> PAGE_SHIFT;
1160 if (bio_flagged(bio, BIO_CLONED))
1161 bio->bi_iter = btrfs_io_bio(bio)->iter;
1163 bio_for_each_segment(bvec, bio, iter) {
1164 rbio->bio_pages[page_index + i] = bvec.bv_page;
1168 spin_unlock_irq(&rbio->bio_list_lock);
1172 * this is called from one of two situations. We either
1173 * have a full stripe from the higher layers, or we've read all
1174 * the missing bits off disk.
1176 * This will calculate the parity and then send down any
1179 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1181 struct btrfs_bio *bbio = rbio->bbio;
1182 void **pointers = rbio->finish_pointers;
1183 int nr_data = rbio->nr_data;
1187 struct bio_list bio_list;
1191 bio_list_init(&bio_list);
1193 if (rbio->real_stripes - rbio->nr_data == 1)
1194 has_qstripe = false;
1195 else if (rbio->real_stripes - rbio->nr_data == 2)
1200 /* at this point we either have a full stripe,
1201 * or we've read the full stripe from the drive.
1202 * recalculate the parity and write the new results.
1204 * We're not allowed to add any new bios to the
1205 * bio list here, anyone else that wants to
1206 * change this stripe needs to do their own rmw.
1208 spin_lock_irq(&rbio->bio_list_lock);
1209 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1210 spin_unlock_irq(&rbio->bio_list_lock);
1212 atomic_set(&rbio->error, 0);
1215 * now that we've set rmw_locked, run through the
1216 * bio list one last time and map the page pointers
1218 * We don't cache full rbios because we're assuming
1219 * the higher layers are unlikely to use this area of
1220 * the disk again soon. If they do use it again,
1221 * hopefully they will send another full bio.
1223 index_rbio_pages(rbio);
1224 if (!rbio_is_full(rbio))
1225 cache_rbio_pages(rbio);
1227 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1229 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1231 /* first collect one page from each data stripe */
1232 for (stripe = 0; stripe < nr_data; stripe++) {
1233 p = page_in_rbio(rbio, stripe, pagenr, 0);
1234 pointers[stripe] = kmap(p);
1237 /* then add the parity stripe */
1238 p = rbio_pstripe_page(rbio, pagenr);
1240 pointers[stripe++] = kmap(p);
1245 * raid6, add the qstripe and call the
1246 * library function to fill in our p/q
1248 p = rbio_qstripe_page(rbio, pagenr);
1250 pointers[stripe++] = kmap(p);
1252 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1256 copy_page(pointers[nr_data], pointers[0]);
1257 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1261 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1262 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1266 * time to start writing. Make bios for everything from the
1267 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1270 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1271 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1273 if (stripe < rbio->nr_data) {
1274 page = page_in_rbio(rbio, stripe, pagenr, 1);
1278 page = rbio_stripe_page(rbio, stripe, pagenr);
1281 ret = rbio_add_io_page(rbio, &bio_list,
1282 page, stripe, pagenr, rbio->stripe_len);
1288 if (likely(!bbio->num_tgtdevs))
1291 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1292 if (!bbio->tgtdev_map[stripe])
1295 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1297 if (stripe < rbio->nr_data) {
1298 page = page_in_rbio(rbio, stripe, pagenr, 1);
1302 page = rbio_stripe_page(rbio, stripe, pagenr);
1305 ret = rbio_add_io_page(rbio, &bio_list, page,
1306 rbio->bbio->tgtdev_map[stripe],
1307 pagenr, rbio->stripe_len);
1314 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1315 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1317 while ((bio = bio_list_pop(&bio_list))) {
1318 bio->bi_private = rbio;
1319 bio->bi_end_io = raid_write_end_io;
1320 bio->bi_opf = REQ_OP_WRITE;
1327 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1329 while ((bio = bio_list_pop(&bio_list)))
1334 * helper to find the stripe number for a given bio. Used to figure out which
1335 * stripe has failed. This expects the bio to correspond to a physical disk,
1336 * so it looks up based on physical sector numbers.
1338 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1341 u64 physical = bio->bi_iter.bi_sector;
1343 struct btrfs_bio_stripe *stripe;
1347 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1348 stripe = &rbio->bbio->stripes[i];
1349 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1350 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1358 * helper to find the stripe number for a given
1359 * bio (before mapping). Used to figure out which stripe has
1360 * failed. This looks up based on logical block numbers.
1362 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1365 u64 logical = bio->bi_iter.bi_sector << 9;
1368 for (i = 0; i < rbio->nr_data; i++) {
1369 u64 stripe_start = rbio->bbio->raid_map[i];
1371 if (in_range(logical, stripe_start, rbio->stripe_len))
1378 * returns -EIO if we had too many failures
1380 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1382 unsigned long flags;
1385 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1387 /* we already know this stripe is bad, move on */
1388 if (rbio->faila == failed || rbio->failb == failed)
1391 if (rbio->faila == -1) {
1392 /* first failure on this rbio */
1393 rbio->faila = failed;
1394 atomic_inc(&rbio->error);
1395 } else if (rbio->failb == -1) {
1396 /* second failure on this rbio */
1397 rbio->failb = failed;
1398 atomic_inc(&rbio->error);
1403 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1409 * helper to fail a stripe based on a physical disk
1412 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1415 int failed = find_bio_stripe(rbio, bio);
1420 return fail_rbio_index(rbio, failed);
1424 * this sets each page in the bio uptodate. It should only be used on private
1425 * rbio pages, nothing that comes in from the higher layers
1427 static void set_bio_pages_uptodate(struct bio *bio)
1429 struct bio_vec *bvec;
1430 struct bvec_iter_all iter_all;
1432 ASSERT(!bio_flagged(bio, BIO_CLONED));
1434 bio_for_each_segment_all(bvec, bio, iter_all)
1435 SetPageUptodate(bvec->bv_page);
1439 * end io for the read phase of the rmw cycle. All the bios here are physical
1440 * stripe bios we've read from the disk so we can recalculate the parity of the
1443 * This will usually kick off finish_rmw once all the bios are read in, but it
1444 * may trigger parity reconstruction if we had any errors along the way
1446 static void raid_rmw_end_io(struct bio *bio)
1448 struct btrfs_raid_bio *rbio = bio->bi_private;
1451 fail_bio_stripe(rbio, bio);
1453 set_bio_pages_uptodate(bio);
1457 if (!atomic_dec_and_test(&rbio->stripes_pending))
1460 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1464 * this will normally call finish_rmw to start our write
1465 * but if there are any failed stripes we'll reconstruct
1468 validate_rbio_for_rmw(rbio);
1473 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1477 * the stripe must be locked by the caller. It will
1478 * unlock after all the writes are done
1480 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1482 int bios_to_read = 0;
1483 struct bio_list bio_list;
1489 bio_list_init(&bio_list);
1491 ret = alloc_rbio_pages(rbio);
1495 index_rbio_pages(rbio);
1497 atomic_set(&rbio->error, 0);
1499 * build a list of bios to read all the missing parts of this
1502 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1503 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1506 * we want to find all the pages missing from
1507 * the rbio and read them from the disk. If
1508 * page_in_rbio finds a page in the bio list
1509 * we don't need to read it off the stripe.
1511 page = page_in_rbio(rbio, stripe, pagenr, 1);
1515 page = rbio_stripe_page(rbio, stripe, pagenr);
1517 * the bio cache may have handed us an uptodate
1518 * page. If so, be happy and use it
1520 if (PageUptodate(page))
1523 ret = rbio_add_io_page(rbio, &bio_list, page,
1524 stripe, pagenr, rbio->stripe_len);
1530 bios_to_read = bio_list_size(&bio_list);
1531 if (!bios_to_read) {
1533 * this can happen if others have merged with
1534 * us, it means there is nothing left to read.
1535 * But if there are missing devices it may not be
1536 * safe to do the full stripe write yet.
1542 * the bbio may be freed once we submit the last bio. Make sure
1543 * not to touch it after that
1545 atomic_set(&rbio->stripes_pending, bios_to_read);
1546 while ((bio = bio_list_pop(&bio_list))) {
1547 bio->bi_private = rbio;
1548 bio->bi_end_io = raid_rmw_end_io;
1549 bio->bi_opf = REQ_OP_READ;
1551 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1555 /* the actual write will happen once the reads are done */
1559 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1561 while ((bio = bio_list_pop(&bio_list)))
1567 validate_rbio_for_rmw(rbio);
1572 * if the upper layers pass in a full stripe, we thank them by only allocating
1573 * enough pages to hold the parity, and sending it all down quickly.
1575 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1579 ret = alloc_rbio_parity_pages(rbio);
1581 __free_raid_bio(rbio);
1585 ret = lock_stripe_add(rbio);
1592 * partial stripe writes get handed over to async helpers.
1593 * We're really hoping to merge a few more writes into this
1594 * rbio before calculating new parity
1596 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1600 ret = lock_stripe_add(rbio);
1602 start_async_work(rbio, rmw_work);
1607 * sometimes while we were reading from the drive to
1608 * recalculate parity, enough new bios come into create
1609 * a full stripe. So we do a check here to see if we can
1610 * go directly to finish_rmw
1612 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1614 /* head off into rmw land if we don't have a full stripe */
1615 if (!rbio_is_full(rbio))
1616 return partial_stripe_write(rbio);
1617 return full_stripe_write(rbio);
1621 * We use plugging call backs to collect full stripes.
1622 * Any time we get a partial stripe write while plugged
1623 * we collect it into a list. When the unplug comes down,
1624 * we sort the list by logical block number and merge
1625 * everything we can into the same rbios
1627 struct btrfs_plug_cb {
1628 struct blk_plug_cb cb;
1629 struct btrfs_fs_info *info;
1630 struct list_head rbio_list;
1631 struct btrfs_work work;
1635 * rbios on the plug list are sorted for easier merging.
1637 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1639 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1641 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1643 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1644 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1646 if (a_sector < b_sector)
1648 if (a_sector > b_sector)
1653 static void run_plug(struct btrfs_plug_cb *plug)
1655 struct btrfs_raid_bio *cur;
1656 struct btrfs_raid_bio *last = NULL;
1659 * sort our plug list then try to merge
1660 * everything we can in hopes of creating full
1663 list_sort(NULL, &plug->rbio_list, plug_cmp);
1664 while (!list_empty(&plug->rbio_list)) {
1665 cur = list_entry(plug->rbio_list.next,
1666 struct btrfs_raid_bio, plug_list);
1667 list_del_init(&cur->plug_list);
1669 if (rbio_is_full(cur)) {
1672 /* we have a full stripe, send it down */
1673 ret = full_stripe_write(cur);
1678 if (rbio_can_merge(last, cur)) {
1679 merge_rbio(last, cur);
1680 __free_raid_bio(cur);
1684 __raid56_parity_write(last);
1689 __raid56_parity_write(last);
1695 * if the unplug comes from schedule, we have to push the
1696 * work off to a helper thread
1698 static void unplug_work(struct btrfs_work *work)
1700 struct btrfs_plug_cb *plug;
1701 plug = container_of(work, struct btrfs_plug_cb, work);
1705 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1707 struct btrfs_plug_cb *plug;
1708 plug = container_of(cb, struct btrfs_plug_cb, cb);
1710 if (from_schedule) {
1711 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1712 btrfs_queue_work(plug->info->rmw_workers,
1720 * our main entry point for writes from the rest of the FS.
1722 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
1723 struct btrfs_bio *bbio, u64 stripe_len)
1725 struct btrfs_raid_bio *rbio;
1726 struct btrfs_plug_cb *plug = NULL;
1727 struct blk_plug_cb *cb;
1730 rbio = alloc_rbio(fs_info, bbio, stripe_len);
1732 btrfs_put_bbio(bbio);
1733 return PTR_ERR(rbio);
1735 bio_list_add(&rbio->bio_list, bio);
1736 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1737 rbio->operation = BTRFS_RBIO_WRITE;
1739 btrfs_bio_counter_inc_noblocked(fs_info);
1740 rbio->generic_bio_cnt = 1;
1743 * don't plug on full rbios, just get them out the door
1744 * as quickly as we can
1746 if (rbio_is_full(rbio)) {
1747 ret = full_stripe_write(rbio);
1749 btrfs_bio_counter_dec(fs_info);
1753 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1755 plug = container_of(cb, struct btrfs_plug_cb, cb);
1757 plug->info = fs_info;
1758 INIT_LIST_HEAD(&plug->rbio_list);
1760 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1763 ret = __raid56_parity_write(rbio);
1765 btrfs_bio_counter_dec(fs_info);
1771 * all parity reconstruction happens here. We've read in everything
1772 * we can find from the drives and this does the heavy lifting of
1773 * sorting the good from the bad.
1775 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1779 int faila = -1, failb = -1;
1784 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1786 err = BLK_STS_RESOURCE;
1790 faila = rbio->faila;
1791 failb = rbio->failb;
1793 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1794 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1795 spin_lock_irq(&rbio->bio_list_lock);
1796 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1797 spin_unlock_irq(&rbio->bio_list_lock);
1800 index_rbio_pages(rbio);
1802 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1804 * Now we just use bitmap to mark the horizontal stripes in
1805 * which we have data when doing parity scrub.
1807 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1808 !test_bit(pagenr, rbio->dbitmap))
1811 /* setup our array of pointers with pages
1814 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1816 * if we're rebuilding a read, we have to use
1817 * pages from the bio list
1819 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1820 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1821 (stripe == faila || stripe == failb)) {
1822 page = page_in_rbio(rbio, stripe, pagenr, 0);
1824 page = rbio_stripe_page(rbio, stripe, pagenr);
1826 pointers[stripe] = kmap(page);
1829 /* all raid6 handling here */
1830 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1832 * single failure, rebuild from parity raid5
1836 if (faila == rbio->nr_data) {
1838 * Just the P stripe has failed, without
1839 * a bad data or Q stripe.
1840 * TODO, we should redo the xor here.
1842 err = BLK_STS_IOERR;
1846 * a single failure in raid6 is rebuilt
1847 * in the pstripe code below
1852 /* make sure our ps and qs are in order */
1856 /* if the q stripe is failed, do a pstripe reconstruction
1858 * If both the q stripe and the P stripe are failed, we're
1859 * here due to a crc mismatch and we can't give them the
1862 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1863 if (rbio->bbio->raid_map[faila] ==
1865 err = BLK_STS_IOERR;
1869 * otherwise we have one bad data stripe and
1870 * a good P stripe. raid5!
1875 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1876 raid6_datap_recov(rbio->real_stripes,
1877 PAGE_SIZE, faila, pointers);
1879 raid6_2data_recov(rbio->real_stripes,
1880 PAGE_SIZE, faila, failb,
1886 /* rebuild from P stripe here (raid5 or raid6) */
1887 BUG_ON(failb != -1);
1889 /* Copy parity block into failed block to start with */
1890 copy_page(pointers[faila], pointers[rbio->nr_data]);
1892 /* rearrange the pointer array */
1893 p = pointers[faila];
1894 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1895 pointers[stripe] = pointers[stripe + 1];
1896 pointers[rbio->nr_data - 1] = p;
1898 /* xor in the rest */
1899 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1901 /* if we're doing this rebuild as part of an rmw, go through
1902 * and set all of our private rbio pages in the
1903 * failed stripes as uptodate. This way finish_rmw will
1904 * know they can be trusted. If this was a read reconstruction,
1905 * other endio functions will fiddle the uptodate bits
1907 if (rbio->operation == BTRFS_RBIO_WRITE) {
1908 for (i = 0; i < rbio->stripe_npages; i++) {
1910 page = rbio_stripe_page(rbio, faila, i);
1911 SetPageUptodate(page);
1914 page = rbio_stripe_page(rbio, failb, i);
1915 SetPageUptodate(page);
1919 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1921 * if we're rebuilding a read, we have to use
1922 * pages from the bio list
1924 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1925 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1926 (stripe == faila || stripe == failb)) {
1927 page = page_in_rbio(rbio, stripe, pagenr, 0);
1929 page = rbio_stripe_page(rbio, stripe, pagenr);
1941 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1942 * valid rbio which is consistent with ondisk content, thus such a
1943 * valid rbio can be cached to avoid further disk reads.
1945 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1946 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1948 * - In case of two failures, where rbio->failb != -1:
1950 * Do not cache this rbio since the above read reconstruction
1951 * (raid6_datap_recov() or raid6_2data_recov()) may have
1952 * changed some content of stripes which are not identical to
1953 * on-disk content any more, otherwise, a later write/recover
1954 * may steal stripe_pages from this rbio and end up with
1955 * corruptions or rebuild failures.
1957 * - In case of single failure, where rbio->failb == -1:
1959 * Cache this rbio iff the above read reconstruction is
1960 * executed without problems.
1962 if (err == BLK_STS_OK && rbio->failb < 0)
1963 cache_rbio_pages(rbio);
1965 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1967 rbio_orig_end_io(rbio, err);
1968 } else if (err == BLK_STS_OK) {
1972 if (rbio->operation == BTRFS_RBIO_WRITE)
1974 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1975 finish_parity_scrub(rbio, 0);
1979 rbio_orig_end_io(rbio, err);
1984 * This is called only for stripes we've read from disk to
1985 * reconstruct the parity.
1987 static void raid_recover_end_io(struct bio *bio)
1989 struct btrfs_raid_bio *rbio = bio->bi_private;
1992 * we only read stripe pages off the disk, set them
1993 * up to date if there were no errors
1996 fail_bio_stripe(rbio, bio);
1998 set_bio_pages_uptodate(bio);
2001 if (!atomic_dec_and_test(&rbio->stripes_pending))
2004 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2005 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2007 __raid_recover_end_io(rbio);
2011 * reads everything we need off the disk to reconstruct
2012 * the parity. endio handlers trigger final reconstruction
2013 * when the IO is done.
2015 * This is used both for reads from the higher layers and for
2016 * parity construction required to finish a rmw cycle.
2018 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2020 int bios_to_read = 0;
2021 struct bio_list bio_list;
2027 bio_list_init(&bio_list);
2029 ret = alloc_rbio_pages(rbio);
2033 atomic_set(&rbio->error, 0);
2036 * read everything that hasn't failed. Thanks to the
2037 * stripe cache, it is possible that some or all of these
2038 * pages are going to be uptodate.
2040 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2041 if (rbio->faila == stripe || rbio->failb == stripe) {
2042 atomic_inc(&rbio->error);
2046 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2050 * the rmw code may have already read this
2053 p = rbio_stripe_page(rbio, stripe, pagenr);
2054 if (PageUptodate(p))
2057 ret = rbio_add_io_page(rbio, &bio_list,
2058 rbio_stripe_page(rbio, stripe, pagenr),
2059 stripe, pagenr, rbio->stripe_len);
2065 bios_to_read = bio_list_size(&bio_list);
2066 if (!bios_to_read) {
2068 * we might have no bios to read just because the pages
2069 * were up to date, or we might have no bios to read because
2070 * the devices were gone.
2072 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2073 __raid_recover_end_io(rbio);
2081 * the bbio may be freed once we submit the last bio. Make sure
2082 * not to touch it after that
2084 atomic_set(&rbio->stripes_pending, bios_to_read);
2085 while ((bio = bio_list_pop(&bio_list))) {
2086 bio->bi_private = rbio;
2087 bio->bi_end_io = raid_recover_end_io;
2088 bio->bi_opf = REQ_OP_READ;
2090 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2098 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2099 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2100 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2102 while ((bio = bio_list_pop(&bio_list)))
2109 * the main entry point for reads from the higher layers. This
2110 * is really only called when the normal read path had a failure,
2111 * so we assume the bio they send down corresponds to a failed part
2114 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
2115 struct btrfs_bio *bbio, u64 stripe_len,
2116 int mirror_num, int generic_io)
2118 struct btrfs_raid_bio *rbio;
2122 ASSERT(bbio->mirror_num == mirror_num);
2123 btrfs_io_bio(bio)->mirror_num = mirror_num;
2126 rbio = alloc_rbio(fs_info, bbio, stripe_len);
2129 btrfs_put_bbio(bbio);
2130 return PTR_ERR(rbio);
2133 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2134 bio_list_add(&rbio->bio_list, bio);
2135 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2137 rbio->faila = find_logical_bio_stripe(rbio, bio);
2138 if (rbio->faila == -1) {
2140 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2141 __func__, bio->bi_iter.bi_sector << 9,
2142 (u64)bio->bi_iter.bi_size, bbio->map_type);
2144 btrfs_put_bbio(bbio);
2150 btrfs_bio_counter_inc_noblocked(fs_info);
2151 rbio->generic_bio_cnt = 1;
2153 btrfs_get_bbio(bbio);
2158 * for 'mirror == 2', reconstruct from all other stripes.
2159 * for 'mirror_num > 2', select a stripe to fail on every retry.
2161 if (mirror_num > 2) {
2163 * 'mirror == 3' is to fail the p stripe and
2164 * reconstruct from the q stripe. 'mirror > 3' is to
2165 * fail a data stripe and reconstruct from p+q stripe.
2167 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2168 ASSERT(rbio->failb > 0);
2169 if (rbio->failb <= rbio->faila)
2173 ret = lock_stripe_add(rbio);
2176 * __raid56_parity_recover will end the bio with
2177 * any errors it hits. We don't want to return
2178 * its error value up the stack because our caller
2179 * will end up calling bio_endio with any nonzero
2183 __raid56_parity_recover(rbio);
2185 * our rbio has been added to the list of
2186 * rbios that will be handled after the
2187 * currently lock owner is done
2193 static void rmw_work(struct btrfs_work *work)
2195 struct btrfs_raid_bio *rbio;
2197 rbio = container_of(work, struct btrfs_raid_bio, work);
2198 raid56_rmw_stripe(rbio);
2201 static void read_rebuild_work(struct btrfs_work *work)
2203 struct btrfs_raid_bio *rbio;
2205 rbio = container_of(work, struct btrfs_raid_bio, work);
2206 __raid56_parity_recover(rbio);
2210 * The following code is used to scrub/replace the parity stripe
2212 * Caller must have already increased bio_counter for getting @bbio.
2214 * Note: We need make sure all the pages that add into the scrub/replace
2215 * raid bio are correct and not be changed during the scrub/replace. That
2216 * is those pages just hold metadata or file data with checksum.
2219 struct btrfs_raid_bio *
2220 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2221 struct btrfs_bio *bbio, u64 stripe_len,
2222 struct btrfs_device *scrub_dev,
2223 unsigned long *dbitmap, int stripe_nsectors)
2225 struct btrfs_raid_bio *rbio;
2228 rbio = alloc_rbio(fs_info, bbio, stripe_len);
2231 bio_list_add(&rbio->bio_list, bio);
2233 * This is a special bio which is used to hold the completion handler
2234 * and make the scrub rbio is similar to the other types
2236 ASSERT(!bio->bi_iter.bi_size);
2237 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2240 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2241 * to the end position, so this search can start from the first parity
2244 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2245 if (bbio->stripes[i].dev == scrub_dev) {
2250 ASSERT(i < rbio->real_stripes);
2252 /* Now we just support the sectorsize equals to page size */
2253 ASSERT(fs_info->sectorsize == PAGE_SIZE);
2254 ASSERT(rbio->stripe_npages == stripe_nsectors);
2255 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2258 * We have already increased bio_counter when getting bbio, record it
2259 * so we can free it at rbio_orig_end_io().
2261 rbio->generic_bio_cnt = 1;
2266 /* Used for both parity scrub and missing. */
2267 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2273 ASSERT(logical >= rbio->bbio->raid_map[0]);
2274 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2275 rbio->stripe_len * rbio->nr_data);
2276 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2277 index = stripe_offset >> PAGE_SHIFT;
2278 rbio->bio_pages[index] = page;
2282 * We just scrub the parity that we have correct data on the same horizontal,
2283 * so we needn't allocate all pages for all the stripes.
2285 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2292 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2293 for (i = 0; i < rbio->real_stripes; i++) {
2294 index = i * rbio->stripe_npages + bit;
2295 if (rbio->stripe_pages[index])
2298 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2301 rbio->stripe_pages[index] = page;
2307 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2310 struct btrfs_bio *bbio = rbio->bbio;
2311 void **pointers = rbio->finish_pointers;
2312 unsigned long *pbitmap = rbio->finish_pbitmap;
2313 int nr_data = rbio->nr_data;
2317 struct page *p_page = NULL;
2318 struct page *q_page = NULL;
2319 struct bio_list bio_list;
2324 bio_list_init(&bio_list);
2326 if (rbio->real_stripes - rbio->nr_data == 1)
2327 has_qstripe = false;
2328 else if (rbio->real_stripes - rbio->nr_data == 2)
2333 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2335 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2339 * Because the higher layers(scrubber) are unlikely to
2340 * use this area of the disk again soon, so don't cache
2343 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2348 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2351 SetPageUptodate(p_page);
2354 /* RAID6, allocate and map temp space for the Q stripe */
2355 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2357 __free_page(p_page);
2360 SetPageUptodate(q_page);
2361 pointers[rbio->real_stripes - 1] = kmap(q_page);
2364 atomic_set(&rbio->error, 0);
2366 /* Map the parity stripe just once */
2367 pointers[nr_data] = kmap(p_page);
2369 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2372 /* first collect one page from each data stripe */
2373 for (stripe = 0; stripe < nr_data; stripe++) {
2374 p = page_in_rbio(rbio, stripe, pagenr, 0);
2375 pointers[stripe] = kmap(p);
2379 /* RAID6, call the library function to fill in our P/Q */
2380 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2384 copy_page(pointers[nr_data], pointers[0]);
2385 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2388 /* Check scrubbing parity and repair it */
2389 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2391 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2392 copy_page(parity, pointers[rbio->scrubp]);
2394 /* Parity is right, needn't writeback */
2395 bitmap_clear(rbio->dbitmap, pagenr, 1);
2398 for (stripe = 0; stripe < nr_data; stripe++)
2399 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2403 __free_page(p_page);
2406 __free_page(q_page);
2411 * time to start writing. Make bios for everything from the
2412 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2415 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2418 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2419 ret = rbio_add_io_page(rbio, &bio_list,
2420 page, rbio->scrubp, pagenr, rbio->stripe_len);
2428 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2431 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2432 ret = rbio_add_io_page(rbio, &bio_list, page,
2433 bbio->tgtdev_map[rbio->scrubp],
2434 pagenr, rbio->stripe_len);
2440 nr_data = bio_list_size(&bio_list);
2442 /* Every parity is right */
2443 rbio_orig_end_io(rbio, BLK_STS_OK);
2447 atomic_set(&rbio->stripes_pending, nr_data);
2449 while ((bio = bio_list_pop(&bio_list))) {
2450 bio->bi_private = rbio;
2451 bio->bi_end_io = raid_write_end_io;
2452 bio->bi_opf = REQ_OP_WRITE;
2459 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2461 while ((bio = bio_list_pop(&bio_list)))
2465 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2467 if (stripe >= 0 && stripe < rbio->nr_data)
2473 * While we're doing the parity check and repair, we could have errors
2474 * in reading pages off the disk. This checks for errors and if we're
2475 * not able to read the page it'll trigger parity reconstruction. The
2476 * parity scrub will be finished after we've reconstructed the failed
2479 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2481 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2484 if (rbio->faila >= 0 || rbio->failb >= 0) {
2485 int dfail = 0, failp = -1;
2487 if (is_data_stripe(rbio, rbio->faila))
2489 else if (is_parity_stripe(rbio->faila))
2490 failp = rbio->faila;
2492 if (is_data_stripe(rbio, rbio->failb))
2494 else if (is_parity_stripe(rbio->failb))
2495 failp = rbio->failb;
2498 * Because we can not use a scrubbing parity to repair
2499 * the data, so the capability of the repair is declined.
2500 * (In the case of RAID5, we can not repair anything)
2502 if (dfail > rbio->bbio->max_errors - 1)
2506 * If all data is good, only parity is correctly, just
2507 * repair the parity.
2510 finish_parity_scrub(rbio, 0);
2515 * Here means we got one corrupted data stripe and one
2516 * corrupted parity on RAID6, if the corrupted parity
2517 * is scrubbing parity, luckily, use the other one to repair
2518 * the data, or we can not repair the data stripe.
2520 if (failp != rbio->scrubp)
2523 __raid_recover_end_io(rbio);
2525 finish_parity_scrub(rbio, 1);
2530 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2534 * end io for the read phase of the rmw cycle. All the bios here are physical
2535 * stripe bios we've read from the disk so we can recalculate the parity of the
2538 * This will usually kick off finish_rmw once all the bios are read in, but it
2539 * may trigger parity reconstruction if we had any errors along the way
2541 static void raid56_parity_scrub_end_io(struct bio *bio)
2543 struct btrfs_raid_bio *rbio = bio->bi_private;
2546 fail_bio_stripe(rbio, bio);
2548 set_bio_pages_uptodate(bio);
2552 if (!atomic_dec_and_test(&rbio->stripes_pending))
2556 * this will normally call finish_rmw to start our write
2557 * but if there are any failed stripes we'll reconstruct
2560 validate_rbio_for_parity_scrub(rbio);
2563 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2565 int bios_to_read = 0;
2566 struct bio_list bio_list;
2572 bio_list_init(&bio_list);
2574 ret = alloc_rbio_essential_pages(rbio);
2578 atomic_set(&rbio->error, 0);
2580 * build a list of bios to read all the missing parts of this
2583 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2584 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2587 * we want to find all the pages missing from
2588 * the rbio and read them from the disk. If
2589 * page_in_rbio finds a page in the bio list
2590 * we don't need to read it off the stripe.
2592 page = page_in_rbio(rbio, stripe, pagenr, 1);
2596 page = rbio_stripe_page(rbio, stripe, pagenr);
2598 * the bio cache may have handed us an uptodate
2599 * page. If so, be happy and use it
2601 if (PageUptodate(page))
2604 ret = rbio_add_io_page(rbio, &bio_list, page,
2605 stripe, pagenr, rbio->stripe_len);
2611 bios_to_read = bio_list_size(&bio_list);
2612 if (!bios_to_read) {
2614 * this can happen if others have merged with
2615 * us, it means there is nothing left to read.
2616 * But if there are missing devices it may not be
2617 * safe to do the full stripe write yet.
2623 * the bbio may be freed once we submit the last bio. Make sure
2624 * not to touch it after that
2626 atomic_set(&rbio->stripes_pending, bios_to_read);
2627 while ((bio = bio_list_pop(&bio_list))) {
2628 bio->bi_private = rbio;
2629 bio->bi_end_io = raid56_parity_scrub_end_io;
2630 bio->bi_opf = REQ_OP_READ;
2632 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2636 /* the actual write will happen once the reads are done */
2640 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2642 while ((bio = bio_list_pop(&bio_list)))
2648 validate_rbio_for_parity_scrub(rbio);
2651 static void scrub_parity_work(struct btrfs_work *work)
2653 struct btrfs_raid_bio *rbio;
2655 rbio = container_of(work, struct btrfs_raid_bio, work);
2656 raid56_parity_scrub_stripe(rbio);
2659 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2661 if (!lock_stripe_add(rbio))
2662 start_async_work(rbio, scrub_parity_work);
2665 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2667 struct btrfs_raid_bio *
2668 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2669 struct btrfs_bio *bbio, u64 length)
2671 struct btrfs_raid_bio *rbio;
2673 rbio = alloc_rbio(fs_info, bbio, length);
2677 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2678 bio_list_add(&rbio->bio_list, bio);
2680 * This is a special bio which is used to hold the completion handler
2681 * and make the scrub rbio is similar to the other types
2683 ASSERT(!bio->bi_iter.bi_size);
2685 rbio->faila = find_logical_bio_stripe(rbio, bio);
2686 if (rbio->faila == -1) {
2693 * When we get bbio, we have already increased bio_counter, record it
2694 * so we can free it at rbio_orig_end_io()
2696 rbio->generic_bio_cnt = 1;
2701 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2703 if (!lock_stripe_add(rbio))
2704 start_async_work(rbio, read_rebuild_work);