btrfs: tree-log: make the return value for log syncing consistent
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
cea62800 16#include "misc.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
53b381b3
DW
22
23/* set when additional merges to this rbio are not allowed */
24#define RBIO_RMW_LOCKED_BIT 1
25
4ae10b3a
CM
26/*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30#define RBIO_CACHE_BIT 2
31
32/*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35#define RBIO_CACHE_READY_BIT 3
36
4ae10b3a
CM
37#define RBIO_CACHE_SIZE 1024
38
8a953348
DS
39#define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41/* Used by the raid56 code to lock stripes for read/modify/write */
42struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45};
46
47/* Used by the raid56 code to lock stripes for read/modify/write */
48struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53};
54
eb357060
QW
55/*
56 * A bvec like structure to present a sector inside a page.
57 *
58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
59 */
60struct sector_ptr {
61 struct page *page;
00425dd9
QW
62 unsigned int pgoff:24;
63 unsigned int uptodate:8;
eb357060
QW
64};
65
53b381b3
DW
66static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
67static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
385de0ef
CH
68static void rmw_work(struct work_struct *work);
69static void read_rebuild_work(struct work_struct *work);
53b381b3
DW
70static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
71static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
72static void __free_raid_bio(struct btrfs_raid_bio *rbio);
73static void index_rbio_pages(struct btrfs_raid_bio *rbio);
74static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
75
5a6ac9ea
MX
76static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
77 int need_check);
385de0ef 78static void scrub_parity_work(struct work_struct *work);
5a6ac9ea 79
385de0ef 80static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
ac638859 81{
385de0ef
CH
82 INIT_WORK(&rbio->work, work_func);
83 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
84}
85
53b381b3
DW
86/*
87 * the stripe hash table is used for locking, and to collect
88 * bios in hopes of making a full stripe
89 */
90int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
91{
92 struct btrfs_stripe_hash_table *table;
93 struct btrfs_stripe_hash_table *x;
94 struct btrfs_stripe_hash *cur;
95 struct btrfs_stripe_hash *h;
96 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
97 int i;
98
99 if (info->stripe_hash_table)
100 return 0;
101
83c8266a
DS
102 /*
103 * The table is large, starting with order 4 and can go as high as
104 * order 7 in case lock debugging is turned on.
105 *
106 * Try harder to allocate and fallback to vmalloc to lower the chance
107 * of a failing mount.
108 */
ee787f95 109 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
110 if (!table)
111 return -ENOMEM;
53b381b3 112
4ae10b3a
CM
113 spin_lock_init(&table->cache_lock);
114 INIT_LIST_HEAD(&table->stripe_cache);
115
53b381b3
DW
116 h = table->table;
117
118 for (i = 0; i < num_entries; i++) {
119 cur = h + i;
120 INIT_LIST_HEAD(&cur->hash_list);
121 spin_lock_init(&cur->lock);
53b381b3
DW
122 }
123
124 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 125 kvfree(x);
53b381b3
DW
126 return 0;
127}
128
4ae10b3a
CM
129/*
130 * caching an rbio means to copy anything from the
ac26df8b 131 * bio_sectors array into the stripe_pages array. We
4ae10b3a
CM
132 * use the page uptodate bit in the stripe cache array
133 * to indicate if it has valid data
134 *
135 * once the caching is done, we set the cache ready
136 * bit.
137 */
138static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
139{
140 int i;
4ae10b3a
CM
141 int ret;
142
143 ret = alloc_rbio_pages(rbio);
144 if (ret)
145 return;
146
00425dd9
QW
147 for (i = 0; i < rbio->nr_sectors; i++) {
148 /* Some range not covered by bio (partial write), skip it */
149 if (!rbio->bio_sectors[i].page)
150 continue;
151
152 ASSERT(rbio->stripe_sectors[i].page);
153 memcpy_page(rbio->stripe_sectors[i].page,
154 rbio->stripe_sectors[i].pgoff,
155 rbio->bio_sectors[i].page,
156 rbio->bio_sectors[i].pgoff,
157 rbio->bioc->fs_info->sectorsize);
158 rbio->stripe_sectors[i].uptodate = 1;
159 }
4ae10b3a
CM
160 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
161}
162
53b381b3
DW
163/*
164 * we hash on the first logical address of the stripe
165 */
166static int rbio_bucket(struct btrfs_raid_bio *rbio)
167{
4c664611 168 u64 num = rbio->bioc->raid_map[0];
53b381b3
DW
169
170 /*
171 * we shift down quite a bit. We're using byte
172 * addressing, and most of the lower bits are zeros.
173 * This tends to upset hash_64, and it consistently
174 * returns just one or two different values.
175 *
176 * shifting off the lower bits fixes things.
177 */
178 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
179}
180
d4e28d9b
QW
181static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
182 unsigned int page_nr)
183{
184 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
185 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
186 int i;
187
188 ASSERT(page_nr < rbio->nr_pages);
189
190 for (i = sectors_per_page * page_nr;
191 i < sectors_per_page * page_nr + sectors_per_page;
192 i++) {
193 if (!rbio->stripe_sectors[i].uptodate)
194 return false;
195 }
196 return true;
197}
198
eb357060
QW
199/*
200 * Update the stripe_sectors[] array to use correct page and pgoff
201 *
202 * Should be called every time any page pointer in stripes_pages[] got modified.
203 */
204static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
205{
206 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
207 u32 offset;
208 int i;
209
210 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
211 int page_index = offset >> PAGE_SHIFT;
212
213 ASSERT(page_index < rbio->nr_pages);
214 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
215 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
216 }
217}
218
4d100466
QW
219static void steal_rbio_page(struct btrfs_raid_bio *src,
220 struct btrfs_raid_bio *dest, int page_nr)
221{
222 const u32 sectorsize = src->bioc->fs_info->sectorsize;
223 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
224 int i;
225
226 if (dest->stripe_pages[page_nr])
227 __free_page(dest->stripe_pages[page_nr]);
228 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
229 src->stripe_pages[page_nr] = NULL;
230
231 /* Also update the sector->uptodate bits. */
232 for (i = sectors_per_page * page_nr;
233 i < sectors_per_page * page_nr + sectors_per_page; i++)
234 dest->stripe_sectors[i].uptodate = true;
235}
236
4ae10b3a 237/*
d4e28d9b
QW
238 * Stealing an rbio means taking all the uptodate pages from the stripe array
239 * in the source rbio and putting them into the destination rbio.
240 *
241 * This will also update the involved stripe_sectors[] which are referring to
242 * the old pages.
4ae10b3a
CM
243 */
244static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
245{
246 int i;
247 struct page *s;
4ae10b3a
CM
248
249 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
250 return;
251
252 for (i = 0; i < dest->nr_pages; i++) {
253 s = src->stripe_pages[i];
d4e28d9b 254 if (!s || !full_page_sectors_uptodate(src, i))
4ae10b3a 255 continue;
4ae10b3a 256
4d100466 257 steal_rbio_page(src, dest, i);
4ae10b3a 258 }
eb357060
QW
259 index_stripe_sectors(dest);
260 index_stripe_sectors(src);
4ae10b3a
CM
261}
262
53b381b3
DW
263/*
264 * merging means we take the bio_list from the victim and
265 * splice it into the destination. The victim should
266 * be discarded afterwards.
267 *
268 * must be called with dest->rbio_list_lock held
269 */
270static void merge_rbio(struct btrfs_raid_bio *dest,
271 struct btrfs_raid_bio *victim)
272{
273 bio_list_merge(&dest->bio_list, &victim->bio_list);
274 dest->bio_list_bytes += victim->bio_list_bytes;
bd8f7e62
QW
275 /* Also inherit the bitmaps from @victim. */
276 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
277 dest->stripe_nsectors);
4245215d 278 dest->generic_bio_cnt += victim->generic_bio_cnt;
53b381b3
DW
279 bio_list_init(&victim->bio_list);
280}
281
282/*
4ae10b3a
CM
283 * used to prune items that are in the cache. The caller
284 * must hold the hash table lock.
285 */
286static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
287{
288 int bucket = rbio_bucket(rbio);
289 struct btrfs_stripe_hash_table *table;
290 struct btrfs_stripe_hash *h;
291 int freeit = 0;
292
293 /*
294 * check the bit again under the hash table lock.
295 */
296 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
297 return;
298
6a258d72 299 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
300 h = table->table + bucket;
301
302 /* hold the lock for the bucket because we may be
303 * removing it from the hash table
304 */
305 spin_lock(&h->lock);
306
307 /*
308 * hold the lock for the bio list because we need
309 * to make sure the bio list is empty
310 */
311 spin_lock(&rbio->bio_list_lock);
312
313 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
314 list_del_init(&rbio->stripe_cache);
315 table->cache_size -= 1;
316 freeit = 1;
317
318 /* if the bio list isn't empty, this rbio is
319 * still involved in an IO. We take it out
320 * of the cache list, and drop the ref that
321 * was held for the list.
322 *
323 * If the bio_list was empty, we also remove
324 * the rbio from the hash_table, and drop
325 * the corresponding ref
326 */
327 if (bio_list_empty(&rbio->bio_list)) {
328 if (!list_empty(&rbio->hash_list)) {
329 list_del_init(&rbio->hash_list);
dec95574 330 refcount_dec(&rbio->refs);
4ae10b3a
CM
331 BUG_ON(!list_empty(&rbio->plug_list));
332 }
333 }
334 }
335
336 spin_unlock(&rbio->bio_list_lock);
337 spin_unlock(&h->lock);
338
339 if (freeit)
340 __free_raid_bio(rbio);
341}
342
343/*
344 * prune a given rbio from the cache
345 */
346static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
347{
348 struct btrfs_stripe_hash_table *table;
349 unsigned long flags;
350
351 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
352 return;
353
6a258d72 354 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
355
356 spin_lock_irqsave(&table->cache_lock, flags);
357 __remove_rbio_from_cache(rbio);
358 spin_unlock_irqrestore(&table->cache_lock, flags);
359}
360
361/*
362 * remove everything in the cache
363 */
48a3b636 364static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
365{
366 struct btrfs_stripe_hash_table *table;
367 unsigned long flags;
368 struct btrfs_raid_bio *rbio;
369
370 table = info->stripe_hash_table;
371
372 spin_lock_irqsave(&table->cache_lock, flags);
373 while (!list_empty(&table->stripe_cache)) {
374 rbio = list_entry(table->stripe_cache.next,
375 struct btrfs_raid_bio,
376 stripe_cache);
377 __remove_rbio_from_cache(rbio);
378 }
379 spin_unlock_irqrestore(&table->cache_lock, flags);
380}
381
382/*
383 * remove all cached entries and free the hash table
384 * used by unmount
53b381b3
DW
385 */
386void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
387{
388 if (!info->stripe_hash_table)
389 return;
4ae10b3a 390 btrfs_clear_rbio_cache(info);
f749303b 391 kvfree(info->stripe_hash_table);
53b381b3
DW
392 info->stripe_hash_table = NULL;
393}
394
4ae10b3a
CM
395/*
396 * insert an rbio into the stripe cache. It
397 * must have already been prepared by calling
398 * cache_rbio_pages
399 *
400 * If this rbio was already cached, it gets
401 * moved to the front of the lru.
402 *
403 * If the size of the rbio cache is too big, we
404 * prune an item.
405 */
406static void cache_rbio(struct btrfs_raid_bio *rbio)
407{
408 struct btrfs_stripe_hash_table *table;
409 unsigned long flags;
410
411 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
412 return;
413
6a258d72 414 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
415
416 spin_lock_irqsave(&table->cache_lock, flags);
417 spin_lock(&rbio->bio_list_lock);
418
419 /* bump our ref if we were not in the list before */
420 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 421 refcount_inc(&rbio->refs);
4ae10b3a
CM
422
423 if (!list_empty(&rbio->stripe_cache)){
424 list_move(&rbio->stripe_cache, &table->stripe_cache);
425 } else {
426 list_add(&rbio->stripe_cache, &table->stripe_cache);
427 table->cache_size += 1;
428 }
429
430 spin_unlock(&rbio->bio_list_lock);
431
432 if (table->cache_size > RBIO_CACHE_SIZE) {
433 struct btrfs_raid_bio *found;
434
435 found = list_entry(table->stripe_cache.prev,
436 struct btrfs_raid_bio,
437 stripe_cache);
438
439 if (found != rbio)
440 __remove_rbio_from_cache(found);
441 }
442
443 spin_unlock_irqrestore(&table->cache_lock, flags);
4ae10b3a
CM
444}
445
53b381b3
DW
446/*
447 * helper function to run the xor_blocks api. It is only
448 * able to do MAX_XOR_BLOCKS at a time, so we need to
449 * loop through.
450 */
451static void run_xor(void **pages, int src_cnt, ssize_t len)
452{
453 int src_off = 0;
454 int xor_src_cnt = 0;
455 void *dest = pages[src_cnt];
456
457 while(src_cnt > 0) {
458 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
459 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
460
461 src_cnt -= xor_src_cnt;
462 src_off += xor_src_cnt;
463 }
464}
465
466/*
176571a1
DS
467 * Returns true if the bio list inside this rbio covers an entire stripe (no
468 * rmw required).
53b381b3 469 */
176571a1 470static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3 471{
176571a1 472 unsigned long flags;
53b381b3
DW
473 unsigned long size = rbio->bio_list_bytes;
474 int ret = 1;
475
176571a1 476 spin_lock_irqsave(&rbio->bio_list_lock, flags);
53b381b3
DW
477 if (size != rbio->nr_data * rbio->stripe_len)
478 ret = 0;
53b381b3 479 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
53b381b3 480 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
176571a1 481
53b381b3
DW
482 return ret;
483}
484
485/*
486 * returns 1 if it is safe to merge two rbios together.
487 * The merging is safe if the two rbios correspond to
488 * the same stripe and if they are both going in the same
489 * direction (read vs write), and if neither one is
490 * locked for final IO
491 *
492 * The caller is responsible for locking such that
493 * rmw_locked is safe to test
494 */
495static int rbio_can_merge(struct btrfs_raid_bio *last,
496 struct btrfs_raid_bio *cur)
497{
498 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
499 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
500 return 0;
501
4ae10b3a
CM
502 /*
503 * we can't merge with cached rbios, since the
504 * idea is that when we merge the destination
505 * rbio is going to run our IO for us. We can
01327610 506 * steal from cached rbios though, other functions
4ae10b3a
CM
507 * handle that.
508 */
509 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
510 test_bit(RBIO_CACHE_BIT, &cur->flags))
511 return 0;
512
4c664611 513 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
53b381b3
DW
514 return 0;
515
5a6ac9ea
MX
516 /* we can't merge with different operations */
517 if (last->operation != cur->operation)
518 return 0;
519 /*
520 * We've need read the full stripe from the drive.
521 * check and repair the parity and write the new results.
522 *
523 * We're not allowed to add any new bios to the
524 * bio list here, anyone else that wants to
525 * change this stripe needs to do their own rmw.
526 */
db34be19 527 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 528 return 0;
53b381b3 529
db34be19 530 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
b4ee1782
OS
531 return 0;
532
cc54ff62
LB
533 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
534 int fa = last->faila;
535 int fb = last->failb;
536 int cur_fa = cur->faila;
537 int cur_fb = cur->failb;
538
539 if (last->faila >= last->failb) {
540 fa = last->failb;
541 fb = last->faila;
542 }
543
544 if (cur->faila >= cur->failb) {
545 cur_fa = cur->failb;
546 cur_fb = cur->faila;
547 }
548
549 if (fa != cur_fa || fb != cur_fb)
550 return 0;
551 }
53b381b3
DW
552 return 1;
553}
554
3e77605d
QW
555static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
556 unsigned int stripe_nr,
557 unsigned int sector_nr)
558{
559 ASSERT(stripe_nr < rbio->real_stripes);
560 ASSERT(sector_nr < rbio->stripe_nsectors);
561
562 return stripe_nr * rbio->stripe_nsectors + sector_nr;
563}
564
565/* Return a sector from rbio->stripe_sectors, not from the bio list */
566static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
567 unsigned int stripe_nr,
568 unsigned int sector_nr)
569{
570 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
571 sector_nr)];
572}
573
1145059a
QW
574/* Grab a sector inside P stripe */
575static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
576 unsigned int sector_nr)
b7178a5f 577{
1145059a 578 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
b7178a5f
ZL
579}
580
1145059a
QW
581/* Grab a sector inside Q stripe, return NULL if not RAID6 */
582static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
583 unsigned int sector_nr)
53b381b3 584{
1145059a
QW
585 if (rbio->nr_data + 1 == rbio->real_stripes)
586 return NULL;
587 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
53b381b3
DW
588}
589
53b381b3
DW
590/*
591 * The first stripe in the table for a logical address
592 * has the lock. rbios are added in one of three ways:
593 *
594 * 1) Nobody has the stripe locked yet. The rbio is given
595 * the lock and 0 is returned. The caller must start the IO
596 * themselves.
597 *
598 * 2) Someone has the stripe locked, but we're able to merge
599 * with the lock owner. The rbio is freed and the IO will
600 * start automatically along with the existing rbio. 1 is returned.
601 *
602 * 3) Someone has the stripe locked, but we're not able to merge.
603 * The rbio is added to the lock owner's plug list, or merged into
604 * an rbio already on the plug list. When the lock owner unlocks,
605 * the next rbio on the list is run and the IO is started automatically.
606 * 1 is returned
607 *
608 * If we return 0, the caller still owns the rbio and must continue with
609 * IO submission. If we return 1, the caller must assume the rbio has
610 * already been freed.
611 */
612static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
613{
721860d5 614 struct btrfs_stripe_hash *h;
53b381b3
DW
615 struct btrfs_raid_bio *cur;
616 struct btrfs_raid_bio *pending;
617 unsigned long flags;
53b381b3 618 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 619 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 620 int ret = 0;
53b381b3 621
6a258d72 622 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 623
53b381b3
DW
624 spin_lock_irqsave(&h->lock, flags);
625 list_for_each_entry(cur, &h->hash_list, hash_list) {
4c664611 626 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
9d6cb1b0 627 continue;
4ae10b3a 628
9d6cb1b0 629 spin_lock(&cur->bio_list_lock);
4ae10b3a 630
9d6cb1b0
JT
631 /* Can we steal this cached rbio's pages? */
632 if (bio_list_empty(&cur->bio_list) &&
633 list_empty(&cur->plug_list) &&
634 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
635 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
636 list_del_init(&cur->hash_list);
637 refcount_dec(&cur->refs);
53b381b3 638
9d6cb1b0
JT
639 steal_rbio(cur, rbio);
640 cache_drop = cur;
641 spin_unlock(&cur->bio_list_lock);
4ae10b3a 642
9d6cb1b0
JT
643 goto lockit;
644 }
53b381b3 645
9d6cb1b0
JT
646 /* Can we merge into the lock owner? */
647 if (rbio_can_merge(cur, rbio)) {
648 merge_rbio(cur, rbio);
53b381b3 649 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 650 freeit = rbio;
53b381b3
DW
651 ret = 1;
652 goto out;
653 }
9d6cb1b0
JT
654
655
656 /*
657 * We couldn't merge with the running rbio, see if we can merge
658 * with the pending ones. We don't have to check for rmw_locked
659 * because there is no way they are inside finish_rmw right now
660 */
661 list_for_each_entry(pending, &cur->plug_list, plug_list) {
662 if (rbio_can_merge(pending, rbio)) {
663 merge_rbio(pending, rbio);
664 spin_unlock(&cur->bio_list_lock);
665 freeit = rbio;
666 ret = 1;
667 goto out;
668 }
669 }
670
671 /*
672 * No merging, put us on the tail of the plug list, our rbio
673 * will be started with the currently running rbio unlocks
674 */
675 list_add_tail(&rbio->plug_list, &cur->plug_list);
676 spin_unlock(&cur->bio_list_lock);
677 ret = 1;
678 goto out;
53b381b3 679 }
4ae10b3a 680lockit:
dec95574 681 refcount_inc(&rbio->refs);
53b381b3
DW
682 list_add(&rbio->hash_list, &h->hash_list);
683out:
684 spin_unlock_irqrestore(&h->lock, flags);
4ae10b3a
CM
685 if (cache_drop)
686 remove_rbio_from_cache(cache_drop);
53b381b3
DW
687 if (freeit)
688 __free_raid_bio(freeit);
689 return ret;
690}
691
692/*
693 * called as rmw or parity rebuild is completed. If the plug list has more
694 * rbios waiting for this stripe, the next one on the list will be started
695 */
696static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
697{
698 int bucket;
699 struct btrfs_stripe_hash *h;
700 unsigned long flags;
4ae10b3a 701 int keep_cache = 0;
53b381b3
DW
702
703 bucket = rbio_bucket(rbio);
6a258d72 704 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 705
4ae10b3a
CM
706 if (list_empty(&rbio->plug_list))
707 cache_rbio(rbio);
708
53b381b3
DW
709 spin_lock_irqsave(&h->lock, flags);
710 spin_lock(&rbio->bio_list_lock);
711
712 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
713 /*
714 * if we're still cached and there is no other IO
715 * to perform, just leave this rbio here for others
716 * to steal from later
717 */
718 if (list_empty(&rbio->plug_list) &&
719 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
720 keep_cache = 1;
721 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
722 BUG_ON(!bio_list_empty(&rbio->bio_list));
723 goto done;
724 }
53b381b3
DW
725
726 list_del_init(&rbio->hash_list);
dec95574 727 refcount_dec(&rbio->refs);
53b381b3
DW
728
729 /*
730 * we use the plug list to hold all the rbios
731 * waiting for the chance to lock this stripe.
732 * hand the lock over to one of them.
733 */
734 if (!list_empty(&rbio->plug_list)) {
735 struct btrfs_raid_bio *next;
736 struct list_head *head = rbio->plug_list.next;
737
738 next = list_entry(head, struct btrfs_raid_bio,
739 plug_list);
740
741 list_del_init(&rbio->plug_list);
742
743 list_add(&next->hash_list, &h->hash_list);
dec95574 744 refcount_inc(&next->refs);
53b381b3
DW
745 spin_unlock(&rbio->bio_list_lock);
746 spin_unlock_irqrestore(&h->lock, flags);
747
1b94b556 748 if (next->operation == BTRFS_RBIO_READ_REBUILD)
e66d8d5a 749 start_async_work(next, read_rebuild_work);
b4ee1782
OS
750 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
751 steal_rbio(rbio, next);
e66d8d5a 752 start_async_work(next, read_rebuild_work);
b4ee1782 753 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 754 steal_rbio(rbio, next);
cf6a4a75 755 start_async_work(next, rmw_work);
5a6ac9ea
MX
756 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
757 steal_rbio(rbio, next);
a81b747d 758 start_async_work(next, scrub_parity_work);
4ae10b3a 759 }
53b381b3
DW
760
761 goto done_nolock;
53b381b3
DW
762 }
763 }
4ae10b3a 764done:
53b381b3
DW
765 spin_unlock(&rbio->bio_list_lock);
766 spin_unlock_irqrestore(&h->lock, flags);
767
768done_nolock:
4ae10b3a
CM
769 if (!keep_cache)
770 remove_rbio_from_cache(rbio);
53b381b3
DW
771}
772
773static void __free_raid_bio(struct btrfs_raid_bio *rbio)
774{
775 int i;
776
dec95574 777 if (!refcount_dec_and_test(&rbio->refs))
53b381b3
DW
778 return;
779
4ae10b3a 780 WARN_ON(!list_empty(&rbio->stripe_cache));
53b381b3
DW
781 WARN_ON(!list_empty(&rbio->hash_list));
782 WARN_ON(!bio_list_empty(&rbio->bio_list));
783
784 for (i = 0; i < rbio->nr_pages; i++) {
785 if (rbio->stripe_pages[i]) {
786 __free_page(rbio->stripe_pages[i]);
787 rbio->stripe_pages[i] = NULL;
788 }
789 }
af8e2d1d 790
4c664611 791 btrfs_put_bioc(rbio->bioc);
53b381b3
DW
792 kfree(rbio);
793}
794
7583d8d0 795static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
53b381b3 796{
7583d8d0
LB
797 struct bio *next;
798
799 while (cur) {
800 next = cur->bi_next;
801 cur->bi_next = NULL;
802 cur->bi_status = err;
803 bio_endio(cur);
804 cur = next;
805 }
53b381b3
DW
806}
807
808/*
809 * this frees the rbio and runs through all the bios in the
810 * bio_list and calls end_io on them
811 */
4e4cbee9 812static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
53b381b3
DW
813{
814 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 815 struct bio *extra;
4245215d
MX
816
817 if (rbio->generic_bio_cnt)
6a258d72 818 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
bd8f7e62
QW
819 /*
820 * Clear the data bitmap, as the rbio may be cached for later usage.
821 * do this before before unlock_stripe() so there will be no new bio
822 * for this bio.
823 */
824 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
4245215d 825
7583d8d0
LB
826 /*
827 * At this moment, rbio->bio_list is empty, however since rbio does not
828 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
829 * hash list, rbio may be merged with others so that rbio->bio_list
830 * becomes non-empty.
831 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
832 * more and we can call bio_endio() on all queued bios.
833 */
834 unlock_stripe(rbio);
835 extra = bio_list_get(&rbio->bio_list);
836 __free_raid_bio(rbio);
53b381b3 837
7583d8d0
LB
838 rbio_endio_bio_list(cur, err);
839 if (extra)
840 rbio_endio_bio_list(extra, err);
53b381b3
DW
841}
842
843/*
844 * end io function used by finish_rmw. When we finally
845 * get here, we've written a full stripe
846 */
4246a0b6 847static void raid_write_end_io(struct bio *bio)
53b381b3
DW
848{
849 struct btrfs_raid_bio *rbio = bio->bi_private;
4e4cbee9 850 blk_status_t err = bio->bi_status;
a6111d11 851 int max_errors;
53b381b3
DW
852
853 if (err)
854 fail_bio_stripe(rbio, bio);
855
856 bio_put(bio);
857
b89e1b01 858 if (!atomic_dec_and_test(&rbio->stripes_pending))
53b381b3
DW
859 return;
860
58efbc9f 861 err = BLK_STS_OK;
53b381b3
DW
862
863 /* OK, we have read all the stripes we need to. */
a6111d11 864 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
4c664611 865 0 : rbio->bioc->max_errors;
a6111d11 866 if (atomic_read(&rbio->error) > max_errors)
4e4cbee9 867 err = BLK_STS_IOERR;
53b381b3 868
4246a0b6 869 rbio_orig_end_io(rbio, err);
53b381b3
DW
870}
871
3e77605d
QW
872/**
873 * Get a sector pointer specified by its @stripe_nr and @sector_nr
874 *
875 * @rbio: The raid bio
876 * @stripe_nr: Stripe number, valid range [0, real_stripe)
877 * @sector_nr: Sector number inside the stripe,
878 * valid range [0, stripe_nsectors)
879 * @bio_list_only: Whether to use sectors inside the bio list only.
880 *
881 * The read/modify/write code wants to reuse the original bio page as much
882 * as possible, and only use stripe_sectors as fallback.
883 */
884static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
885 int stripe_nr, int sector_nr,
886 bool bio_list_only)
887{
888 struct sector_ptr *sector;
889 int index;
890
891 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
892 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
893
894 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
895 ASSERT(index >= 0 && index < rbio->nr_sectors);
896
897 spin_lock_irq(&rbio->bio_list_lock);
898 sector = &rbio->bio_sectors[index];
899 if (sector->page || bio_list_only) {
900 /* Don't return sector without a valid page pointer */
901 if (!sector->page)
902 sector = NULL;
903 spin_unlock_irq(&rbio->bio_list_lock);
904 return sector;
905 }
906 spin_unlock_irq(&rbio->bio_list_lock);
907
908 return &rbio->stripe_sectors[index];
909}
910
53b381b3
DW
911/*
912 * allocation and initial setup for the btrfs_raid_bio. Not
913 * this does not allocate any pages for rbio->pages.
914 */
2ff7e61e 915static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
4c664611 916 struct btrfs_io_context *bioc,
cc353a8b 917 u32 stripe_len)
53b381b3 918{
843de58b
QW
919 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
920 const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
921 const unsigned int num_pages = stripe_npages * real_stripes;
94efbe19
QW
922 const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
923 const unsigned int num_sectors = stripe_nsectors * real_stripes;
53b381b3
DW
924 struct btrfs_raid_bio *rbio;
925 int nr_data = 0;
53b381b3
DW
926 void *p;
927
843de58b 928 ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
94efbe19
QW
929 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
930 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
c67c68eb
QW
931 /*
932 * Our current stripe len should be fixed to 64k thus stripe_nsectors
933 * (at most 16) should be no larger than BITS_PER_LONG.
934 */
935 ASSERT(stripe_nsectors <= BITS_PER_LONG);
843de58b 936
1389053e
KC
937 rbio = kzalloc(sizeof(*rbio) +
938 sizeof(*rbio->stripe_pages) * num_pages +
00425dd9 939 sizeof(*rbio->bio_sectors) * num_sectors +
eb357060 940 sizeof(*rbio->stripe_sectors) * num_sectors +
c67c68eb 941 sizeof(*rbio->finish_pointers) * real_stripes,
1389053e 942 GFP_NOFS);
af8e2d1d 943 if (!rbio)
53b381b3 944 return ERR_PTR(-ENOMEM);
53b381b3
DW
945
946 bio_list_init(&rbio->bio_list);
947 INIT_LIST_HEAD(&rbio->plug_list);
948 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 949 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 950 INIT_LIST_HEAD(&rbio->hash_list);
4c664611 951 rbio->bioc = bioc;
53b381b3
DW
952 rbio->stripe_len = stripe_len;
953 rbio->nr_pages = num_pages;
94efbe19 954 rbio->nr_sectors = num_sectors;
2c8cdd6e 955 rbio->real_stripes = real_stripes;
5a6ac9ea 956 rbio->stripe_npages = stripe_npages;
94efbe19 957 rbio->stripe_nsectors = stripe_nsectors;
53b381b3
DW
958 rbio->faila = -1;
959 rbio->failb = -1;
dec95574 960 refcount_set(&rbio->refs, 1);
b89e1b01
MX
961 atomic_set(&rbio->error, 0);
962 atomic_set(&rbio->stripes_pending, 0);
53b381b3
DW
963
964 /*
ac26df8b
QW
965 * The stripe_pages, bio_sectors, etc arrays point to the extra memory
966 * we allocated past the end of the rbio.
53b381b3
DW
967 */
968 p = rbio + 1;
1389053e
KC
969#define CONSUME_ALLOC(ptr, count) do { \
970 ptr = p; \
971 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
972 } while (0)
973 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
00425dd9 974 CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
eb357060 975 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
1389053e 976 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1389053e 977#undef CONSUME_ALLOC
53b381b3 978
4c664611 979 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
10f11900 980 nr_data = real_stripes - 1;
4c664611 981 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
2c8cdd6e 982 nr_data = real_stripes - 2;
53b381b3 983 else
10f11900 984 BUG();
53b381b3
DW
985
986 rbio->nr_data = nr_data;
987 return rbio;
988}
989
990/* allocate pages for all the stripes in the bio, including parity */
991static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
992{
eb357060
QW
993 int ret;
994
995 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
996 if (ret < 0)
997 return ret;
998 /* Mapping all sectors */
999 index_stripe_sectors(rbio);
1000 return 0;
53b381b3
DW
1001}
1002
b7178a5f 1003/* only allocate pages for p/q stripes */
53b381b3
DW
1004static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1005{
f77183dc 1006 const int data_pages = rbio->nr_data * rbio->stripe_npages;
eb357060 1007 int ret;
53b381b3 1008
eb357060
QW
1009 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1010 rbio->stripe_pages + data_pages);
1011 if (ret < 0)
1012 return ret;
1013
1014 index_stripe_sectors(rbio);
1015 return 0;
53b381b3
DW
1016}
1017
1018/*
3e77605d
QW
1019 * Add a single sector @sector into our list of bios for IO.
1020 *
1021 * Return 0 if everything went well.
1022 * Return <0 for error.
53b381b3 1023 */
3e77605d
QW
1024static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1025 struct bio_list *bio_list,
1026 struct sector_ptr *sector,
1027 unsigned int stripe_nr,
1028 unsigned int sector_nr,
1029 unsigned long bio_max_len,
1030 unsigned int opf)
53b381b3 1031{
3e77605d 1032 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
53b381b3 1033 struct bio *last = bio_list->tail;
53b381b3
DW
1034 int ret;
1035 struct bio *bio;
4c664611 1036 struct btrfs_io_stripe *stripe;
53b381b3
DW
1037 u64 disk_start;
1038
3e77605d
QW
1039 /*
1040 * Note: here stripe_nr has taken device replace into consideration,
1041 * thus it can be larger than rbio->real_stripe.
1042 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1043 */
1044 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1045 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1046 ASSERT(sector->page);
1047
4c664611 1048 stripe = &rbio->bioc->stripes[stripe_nr];
3e77605d 1049 disk_start = stripe->physical + sector_nr * sectorsize;
53b381b3
DW
1050
1051 /* if the device is missing, just fail this stripe */
1052 if (!stripe->dev->bdev)
1053 return fail_rbio_index(rbio, stripe_nr);
1054
1055 /* see if we can add this page onto our existing bio */
1056 if (last) {
1201b58b 1057 u64 last_end = last->bi_iter.bi_sector << 9;
4f024f37 1058 last_end += last->bi_iter.bi_size;
53b381b3
DW
1059
1060 /*
1061 * we can't merge these if they are from different
1062 * devices or if they are not contiguous
1063 */
f90ae76a 1064 if (last_end == disk_start && !last->bi_status &&
309dca30 1065 last->bi_bdev == stripe->dev->bdev) {
3e77605d
QW
1066 ret = bio_add_page(last, sector->page, sectorsize,
1067 sector->pgoff);
1068 if (ret == sectorsize)
53b381b3
DW
1069 return 0;
1070 }
1071 }
1072
1073 /* put a new bio on the list */
e1b4b44e
CH
1074 bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
1075 opf, GFP_NOFS);
4f024f37 1076 bio->bi_iter.bi_sector = disk_start >> 9;
e01bf588 1077 bio->bi_private = rbio;
53b381b3 1078
3e77605d 1079 bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
53b381b3
DW
1080 bio_list_add(bio_list, bio);
1081 return 0;
1082}
1083
1084/*
1085 * while we're doing the read/modify/write cycle, we could
1086 * have errors in reading pages off the disk. This checks
1087 * for errors and if we're not able to read the page it'll
1088 * trigger parity reconstruction. The rmw will be finished
1089 * after we've reconstructed the failed stripes
1090 */
1091static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1092{
1093 if (rbio->faila >= 0 || rbio->failb >= 0) {
2c8cdd6e 1094 BUG_ON(rbio->faila == rbio->real_stripes - 1);
53b381b3
DW
1095 __raid56_parity_recover(rbio);
1096 } else {
1097 finish_rmw(rbio);
1098 }
1099}
1100
00425dd9
QW
1101static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1102{
1103 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1104 struct bio_vec bvec;
1105 struct bvec_iter iter;
1106 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1107 rbio->bioc->raid_map[0];
1108
00425dd9
QW
1109 bio_for_each_segment(bvec, bio, iter) {
1110 u32 bvec_offset;
1111
1112 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1113 bvec_offset += sectorsize, offset += sectorsize) {
1114 int index = offset / sectorsize;
1115 struct sector_ptr *sector = &rbio->bio_sectors[index];
1116
1117 sector->page = bvec.bv_page;
1118 sector->pgoff = bvec.bv_offset + bvec_offset;
1119 ASSERT(sector->pgoff < PAGE_SIZE);
1120 }
1121 }
1122}
1123
53b381b3
DW
1124/*
1125 * helper function to walk our bio list and populate the bio_pages array with
1126 * the result. This seems expensive, but it is faster than constantly
1127 * searching through the bio list as we setup the IO in finish_rmw or stripe
1128 * reconstruction.
1129 *
1130 * This must be called before you trust the answers from page_in_rbio
1131 */
1132static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1133{
1134 struct bio *bio;
53b381b3
DW
1135
1136 spin_lock_irq(&rbio->bio_list_lock);
00425dd9
QW
1137 bio_list_for_each(bio, &rbio->bio_list)
1138 index_one_bio(rbio, bio);
1139
53b381b3
DW
1140 spin_unlock_irq(&rbio->bio_list_lock);
1141}
1142
b8bea09a
QW
1143static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1144 struct raid56_bio_trace_info *trace_info)
1145{
1146 const struct btrfs_io_context *bioc = rbio->bioc;
1147 int i;
1148
1149 ASSERT(bioc);
1150
1151 /* We rely on bio->bi_bdev to find the stripe number. */
1152 if (!bio->bi_bdev)
1153 goto not_found;
1154
1155 for (i = 0; i < bioc->num_stripes; i++) {
1156 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1157 continue;
1158 trace_info->stripe_nr = i;
1159 trace_info->devid = bioc->stripes[i].dev->devid;
1160 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1161 bioc->stripes[i].physical;
1162 return;
1163 }
1164
1165not_found:
1166 trace_info->devid = -1;
1167 trace_info->offset = -1;
1168 trace_info->stripe_nr = -1;
1169}
1170
53b381b3
DW
1171/*
1172 * this is called from one of two situations. We either
1173 * have a full stripe from the higher layers, or we've read all
1174 * the missing bits off disk.
1175 *
1176 * This will calculate the parity and then send down any
1177 * changed blocks.
1178 */
1179static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1180{
4c664611 1181 struct btrfs_io_context *bioc = rbio->bioc;
1145059a 1182 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 1183 void **pointers = rbio->finish_pointers;
53b381b3
DW
1184 int nr_data = rbio->nr_data;
1185 int stripe;
3e77605d 1186 int sectornr;
c17af965 1187 bool has_qstripe;
53b381b3
DW
1188 struct bio_list bio_list;
1189 struct bio *bio;
53b381b3
DW
1190 int ret;
1191
1192 bio_list_init(&bio_list);
1193
c17af965
DS
1194 if (rbio->real_stripes - rbio->nr_data == 1)
1195 has_qstripe = false;
1196 else if (rbio->real_stripes - rbio->nr_data == 2)
1197 has_qstripe = true;
1198 else
53b381b3 1199 BUG();
53b381b3 1200
bd8f7e62
QW
1201 /* We should have at least one data sector. */
1202 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1203
53b381b3
DW
1204 /* at this point we either have a full stripe,
1205 * or we've read the full stripe from the drive.
1206 * recalculate the parity and write the new results.
1207 *
1208 * We're not allowed to add any new bios to the
1209 * bio list here, anyone else that wants to
1210 * change this stripe needs to do their own rmw.
1211 */
1212 spin_lock_irq(&rbio->bio_list_lock);
1213 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1214 spin_unlock_irq(&rbio->bio_list_lock);
1215
b89e1b01 1216 atomic_set(&rbio->error, 0);
53b381b3
DW
1217
1218 /*
1219 * now that we've set rmw_locked, run through the
1220 * bio list one last time and map the page pointers
4ae10b3a
CM
1221 *
1222 * We don't cache full rbios because we're assuming
1223 * the higher layers are unlikely to use this area of
1224 * the disk again soon. If they do use it again,
1225 * hopefully they will send another full bio.
53b381b3
DW
1226 */
1227 index_rbio_pages(rbio);
4ae10b3a
CM
1228 if (!rbio_is_full(rbio))
1229 cache_rbio_pages(rbio);
1230 else
1231 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
53b381b3 1232
3e77605d 1233 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1145059a
QW
1234 struct sector_ptr *sector;
1235
1236 /* First collect one sector from each data stripe */
53b381b3 1237 for (stripe = 0; stripe < nr_data; stripe++) {
1145059a
QW
1238 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1239 pointers[stripe] = kmap_local_page(sector->page) +
1240 sector->pgoff;
53b381b3
DW
1241 }
1242
1145059a
QW
1243 /* Then add the parity stripe */
1244 sector = rbio_pstripe_sector(rbio, sectornr);
1245 sector->uptodate = 1;
1246 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
53b381b3 1247
c17af965 1248 if (has_qstripe) {
53b381b3 1249 /*
1145059a
QW
1250 * RAID6, add the qstripe and call the library function
1251 * to fill in our p/q
53b381b3 1252 */
1145059a
QW
1253 sector = rbio_qstripe_sector(rbio, sectornr);
1254 sector->uptodate = 1;
1255 pointers[stripe++] = kmap_local_page(sector->page) +
1256 sector->pgoff;
53b381b3 1257
1145059a 1258 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
53b381b3
DW
1259 pointers);
1260 } else {
1261 /* raid5 */
1145059a
QW
1262 memcpy(pointers[nr_data], pointers[0], sectorsize);
1263 run_xor(pointers + 1, nr_data - 1, sectorsize);
53b381b3 1264 }
94a0b58d
IW
1265 for (stripe = stripe - 1; stripe >= 0; stripe--)
1266 kunmap_local(pointers[stripe]);
53b381b3
DW
1267 }
1268
1269 /*
1270 * time to start writing. Make bios for everything from the
1271 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1272 * everything else.
1273 */
2c8cdd6e 1274 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
3e77605d
QW
1275 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1276 struct sector_ptr *sector;
1277
bd8f7e62
QW
1278 /* This vertical stripe has no data, skip it. */
1279 if (!test_bit(sectornr, &rbio->dbitmap))
1280 continue;
1281
53b381b3 1282 if (stripe < rbio->nr_data) {
3e77605d
QW
1283 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1284 if (!sector)
53b381b3
DW
1285 continue;
1286 } else {
3e77605d 1287 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3
DW
1288 }
1289
3e77605d
QW
1290 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1291 sectornr, rbio->stripe_len,
1292 REQ_OP_WRITE);
53b381b3
DW
1293 if (ret)
1294 goto cleanup;
1295 }
1296 }
1297
4c664611 1298 if (likely(!bioc->num_tgtdevs))
2c8cdd6e
MX
1299 goto write_data;
1300
1301 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
4c664611 1302 if (!bioc->tgtdev_map[stripe])
2c8cdd6e
MX
1303 continue;
1304
3e77605d
QW
1305 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1306 struct sector_ptr *sector;
1307
bd8f7e62
QW
1308 /* This vertical stripe has no data, skip it. */
1309 if (!test_bit(sectornr, &rbio->dbitmap))
1310 continue;
1311
2c8cdd6e 1312 if (stripe < rbio->nr_data) {
3e77605d
QW
1313 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1314 if (!sector)
2c8cdd6e
MX
1315 continue;
1316 } else {
3e77605d 1317 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2c8cdd6e
MX
1318 }
1319
3e77605d 1320 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 1321 rbio->bioc->tgtdev_map[stripe],
3e77605d 1322 sectornr, rbio->stripe_len,
e01bf588 1323 REQ_OP_WRITE);
2c8cdd6e
MX
1324 if (ret)
1325 goto cleanup;
1326 }
1327 }
1328
1329write_data:
b89e1b01
MX
1330 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1331 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
53b381b3 1332
bf28a605 1333 while ((bio = bio_list_pop(&bio_list))) {
53b381b3 1334 bio->bi_end_io = raid_write_end_io;
4e49ea4a 1335
b8bea09a
QW
1336 if (trace_raid56_write_stripe_enabled()) {
1337 struct raid56_bio_trace_info trace_info = { 0 };
1338
1339 bio_get_trace_info(rbio, bio, &trace_info);
1340 trace_raid56_write_stripe(rbio, bio, &trace_info);
1341 }
4e49ea4a 1342 submit_bio(bio);
53b381b3
DW
1343 }
1344 return;
1345
1346cleanup:
58efbc9f 1347 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1348
1349 while ((bio = bio_list_pop(&bio_list)))
1350 bio_put(bio);
53b381b3
DW
1351}
1352
1353/*
1354 * helper to find the stripe number for a given bio. Used to figure out which
1355 * stripe has failed. This expects the bio to correspond to a physical disk,
1356 * so it looks up based on physical sector numbers.
1357 */
1358static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1359 struct bio *bio)
1360{
4f024f37 1361 u64 physical = bio->bi_iter.bi_sector;
53b381b3 1362 int i;
4c664611 1363 struct btrfs_io_stripe *stripe;
53b381b3
DW
1364
1365 physical <<= 9;
1366
4c664611
QW
1367 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1368 stripe = &rbio->bioc->stripes[i];
83025863 1369 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
309dca30 1370 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
53b381b3
DW
1371 return i;
1372 }
1373 }
1374 return -1;
1375}
1376
1377/*
1378 * helper to find the stripe number for a given
1379 * bio (before mapping). Used to figure out which stripe has
1380 * failed. This looks up based on logical block numbers.
1381 */
1382static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1383 struct bio *bio)
1384{
1201b58b 1385 u64 logical = bio->bi_iter.bi_sector << 9;
53b381b3
DW
1386 int i;
1387
53b381b3 1388 for (i = 0; i < rbio->nr_data; i++) {
4c664611 1389 u64 stripe_start = rbio->bioc->raid_map[i];
83025863
NB
1390
1391 if (in_range(logical, stripe_start, rbio->stripe_len))
53b381b3 1392 return i;
53b381b3
DW
1393 }
1394 return -1;
1395}
1396
1397/*
1398 * returns -EIO if we had too many failures
1399 */
1400static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1401{
1402 unsigned long flags;
1403 int ret = 0;
1404
1405 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1406
1407 /* we already know this stripe is bad, move on */
1408 if (rbio->faila == failed || rbio->failb == failed)
1409 goto out;
1410
1411 if (rbio->faila == -1) {
1412 /* first failure on this rbio */
1413 rbio->faila = failed;
b89e1b01 1414 atomic_inc(&rbio->error);
53b381b3
DW
1415 } else if (rbio->failb == -1) {
1416 /* second failure on this rbio */
1417 rbio->failb = failed;
b89e1b01 1418 atomic_inc(&rbio->error);
53b381b3
DW
1419 } else {
1420 ret = -EIO;
1421 }
1422out:
1423 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1424
1425 return ret;
1426}
1427
1428/*
1429 * helper to fail a stripe based on a physical disk
1430 * bio.
1431 */
1432static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1433 struct bio *bio)
1434{
1435 int failed = find_bio_stripe(rbio, bio);
1436
1437 if (failed < 0)
1438 return -EIO;
1439
1440 return fail_rbio_index(rbio, failed);
1441}
1442
5fdb7afc
QW
1443/*
1444 * For subpage case, we can no longer set page Uptodate directly for
1445 * stripe_pages[], thus we need to locate the sector.
1446 */
1447static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1448 struct page *page,
1449 unsigned int pgoff)
1450{
1451 int i;
1452
1453 for (i = 0; i < rbio->nr_sectors; i++) {
1454 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1455
1456 if (sector->page == page && sector->pgoff == pgoff)
1457 return sector;
1458 }
1459 return NULL;
1460}
1461
53b381b3
DW
1462/*
1463 * this sets each page in the bio uptodate. It should only be used on private
1464 * rbio pages, nothing that comes in from the higher layers
1465 */
5fdb7afc 1466static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
53b381b3 1467{
5fdb7afc 1468 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
0198e5b7 1469 struct bio_vec *bvec;
6dc4f100 1470 struct bvec_iter_all iter_all;
6592e58c 1471
0198e5b7 1472 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1473
5fdb7afc
QW
1474 bio_for_each_segment_all(bvec, bio, iter_all) {
1475 struct sector_ptr *sector;
1476 int pgoff;
1477
1478 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1479 pgoff += sectorsize) {
1480 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1481 ASSERT(sector);
1482 if (sector)
1483 sector->uptodate = 1;
1484 }
1485 }
53b381b3
DW
1486}
1487
d34e123d 1488static void raid56_bio_end_io(struct bio *bio)
53b381b3
DW
1489{
1490 struct btrfs_raid_bio *rbio = bio->bi_private;
1491
4e4cbee9 1492 if (bio->bi_status)
53b381b3
DW
1493 fail_bio_stripe(rbio, bio);
1494 else
5fdb7afc 1495 set_bio_pages_uptodate(rbio, bio);
53b381b3
DW
1496
1497 bio_put(bio);
1498
d34e123d
CH
1499 if (atomic_dec_and_test(&rbio->stripes_pending))
1500 queue_work(rbio->bioc->fs_info->endio_raid56_workers,
1501 &rbio->end_io_work);
1502}
53b381b3 1503
d34e123d
CH
1504/*
1505 * End io handler for the read phase of the RMW cycle. All the bios here are
1506 * physical stripe bios we've read from the disk so we can recalculate the
1507 * parity of the stripe.
1508 *
1509 * This will usually kick off finish_rmw once all the bios are read in, but it
1510 * may trigger parity reconstruction if we had any errors along the way
1511 */
1512static void raid56_rmw_end_io_work(struct work_struct *work)
1513{
1514 struct btrfs_raid_bio *rbio =
1515 container_of(work, struct btrfs_raid_bio, end_io_work);
1516
1517 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
1518 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1519 return;
1520 }
53b381b3
DW
1521
1522 /*
d34e123d
CH
1523 * This will normally call finish_rmw to start our write but if there
1524 * are any failed stripes we'll reconstruct from parity first.
53b381b3
DW
1525 */
1526 validate_rbio_for_rmw(rbio);
53b381b3
DW
1527}
1528
53b381b3
DW
1529/*
1530 * the stripe must be locked by the caller. It will
1531 * unlock after all the writes are done
1532 */
1533static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1534{
1535 int bios_to_read = 0;
53b381b3
DW
1536 struct bio_list bio_list;
1537 int ret;
3e77605d 1538 int sectornr;
53b381b3
DW
1539 int stripe;
1540 struct bio *bio;
1541
1542 bio_list_init(&bio_list);
1543
1544 ret = alloc_rbio_pages(rbio);
1545 if (ret)
1546 goto cleanup;
1547
1548 index_rbio_pages(rbio);
1549
b89e1b01 1550 atomic_set(&rbio->error, 0);
53b381b3
DW
1551 /*
1552 * build a list of bios to read all the missing parts of this
1553 * stripe
1554 */
1555 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
3e77605d
QW
1556 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1557 struct sector_ptr *sector;
1558
53b381b3 1559 /*
3e77605d
QW
1560 * We want to find all the sectors missing from the
1561 * rbio and read them from the disk. If * sector_in_rbio()
1562 * finds a page in the bio list we don't need to read
1563 * it off the stripe.
53b381b3 1564 */
3e77605d
QW
1565 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1566 if (sector)
53b381b3
DW
1567 continue;
1568
3e77605d 1569 sector = rbio_stripe_sector(rbio, stripe, sectornr);
4ae10b3a 1570 /*
3e77605d
QW
1571 * The bio cache may have handed us an uptodate page.
1572 * If so, be happy and use it.
4ae10b3a 1573 */
3e77605d 1574 if (sector->uptodate)
4ae10b3a
CM
1575 continue;
1576
3e77605d
QW
1577 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1578 stripe, sectornr, rbio->stripe_len,
e01bf588 1579 REQ_OP_READ);
53b381b3
DW
1580 if (ret)
1581 goto cleanup;
1582 }
1583 }
1584
1585 bios_to_read = bio_list_size(&bio_list);
1586 if (!bios_to_read) {
1587 /*
1588 * this can happen if others have merged with
1589 * us, it means there is nothing left to read.
1590 * But if there are missing devices it may not be
1591 * safe to do the full stripe write yet.
1592 */
1593 goto finish;
1594 }
1595
1596 /*
4c664611
QW
1597 * The bioc may be freed once we submit the last bio. Make sure not to
1598 * touch it after that.
53b381b3 1599 */
b89e1b01 1600 atomic_set(&rbio->stripes_pending, bios_to_read);
d34e123d 1601 INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
bf28a605 1602 while ((bio = bio_list_pop(&bio_list))) {
d34e123d 1603 bio->bi_end_io = raid56_bio_end_io;
53b381b3 1604
b8bea09a
QW
1605 if (trace_raid56_read_partial_enabled()) {
1606 struct raid56_bio_trace_info trace_info = { 0 };
1607
1608 bio_get_trace_info(rbio, bio, &trace_info);
1609 trace_raid56_read_partial(rbio, bio, &trace_info);
1610 }
4e49ea4a 1611 submit_bio(bio);
53b381b3
DW
1612 }
1613 /* the actual write will happen once the reads are done */
1614 return 0;
1615
1616cleanup:
58efbc9f 1617 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
1618
1619 while ((bio = bio_list_pop(&bio_list)))
1620 bio_put(bio);
1621
53b381b3
DW
1622 return -EIO;
1623
1624finish:
1625 validate_rbio_for_rmw(rbio);
1626 return 0;
1627}
1628
1629/*
1630 * if the upper layers pass in a full stripe, we thank them by only allocating
1631 * enough pages to hold the parity, and sending it all down quickly.
1632 */
1633static int full_stripe_write(struct btrfs_raid_bio *rbio)
1634{
1635 int ret;
1636
1637 ret = alloc_rbio_parity_pages(rbio);
3cd846d1
MX
1638 if (ret) {
1639 __free_raid_bio(rbio);
53b381b3 1640 return ret;
3cd846d1 1641 }
53b381b3
DW
1642
1643 ret = lock_stripe_add(rbio);
1644 if (ret == 0)
1645 finish_rmw(rbio);
1646 return 0;
1647}
1648
1649/*
1650 * partial stripe writes get handed over to async helpers.
1651 * We're really hoping to merge a few more writes into this
1652 * rbio before calculating new parity
1653 */
1654static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1655{
1656 int ret;
1657
1658 ret = lock_stripe_add(rbio);
1659 if (ret == 0)
cf6a4a75 1660 start_async_work(rbio, rmw_work);
53b381b3
DW
1661 return 0;
1662}
1663
1664/*
1665 * sometimes while we were reading from the drive to
1666 * recalculate parity, enough new bios come into create
1667 * a full stripe. So we do a check here to see if we can
1668 * go directly to finish_rmw
1669 */
1670static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1671{
1672 /* head off into rmw land if we don't have a full stripe */
1673 if (!rbio_is_full(rbio))
1674 return partial_stripe_write(rbio);
1675 return full_stripe_write(rbio);
1676}
1677
6ac0f488
CM
1678/*
1679 * We use plugging call backs to collect full stripes.
1680 * Any time we get a partial stripe write while plugged
1681 * we collect it into a list. When the unplug comes down,
1682 * we sort the list by logical block number and merge
1683 * everything we can into the same rbios
1684 */
1685struct btrfs_plug_cb {
1686 struct blk_plug_cb cb;
1687 struct btrfs_fs_info *info;
1688 struct list_head rbio_list;
385de0ef 1689 struct work_struct work;
6ac0f488
CM
1690};
1691
1692/*
1693 * rbios on the plug list are sorted for easier merging.
1694 */
4f0f586b
ST
1695static int plug_cmp(void *priv, const struct list_head *a,
1696 const struct list_head *b)
6ac0f488 1697{
214cc184
DS
1698 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1699 plug_list);
1700 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1701 plug_list);
4f024f37
KO
1702 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1703 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1704
1705 if (a_sector < b_sector)
1706 return -1;
1707 if (a_sector > b_sector)
1708 return 1;
1709 return 0;
1710}
1711
1712static void run_plug(struct btrfs_plug_cb *plug)
1713{
1714 struct btrfs_raid_bio *cur;
1715 struct btrfs_raid_bio *last = NULL;
1716
1717 /*
1718 * sort our plug list then try to merge
1719 * everything we can in hopes of creating full
1720 * stripes.
1721 */
1722 list_sort(NULL, &plug->rbio_list, plug_cmp);
1723 while (!list_empty(&plug->rbio_list)) {
1724 cur = list_entry(plug->rbio_list.next,
1725 struct btrfs_raid_bio, plug_list);
1726 list_del_init(&cur->plug_list);
1727
1728 if (rbio_is_full(cur)) {
c7b562c5
DS
1729 int ret;
1730
6ac0f488 1731 /* we have a full stripe, send it down */
c7b562c5
DS
1732 ret = full_stripe_write(cur);
1733 BUG_ON(ret);
6ac0f488
CM
1734 continue;
1735 }
1736 if (last) {
1737 if (rbio_can_merge(last, cur)) {
1738 merge_rbio(last, cur);
1739 __free_raid_bio(cur);
1740 continue;
1741
1742 }
1743 __raid56_parity_write(last);
1744 }
1745 last = cur;
1746 }
1747 if (last) {
1748 __raid56_parity_write(last);
1749 }
1750 kfree(plug);
1751}
1752
1753/*
1754 * if the unplug comes from schedule, we have to push the
1755 * work off to a helper thread
1756 */
385de0ef 1757static void unplug_work(struct work_struct *work)
6ac0f488
CM
1758{
1759 struct btrfs_plug_cb *plug;
1760 plug = container_of(work, struct btrfs_plug_cb, work);
1761 run_plug(plug);
1762}
1763
1764static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1765{
1766 struct btrfs_plug_cb *plug;
1767 plug = container_of(cb, struct btrfs_plug_cb, cb);
1768
1769 if (from_schedule) {
385de0ef
CH
1770 INIT_WORK(&plug->work, unplug_work);
1771 queue_work(plug->info->rmw_workers, &plug->work);
6ac0f488
CM
1772 return;
1773 }
1774 run_plug(plug);
1775}
1776
bd8f7e62
QW
1777/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1778static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1779{
1780 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1781 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1782 const u64 full_stripe_start = rbio->bioc->raid_map[0];
1783 const u32 orig_len = orig_bio->bi_iter.bi_size;
1784 const u32 sectorsize = fs_info->sectorsize;
1785 u64 cur_logical;
1786
1787 ASSERT(orig_logical >= full_stripe_start &&
1788 orig_logical + orig_len <= full_stripe_start +
1789 rbio->nr_data * rbio->stripe_len);
1790
1791 bio_list_add(&rbio->bio_list, orig_bio);
1792 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1793
1794 /* Update the dbitmap. */
1795 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1796 cur_logical += sectorsize) {
1797 int bit = ((u32)(cur_logical - full_stripe_start) >>
1798 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1799
1800 set_bit(bit, &rbio->dbitmap);
1801 }
1802}
1803
53b381b3
DW
1804/*
1805 * our main entry point for writes from the rest of the FS.
1806 */
cc353a8b 1807int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
53b381b3 1808{
6a258d72 1809 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1810 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1811 struct btrfs_plug_cb *plug = NULL;
1812 struct blk_plug_cb *cb;
4245215d 1813 int ret;
53b381b3 1814
4c664611 1815 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 1816 if (IS_ERR(rbio)) {
4c664611 1817 btrfs_put_bioc(bioc);
53b381b3 1818 return PTR_ERR(rbio);
af8e2d1d 1819 }
1b94b556 1820 rbio->operation = BTRFS_RBIO_WRITE;
bd8f7e62 1821 rbio_add_bio(rbio, bio);
6ac0f488 1822
0b246afa 1823 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
1824 rbio->generic_bio_cnt = 1;
1825
6ac0f488
CM
1826 /*
1827 * don't plug on full rbios, just get them out the door
1828 * as quickly as we can
1829 */
4245215d
MX
1830 if (rbio_is_full(rbio)) {
1831 ret = full_stripe_write(rbio);
1832 if (ret)
0b246afa 1833 btrfs_bio_counter_dec(fs_info);
4245215d
MX
1834 return ret;
1835 }
6ac0f488 1836
0b246afa 1837 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
6ac0f488
CM
1838 if (cb) {
1839 plug = container_of(cb, struct btrfs_plug_cb, cb);
1840 if (!plug->info) {
0b246afa 1841 plug->info = fs_info;
6ac0f488
CM
1842 INIT_LIST_HEAD(&plug->rbio_list);
1843 }
1844 list_add_tail(&rbio->plug_list, &plug->rbio_list);
4245215d 1845 ret = 0;
6ac0f488 1846 } else {
4245215d
MX
1847 ret = __raid56_parity_write(rbio);
1848 if (ret)
0b246afa 1849 btrfs_bio_counter_dec(fs_info);
6ac0f488 1850 }
4245215d 1851 return ret;
53b381b3
DW
1852}
1853
1854/*
1855 * all parity reconstruction happens here. We've read in everything
1856 * we can find from the drives and this does the heavy lifting of
1857 * sorting the good from the bad.
1858 */
1859static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1860{
07e4d380
QW
1861 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1862 int sectornr, stripe;
53b381b3 1863 void **pointers;
94a0b58d 1864 void **unmap_array;
53b381b3 1865 int faila = -1, failb = -1;
58efbc9f 1866 blk_status_t err;
53b381b3
DW
1867 int i;
1868
07e4d380
QW
1869 /*
1870 * This array stores the pointer for each sector, thus it has the extra
1871 * pgoff value added from each sector
1872 */
31e818fe 1873 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
53b381b3 1874 if (!pointers) {
58efbc9f 1875 err = BLK_STS_RESOURCE;
53b381b3
DW
1876 goto cleanup_io;
1877 }
1878
94a0b58d
IW
1879 /*
1880 * Store copy of pointers that does not get reordered during
1881 * reconstruction so that kunmap_local works.
1882 */
1883 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1884 if (!unmap_array) {
1885 err = BLK_STS_RESOURCE;
1886 goto cleanup_pointers;
1887 }
1888
53b381b3
DW
1889 faila = rbio->faila;
1890 failb = rbio->failb;
1891
b4ee1782
OS
1892 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1893 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
53b381b3
DW
1894 spin_lock_irq(&rbio->bio_list_lock);
1895 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1896 spin_unlock_irq(&rbio->bio_list_lock);
1897 }
1898
1899 index_rbio_pages(rbio);
1900
07e4d380
QW
1901 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1902 struct sector_ptr *sector;
1903
5a6ac9ea
MX
1904 /*
1905 * Now we just use bitmap to mark the horizontal stripes in
1906 * which we have data when doing parity scrub.
1907 */
1908 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
c67c68eb 1909 !test_bit(sectornr, &rbio->dbitmap))
5a6ac9ea
MX
1910 continue;
1911
94a0b58d 1912 /*
07e4d380 1913 * Setup our array of pointers with sectors from each stripe
94a0b58d
IW
1914 *
1915 * NOTE: store a duplicate array of pointers to preserve the
1916 * pointer order
53b381b3 1917 */
2c8cdd6e 1918 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
53b381b3 1919 /*
07e4d380 1920 * If we're rebuilding a read, we have to use
53b381b3
DW
1921 * pages from the bio list
1922 */
b4ee1782
OS
1923 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1924 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
53b381b3 1925 (stripe == faila || stripe == failb)) {
07e4d380 1926 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
53b381b3 1927 } else {
07e4d380 1928 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3 1929 }
07e4d380
QW
1930 ASSERT(sector->page);
1931 pointers[stripe] = kmap_local_page(sector->page) +
1932 sector->pgoff;
94a0b58d 1933 unmap_array[stripe] = pointers[stripe];
53b381b3
DW
1934 }
1935
07e4d380 1936 /* All raid6 handling here */
4c664611 1937 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
07e4d380 1938 /* Single failure, rebuild from parity raid5 style */
53b381b3
DW
1939 if (failb < 0) {
1940 if (faila == rbio->nr_data) {
1941 /*
1942 * Just the P stripe has failed, without
1943 * a bad data or Q stripe.
1944 * TODO, we should redo the xor here.
1945 */
58efbc9f 1946 err = BLK_STS_IOERR;
53b381b3
DW
1947 goto cleanup;
1948 }
1949 /*
1950 * a single failure in raid6 is rebuilt
1951 * in the pstripe code below
1952 */
1953 goto pstripe;
1954 }
1955
1956 /* make sure our ps and qs are in order */
b7d2083a
NB
1957 if (faila > failb)
1958 swap(faila, failb);
53b381b3
DW
1959
1960 /* if the q stripe is failed, do a pstripe reconstruction
1961 * from the xors.
1962 * If both the q stripe and the P stripe are failed, we're
1963 * here due to a crc mismatch and we can't give them the
1964 * data they want
1965 */
4c664611
QW
1966 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1967 if (rbio->bioc->raid_map[faila] ==
8e5cfb55 1968 RAID5_P_STRIPE) {
58efbc9f 1969 err = BLK_STS_IOERR;
53b381b3
DW
1970 goto cleanup;
1971 }
1972 /*
1973 * otherwise we have one bad data stripe and
1974 * a good P stripe. raid5!
1975 */
1976 goto pstripe;
1977 }
1978
4c664611 1979 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2c8cdd6e 1980 raid6_datap_recov(rbio->real_stripes,
07e4d380 1981 sectorsize, faila, pointers);
53b381b3 1982 } else {
2c8cdd6e 1983 raid6_2data_recov(rbio->real_stripes,
07e4d380 1984 sectorsize, faila, failb,
53b381b3
DW
1985 pointers);
1986 }
1987 } else {
1988 void *p;
1989
1990 /* rebuild from P stripe here (raid5 or raid6) */
1991 BUG_ON(failb != -1);
1992pstripe:
1993 /* Copy parity block into failed block to start with */
07e4d380 1994 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
53b381b3
DW
1995
1996 /* rearrange the pointer array */
1997 p = pointers[faila];
1998 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1999 pointers[stripe] = pointers[stripe + 1];
2000 pointers[rbio->nr_data - 1] = p;
2001
2002 /* xor in the rest */
07e4d380 2003 run_xor(pointers, rbio->nr_data - 1, sectorsize);
53b381b3
DW
2004 }
2005 /* if we're doing this rebuild as part of an rmw, go through
2006 * and set all of our private rbio pages in the
2007 * failed stripes as uptodate. This way finish_rmw will
2008 * know they can be trusted. If this was a read reconstruction,
2009 * other endio functions will fiddle the uptodate bits
2010 */
1b94b556 2011 if (rbio->operation == BTRFS_RBIO_WRITE) {
07e4d380 2012 for (i = 0; i < rbio->stripe_nsectors; i++) {
53b381b3 2013 if (faila != -1) {
07e4d380
QW
2014 sector = rbio_stripe_sector(rbio, faila, i);
2015 sector->uptodate = 1;
53b381b3
DW
2016 }
2017 if (failb != -1) {
07e4d380
QW
2018 sector = rbio_stripe_sector(rbio, failb, i);
2019 sector->uptodate = 1;
53b381b3
DW
2020 }
2021 }
2022 }
94a0b58d
IW
2023 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2024 kunmap_local(unmap_array[stripe]);
53b381b3
DW
2025 }
2026
58efbc9f 2027 err = BLK_STS_OK;
53b381b3 2028cleanup:
94a0b58d
IW
2029 kfree(unmap_array);
2030cleanup_pointers:
53b381b3
DW
2031 kfree(pointers);
2032
2033cleanup_io:
580c6efa
LB
2034 /*
2035 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2036 * valid rbio which is consistent with ondisk content, thus such a
2037 * valid rbio can be cached to avoid further disk reads.
2038 */
2039 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2040 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
44ac474d
LB
2041 /*
2042 * - In case of two failures, where rbio->failb != -1:
2043 *
2044 * Do not cache this rbio since the above read reconstruction
2045 * (raid6_datap_recov() or raid6_2data_recov()) may have
2046 * changed some content of stripes which are not identical to
2047 * on-disk content any more, otherwise, a later write/recover
2048 * may steal stripe_pages from this rbio and end up with
2049 * corruptions or rebuild failures.
2050 *
2051 * - In case of single failure, where rbio->failb == -1:
2052 *
2053 * Cache this rbio iff the above read reconstruction is
52042d8e 2054 * executed without problems.
44ac474d
LB
2055 */
2056 if (err == BLK_STS_OK && rbio->failb < 0)
4ae10b3a
CM
2057 cache_rbio_pages(rbio);
2058 else
2059 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2060
4246a0b6 2061 rbio_orig_end_io(rbio, err);
58efbc9f 2062 } else if (err == BLK_STS_OK) {
53b381b3
DW
2063 rbio->faila = -1;
2064 rbio->failb = -1;
5a6ac9ea
MX
2065
2066 if (rbio->operation == BTRFS_RBIO_WRITE)
2067 finish_rmw(rbio);
2068 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2069 finish_parity_scrub(rbio, 0);
2070 else
2071 BUG();
53b381b3 2072 } else {
4246a0b6 2073 rbio_orig_end_io(rbio, err);
53b381b3
DW
2074 }
2075}
2076
2077/*
d34e123d
CH
2078 * This is called only for stripes we've read from disk to reconstruct the
2079 * parity.
53b381b3 2080 */
d34e123d 2081static void raid_recover_end_io_work(struct work_struct *work)
53b381b3 2082{
d34e123d
CH
2083 struct btrfs_raid_bio *rbio =
2084 container_of(work, struct btrfs_raid_bio, end_io_work);
53b381b3 2085
4c664611 2086 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
58efbc9f 2087 rbio_orig_end_io(rbio, BLK_STS_IOERR);
53b381b3
DW
2088 else
2089 __raid_recover_end_io(rbio);
2090}
2091
2092/*
2093 * reads everything we need off the disk to reconstruct
2094 * the parity. endio handlers trigger final reconstruction
2095 * when the IO is done.
2096 *
2097 * This is used both for reads from the higher layers and for
2098 * parity construction required to finish a rmw cycle.
2099 */
2100static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2101{
2102 int bios_to_read = 0;
53b381b3
DW
2103 struct bio_list bio_list;
2104 int ret;
3e77605d 2105 int sectornr;
53b381b3
DW
2106 int stripe;
2107 struct bio *bio;
2108
2109 bio_list_init(&bio_list);
2110
2111 ret = alloc_rbio_pages(rbio);
2112 if (ret)
2113 goto cleanup;
2114
b89e1b01 2115 atomic_set(&rbio->error, 0);
53b381b3
DW
2116
2117 /*
4ae10b3a
CM
2118 * read everything that hasn't failed. Thanks to the
2119 * stripe cache, it is possible that some or all of these
2120 * pages are going to be uptodate.
53b381b3 2121 */
2c8cdd6e 2122 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
5588383e 2123 if (rbio->faila == stripe || rbio->failb == stripe) {
b89e1b01 2124 atomic_inc(&rbio->error);
53b381b3 2125 continue;
5588383e 2126 }
53b381b3 2127
3e77605d
QW
2128 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2129 struct sector_ptr *sector;
53b381b3
DW
2130
2131 /*
2132 * the rmw code may have already read this
2133 * page in
2134 */
3e77605d
QW
2135 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2136 if (sector->uptodate)
53b381b3
DW
2137 continue;
2138
3e77605d
QW
2139 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2140 stripe, sectornr, rbio->stripe_len,
2141 REQ_OP_READ);
53b381b3
DW
2142 if (ret < 0)
2143 goto cleanup;
2144 }
2145 }
2146
2147 bios_to_read = bio_list_size(&bio_list);
2148 if (!bios_to_read) {
2149 /*
2150 * we might have no bios to read just because the pages
2151 * were up to date, or we might have no bios to read because
2152 * the devices were gone.
2153 */
4c664611 2154 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
53b381b3 2155 __raid_recover_end_io(rbio);
813f8a0e 2156 return 0;
53b381b3
DW
2157 } else {
2158 goto cleanup;
2159 }
2160 }
2161
2162 /*
4c664611
QW
2163 * The bioc may be freed once we submit the last bio. Make sure not to
2164 * touch it after that.
53b381b3 2165 */
b89e1b01 2166 atomic_set(&rbio->stripes_pending, bios_to_read);
d34e123d 2167 INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
bf28a605 2168 while ((bio = bio_list_pop(&bio_list))) {
d34e123d 2169 bio->bi_end_io = raid56_bio_end_io;
53b381b3 2170
b8bea09a
QW
2171 if (trace_raid56_scrub_read_recover_enabled()) {
2172 struct raid56_bio_trace_info trace_info = { 0 };
2173
2174 bio_get_trace_info(rbio, bio, &trace_info);
2175 trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
2176 }
4e49ea4a 2177 submit_bio(bio);
53b381b3 2178 }
813f8a0e 2179
53b381b3
DW
2180 return 0;
2181
2182cleanup:
b4ee1782
OS
2183 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2184 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
58efbc9f 2185 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2186
2187 while ((bio = bio_list_pop(&bio_list)))
2188 bio_put(bio);
2189
53b381b3
DW
2190 return -EIO;
2191}
2192
2193/*
2194 * the main entry point for reads from the higher layers. This
2195 * is really only called when the normal read path had a failure,
2196 * so we assume the bio they send down corresponds to a failed part
2197 * of the drive.
2198 */
6a258d72 2199int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
cc353a8b 2200 u32 stripe_len, int mirror_num, int generic_io)
53b381b3 2201{
6a258d72 2202 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3
DW
2203 struct btrfs_raid_bio *rbio;
2204 int ret;
2205
abad60c6 2206 if (generic_io) {
4c664611 2207 ASSERT(bioc->mirror_num == mirror_num);
c3a3b19b 2208 btrfs_bio(bio)->mirror_num = mirror_num;
abad60c6
LB
2209 }
2210
4c664611 2211 rbio = alloc_rbio(fs_info, bioc, stripe_len);
af8e2d1d 2212 if (IS_ERR(rbio)) {
6e9606d2 2213 if (generic_io)
4c664611 2214 btrfs_put_bioc(bioc);
53b381b3 2215 return PTR_ERR(rbio);
af8e2d1d 2216 }
53b381b3 2217
1b94b556 2218 rbio->operation = BTRFS_RBIO_READ_REBUILD;
bd8f7e62 2219 rbio_add_bio(rbio, bio);
53b381b3
DW
2220
2221 rbio->faila = find_logical_bio_stripe(rbio, bio);
2222 if (rbio->faila == -1) {
0b246afa 2223 btrfs_warn(fs_info,
4c664611 2224"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
1201b58b 2225 __func__, bio->bi_iter.bi_sector << 9,
4c664611 2226 (u64)bio->bi_iter.bi_size, bioc->map_type);
6e9606d2 2227 if (generic_io)
4c664611 2228 btrfs_put_bioc(bioc);
53b381b3
DW
2229 kfree(rbio);
2230 return -EIO;
2231 }
2232
4245215d 2233 if (generic_io) {
0b246afa 2234 btrfs_bio_counter_inc_noblocked(fs_info);
4245215d
MX
2235 rbio->generic_bio_cnt = 1;
2236 } else {
4c664611 2237 btrfs_get_bioc(bioc);
4245215d
MX
2238 }
2239
53b381b3 2240 /*
8810f751
LB
2241 * Loop retry:
2242 * for 'mirror == 2', reconstruct from all other stripes.
2243 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2244 */
8810f751
LB
2245 if (mirror_num > 2) {
2246 /*
2247 * 'mirror == 3' is to fail the p stripe and
2248 * reconstruct from the q stripe. 'mirror > 3' is to
2249 * fail a data stripe and reconstruct from p+q stripe.
2250 */
2251 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2252 ASSERT(rbio->failb > 0);
2253 if (rbio->failb <= rbio->faila)
2254 rbio->failb--;
2255 }
53b381b3
DW
2256
2257 ret = lock_stripe_add(rbio);
2258
2259 /*
2260 * __raid56_parity_recover will end the bio with
2261 * any errors it hits. We don't want to return
2262 * its error value up the stack because our caller
2263 * will end up calling bio_endio with any nonzero
2264 * return
2265 */
2266 if (ret == 0)
2267 __raid56_parity_recover(rbio);
2268 /*
2269 * our rbio has been added to the list of
2270 * rbios that will be handled after the
2271 * currently lock owner is done
2272 */
2273 return 0;
2274
2275}
2276
385de0ef 2277static void rmw_work(struct work_struct *work)
53b381b3
DW
2278{
2279 struct btrfs_raid_bio *rbio;
2280
2281 rbio = container_of(work, struct btrfs_raid_bio, work);
2282 raid56_rmw_stripe(rbio);
2283}
2284
385de0ef 2285static void read_rebuild_work(struct work_struct *work)
53b381b3
DW
2286{
2287 struct btrfs_raid_bio *rbio;
2288
2289 rbio = container_of(work, struct btrfs_raid_bio, work);
2290 __raid56_parity_recover(rbio);
2291}
5a6ac9ea
MX
2292
2293/*
2294 * The following code is used to scrub/replace the parity stripe
2295 *
4c664611 2296 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2297 *
5a6ac9ea
MX
2298 * Note: We need make sure all the pages that add into the scrub/replace
2299 * raid bio are correct and not be changed during the scrub/replace. That
2300 * is those pages just hold metadata or file data with checksum.
2301 */
2302
6a258d72
QW
2303struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2304 struct btrfs_io_context *bioc,
cc353a8b 2305 u32 stripe_len, struct btrfs_device *scrub_dev,
6a258d72 2306 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2307{
6a258d72 2308 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2309 struct btrfs_raid_bio *rbio;
2310 int i;
2311
4c664611 2312 rbio = alloc_rbio(fs_info, bioc, stripe_len);
5a6ac9ea
MX
2313 if (IS_ERR(rbio))
2314 return NULL;
2315 bio_list_add(&rbio->bio_list, bio);
2316 /*
2317 * This is a special bio which is used to hold the completion handler
2318 * and make the scrub rbio is similar to the other types
2319 */
2320 ASSERT(!bio->bi_iter.bi_size);
2321 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2322
9cd3a7eb 2323 /*
4c664611 2324 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2325 * to the end position, so this search can start from the first parity
2326 * stripe.
2327 */
2328 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2329 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2330 rbio->scrubp = i;
2331 break;
2332 }
2333 }
9cd3a7eb 2334 ASSERT(i < rbio->real_stripes);
5a6ac9ea 2335
c67c68eb 2336 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
5a6ac9ea 2337
ae6529c3 2338 /*
4c664611 2339 * We have already increased bio_counter when getting bioc, record it
ae6529c3
QW
2340 * so we can free it at rbio_orig_end_io().
2341 */
2342 rbio->generic_bio_cnt = 1;
2343
5a6ac9ea
MX
2344 return rbio;
2345}
2346
b4ee1782
OS
2347/* Used for both parity scrub and missing. */
2348void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
6346f6bf 2349 unsigned int pgoff, u64 logical)
5a6ac9ea 2350{
6346f6bf 2351 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
5a6ac9ea
MX
2352 int stripe_offset;
2353 int index;
2354
4c664611 2355 ASSERT(logical >= rbio->bioc->raid_map[0]);
6346f6bf 2356 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
5a6ac9ea 2357 rbio->stripe_len * rbio->nr_data);
4c664611 2358 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
6346f6bf
QW
2359 index = stripe_offset / sectorsize;
2360 rbio->bio_sectors[index].page = page;
2361 rbio->bio_sectors[index].pgoff = pgoff;
5a6ac9ea
MX
2362}
2363
2364/*
2365 * We just scrub the parity that we have correct data on the same horizontal,
2366 * so we needn't allocate all pages for all the stripes.
2367 */
2368static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2369{
3907ce29
QW
2370 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2371 int stripe;
2372 int sectornr;
2373
c67c68eb 2374 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3907ce29
QW
2375 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2376 struct page *page;
2377 int index = (stripe * rbio->stripe_nsectors + sectornr) *
2378 sectorsize >> PAGE_SHIFT;
5a6ac9ea 2379
5a6ac9ea
MX
2380 if (rbio->stripe_pages[index])
2381 continue;
2382
b0ee5e1e 2383 page = alloc_page(GFP_NOFS);
5a6ac9ea
MX
2384 if (!page)
2385 return -ENOMEM;
2386 rbio->stripe_pages[index] = page;
5a6ac9ea
MX
2387 }
2388 }
eb357060 2389 index_stripe_sectors(rbio);
5a6ac9ea
MX
2390 return 0;
2391}
2392
5a6ac9ea
MX
2393static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2394 int need_check)
2395{
4c664611 2396 struct btrfs_io_context *bioc = rbio->bioc;
46900662 2397 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 2398 void **pointers = rbio->finish_pointers;
c67c68eb 2399 unsigned long *pbitmap = &rbio->finish_pbitmap;
5a6ac9ea
MX
2400 int nr_data = rbio->nr_data;
2401 int stripe;
3e77605d 2402 int sectornr;
c17af965 2403 bool has_qstripe;
46900662
QW
2404 struct sector_ptr p_sector = { 0 };
2405 struct sector_ptr q_sector = { 0 };
5a6ac9ea
MX
2406 struct bio_list bio_list;
2407 struct bio *bio;
76035976 2408 int is_replace = 0;
5a6ac9ea
MX
2409 int ret;
2410
2411 bio_list_init(&bio_list);
2412
c17af965
DS
2413 if (rbio->real_stripes - rbio->nr_data == 1)
2414 has_qstripe = false;
2415 else if (rbio->real_stripes - rbio->nr_data == 2)
2416 has_qstripe = true;
2417 else
5a6ac9ea 2418 BUG();
5a6ac9ea 2419
4c664611 2420 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
76035976 2421 is_replace = 1;
c67c68eb 2422 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
76035976
MX
2423 }
2424
5a6ac9ea
MX
2425 /*
2426 * Because the higher layers(scrubber) are unlikely to
2427 * use this area of the disk again soon, so don't cache
2428 * it.
2429 */
2430 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2431
2432 if (!need_check)
2433 goto writeback;
2434
46900662
QW
2435 p_sector.page = alloc_page(GFP_NOFS);
2436 if (!p_sector.page)
5a6ac9ea 2437 goto cleanup;
46900662
QW
2438 p_sector.pgoff = 0;
2439 p_sector.uptodate = 1;
5a6ac9ea 2440
c17af965 2441 if (has_qstripe) {
d70cef0d 2442 /* RAID6, allocate and map temp space for the Q stripe */
46900662
QW
2443 q_sector.page = alloc_page(GFP_NOFS);
2444 if (!q_sector.page) {
2445 __free_page(p_sector.page);
2446 p_sector.page = NULL;
5a6ac9ea
MX
2447 goto cleanup;
2448 }
46900662
QW
2449 q_sector.pgoff = 0;
2450 q_sector.uptodate = 1;
2451 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
5a6ac9ea
MX
2452 }
2453
2454 atomic_set(&rbio->error, 0);
2455
d70cef0d 2456 /* Map the parity stripe just once */
46900662 2457 pointers[nr_data] = kmap_local_page(p_sector.page);
d70cef0d 2458
c67c68eb 2459 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
46900662 2460 struct sector_ptr *sector;
5a6ac9ea 2461 void *parity;
46900662 2462
5a6ac9ea
MX
2463 /* first collect one page from each data stripe */
2464 for (stripe = 0; stripe < nr_data; stripe++) {
46900662
QW
2465 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2466 pointers[stripe] = kmap_local_page(sector->page) +
2467 sector->pgoff;
5a6ac9ea
MX
2468 }
2469
c17af965 2470 if (has_qstripe) {
d70cef0d 2471 /* RAID6, call the library function to fill in our P/Q */
46900662 2472 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
5a6ac9ea
MX
2473 pointers);
2474 } else {
2475 /* raid5 */
46900662
QW
2476 memcpy(pointers[nr_data], pointers[0], sectorsize);
2477 run_xor(pointers + 1, nr_data - 1, sectorsize);
5a6ac9ea
MX
2478 }
2479
01327610 2480 /* Check scrubbing parity and repair it */
46900662
QW
2481 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2482 parity = kmap_local_page(sector->page) + sector->pgoff;
2483 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2484 memcpy(parity, pointers[rbio->scrubp], sectorsize);
5a6ac9ea
MX
2485 else
2486 /* Parity is right, needn't writeback */
c67c68eb 2487 bitmap_clear(&rbio->dbitmap, sectornr, 1);
58c1a35c 2488 kunmap_local(parity);
5a6ac9ea 2489
94a0b58d
IW
2490 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2491 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2492 }
2493
94a0b58d 2494 kunmap_local(pointers[nr_data]);
46900662
QW
2495 __free_page(p_sector.page);
2496 p_sector.page = NULL;
2497 if (q_sector.page) {
94a0b58d 2498 kunmap_local(pointers[rbio->real_stripes - 1]);
46900662
QW
2499 __free_page(q_sector.page);
2500 q_sector.page = NULL;
d70cef0d 2501 }
5a6ac9ea
MX
2502
2503writeback:
2504 /*
2505 * time to start writing. Make bios for everything from the
2506 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2507 * everything else.
2508 */
c67c68eb 2509 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3e77605d 2510 struct sector_ptr *sector;
5a6ac9ea 2511
3e77605d
QW
2512 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2513 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2514 sectornr, rbio->stripe_len, REQ_OP_WRITE);
5a6ac9ea
MX
2515 if (ret)
2516 goto cleanup;
2517 }
2518
76035976
MX
2519 if (!is_replace)
2520 goto submit_write;
2521
3e77605d
QW
2522 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2523 struct sector_ptr *sector;
76035976 2524
3e77605d
QW
2525 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2526 ret = rbio_add_io_sector(rbio, &bio_list, sector,
4c664611 2527 bioc->tgtdev_map[rbio->scrubp],
3e77605d 2528 sectornr, rbio->stripe_len, REQ_OP_WRITE);
76035976
MX
2529 if (ret)
2530 goto cleanup;
2531 }
2532
2533submit_write:
5a6ac9ea
MX
2534 nr_data = bio_list_size(&bio_list);
2535 if (!nr_data) {
2536 /* Every parity is right */
58efbc9f 2537 rbio_orig_end_io(rbio, BLK_STS_OK);
5a6ac9ea
MX
2538 return;
2539 }
2540
2541 atomic_set(&rbio->stripes_pending, nr_data);
2542
bf28a605 2543 while ((bio = bio_list_pop(&bio_list))) {
a6111d11 2544 bio->bi_end_io = raid_write_end_io;
4e49ea4a 2545
b8bea09a
QW
2546 if (trace_raid56_scrub_write_stripe_enabled()) {
2547 struct raid56_bio_trace_info trace_info = { 0 };
2548
2549 bio_get_trace_info(rbio, bio, &trace_info);
2550 trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
2551 }
4e49ea4a 2552 submit_bio(bio);
5a6ac9ea
MX
2553 }
2554 return;
2555
2556cleanup:
58efbc9f 2557 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2558
2559 while ((bio = bio_list_pop(&bio_list)))
2560 bio_put(bio);
5a6ac9ea
MX
2561}
2562
2563static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2564{
2565 if (stripe >= 0 && stripe < rbio->nr_data)
2566 return 1;
2567 return 0;
2568}
2569
2570/*
2571 * While we're doing the parity check and repair, we could have errors
2572 * in reading pages off the disk. This checks for errors and if we're
2573 * not able to read the page it'll trigger parity reconstruction. The
2574 * parity scrub will be finished after we've reconstructed the failed
2575 * stripes
2576 */
2577static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2578{
4c664611 2579 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
5a6ac9ea
MX
2580 goto cleanup;
2581
2582 if (rbio->faila >= 0 || rbio->failb >= 0) {
2583 int dfail = 0, failp = -1;
2584
2585 if (is_data_stripe(rbio, rbio->faila))
2586 dfail++;
2587 else if (is_parity_stripe(rbio->faila))
2588 failp = rbio->faila;
2589
2590 if (is_data_stripe(rbio, rbio->failb))
2591 dfail++;
2592 else if (is_parity_stripe(rbio->failb))
2593 failp = rbio->failb;
2594
2595 /*
2596 * Because we can not use a scrubbing parity to repair
2597 * the data, so the capability of the repair is declined.
2598 * (In the case of RAID5, we can not repair anything)
2599 */
4c664611 2600 if (dfail > rbio->bioc->max_errors - 1)
5a6ac9ea
MX
2601 goto cleanup;
2602
2603 /*
2604 * If all data is good, only parity is correctly, just
2605 * repair the parity.
2606 */
2607 if (dfail == 0) {
2608 finish_parity_scrub(rbio, 0);
2609 return;
2610 }
2611
2612 /*
2613 * Here means we got one corrupted data stripe and one
2614 * corrupted parity on RAID6, if the corrupted parity
01327610 2615 * is scrubbing parity, luckily, use the other one to repair
5a6ac9ea
MX
2616 * the data, or we can not repair the data stripe.
2617 */
2618 if (failp != rbio->scrubp)
2619 goto cleanup;
2620
2621 __raid_recover_end_io(rbio);
2622 } else {
2623 finish_parity_scrub(rbio, 1);
2624 }
2625 return;
2626
2627cleanup:
58efbc9f 2628 rbio_orig_end_io(rbio, BLK_STS_IOERR);
5a6ac9ea
MX
2629}
2630
2631/*
2632 * end io for the read phase of the rmw cycle. All the bios here are physical
2633 * stripe bios we've read from the disk so we can recalculate the parity of the
2634 * stripe.
2635 *
2636 * This will usually kick off finish_rmw once all the bios are read in, but it
2637 * may trigger parity reconstruction if we had any errors along the way
2638 */
d34e123d 2639static void raid56_parity_scrub_end_io_work(struct work_struct *work)
5a6ac9ea 2640{
d34e123d
CH
2641 struct btrfs_raid_bio *rbio =
2642 container_of(work, struct btrfs_raid_bio, end_io_work);
5a6ac9ea
MX
2643
2644 /*
d34e123d
CH
2645 * This will normally call finish_rmw to start our write, but if there
2646 * are any failed stripes we'll reconstruct from parity first
5a6ac9ea
MX
2647 */
2648 validate_rbio_for_parity_scrub(rbio);
2649}
2650
2651static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2652{
2653 int bios_to_read = 0;
5a6ac9ea
MX
2654 struct bio_list bio_list;
2655 int ret;
3e77605d 2656 int sectornr;
5a6ac9ea
MX
2657 int stripe;
2658 struct bio *bio;
2659
785884fc
LB
2660 bio_list_init(&bio_list);
2661
5a6ac9ea
MX
2662 ret = alloc_rbio_essential_pages(rbio);
2663 if (ret)
2664 goto cleanup;
2665
5a6ac9ea
MX
2666 atomic_set(&rbio->error, 0);
2667 /*
2668 * build a list of bios to read all the missing parts of this
2669 * stripe
2670 */
2c8cdd6e 2671 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
c67c68eb 2672 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3e77605d 2673 struct sector_ptr *sector;
5a6ac9ea 2674 /*
3e77605d
QW
2675 * We want to find all the sectors missing from the
2676 * rbio and read them from the disk. If * sector_in_rbio()
2677 * finds a sector in the bio list we don't need to read
2678 * it off the stripe.
5a6ac9ea 2679 */
3e77605d
QW
2680 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2681 if (sector)
5a6ac9ea
MX
2682 continue;
2683
3e77605d 2684 sector = rbio_stripe_sector(rbio, stripe, sectornr);
5a6ac9ea 2685 /*
3e77605d
QW
2686 * The bio cache may have handed us an uptodate sector.
2687 * If so, be happy and use it.
5a6ac9ea 2688 */
3e77605d 2689 if (sector->uptodate)
5a6ac9ea
MX
2690 continue;
2691
3e77605d
QW
2692 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2693 stripe, sectornr, rbio->stripe_len,
2694 REQ_OP_READ);
5a6ac9ea
MX
2695 if (ret)
2696 goto cleanup;
2697 }
2698 }
2699
2700 bios_to_read = bio_list_size(&bio_list);
2701 if (!bios_to_read) {
2702 /*
2703 * this can happen if others have merged with
2704 * us, it means there is nothing left to read.
2705 * But if there are missing devices it may not be
2706 * safe to do the full stripe write yet.
2707 */
2708 goto finish;
2709 }
2710
2711 /*
4c664611
QW
2712 * The bioc may be freed once we submit the last bio. Make sure not to
2713 * touch it after that.
5a6ac9ea
MX
2714 */
2715 atomic_set(&rbio->stripes_pending, bios_to_read);
d34e123d 2716 INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
bf28a605 2717 while ((bio = bio_list_pop(&bio_list))) {
d34e123d 2718 bio->bi_end_io = raid56_bio_end_io;
5a6ac9ea 2719
b8bea09a
QW
2720 if (trace_raid56_scrub_read_enabled()) {
2721 struct raid56_bio_trace_info trace_info = { 0 };
2722
2723 bio_get_trace_info(rbio, bio, &trace_info);
2724 trace_raid56_scrub_read(rbio, bio, &trace_info);
2725 }
4e49ea4a 2726 submit_bio(bio);
5a6ac9ea
MX
2727 }
2728 /* the actual write will happen once the reads are done */
2729 return;
2730
2731cleanup:
58efbc9f 2732 rbio_orig_end_io(rbio, BLK_STS_IOERR);
785884fc
LB
2733
2734 while ((bio = bio_list_pop(&bio_list)))
2735 bio_put(bio);
2736
5a6ac9ea
MX
2737 return;
2738
2739finish:
2740 validate_rbio_for_parity_scrub(rbio);
2741}
2742
385de0ef 2743static void scrub_parity_work(struct work_struct *work)
5a6ac9ea
MX
2744{
2745 struct btrfs_raid_bio *rbio;
2746
2747 rbio = container_of(work, struct btrfs_raid_bio, work);
2748 raid56_parity_scrub_stripe(rbio);
2749}
2750
5a6ac9ea
MX
2751void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2752{
2753 if (!lock_stripe_add(rbio))
a81b747d 2754 start_async_work(rbio, scrub_parity_work);
5a6ac9ea 2755}
b4ee1782
OS
2756
2757/* The following code is used for dev replace of a missing RAID 5/6 device. */
2758
2759struct btrfs_raid_bio *
6a258d72
QW
2760raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2761 u64 length)
b4ee1782 2762{
6a258d72 2763 struct btrfs_fs_info *fs_info = bioc->fs_info;
b4ee1782
OS
2764 struct btrfs_raid_bio *rbio;
2765
4c664611 2766 rbio = alloc_rbio(fs_info, bioc, length);
b4ee1782
OS
2767 if (IS_ERR(rbio))
2768 return NULL;
2769
2770 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2771 bio_list_add(&rbio->bio_list, bio);
2772 /*
2773 * This is a special bio which is used to hold the completion handler
2774 * and make the scrub rbio is similar to the other types
2775 */
2776 ASSERT(!bio->bi_iter.bi_size);
2777
2778 rbio->faila = find_logical_bio_stripe(rbio, bio);
2779 if (rbio->faila == -1) {
2780 BUG();
2781 kfree(rbio);
2782 return NULL;
2783 }
2784
ae6529c3 2785 /*
4c664611 2786 * When we get bioc, we have already increased bio_counter, record it
ae6529c3
QW
2787 * so we can free it at rbio_orig_end_io()
2788 */
2789 rbio->generic_bio_cnt = 1;
2790
b4ee1782
OS
2791 return rbio;
2792}
2793
b4ee1782
OS
2794void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2795{
2796 if (!lock_stripe_add(rbio))
e66d8d5a 2797 start_async_work(rbio, read_rebuild_work);
b4ee1782 2798}