Merge tag 'pci-v6.16-fixes-3' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci
[linux-block.git] / fs / btrfs / raid56.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
53b381b3
DW
2/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
53b381b3 5 */
c1d7c514 6
53b381b3 7#include <linux/sched.h>
53b381b3
DW
8#include <linux/bio.h>
9#include <linux/slab.h>
53b381b3 10#include <linux/blkdev.h>
53b381b3
DW
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
818e010b 15#include <linux/mm.h>
9b569ea0 16#include "messages.h"
53b381b3 17#include "ctree.h"
53b381b3 18#include "disk-io.h"
53b381b3
DW
19#include "volumes.h"
20#include "raid56.h"
21#include "async-thread.h"
c5a41562 22#include "file-item.h"
7a315072 23#include "btrfs_inode.h"
53b381b3
DW
24
25/* set when additional merges to this rbio are not allowed */
26#define RBIO_RMW_LOCKED_BIT 1
27
4ae10b3a
CM
28/*
29 * set when this rbio is sitting in the hash, but it is just a cache
30 * of past RMW
31 */
32#define RBIO_CACHE_BIT 2
33
34/*
35 * set when it is safe to trust the stripe_pages for caching
36 */
37#define RBIO_CACHE_READY_BIT 3
38
4ae10b3a
CM
39#define RBIO_CACHE_SIZE 1024
40
8a953348
DS
41#define BTRFS_STRIPE_HASH_TABLE_BITS 11
42
bbbee460
QW
43static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44{
45 if (unlikely(!bioc)) {
46 btrfs_crit(fs_info, "bioc=NULL");
47 return;
48 }
49 btrfs_crit(fs_info,
50"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 bioc->logical, bioc->full_stripe_logical, bioc->size,
52 bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53 bioc->replace_stripe_src, bioc->num_stripes);
54 for (int i = 0; i < bioc->num_stripes; i++) {
55 btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
56 i, bioc->stripes[i].dev->devid,
57 bioc->stripes[i].physical);
58 }
59}
60
61static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62 const struct btrfs_raid_bio *rbio)
63{
64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65 return;
66
67 dump_bioc(fs_info, rbio->bioc);
68 btrfs_crit(fs_info,
69"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70 rbio->flags, rbio->nr_sectors, rbio->nr_data,
71 rbio->real_stripes, rbio->stripe_nsectors,
72 rbio->scrubp, rbio->dbitmap);
73}
74
75#define ASSERT_RBIO(expr, rbio) \
76({ \
77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
79 (rbio)->bioc->fs_info : NULL; \
80 \
81 btrfs_dump_rbio(__fs_info, (rbio)); \
82 } \
83 ASSERT((expr)); \
84})
85
86#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
87({ \
88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
90 (rbio)->bioc->fs_info : NULL; \
91 \
92 btrfs_dump_rbio(__fs_info, (rbio)); \
93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
94 } \
95 ASSERT((expr)); \
96})
97
98#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
99({ \
100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
102 (rbio)->bioc->fs_info : NULL; \
103 \
104 btrfs_dump_rbio(__fs_info, (rbio)); \
105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
106 } \
107 ASSERT((expr)); \
108})
109
110#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
111({ \
112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
114 (rbio)->bioc->fs_info : NULL; \
115 \
116 btrfs_dump_rbio(__fs_info, (rbio)); \
117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \
118 } \
119 ASSERT((expr)); \
120})
121
8a953348
DS
122/* Used by the raid56 code to lock stripes for read/modify/write */
123struct btrfs_stripe_hash {
124 struct list_head hash_list;
125 spinlock_t lock;
126};
127
128/* Used by the raid56 code to lock stripes for read/modify/write */
129struct btrfs_stripe_hash_table {
130 struct list_head stripe_cache;
131 spinlock_t cache_lock;
132 int cache_size;
133 struct btrfs_stripe_hash table[];
134};
135
eb357060 136/*
cd678925
QW
137 * A structure to present a sector inside a page, the length is fixed to
138 * sectorsize;
eb357060
QW
139 */
140struct sector_ptr {
cd678925
QW
141 /*
142 * Blocks from the bio list can still be highmem.
143 * So here we use physical address to present a page and the offset inside it.
144 */
145 phys_addr_t paddr;
146 bool has_paddr;
147 bool uptodate;
eb357060
QW
148};
149
93723095
QW
150static void rmw_rbio_work(struct work_struct *work);
151static void rmw_rbio_work_locked(struct work_struct *work);
53b381b3
DW
152static void index_rbio_pages(struct btrfs_raid_bio *rbio);
153static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
154
486c737f 155static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
6bfd0133 156static void scrub_rbio_work_locked(struct work_struct *work);
5a6ac9ea 157
797d74b7
QW
158static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
159{
2942a50d 160 bitmap_free(rbio->error_bitmap);
797d74b7
QW
161 kfree(rbio->stripe_pages);
162 kfree(rbio->bio_sectors);
163 kfree(rbio->stripe_sectors);
164 kfree(rbio->finish_pointers);
165}
166
ff2b64a2
QW
167static void free_raid_bio(struct btrfs_raid_bio *rbio)
168{
169 int i;
170
171 if (!refcount_dec_and_test(&rbio->refs))
172 return;
173
174 WARN_ON(!list_empty(&rbio->stripe_cache));
175 WARN_ON(!list_empty(&rbio->hash_list));
176 WARN_ON(!bio_list_empty(&rbio->bio_list));
177
178 for (i = 0; i < rbio->nr_pages; i++) {
179 if (rbio->stripe_pages[i]) {
180 __free_page(rbio->stripe_pages[i]);
181 rbio->stripe_pages[i] = NULL;
182 }
183 }
184
185 btrfs_put_bioc(rbio->bioc);
797d74b7 186 free_raid_bio_pointers(rbio);
ff2b64a2
QW
187 kfree(rbio);
188}
189
385de0ef 190static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
ac638859 191{
385de0ef
CH
192 INIT_WORK(&rbio->work, work_func);
193 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
ac638859
DS
194}
195
53b381b3
DW
196/*
197 * the stripe hash table is used for locking, and to collect
198 * bios in hopes of making a full stripe
199 */
200int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
201{
202 struct btrfs_stripe_hash_table *table;
203 struct btrfs_stripe_hash_table *x;
204 struct btrfs_stripe_hash *cur;
205 struct btrfs_stripe_hash *h;
05a6ec86 206 unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
53b381b3
DW
207
208 if (info->stripe_hash_table)
209 return 0;
210
83c8266a
DS
211 /*
212 * The table is large, starting with order 4 and can go as high as
213 * order 7 in case lock debugging is turned on.
214 *
215 * Try harder to allocate and fallback to vmalloc to lower the chance
216 * of a failing mount.
217 */
ee787f95 218 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
818e010b
DS
219 if (!table)
220 return -ENOMEM;
53b381b3 221
4ae10b3a
CM
222 spin_lock_init(&table->cache_lock);
223 INIT_LIST_HEAD(&table->stripe_cache);
224
53b381b3
DW
225 h = table->table;
226
05a6ec86 227 for (unsigned int i = 0; i < num_entries; i++) {
53b381b3
DW
228 cur = h + i;
229 INIT_LIST_HEAD(&cur->hash_list);
230 spin_lock_init(&cur->lock);
53b381b3
DW
231 }
232
233 x = cmpxchg(&info->stripe_hash_table, NULL, table);
fe3b7bb0 234 kvfree(x);
53b381b3
DW
235 return 0;
236}
237
cd678925
QW
238static void memcpy_sectors(const struct sector_ptr *dst,
239 const struct sector_ptr *src, u32 blocksize)
240{
241 memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr),
242 phys_to_page(src->paddr), offset_in_page(src->paddr),
243 blocksize);
244}
245
4ae10b3a
CM
246/*
247 * caching an rbio means to copy anything from the
ac26df8b 248 * bio_sectors array into the stripe_pages array. We
4ae10b3a
CM
249 * use the page uptodate bit in the stripe cache array
250 * to indicate if it has valid data
251 *
252 * once the caching is done, we set the cache ready
253 * bit.
254 */
255static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
256{
257 int i;
4ae10b3a
CM
258 int ret;
259
260 ret = alloc_rbio_pages(rbio);
261 if (ret)
262 return;
263
00425dd9
QW
264 for (i = 0; i < rbio->nr_sectors; i++) {
265 /* Some range not covered by bio (partial write), skip it */
cd678925 266 if (!rbio->bio_sectors[i].has_paddr) {
88074c8b
QW
267 /*
268 * Even if the sector is not covered by bio, if it is
269 * a data sector it should still be uptodate as it is
270 * read from disk.
271 */
272 if (i < rbio->nr_data * rbio->stripe_nsectors)
273 ASSERT(rbio->stripe_sectors[i].uptodate);
00425dd9 274 continue;
88074c8b 275 }
00425dd9 276
cd678925
QW
277 memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i],
278 rbio->bioc->fs_info->sectorsize);
00425dd9
QW
279 rbio->stripe_sectors[i].uptodate = 1;
280 }
4ae10b3a
CM
281 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
282}
283
53b381b3
DW
284/*
285 * we hash on the first logical address of the stripe
286 */
287static int rbio_bucket(struct btrfs_raid_bio *rbio)
288{
18d758a2 289 u64 num = rbio->bioc->full_stripe_logical;
53b381b3
DW
290
291 /*
292 * we shift down quite a bit. We're using byte
293 * addressing, and most of the lower bits are zeros.
294 * This tends to upset hash_64, and it consistently
295 * returns just one or two different values.
296 *
297 * shifting off the lower bits fixes things.
298 */
299 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
300}
301
d4e28d9b
QW
302static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
303 unsigned int page_nr)
304{
305 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
306 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
307 int i;
308
309 ASSERT(page_nr < rbio->nr_pages);
310
311 for (i = sectors_per_page * page_nr;
312 i < sectors_per_page * page_nr + sectors_per_page;
313 i++) {
314 if (!rbio->stripe_sectors[i].uptodate)
315 return false;
316 }
317 return true;
318}
319
eb357060
QW
320/*
321 * Update the stripe_sectors[] array to use correct page and pgoff
322 *
323 * Should be called every time any page pointer in stripes_pages[] got modified.
324 */
325static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
326{
327 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
328 u32 offset;
329 int i;
330
331 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
332 int page_index = offset >> PAGE_SHIFT;
333
334 ASSERT(page_index < rbio->nr_pages);
cd678925
QW
335 if (!rbio->stripe_pages[page_index])
336 continue;
337
338 rbio->stripe_sectors[i].has_paddr = true;
339 rbio->stripe_sectors[i].paddr =
340 page_to_phys(rbio->stripe_pages[page_index]) +
341 offset_in_page(offset);
eb357060
QW
342 }
343}
344
4d100466
QW
345static void steal_rbio_page(struct btrfs_raid_bio *src,
346 struct btrfs_raid_bio *dest, int page_nr)
347{
348 const u32 sectorsize = src->bioc->fs_info->sectorsize;
349 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
350 int i;
351
352 if (dest->stripe_pages[page_nr])
353 __free_page(dest->stripe_pages[page_nr]);
354 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
355 src->stripe_pages[page_nr] = NULL;
356
357 /* Also update the sector->uptodate bits. */
358 for (i = sectors_per_page * page_nr;
359 i < sectors_per_page * page_nr + sectors_per_page; i++)
360 dest->stripe_sectors[i].uptodate = true;
361}
362
88074c8b
QW
363static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
364{
365 const int sector_nr = (page_nr << PAGE_SHIFT) >>
366 rbio->bioc->fs_info->sectorsize_bits;
367
368 /*
369 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
370 * we won't have a page which is half data half parity.
371 *
372 * Thus if the first sector of the page belongs to data stripes, then
373 * the full page belongs to data stripes.
374 */
375 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
376}
377
4ae10b3a 378/*
d4e28d9b
QW
379 * Stealing an rbio means taking all the uptodate pages from the stripe array
380 * in the source rbio and putting them into the destination rbio.
381 *
382 * This will also update the involved stripe_sectors[] which are referring to
383 * the old pages.
4ae10b3a
CM
384 */
385static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
386{
387 int i;
4ae10b3a
CM
388
389 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
390 return;
391
392 for (i = 0; i < dest->nr_pages; i++) {
88074c8b
QW
393 struct page *p = src->stripe_pages[i];
394
395 /*
396 * We don't need to steal P/Q pages as they will always be
397 * regenerated for RMW or full write anyway.
398 */
399 if (!is_data_stripe_page(src, i))
4ae10b3a 400 continue;
4ae10b3a 401
88074c8b
QW
402 /*
403 * If @src already has RBIO_CACHE_READY_BIT, it should have
404 * all data stripe pages present and uptodate.
405 */
406 ASSERT(p);
407 ASSERT(full_page_sectors_uptodate(src, i));
4d100466 408 steal_rbio_page(src, dest, i);
4ae10b3a 409 }
eb357060
QW
410 index_stripe_sectors(dest);
411 index_stripe_sectors(src);
4ae10b3a
CM
412}
413
53b381b3
DW
414/*
415 * merging means we take the bio_list from the victim and
416 * splice it into the destination. The victim should
417 * be discarded afterwards.
418 *
419 * must be called with dest->rbio_list_lock held
420 */
421static void merge_rbio(struct btrfs_raid_bio *dest,
422 struct btrfs_raid_bio *victim)
423{
fa1af65b 424 bio_list_merge_init(&dest->bio_list, &victim->bio_list);
53b381b3 425 dest->bio_list_bytes += victim->bio_list_bytes;
bd8f7e62
QW
426 /* Also inherit the bitmaps from @victim. */
427 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
428 dest->stripe_nsectors);
53b381b3
DW
429}
430
431/*
4ae10b3a
CM
432 * used to prune items that are in the cache. The caller
433 * must hold the hash table lock.
434 */
435static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
436{
437 int bucket = rbio_bucket(rbio);
438 struct btrfs_stripe_hash_table *table;
439 struct btrfs_stripe_hash *h;
440 int freeit = 0;
441
442 /*
443 * check the bit again under the hash table lock.
444 */
445 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
446 return;
447
6a258d72 448 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a
CM
449 h = table->table + bucket;
450
451 /* hold the lock for the bucket because we may be
452 * removing it from the hash table
453 */
454 spin_lock(&h->lock);
455
456 /*
457 * hold the lock for the bio list because we need
458 * to make sure the bio list is empty
459 */
460 spin_lock(&rbio->bio_list_lock);
461
462 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
463 list_del_init(&rbio->stripe_cache);
464 table->cache_size -= 1;
465 freeit = 1;
466
467 /* if the bio list isn't empty, this rbio is
468 * still involved in an IO. We take it out
469 * of the cache list, and drop the ref that
470 * was held for the list.
471 *
472 * If the bio_list was empty, we also remove
473 * the rbio from the hash_table, and drop
474 * the corresponding ref
475 */
476 if (bio_list_empty(&rbio->bio_list)) {
477 if (!list_empty(&rbio->hash_list)) {
478 list_del_init(&rbio->hash_list);
dec95574 479 refcount_dec(&rbio->refs);
4ae10b3a
CM
480 BUG_ON(!list_empty(&rbio->plug_list));
481 }
482 }
483 }
484
485 spin_unlock(&rbio->bio_list_lock);
486 spin_unlock(&h->lock);
487
488 if (freeit)
ff2b64a2 489 free_raid_bio(rbio);
4ae10b3a
CM
490}
491
492/*
493 * prune a given rbio from the cache
494 */
495static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
496{
497 struct btrfs_stripe_hash_table *table;
4ae10b3a
CM
498
499 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
500 return;
501
6a258d72 502 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a 503
74cc3600 504 spin_lock(&table->cache_lock);
4ae10b3a 505 __remove_rbio_from_cache(rbio);
74cc3600 506 spin_unlock(&table->cache_lock);
4ae10b3a
CM
507}
508
509/*
510 * remove everything in the cache
511 */
48a3b636 512static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
4ae10b3a
CM
513{
514 struct btrfs_stripe_hash_table *table;
4ae10b3a
CM
515 struct btrfs_raid_bio *rbio;
516
517 table = info->stripe_hash_table;
518
74cc3600 519 spin_lock(&table->cache_lock);
4ae10b3a 520 while (!list_empty(&table->stripe_cache)) {
2d44a15a
DS
521 rbio = list_first_entry(&table->stripe_cache,
522 struct btrfs_raid_bio, stripe_cache);
4ae10b3a
CM
523 __remove_rbio_from_cache(rbio);
524 }
74cc3600 525 spin_unlock(&table->cache_lock);
4ae10b3a
CM
526}
527
528/*
529 * remove all cached entries and free the hash table
530 * used by unmount
53b381b3
DW
531 */
532void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
533{
534 if (!info->stripe_hash_table)
535 return;
4ae10b3a 536 btrfs_clear_rbio_cache(info);
f749303b 537 kvfree(info->stripe_hash_table);
53b381b3
DW
538 info->stripe_hash_table = NULL;
539}
540
4ae10b3a
CM
541/*
542 * insert an rbio into the stripe cache. It
543 * must have already been prepared by calling
544 * cache_rbio_pages
545 *
546 * If this rbio was already cached, it gets
547 * moved to the front of the lru.
548 *
549 * If the size of the rbio cache is too big, we
550 * prune an item.
551 */
552static void cache_rbio(struct btrfs_raid_bio *rbio)
553{
554 struct btrfs_stripe_hash_table *table;
4ae10b3a
CM
555
556 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
557 return;
558
6a258d72 559 table = rbio->bioc->fs_info->stripe_hash_table;
4ae10b3a 560
74cc3600 561 spin_lock(&table->cache_lock);
4ae10b3a
CM
562 spin_lock(&rbio->bio_list_lock);
563
564 /* bump our ref if we were not in the list before */
565 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
dec95574 566 refcount_inc(&rbio->refs);
4ae10b3a
CM
567
568 if (!list_empty(&rbio->stripe_cache)){
569 list_move(&rbio->stripe_cache, &table->stripe_cache);
570 } else {
571 list_add(&rbio->stripe_cache, &table->stripe_cache);
572 table->cache_size += 1;
573 }
574
575 spin_unlock(&rbio->bio_list_lock);
576
577 if (table->cache_size > RBIO_CACHE_SIZE) {
578 struct btrfs_raid_bio *found;
579
c5d12d5b
FM
580 found = list_last_entry(&table->stripe_cache,
581 struct btrfs_raid_bio,
582 stripe_cache);
4ae10b3a
CM
583
584 if (found != rbio)
585 __remove_rbio_from_cache(found);
586 }
587
74cc3600 588 spin_unlock(&table->cache_lock);
4ae10b3a
CM
589}
590
53b381b3
DW
591/*
592 * helper function to run the xor_blocks api. It is only
593 * able to do MAX_XOR_BLOCKS at a time, so we need to
594 * loop through.
595 */
596static void run_xor(void **pages, int src_cnt, ssize_t len)
597{
598 int src_off = 0;
599 int xor_src_cnt = 0;
600 void *dest = pages[src_cnt];
601
602 while(src_cnt > 0) {
603 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
604 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
605
606 src_cnt -= xor_src_cnt;
607 src_off += xor_src_cnt;
608 }
609}
610
611/*
176571a1
DS
612 * Returns true if the bio list inside this rbio covers an entire stripe (no
613 * rmw required).
53b381b3 614 */
176571a1 615static int rbio_is_full(struct btrfs_raid_bio *rbio)
53b381b3
DW
616{
617 unsigned long size = rbio->bio_list_bytes;
618 int ret = 1;
619
74cc3600 620 spin_lock(&rbio->bio_list_lock);
ff18a4af 621 if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
53b381b3 622 ret = 0;
ff18a4af 623 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
74cc3600 624 spin_unlock(&rbio->bio_list_lock);
176571a1 625
53b381b3
DW
626 return ret;
627}
628
629/*
630 * returns 1 if it is safe to merge two rbios together.
631 * The merging is safe if the two rbios correspond to
632 * the same stripe and if they are both going in the same
633 * direction (read vs write), and if neither one is
634 * locked for final IO
635 *
636 * The caller is responsible for locking such that
637 * rmw_locked is safe to test
638 */
639static int rbio_can_merge(struct btrfs_raid_bio *last,
640 struct btrfs_raid_bio *cur)
641{
642 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
643 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
644 return 0;
645
4ae10b3a
CM
646 /*
647 * we can't merge with cached rbios, since the
648 * idea is that when we merge the destination
649 * rbio is going to run our IO for us. We can
01327610 650 * steal from cached rbios though, other functions
4ae10b3a
CM
651 * handle that.
652 */
653 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
654 test_bit(RBIO_CACHE_BIT, &cur->flags))
655 return 0;
656
18d758a2 657 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
53b381b3
DW
658 return 0;
659
5a6ac9ea
MX
660 /* we can't merge with different operations */
661 if (last->operation != cur->operation)
662 return 0;
663 /*
664 * We've need read the full stripe from the drive.
665 * check and repair the parity and write the new results.
666 *
667 * We're not allowed to add any new bios to the
668 * bio list here, anyone else that wants to
669 * change this stripe needs to do their own rmw.
670 */
db34be19 671 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
53b381b3 672 return 0;
53b381b3 673
3a3c7a7f 674 if (last->operation == BTRFS_RBIO_READ_REBUILD)
b4ee1782
OS
675 return 0;
676
53b381b3
DW
677 return 1;
678}
679
3e77605d
QW
680static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
681 unsigned int stripe_nr,
682 unsigned int sector_nr)
683{
bbbee460
QW
684 ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
685 ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
3e77605d
QW
686
687 return stripe_nr * rbio->stripe_nsectors + sector_nr;
688}
689
690/* Return a sector from rbio->stripe_sectors, not from the bio list */
691static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
692 unsigned int stripe_nr,
693 unsigned int sector_nr)
694{
695 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
696 sector_nr)];
697}
698
1145059a
QW
699/* Grab a sector inside P stripe */
700static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
701 unsigned int sector_nr)
b7178a5f 702{
1145059a 703 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
b7178a5f
ZL
704}
705
1145059a
QW
706/* Grab a sector inside Q stripe, return NULL if not RAID6 */
707static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
708 unsigned int sector_nr)
53b381b3 709{
1145059a
QW
710 if (rbio->nr_data + 1 == rbio->real_stripes)
711 return NULL;
712 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
53b381b3
DW
713}
714
53b381b3
DW
715/*
716 * The first stripe in the table for a logical address
717 * has the lock. rbios are added in one of three ways:
718 *
719 * 1) Nobody has the stripe locked yet. The rbio is given
720 * the lock and 0 is returned. The caller must start the IO
721 * themselves.
722 *
723 * 2) Someone has the stripe locked, but we're able to merge
724 * with the lock owner. The rbio is freed and the IO will
725 * start automatically along with the existing rbio. 1 is returned.
726 *
727 * 3) Someone has the stripe locked, but we're not able to merge.
728 * The rbio is added to the lock owner's plug list, or merged into
729 * an rbio already on the plug list. When the lock owner unlocks,
730 * the next rbio on the list is run and the IO is started automatically.
731 * 1 is returned
732 *
733 * If we return 0, the caller still owns the rbio and must continue with
734 * IO submission. If we return 1, the caller must assume the rbio has
735 * already been freed.
736 */
737static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
738{
721860d5 739 struct btrfs_stripe_hash *h;
53b381b3
DW
740 struct btrfs_raid_bio *cur;
741 struct btrfs_raid_bio *pending;
53b381b3 742 struct btrfs_raid_bio *freeit = NULL;
4ae10b3a 743 struct btrfs_raid_bio *cache_drop = NULL;
53b381b3 744 int ret = 0;
53b381b3 745
6a258d72 746 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
721860d5 747
74cc3600 748 spin_lock(&h->lock);
53b381b3 749 list_for_each_entry(cur, &h->hash_list, hash_list) {
18d758a2 750 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
9d6cb1b0 751 continue;
4ae10b3a 752
9d6cb1b0 753 spin_lock(&cur->bio_list_lock);
4ae10b3a 754
9d6cb1b0
JT
755 /* Can we steal this cached rbio's pages? */
756 if (bio_list_empty(&cur->bio_list) &&
757 list_empty(&cur->plug_list) &&
758 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
759 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
760 list_del_init(&cur->hash_list);
761 refcount_dec(&cur->refs);
53b381b3 762
9d6cb1b0
JT
763 steal_rbio(cur, rbio);
764 cache_drop = cur;
765 spin_unlock(&cur->bio_list_lock);
4ae10b3a 766
9d6cb1b0
JT
767 goto lockit;
768 }
53b381b3 769
9d6cb1b0
JT
770 /* Can we merge into the lock owner? */
771 if (rbio_can_merge(cur, rbio)) {
772 merge_rbio(cur, rbio);
53b381b3 773 spin_unlock(&cur->bio_list_lock);
9d6cb1b0 774 freeit = rbio;
53b381b3
DW
775 ret = 1;
776 goto out;
777 }
9d6cb1b0
JT
778
779
780 /*
781 * We couldn't merge with the running rbio, see if we can merge
782 * with the pending ones. We don't have to check for rmw_locked
783 * because there is no way they are inside finish_rmw right now
784 */
785 list_for_each_entry(pending, &cur->plug_list, plug_list) {
786 if (rbio_can_merge(pending, rbio)) {
787 merge_rbio(pending, rbio);
788 spin_unlock(&cur->bio_list_lock);
789 freeit = rbio;
790 ret = 1;
791 goto out;
792 }
793 }
794
795 /*
796 * No merging, put us on the tail of the plug list, our rbio
797 * will be started with the currently running rbio unlocks
798 */
799 list_add_tail(&rbio->plug_list, &cur->plug_list);
800 spin_unlock(&cur->bio_list_lock);
801 ret = 1;
802 goto out;
53b381b3 803 }
4ae10b3a 804lockit:
dec95574 805 refcount_inc(&rbio->refs);
53b381b3
DW
806 list_add(&rbio->hash_list, &h->hash_list);
807out:
74cc3600 808 spin_unlock(&h->lock);
4ae10b3a
CM
809 if (cache_drop)
810 remove_rbio_from_cache(cache_drop);
53b381b3 811 if (freeit)
ff2b64a2 812 free_raid_bio(freeit);
53b381b3
DW
813 return ret;
814}
815
d817ce35
QW
816static void recover_rbio_work_locked(struct work_struct *work);
817
53b381b3
DW
818/*
819 * called as rmw or parity rebuild is completed. If the plug list has more
820 * rbios waiting for this stripe, the next one on the list will be started
821 */
822static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
823{
824 int bucket;
825 struct btrfs_stripe_hash *h;
4ae10b3a 826 int keep_cache = 0;
53b381b3
DW
827
828 bucket = rbio_bucket(rbio);
6a258d72 829 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
53b381b3 830
4ae10b3a
CM
831 if (list_empty(&rbio->plug_list))
832 cache_rbio(rbio);
833
74cc3600 834 spin_lock(&h->lock);
53b381b3
DW
835 spin_lock(&rbio->bio_list_lock);
836
837 if (!list_empty(&rbio->hash_list)) {
4ae10b3a
CM
838 /*
839 * if we're still cached and there is no other IO
840 * to perform, just leave this rbio here for others
841 * to steal from later
842 */
843 if (list_empty(&rbio->plug_list) &&
844 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
845 keep_cache = 1;
846 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
847 BUG_ON(!bio_list_empty(&rbio->bio_list));
848 goto done;
849 }
53b381b3
DW
850
851 list_del_init(&rbio->hash_list);
dec95574 852 refcount_dec(&rbio->refs);
53b381b3
DW
853
854 /*
855 * we use the plug list to hold all the rbios
856 * waiting for the chance to lock this stripe.
857 * hand the lock over to one of them.
858 */
859 if (!list_empty(&rbio->plug_list)) {
860 struct btrfs_raid_bio *next;
861 struct list_head *head = rbio->plug_list.next;
862
863 next = list_entry(head, struct btrfs_raid_bio,
864 plug_list);
865
866 list_del_init(&rbio->plug_list);
867
868 list_add(&next->hash_list, &h->hash_list);
dec95574 869 refcount_inc(&next->refs);
53b381b3 870 spin_unlock(&rbio->bio_list_lock);
74cc3600 871 spin_unlock(&h->lock);
53b381b3 872
3a3c7a7f 873 if (next->operation == BTRFS_RBIO_READ_REBUILD) {
d817ce35 874 start_async_work(next, recover_rbio_work_locked);
b4ee1782 875 } else if (next->operation == BTRFS_RBIO_WRITE) {
4ae10b3a 876 steal_rbio(rbio, next);
93723095 877 start_async_work(next, rmw_rbio_work_locked);
5a6ac9ea
MX
878 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
879 steal_rbio(rbio, next);
6bfd0133 880 start_async_work(next, scrub_rbio_work_locked);
4ae10b3a 881 }
53b381b3
DW
882
883 goto done_nolock;
53b381b3
DW
884 }
885 }
4ae10b3a 886done:
53b381b3 887 spin_unlock(&rbio->bio_list_lock);
74cc3600 888 spin_unlock(&h->lock);
53b381b3
DW
889
890done_nolock:
4ae10b3a
CM
891 if (!keep_cache)
892 remove_rbio_from_cache(rbio);
53b381b3
DW
893}
894
c779b798 895static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
53b381b3 896{
7583d8d0
LB
897 struct bio *next;
898
899 while (cur) {
900 next = cur->bi_next;
901 cur->bi_next = NULL;
c779b798 902 cur->bi_status = status;
7583d8d0
LB
903 bio_endio(cur);
904 cur = next;
905 }
53b381b3
DW
906}
907
908/*
909 * this frees the rbio and runs through all the bios in the
910 * bio_list and calls end_io on them
911 */
c779b798 912static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
53b381b3
DW
913{
914 struct bio *cur = bio_list_get(&rbio->bio_list);
7583d8d0 915 struct bio *extra;
4245215d 916
c5a41562
QW
917 kfree(rbio->csum_buf);
918 bitmap_free(rbio->csum_bitmap);
919 rbio->csum_buf = NULL;
920 rbio->csum_bitmap = NULL;
921
bd8f7e62
QW
922 /*
923 * Clear the data bitmap, as the rbio may be cached for later usage.
924 * do this before before unlock_stripe() so there will be no new bio
925 * for this bio.
926 */
927 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
4245215d 928
7583d8d0
LB
929 /*
930 * At this moment, rbio->bio_list is empty, however since rbio does not
931 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
932 * hash list, rbio may be merged with others so that rbio->bio_list
933 * becomes non-empty.
934 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
935 * more and we can call bio_endio() on all queued bios.
936 */
937 unlock_stripe(rbio);
938 extra = bio_list_get(&rbio->bio_list);
ff2b64a2 939 free_raid_bio(rbio);
53b381b3 940
c779b798 941 rbio_endio_bio_list(cur, status);
7583d8d0 942 if (extra)
c779b798 943 rbio_endio_bio_list(extra, status);
53b381b3
DW
944}
945
43dd529a
DS
946/*
947 * Get a sector pointer specified by its @stripe_nr and @sector_nr.
3e77605d
QW
948 *
949 * @rbio: The raid bio
950 * @stripe_nr: Stripe number, valid range [0, real_stripe)
951 * @sector_nr: Sector number inside the stripe,
952 * valid range [0, stripe_nsectors)
953 * @bio_list_only: Whether to use sectors inside the bio list only.
954 *
955 * The read/modify/write code wants to reuse the original bio page as much
956 * as possible, and only use stripe_sectors as fallback.
957 */
958static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
959 int stripe_nr, int sector_nr,
960 bool bio_list_only)
961{
962 struct sector_ptr *sector;
963 int index;
964
bbbee460
QW
965 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
966 rbio, stripe_nr);
967 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
968 rbio, sector_nr);
3e77605d
QW
969
970 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
971 ASSERT(index >= 0 && index < rbio->nr_sectors);
972
74cc3600 973 spin_lock(&rbio->bio_list_lock);
3e77605d 974 sector = &rbio->bio_sectors[index];
cd678925 975 if (sector->has_paddr || bio_list_only) {
3e77605d 976 /* Don't return sector without a valid page pointer */
cd678925 977 if (!sector->has_paddr)
3e77605d 978 sector = NULL;
74cc3600 979 spin_unlock(&rbio->bio_list_lock);
3e77605d
QW
980 return sector;
981 }
74cc3600 982 spin_unlock(&rbio->bio_list_lock);
3e77605d
QW
983
984 return &rbio->stripe_sectors[index];
985}
986
53b381b3
DW
987/*
988 * allocation and initial setup for the btrfs_raid_bio. Not
989 * this does not allocate any pages for rbio->pages.
990 */
2ff7e61e 991static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
ff18a4af 992 struct btrfs_io_context *bioc)
53b381b3 993{
1faf3885 994 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
ff18a4af 995 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
843de58b 996 const unsigned int num_pages = stripe_npages * real_stripes;
ff18a4af
CH
997 const unsigned int stripe_nsectors =
998 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
94efbe19 999 const unsigned int num_sectors = stripe_nsectors * real_stripes;
53b381b3 1000 struct btrfs_raid_bio *rbio;
53b381b3 1001
94efbe19
QW
1002 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1003 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
c67c68eb
QW
1004 /*
1005 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1006 * (at most 16) should be no larger than BITS_PER_LONG.
1007 */
1008 ASSERT(stripe_nsectors <= BITS_PER_LONG);
843de58b 1009
b2324e08
QW
1010 /*
1011 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1012 * (limited by u8).
1013 */
1014 ASSERT(real_stripes >= 2);
1015 ASSERT(real_stripes <= U8_MAX);
1016
797d74b7 1017 rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
af8e2d1d 1018 if (!rbio)
53b381b3 1019 return ERR_PTR(-ENOMEM);
797d74b7
QW
1020 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1021 GFP_NOFS);
1022 rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1023 GFP_NOFS);
1024 rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1025 GFP_NOFS);
1026 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
2942a50d 1027 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
797d74b7
QW
1028
1029 if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
2942a50d 1030 !rbio->finish_pointers || !rbio->error_bitmap) {
797d74b7
QW
1031 free_raid_bio_pointers(rbio);
1032 kfree(rbio);
1033 return ERR_PTR(-ENOMEM);
1034 }
53b381b3
DW
1035
1036 bio_list_init(&rbio->bio_list);
d817ce35 1037 init_waitqueue_head(&rbio->io_wait);
53b381b3
DW
1038 INIT_LIST_HEAD(&rbio->plug_list);
1039 spin_lock_init(&rbio->bio_list_lock);
4ae10b3a 1040 INIT_LIST_HEAD(&rbio->stripe_cache);
53b381b3 1041 INIT_LIST_HEAD(&rbio->hash_list);
f1c29379 1042 btrfs_get_bioc(bioc);
4c664611 1043 rbio->bioc = bioc;
53b381b3 1044 rbio->nr_pages = num_pages;
94efbe19 1045 rbio->nr_sectors = num_sectors;
2c8cdd6e 1046 rbio->real_stripes = real_stripes;
5a6ac9ea 1047 rbio->stripe_npages = stripe_npages;
94efbe19 1048 rbio->stripe_nsectors = stripe_nsectors;
dec95574 1049 refcount_set(&rbio->refs, 1);
b89e1b01 1050 atomic_set(&rbio->stripes_pending, 0);
53b381b3 1051
0b30f719
QW
1052 ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1053 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
b2324e08 1054 ASSERT(rbio->nr_data > 0);
53b381b3 1055
53b381b3
DW
1056 return rbio;
1057}
1058
1059/* allocate pages for all the stripes in the bio, including parity */
1060static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1061{
eb357060
QW
1062 int ret;
1063
0fbf6cbd 1064 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
eb357060
QW
1065 if (ret < 0)
1066 return ret;
1067 /* Mapping all sectors */
1068 index_stripe_sectors(rbio);
1069 return 0;
53b381b3
DW
1070}
1071
b7178a5f 1072/* only allocate pages for p/q stripes */
53b381b3
DW
1073static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1074{
f77183dc 1075 const int data_pages = rbio->nr_data * rbio->stripe_npages;
eb357060 1076 int ret;
53b381b3 1077
eb357060 1078 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
0fbf6cbd 1079 rbio->stripe_pages + data_pages, false);
eb357060
QW
1080 if (ret < 0)
1081 return ret;
1082
1083 index_stripe_sectors(rbio);
1084 return 0;
53b381b3
DW
1085}
1086
75b47033 1087/*
67da05b3 1088 * Return the total number of errors found in the vertical stripe of @sector_nr.
75b47033
QW
1089 *
1090 * @faila and @failb will also be updated to the first and second stripe
1091 * number of the errors.
1092 */
1093static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1094 int *faila, int *failb)
1095{
1096 int stripe_nr;
1097 int found_errors = 0;
1098
ad3daf1c
QW
1099 if (faila || failb) {
1100 /*
1101 * Both @faila and @failb should be valid pointers if any of
1102 * them is specified.
1103 */
1104 ASSERT(faila && failb);
1105 *faila = -1;
1106 *failb = -1;
1107 }
75b47033
QW
1108
1109 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1110 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1111
1112 if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1113 found_errors++;
ad3daf1c
QW
1114 if (faila) {
1115 /* Update faila and failb. */
1116 if (*faila < 0)
1117 *faila = stripe_nr;
1118 else if (*failb < 0)
1119 *failb = stripe_nr;
1120 }
75b47033
QW
1121 }
1122 }
1123 return found_errors;
1124}
1125
53b381b3 1126/*
3e77605d
QW
1127 * Add a single sector @sector into our list of bios for IO.
1128 *
1129 * Return 0 if everything went well.
1130 * Return <0 for error.
53b381b3 1131 */
3e77605d
QW
1132static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1133 struct bio_list *bio_list,
1134 struct sector_ptr *sector,
1135 unsigned int stripe_nr,
1136 unsigned int sector_nr,
bf9486d6 1137 enum req_op op)
53b381b3 1138{
3e77605d 1139 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
53b381b3 1140 struct bio *last = bio_list->tail;
53b381b3
DW
1141 int ret;
1142 struct bio *bio;
4c664611 1143 struct btrfs_io_stripe *stripe;
53b381b3
DW
1144 u64 disk_start;
1145
3e77605d
QW
1146 /*
1147 * Note: here stripe_nr has taken device replace into consideration,
1148 * thus it can be larger than rbio->real_stripe.
1149 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1150 */
bbbee460
QW
1151 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1152 rbio, stripe_nr);
1153 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1154 rbio, sector_nr);
cd678925 1155 ASSERT(sector->has_paddr);
3e77605d 1156
4c664611 1157 stripe = &rbio->bioc->stripes[stripe_nr];
3e77605d 1158 disk_start = stripe->physical + sector_nr * sectorsize;
53b381b3
DW
1159
1160 /* if the device is missing, just fail this stripe */
2942a50d 1161 if (!stripe->dev->bdev) {
ad3daf1c
QW
1162 int found_errors;
1163
2942a50d
QW
1164 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1165 rbio->error_bitmap);
ad3daf1c
QW
1166
1167 /* Check if we have reached tolerance early. */
1168 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1169 NULL, NULL);
1170 if (found_errors > rbio->bioc->max_errors)
1171 return -EIO;
1172 return 0;
2942a50d 1173 }
53b381b3
DW
1174
1175 /* see if we can add this page onto our existing bio */
1176 if (last) {
adbe7e38 1177 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
4f024f37 1178 last_end += last->bi_iter.bi_size;
53b381b3
DW
1179
1180 /*
1181 * we can't merge these if they are from different
1182 * devices or if they are not contiguous
1183 */
f90ae76a 1184 if (last_end == disk_start && !last->bi_status &&
309dca30 1185 last->bi_bdev == stripe->dev->bdev) {
cd678925
QW
1186 ret = bio_add_page(last, phys_to_page(sector->paddr),
1187 sectorsize, offset_in_page(sector->paddr));
3e77605d 1188 if (ret == sectorsize)
53b381b3
DW
1189 return 0;
1190 }
1191 }
1192
1193 /* put a new bio on the list */
ff18a4af
CH
1194 bio = bio_alloc(stripe->dev->bdev,
1195 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
bf9486d6 1196 op, GFP_NOFS);
29e70be2 1197 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
e01bf588 1198 bio->bi_private = rbio;
53b381b3 1199
cd678925
QW
1200 __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize,
1201 offset_in_page(sector->paddr));
53b381b3
DW
1202 bio_list_add(bio_list, bio);
1203 return 0;
1204}
1205
00425dd9
QW
1206static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1207{
1208 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
6f3f722d
CH
1209 const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
1210 struct bvec_iter iter = bio->bi_iter;
00425dd9 1211 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
18d758a2 1212 rbio->bioc->full_stripe_logical;
00425dd9 1213
6f3f722d
CH
1214 while (iter.bi_size) {
1215 unsigned int index = (offset >> sectorsize_bits);
1216 struct sector_ptr *sector = &rbio->bio_sectors[index];
1217 struct bio_vec bv = bio_iter_iovec(bio, iter);
00425dd9 1218
cd678925
QW
1219 sector->has_paddr = true;
1220 sector->paddr = bvec_phys(&bv);
6f3f722d
CH
1221 bio_advance_iter_single(bio, &iter, sectorsize);
1222 offset += sectorsize;
00425dd9
QW
1223 }
1224}
1225
53b381b3
DW
1226/*
1227 * helper function to walk our bio list and populate the bio_pages array with
1228 * the result. This seems expensive, but it is faster than constantly
1229 * searching through the bio list as we setup the IO in finish_rmw or stripe
1230 * reconstruction.
1231 *
1232 * This must be called before you trust the answers from page_in_rbio
1233 */
1234static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1235{
1236 struct bio *bio;
53b381b3 1237
74cc3600 1238 spin_lock(&rbio->bio_list_lock);
00425dd9
QW
1239 bio_list_for_each(bio, &rbio->bio_list)
1240 index_one_bio(rbio, bio);
1241
74cc3600 1242 spin_unlock(&rbio->bio_list_lock);
53b381b3
DW
1243}
1244
b8bea09a
QW
1245static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1246 struct raid56_bio_trace_info *trace_info)
1247{
1248 const struct btrfs_io_context *bioc = rbio->bioc;
1249 int i;
1250
1251 ASSERT(bioc);
1252
1253 /* We rely on bio->bi_bdev to find the stripe number. */
1254 if (!bio->bi_bdev)
1255 goto not_found;
1256
1257 for (i = 0; i < bioc->num_stripes; i++) {
1258 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1259 continue;
1260 trace_info->stripe_nr = i;
1261 trace_info->devid = bioc->stripes[i].dev->devid;
1262 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1263 bioc->stripes[i].physical;
1264 return;
1265 }
1266
1267not_found:
1268 trace_info->devid = -1;
1269 trace_info->offset = -1;
1270 trace_info->stripe_nr = -1;
1271}
1272
801fcfc5
CH
1273static inline void bio_list_put(struct bio_list *bio_list)
1274{
1275 struct bio *bio;
1276
1277 while ((bio = bio_list_pop(bio_list)))
1278 bio_put(bio);
1279}
1280
b2324e08
QW
1281static void assert_rbio(struct btrfs_raid_bio *rbio)
1282{
c186345a 1283 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
b2324e08
QW
1284 return;
1285
1286 /*
1287 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1288 * we won't go beyond 256 disks anyway.
1289 */
bbbee460
QW
1290 ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1291 ASSERT_RBIO(rbio->nr_data > 0, rbio);
b2324e08
QW
1292
1293 /*
1294 * This is another check to make sure nr data stripes is smaller
1295 * than total stripes.
1296 */
bbbee460 1297 ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
b2324e08
QW
1298}
1299
cd678925
QW
1300static inline void *kmap_local_sector(const struct sector_ptr *sector)
1301{
1302 /* The sector pointer must have a page mapped to it. */
1303 ASSERT(sector->has_paddr);
1304
1305 return kmap_local_page(phys_to_page(sector->paddr)) +
1306 offset_in_page(sector->paddr);
1307}
1308
67da05b3 1309/* Generate PQ for one vertical stripe. */
30e3c897
QW
1310static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1311{
1312 void **pointers = rbio->finish_pointers;
1313 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1314 struct sector_ptr *sector;
1315 int stripe;
1316 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1317
1318 /* First collect one sector from each data stripe */
1319 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1320 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
cd678925 1321 pointers[stripe] = kmap_local_sector(sector);
30e3c897
QW
1322 }
1323
1324 /* Then add the parity stripe */
1325 sector = rbio_pstripe_sector(rbio, sectornr);
1326 sector->uptodate = 1;
cd678925 1327 pointers[stripe++] = kmap_local_sector(sector);
30e3c897
QW
1328
1329 if (has_qstripe) {
1330 /*
1331 * RAID6, add the qstripe and call the library function
1332 * to fill in our p/q
1333 */
1334 sector = rbio_qstripe_sector(rbio, sectornr);
1335 sector->uptodate = 1;
cd678925 1336 pointers[stripe++] = kmap_local_sector(sector);
30e3c897 1337
b2324e08 1338 assert_rbio(rbio);
30e3c897
QW
1339 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1340 pointers);
1341 } else {
1342 /* raid5 */
1343 memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1344 run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1345 }
1346 for (stripe = stripe - 1; stripe >= 0; stripe--)
1347 kunmap_local(pointers[stripe]);
1348}
1349
6486d21c
QW
1350static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1351 struct bio_list *bio_list)
53b381b3 1352{
36920044
QW
1353 /* The total sector number inside the full stripe. */
1354 int total_sector_nr;
3e77605d 1355 int sectornr;
6486d21c 1356 int stripe;
53b381b3
DW
1357 int ret;
1358
6486d21c 1359 ASSERT(bio_list_size(bio_list) == 0);
53b381b3 1360
bd8f7e62
QW
1361 /* We should have at least one data sector. */
1362 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1363
5eb30ee2
QW
1364 /*
1365 * Reset errors, as we may have errors inherited from from degraded
1366 * write.
1367 */
2942a50d 1368 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
5eb30ee2 1369
53b381b3 1370 /*
6486d21c 1371 * Start assembly. Make bios for everything from the higher layers (the
36920044 1372 * bio_list in our rbio) and our P/Q. Ignore everything else.
53b381b3 1373 */
36920044
QW
1374 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1375 total_sector_nr++) {
1376 struct sector_ptr *sector;
3e77605d 1377
36920044
QW
1378 stripe = total_sector_nr / rbio->stripe_nsectors;
1379 sectornr = total_sector_nr % rbio->stripe_nsectors;
53b381b3 1380
36920044
QW
1381 /* This vertical stripe has no data, skip it. */
1382 if (!test_bit(sectornr, &rbio->dbitmap))
1383 continue;
53b381b3 1384
36920044
QW
1385 if (stripe < rbio->nr_data) {
1386 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1387 if (!sector)
1388 continue;
1389 } else {
1390 sector = rbio_stripe_sector(rbio, stripe, sectornr);
53b381b3 1391 }
36920044 1392
6486d21c 1393 ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
ff18a4af 1394 sectornr, REQ_OP_WRITE);
36920044 1395 if (ret)
6486d21c 1396 goto error;
53b381b3
DW
1397 }
1398
1faf3885 1399 if (likely(!rbio->bioc->replace_nr_stripes))
6486d21c 1400 return 0;
2c8cdd6e 1401
1faf3885
QW
1402 /*
1403 * Make a copy for the replace target device.
1404 *
1405 * Thus the source stripe number (in replace_stripe_src) should be valid.
1406 */
1407 ASSERT(rbio->bioc->replace_stripe_src >= 0);
1408
36920044
QW
1409 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1410 total_sector_nr++) {
1411 struct sector_ptr *sector;
2c8cdd6e 1412
36920044
QW
1413 stripe = total_sector_nr / rbio->stripe_nsectors;
1414 sectornr = total_sector_nr % rbio->stripe_nsectors;
3e77605d 1415
1faf3885
QW
1416 /*
1417 * For RAID56, there is only one device that can be replaced,
1418 * and replace_stripe_src[0] indicates the stripe number we
1419 * need to copy from.
1420 */
1421 if (stripe != rbio->bioc->replace_stripe_src) {
36920044
QW
1422 /*
1423 * We can skip the whole stripe completely, note
1424 * total_sector_nr will be increased by one anyway.
1425 */
1426 ASSERT(sectornr == 0);
1427 total_sector_nr += rbio->stripe_nsectors - 1;
1428 continue;
1429 }
2c8cdd6e 1430
36920044
QW
1431 /* This vertical stripe has no data, skip it. */
1432 if (!test_bit(sectornr, &rbio->dbitmap))
1433 continue;
2c8cdd6e 1434
36920044
QW
1435 if (stripe < rbio->nr_data) {
1436 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1437 if (!sector)
1438 continue;
1439 } else {
1440 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2c8cdd6e 1441 }
36920044 1442
6486d21c 1443 ret = rbio_add_io_sector(rbio, bio_list, sector,
1faf3885 1444 rbio->real_stripes,
ff18a4af 1445 sectornr, REQ_OP_WRITE);
36920044 1446 if (ret)
6486d21c 1447 goto error;
2c8cdd6e
MX
1448 }
1449
6486d21c
QW
1450 return 0;
1451error:
801fcfc5 1452 bio_list_put(bio_list);
6486d21c
QW
1453 return -EIO;
1454}
1455
2942a50d
QW
1456static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1457{
1458 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1459 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
18d758a2 1460 rbio->bioc->full_stripe_logical;
2942a50d
QW
1461 int total_nr_sector = offset >> fs_info->sectorsize_bits;
1462
1463 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1464
1465 bitmap_set(rbio->error_bitmap, total_nr_sector,
1466 bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1467
1468 /*
1469 * Special handling for raid56_alloc_missing_rbio() used by
1470 * scrub/replace. Unlike call path in raid56_parity_recover(), they
1471 * pass an empty bio here. Thus we have to find out the missing device
1472 * and mark the stripe error instead.
1473 */
1474 if (bio->bi_iter.bi_size == 0) {
1475 bool found_missing = false;
1476 int stripe_nr;
1477
1478 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1479 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1480 found_missing = true;
1481 bitmap_set(rbio->error_bitmap,
1482 stripe_nr * rbio->stripe_nsectors,
1483 rbio->stripe_nsectors);
1484 }
1485 }
1486 ASSERT(found_missing);
1487 }
1488}
1489
5fdb7afc 1490/*
67da05b3 1491 * For subpage case, we can no longer set page Up-to-date directly for
5fdb7afc
QW
1492 * stripe_pages[], thus we need to locate the sector.
1493 */
1494static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
cd678925 1495 phys_addr_t paddr)
5fdb7afc
QW
1496{
1497 int i;
1498
1499 for (i = 0; i < rbio->nr_sectors; i++) {
1500 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1501
cd678925 1502 if (sector->has_paddr && sector->paddr == paddr)
5fdb7afc
QW
1503 return sector;
1504 }
1505 return NULL;
1506}
1507
53b381b3
DW
1508/*
1509 * this sets each page in the bio uptodate. It should only be used on private
1510 * rbio pages, nothing that comes in from the higher layers
1511 */
5fdb7afc 1512static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
53b381b3 1513{
5fdb7afc 1514 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
0198e5b7 1515 struct bio_vec *bvec;
6dc4f100 1516 struct bvec_iter_all iter_all;
6592e58c 1517
0198e5b7 1518 ASSERT(!bio_flagged(bio, BIO_CLONED));
53b381b3 1519
5fdb7afc
QW
1520 bio_for_each_segment_all(bvec, bio, iter_all) {
1521 struct sector_ptr *sector;
cd678925 1522 phys_addr_t paddr = bvec_phys(bvec);
5fdb7afc 1523
cd678925
QW
1524 for (u32 off = 0; off < bvec->bv_len; off += sectorsize) {
1525 sector = find_stripe_sector(rbio, paddr + off);
5fdb7afc
QW
1526 ASSERT(sector);
1527 if (sector)
1528 sector->uptodate = 1;
1529 }
1530 }
53b381b3
DW
1531}
1532
2942a50d
QW
1533static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1534{
cd678925 1535 phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
2942a50d
QW
1536 int i;
1537
1538 for (i = 0; i < rbio->nr_sectors; i++) {
cd678925 1539 if (rbio->stripe_sectors[i].paddr == bvec_paddr)
2942a50d 1540 break;
cd678925
QW
1541 if (rbio->bio_sectors[i].has_paddr &&
1542 rbio->bio_sectors[i].paddr == bvec_paddr)
2942a50d
QW
1543 break;
1544 }
1545 ASSERT(i < rbio->nr_sectors);
1546 return i;
1547}
1548
1549static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1550{
1551 int total_sector_nr = get_bio_sector_nr(rbio, bio);
1552 u32 bio_size = 0;
1553 struct bio_vec *bvec;
a9ad4d87 1554 int i;
2942a50d 1555
c9a43aaf 1556 bio_for_each_bvec_all(bvec, bio, i)
2942a50d
QW
1557 bio_size += bvec->bv_len;
1558
a9ad4d87
QW
1559 /*
1560 * Since we can have multiple bios touching the error_bitmap, we cannot
1561 * call bitmap_set() without protection.
1562 *
1563 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1564 */
1565 for (i = total_sector_nr; i < total_sector_nr +
1566 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1567 set_bit(i, rbio->error_bitmap);
2942a50d
QW
1568}
1569
7a315072
QW
1570/* Verify the data sectors at read time. */
1571static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1572 struct bio *bio)
1573{
1574 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1575 int total_sector_nr = get_bio_sector_nr(rbio, bio);
1576 struct bio_vec *bvec;
1577 struct bvec_iter_all iter_all;
1578
1579 /* No data csum for the whole stripe, no need to verify. */
1580 if (!rbio->csum_bitmap || !rbio->csum_buf)
1581 return;
1582
1583 /* P/Q stripes, they have no data csum to verify against. */
1584 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1585 return;
1586
1587 bio_for_each_segment_all(bvec, bio, iter_all) {
959ddf28 1588 void *kaddr;
7a315072 1589
959ddf28
CH
1590 kaddr = bvec_kmap_local(bvec);
1591 for (u32 off = 0; off < bvec->bv_len;
1592 off += fs_info->sectorsize, total_sector_nr++) {
7a315072
QW
1593 u8 csum_buf[BTRFS_CSUM_SIZE];
1594 u8 *expected_csum = rbio->csum_buf +
1595 total_sector_nr * fs_info->csum_size;
1596 int ret;
1597
1598 /* No csum for this sector, skip to the next sector. */
1599 if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1600 continue;
1601
959ddf28
CH
1602 ret = btrfs_check_sector_csum(fs_info, kaddr + off,
1603 csum_buf, expected_csum);
7a315072
QW
1604 if (ret < 0)
1605 set_bit(total_sector_nr, rbio->error_bitmap);
1606 }
959ddf28 1607 kunmap_local(kaddr);
7a315072
QW
1608 }
1609}
1610
d817ce35
QW
1611static void raid_wait_read_end_io(struct bio *bio)
1612{
1613 struct btrfs_raid_bio *rbio = bio->bi_private;
1614
7a315072 1615 if (bio->bi_status) {
2942a50d 1616 rbio_update_error_bitmap(rbio, bio);
7a315072 1617 } else {
d817ce35 1618 set_bio_pages_uptodate(rbio, bio);
7a315072
QW
1619 verify_bio_data_sectors(rbio, bio);
1620 }
d817ce35
QW
1621
1622 bio_put(bio);
1623 if (atomic_dec_and_test(&rbio->stripes_pending))
1624 wake_up(&rbio->io_wait);
1625}
1626
1c76fb7b 1627static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
d817ce35
QW
1628 struct bio_list *bio_list)
1629{
1630 struct bio *bio;
1631
1632 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1633 while ((bio = bio_list_pop(bio_list))) {
1634 bio->bi_end_io = raid_wait_read_end_io;
1635
dbb6ecb3 1636 if (trace_raid56_read_enabled()) {
d817ce35
QW
1637 struct raid56_bio_trace_info trace_info = { 0 };
1638
1639 bio_get_trace_info(rbio, bio, &trace_info);
dbb6ecb3 1640 trace_raid56_read(rbio, bio, &trace_info);
d817ce35
QW
1641 }
1642 submit_bio(bio);
1643 }
1c76fb7b
CH
1644
1645 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
d817ce35
QW
1646}
1647
5eb30ee2
QW
1648static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1649{
1650 const int data_pages = rbio->nr_data * rbio->stripe_npages;
1651 int ret;
1652
0fbf6cbd 1653 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
5eb30ee2
QW
1654 if (ret < 0)
1655 return ret;
1656
1657 index_stripe_sectors(rbio);
1658 return 0;
1659}
1660
6ac0f488
CM
1661/*
1662 * We use plugging call backs to collect full stripes.
1663 * Any time we get a partial stripe write while plugged
1664 * we collect it into a list. When the unplug comes down,
1665 * we sort the list by logical block number and merge
1666 * everything we can into the same rbios
1667 */
1668struct btrfs_plug_cb {
1669 struct blk_plug_cb cb;
1670 struct btrfs_fs_info *info;
1671 struct list_head rbio_list;
6ac0f488
CM
1672};
1673
1674/*
1675 * rbios on the plug list are sorted for easier merging.
1676 */
4f0f586b
ST
1677static int plug_cmp(void *priv, const struct list_head *a,
1678 const struct list_head *b)
6ac0f488 1679{
214cc184
DS
1680 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1681 plug_list);
1682 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1683 plug_list);
4f024f37
KO
1684 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1685 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
6ac0f488
CM
1686
1687 if (a_sector < b_sector)
1688 return -1;
1689 if (a_sector > b_sector)
1690 return 1;
1691 return 0;
1692}
1693
93723095 1694static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
6ac0f488 1695{
93723095 1696 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
6ac0f488
CM
1697 struct btrfs_raid_bio *cur;
1698 struct btrfs_raid_bio *last = NULL;
1699
6ac0f488 1700 list_sort(NULL, &plug->rbio_list, plug_cmp);
93723095 1701
6ac0f488 1702 while (!list_empty(&plug->rbio_list)) {
2d44a15a
DS
1703 cur = list_first_entry(&plug->rbio_list,
1704 struct btrfs_raid_bio, plug_list);
6ac0f488
CM
1705 list_del_init(&cur->plug_list);
1706
1707 if (rbio_is_full(cur)) {
93723095
QW
1708 /* We have a full stripe, queue it down. */
1709 start_async_work(cur, rmw_rbio_work);
6ac0f488
CM
1710 continue;
1711 }
1712 if (last) {
1713 if (rbio_can_merge(last, cur)) {
1714 merge_rbio(last, cur);
ff2b64a2 1715 free_raid_bio(cur);
6ac0f488 1716 continue;
6ac0f488 1717 }
93723095 1718 start_async_work(last, rmw_rbio_work);
6ac0f488
CM
1719 }
1720 last = cur;
1721 }
93723095
QW
1722 if (last)
1723 start_async_work(last, rmw_rbio_work);
6ac0f488
CM
1724 kfree(plug);
1725}
1726
bd8f7e62
QW
1727/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1728static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1729{
1730 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1731 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
18d758a2 1732 const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
bd8f7e62
QW
1733 const u32 orig_len = orig_bio->bi_iter.bi_size;
1734 const u32 sectorsize = fs_info->sectorsize;
1735 u64 cur_logical;
1736
bbbee460
QW
1737 ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1738 orig_logical + orig_len <= full_stripe_start +
1739 rbio->nr_data * BTRFS_STRIPE_LEN,
1740 rbio, orig_logical);
bd8f7e62
QW
1741
1742 bio_list_add(&rbio->bio_list, orig_bio);
1743 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1744
1745 /* Update the dbitmap. */
1746 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1747 cur_logical += sectorsize) {
1748 int bit = ((u32)(cur_logical - full_stripe_start) >>
1749 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1750
1751 set_bit(bit, &rbio->dbitmap);
1752 }
1753}
1754
53b381b3
DW
1755/*
1756 * our main entry point for writes from the rest of the FS.
1757 */
31683f4a 1758void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
53b381b3 1759{
6a258d72 1760 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 1761 struct btrfs_raid_bio *rbio;
6ac0f488
CM
1762 struct btrfs_plug_cb *plug = NULL;
1763 struct blk_plug_cb *cb;
53b381b3 1764
ff18a4af 1765 rbio = alloc_rbio(fs_info, bioc);
af8e2d1d 1766 if (IS_ERR(rbio)) {
abb49e87
CH
1767 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1768 bio_endio(bio);
1769 return;
af8e2d1d 1770 }
1b94b556 1771 rbio->operation = BTRFS_RBIO_WRITE;
bd8f7e62 1772 rbio_add_bio(rbio, bio);
6ac0f488
CM
1773
1774 /*
93723095 1775 * Don't plug on full rbios, just get them out the door
6ac0f488
CM
1776 * as quickly as we can
1777 */
abb49e87
CH
1778 if (!rbio_is_full(rbio)) {
1779 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1780 if (cb) {
1781 plug = container_of(cb, struct btrfs_plug_cb, cb);
1782 if (!plug->info) {
1783 plug->info = fs_info;
1784 INIT_LIST_HEAD(&plug->rbio_list);
1785 }
1786 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1787 return;
6ac0f488 1788 }
6ac0f488 1789 }
abb49e87 1790
93723095
QW
1791 /*
1792 * Either we don't have any existing plug, or we're doing a full stripe,
abb49e87 1793 * queue the rmw work now.
93723095
QW
1794 */
1795 start_async_work(rbio, rmw_rbio_work);
53b381b3
DW
1796}
1797
7a315072
QW
1798static int verify_one_sector(struct btrfs_raid_bio *rbio,
1799 int stripe_nr, int sector_nr)
1800{
1801 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1802 struct sector_ptr *sector;
1803 u8 csum_buf[BTRFS_CSUM_SIZE];
1804 u8 *csum_expected;
959ddf28 1805 void *kaddr;
7a315072
QW
1806 int ret;
1807
1808 if (!rbio->csum_bitmap || !rbio->csum_buf)
1809 return 0;
1810
1811 /* No way to verify P/Q as they are not covered by data csum. */
1812 if (stripe_nr >= rbio->nr_data)
1813 return 0;
1814 /*
1815 * If we're rebuilding a read, we have to use pages from the
1816 * bio list if possible.
1817 */
3a3c7a7f 1818 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
7a315072
QW
1819 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1820 } else {
1821 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1822 }
1823
7a315072
QW
1824 csum_expected = rbio->csum_buf +
1825 (stripe_nr * rbio->stripe_nsectors + sector_nr) *
1826 fs_info->csum_size;
cd678925 1827 kaddr = kmap_local_sector(sector);
959ddf28
CH
1828 ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected);
1829 kunmap_local(kaddr);
7a315072
QW
1830 return ret;
1831}
1832
9c5ff9b4
QW
1833/*
1834 * Recover a vertical stripe specified by @sector_nr.
1835 * @*pointers are the pre-allocated pointers by the caller, so we don't
1836 * need to allocate/free the pointers again and again.
1837 */
75b47033
QW
1838static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1839 void **pointers, void **unmap_array)
9c5ff9b4
QW
1840{
1841 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1842 struct sector_ptr *sector;
1843 const u32 sectorsize = fs_info->sectorsize;
75b47033
QW
1844 int found_errors;
1845 int faila;
1846 int failb;
9c5ff9b4 1847 int stripe_nr;
7a315072 1848 int ret = 0;
9c5ff9b4
QW
1849
1850 /*
1851 * Now we just use bitmap to mark the horizontal stripes in
1852 * which we have data when doing parity scrub.
1853 */
1854 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1855 !test_bit(sector_nr, &rbio->dbitmap))
75b47033
QW
1856 return 0;
1857
1858 found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1859 &failb);
1860 /*
67da05b3 1861 * No errors in the vertical stripe, skip it. Can happen for recovery
75b47033
QW
1862 * which only part of a stripe failed csum check.
1863 */
1864 if (!found_errors)
1865 return 0;
1866
1867 if (found_errors > rbio->bioc->max_errors)
1868 return -EIO;
9c5ff9b4
QW
1869
1870 /*
1871 * Setup our array of pointers with sectors from each stripe
1872 *
1873 * NOTE: store a duplicate array of pointers to preserve the
1874 * pointer order.
1875 */
1876 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1877 /*
75b47033
QW
1878 * If we're rebuilding a read, we have to use pages from the
1879 * bio list if possible.
9c5ff9b4 1880 */
3a3c7a7f 1881 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
9c5ff9b4
QW
1882 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1883 } else {
1884 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1885 }
cd678925 1886 pointers[stripe_nr] = kmap_local_sector(sector);
9c5ff9b4
QW
1887 unmap_array[stripe_nr] = pointers[stripe_nr];
1888 }
1889
1890 /* All raid6 handling here */
1891 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1892 /* Single failure, rebuild from parity raid5 style */
1893 if (failb < 0) {
1894 if (faila == rbio->nr_data)
1895 /*
1896 * Just the P stripe has failed, without
1897 * a bad data or Q stripe.
1898 * We have nothing to do, just skip the
1899 * recovery for this stripe.
1900 */
1901 goto cleanup;
1902 /*
1903 * a single failure in raid6 is rebuilt
1904 * in the pstripe code below
1905 */
1906 goto pstripe;
1907 }
1908
1909 /*
1910 * If the q stripe is failed, do a pstripe reconstruction from
1911 * the xors.
1912 * If both the q stripe and the P stripe are failed, we're
1913 * here due to a crc mismatch and we can't give them the
1914 * data they want.
1915 */
18d758a2
QW
1916 if (failb == rbio->real_stripes - 1) {
1917 if (faila == rbio->real_stripes - 2)
9c5ff9b4
QW
1918 /*
1919 * Only P and Q are corrupted.
1920 * We only care about data stripes recovery,
1921 * can skip this vertical stripe.
1922 */
1923 goto cleanup;
1924 /*
1925 * Otherwise we have one bad data stripe and
1926 * a good P stripe. raid5!
1927 */
1928 goto pstripe;
1929 }
1930
18d758a2 1931 if (failb == rbio->real_stripes - 2) {
9c5ff9b4
QW
1932 raid6_datap_recov(rbio->real_stripes, sectorsize,
1933 faila, pointers);
1934 } else {
1935 raid6_2data_recov(rbio->real_stripes, sectorsize,
1936 faila, failb, pointers);
1937 }
1938 } else {
1939 void *p;
1940
1941 /* Rebuild from P stripe here (raid5 or raid6). */
1942 ASSERT(failb == -1);
1943pstripe:
1944 /* Copy parity block into failed block to start with */
1945 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1946
1947 /* Rearrange the pointer array */
1948 p = pointers[faila];
1949 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1950 stripe_nr++)
1951 pointers[stripe_nr] = pointers[stripe_nr + 1];
1952 pointers[rbio->nr_data - 1] = p;
1953
1954 /* Xor in the rest */
1955 run_xor(pointers, rbio->nr_data - 1, sectorsize);
1956
1957 }
1958
1959 /*
1960 * No matter if this is a RMW or recovery, we should have all
1961 * failed sectors repaired in the vertical stripe, thus they are now
1962 * uptodate.
1963 * Especially if we determine to cache the rbio, we need to
1964 * have at least all data sectors uptodate.
7a315072
QW
1965 *
1966 * If possible, also check if the repaired sector matches its data
1967 * checksum.
9c5ff9b4 1968 */
75b47033 1969 if (faila >= 0) {
7a315072
QW
1970 ret = verify_one_sector(rbio, faila, sector_nr);
1971 if (ret < 0)
1972 goto cleanup;
1973
75b47033 1974 sector = rbio_stripe_sector(rbio, faila, sector_nr);
9c5ff9b4
QW
1975 sector->uptodate = 1;
1976 }
75b47033 1977 if (failb >= 0) {
f7c11aff 1978 ret = verify_one_sector(rbio, failb, sector_nr);
7a315072
QW
1979 if (ret < 0)
1980 goto cleanup;
1981
75b47033 1982 sector = rbio_stripe_sector(rbio, failb, sector_nr);
9c5ff9b4
QW
1983 sector->uptodate = 1;
1984 }
1985
1986cleanup:
1987 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1988 kunmap_local(unmap_array[stripe_nr]);
7a315072 1989 return ret;
9c5ff9b4
QW
1990}
1991
ec936b03 1992static int recover_sectors(struct btrfs_raid_bio *rbio)
53b381b3 1993{
9c5ff9b4
QW
1994 void **pointers = NULL;
1995 void **unmap_array = NULL;
ec936b03
QW
1996 int sectornr;
1997 int ret = 0;
53b381b3 1998
07e4d380 1999 /*
ec936b03
QW
2000 * @pointers array stores the pointer for each sector.
2001 *
2002 * @unmap_array stores copy of pointers that does not get reordered
2003 * during reconstruction so that kunmap_local works.
07e4d380 2004 */
31e818fe 2005 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
94a0b58d 2006 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
ec936b03
QW
2007 if (!pointers || !unmap_array) {
2008 ret = -ENOMEM;
2009 goto out;
94a0b58d
IW
2010 }
2011
3a3c7a7f 2012 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
74cc3600 2013 spin_lock(&rbio->bio_list_lock);
53b381b3 2014 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
74cc3600 2015 spin_unlock(&rbio->bio_list_lock);
53b381b3
DW
2016 }
2017
2018 index_rbio_pages(rbio);
2019
75b47033
QW
2020 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2021 ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2022 if (ret < 0)
2023 break;
2024 }
53b381b3 2025
ec936b03 2026out:
53b381b3 2027 kfree(pointers);
ec936b03
QW
2028 kfree(unmap_array);
2029 return ret;
2030}
2031
40f87ddb 2032static void recover_rbio(struct btrfs_raid_bio *rbio)
53b381b3 2033{
d838d05e 2034 struct bio_list bio_list = BIO_EMPTY_LIST;
d31968d9
QW
2035 int total_sector_nr;
2036 int ret = 0;
53b381b3 2037
d838d05e
CH
2038 /*
2039 * Either we're doing recover for a read failure or degraded write,
2040 * caller should have set error bitmap correctly.
2041 */
2042 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2043
2044 /* For recovery, we need to read all sectors including P/Q. */
2045 ret = alloc_rbio_pages(rbio);
2046 if (ret < 0)
40f87ddb 2047 goto out;
d838d05e
CH
2048
2049 index_rbio_pages(rbio);
2050
53b381b3 2051 /*
f6065f8e
QW
2052 * Read everything that hasn't failed. However this time we will
2053 * not trust any cached sector.
2054 * As we may read out some stale data but higher layer is not reading
2055 * that stale part.
2056 *
2057 * So here we always re-read everything in recovery path.
53b381b3 2058 */
ef340fcc
QW
2059 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2060 total_sector_nr++) {
2061 int stripe = total_sector_nr / rbio->stripe_nsectors;
2062 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2063 struct sector_ptr *sector;
2064
75b47033
QW
2065 /*
2066 * Skip the range which has error. It can be a range which is
2067 * marked error (for csum mismatch), or it can be a missing
2068 * device.
2069 */
2070 if (!rbio->bioc->stripes[stripe].dev->bdev ||
2071 test_bit(total_sector_nr, rbio->error_bitmap)) {
2072 /*
2073 * Also set the error bit for missing device, which
2074 * may not yet have its error bit set.
2075 */
2076 set_bit(total_sector_nr, rbio->error_bitmap);
53b381b3 2077 continue;
5588383e 2078 }
75b47033 2079
ef340fcc 2080 sector = rbio_stripe_sector(rbio, stripe, sectornr);
d838d05e 2081 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
ff18a4af 2082 sectornr, REQ_OP_READ);
d838d05e
CH
2083 if (ret < 0) {
2084 bio_list_put(&bio_list);
40f87ddb 2085 goto out;
d838d05e 2086 }
53b381b3 2087 }
d817ce35 2088
1c76fb7b 2089 submit_read_wait_bio_list(rbio, &bio_list);
40f87ddb
CH
2090 ret = recover_sectors(rbio);
2091out:
2092 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
d817ce35
QW
2093}
2094
2095static void recover_rbio_work(struct work_struct *work)
2096{
2097 struct btrfs_raid_bio *rbio;
d817ce35
QW
2098
2099 rbio = container_of(work, struct btrfs_raid_bio, work);
40f87ddb
CH
2100 if (!lock_stripe_add(rbio))
2101 recover_rbio(rbio);
d817ce35
QW
2102}
2103
2104static void recover_rbio_work_locked(struct work_struct *work)
2105{
40f87ddb 2106 recover_rbio(container_of(work, struct btrfs_raid_bio, work));
d817ce35
QW
2107}
2108
75b47033
QW
2109static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2110{
2111 bool found = false;
2112 int sector_nr;
2113
2114 /*
2115 * This is for RAID6 extra recovery tries, thus mirror number should
2116 * be large than 2.
2117 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2118 * RAID5 methods.
2119 */
2120 ASSERT(mirror_num > 2);
2121 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2122 int found_errors;
2123 int faila;
2124 int failb;
2125
2126 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2127 &faila, &failb);
2128 /* This vertical stripe doesn't have errors. */
2129 if (!found_errors)
2130 continue;
2131
2132 /*
2133 * If we found errors, there should be only one error marked
2134 * by previous set_rbio_range_error().
2135 */
2136 ASSERT(found_errors == 1);
2137 found = true;
2138
2139 /* Now select another stripe to mark as error. */
2140 failb = rbio->real_stripes - (mirror_num - 1);
2141 if (failb <= faila)
2142 failb--;
2143
2144 /* Set the extra bit in error bitmap. */
2145 if (failb >= 0)
2146 set_bit(failb * rbio->stripe_nsectors + sector_nr,
2147 rbio->error_bitmap);
2148 }
2149
2150 /* We should found at least one vertical stripe with error.*/
2151 ASSERT(found);
2152}
2153
53b381b3
DW
2154/*
2155 * the main entry point for reads from the higher layers. This
2156 * is really only called when the normal read path had a failure,
2157 * so we assume the bio they send down corresponds to a failed part
2158 * of the drive.
2159 */
6065fd95 2160void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
f1c29379 2161 int mirror_num)
53b381b3 2162{
6a258d72 2163 struct btrfs_fs_info *fs_info = bioc->fs_info;
53b381b3 2164 struct btrfs_raid_bio *rbio;
53b381b3 2165
ff18a4af 2166 rbio = alloc_rbio(fs_info, bioc);
af8e2d1d 2167 if (IS_ERR(rbio)) {
6065fd95 2168 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
d817ce35
QW
2169 bio_endio(bio);
2170 return;
af8e2d1d 2171 }
53b381b3 2172
1b94b556 2173 rbio->operation = BTRFS_RBIO_READ_REBUILD;
bd8f7e62 2174 rbio_add_bio(rbio, bio);
53b381b3 2175
2942a50d
QW
2176 set_rbio_range_error(rbio, bio);
2177
53b381b3 2178 /*
8810f751
LB
2179 * Loop retry:
2180 * for 'mirror == 2', reconstruct from all other stripes.
2181 * for 'mirror_num > 2', select a stripe to fail on every retry.
53b381b3 2182 */
ad3daf1c 2183 if (mirror_num > 2)
75b47033 2184 set_rbio_raid6_extra_error(rbio, mirror_num);
53b381b3 2185
d817ce35 2186 start_async_work(rbio, recover_rbio_work);
53b381b3
DW
2187}
2188
c5a41562
QW
2189static void fill_data_csums(struct btrfs_raid_bio *rbio)
2190{
2191 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2192 struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
18d758a2
QW
2193 rbio->bioc->full_stripe_logical);
2194 const u64 start = rbio->bioc->full_stripe_logical;
c5a41562
QW
2195 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2196 fs_info->sectorsize_bits;
2197 int ret;
2198
2199 /* The rbio should not have its csum buffer initialized. */
2200 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2201
2202 /*
2203 * Skip the csum search if:
2204 *
2205 * - The rbio doesn't belong to data block groups
2206 * Then we are doing IO for tree blocks, no need to search csums.
2207 *
2208 * - The rbio belongs to mixed block groups
2209 * This is to avoid deadlock, as we're already holding the full
2210 * stripe lock, if we trigger a metadata read, and it needs to do
2211 * raid56 recovery, we will deadlock.
2212 */
2213 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2214 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2215 return;
2216
2217 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2218 fs_info->csum_size, GFP_NOFS);
2219 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2220 GFP_NOFS);
2221 if (!rbio->csum_buf || !rbio->csum_bitmap) {
2222 ret = -ENOMEM;
2223 goto error;
2224 }
2225
3c771c19
QW
2226 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2227 rbio->csum_buf, rbio->csum_bitmap);
c5a41562
QW
2228 if (ret < 0)
2229 goto error;
2230 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2231 goto no_csum;
2232 return;
2233
2234error:
2235 /*
2236 * We failed to allocate memory or grab the csum, but it's not fatal,
2237 * we can still continue. But better to warn users that RMW is no
2238 * longer safe for this particular sub-stripe write.
2239 */
2240 btrfs_warn_rl(fs_info,
2241"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
18d758a2 2242 rbio->bioc->full_stripe_logical, ret);
c5a41562
QW
2243no_csum:
2244 kfree(rbio->csum_buf);
2245 bitmap_free(rbio->csum_bitmap);
2246 rbio->csum_buf = NULL;
2247 rbio->csum_bitmap = NULL;
2248}
2249
7a315072 2250static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
5eb30ee2 2251{
02efa3a6
CH
2252 struct bio_list bio_list = BIO_EMPTY_LIST;
2253 int total_sector_nr;
2254 int ret = 0;
5eb30ee2 2255
c5a41562
QW
2256 /*
2257 * Fill the data csums we need for data verification. We need to fill
2258 * the csum_bitmap/csum_buf first, as our endio function will try to
2259 * verify the data sectors.
2260 */
2261 fill_data_csums(rbio);
2262
02efa3a6
CH
2263 /*
2264 * Build a list of bios to read all sectors (including data and P/Q).
2265 *
2266 * This behavior is to compensate the later csum verification and recovery.
2267 */
2268 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2269 total_sector_nr++) {
2270 struct sector_ptr *sector;
2271 int stripe = total_sector_nr / rbio->stripe_nsectors;
2272 int sectornr = total_sector_nr % rbio->stripe_nsectors;
5eb30ee2 2273
02efa3a6
CH
2274 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2275 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2276 stripe, sectornr, REQ_OP_READ);
2277 if (ret) {
2278 bio_list_put(&bio_list);
2279 return ret;
2280 }
2281 }
7a315072
QW
2282
2283 /*
2284 * We may or may not have any corrupted sectors (including missing dev
2285 * and csum mismatch), just let recover_sectors() to handle them all.
2286 */
02efa3a6
CH
2287 submit_read_wait_bio_list(rbio, &bio_list);
2288 return recover_sectors(rbio);
5eb30ee2
QW
2289}
2290
2291static void raid_wait_write_end_io(struct bio *bio)
2292{
2293 struct btrfs_raid_bio *rbio = bio->bi_private;
5eb30ee2 2294
ae8ce871 2295 if (bio->bi_status)
2942a50d 2296 rbio_update_error_bitmap(rbio, bio);
5eb30ee2
QW
2297 bio_put(bio);
2298 if (atomic_dec_and_test(&rbio->stripes_pending))
2299 wake_up(&rbio->io_wait);
2300}
2301
2302static void submit_write_bios(struct btrfs_raid_bio *rbio,
2303 struct bio_list *bio_list)
2304{
2305 struct bio *bio;
2306
2307 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2308 while ((bio = bio_list_pop(bio_list))) {
2309 bio->bi_end_io = raid_wait_write_end_io;
2310
dbb6ecb3 2311 if (trace_raid56_write_enabled()) {
5eb30ee2
QW
2312 struct raid56_bio_trace_info trace_info = { 0 };
2313
2314 bio_get_trace_info(rbio, bio, &trace_info);
dbb6ecb3 2315 trace_raid56_write(rbio, bio, &trace_info);
5eb30ee2
QW
2316 }
2317 submit_bio(bio);
2318 }
2319}
2320
7a315072
QW
2321/*
2322 * To determine if we need to read any sector from the disk.
2323 * Should only be utilized in RMW path, to skip cached rbio.
2324 */
2325static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2326{
2327 int i;
2328
2329 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2330 struct sector_ptr *sector = &rbio->stripe_sectors[i];
2331
2332 /*
2333 * We have a sector which doesn't have page nor uptodate,
2334 * thus this rbio can not be cached one, as cached one must
2335 * have all its data sectors present and uptodate.
2336 */
cd678925 2337 if (!sector->has_paddr || !sector->uptodate)
7a315072
QW
2338 return true;
2339 }
2340 return false;
2341}
2342
1d0ef1ca 2343static void rmw_rbio(struct btrfs_raid_bio *rbio)
5eb30ee2
QW
2344{
2345 struct bio_list bio_list;
2346 int sectornr;
2347 int ret = 0;
2348
2349 /*
2350 * Allocate the pages for parity first, as P/Q pages will always be
2351 * needed for both full-stripe and sub-stripe writes.
2352 */
2353 ret = alloc_rbio_parity_pages(rbio);
2354 if (ret < 0)
1d0ef1ca 2355 goto out;
5eb30ee2 2356
7a315072
QW
2357 /*
2358 * Either full stripe write, or we have every data sector already
2359 * cached, can go to write path immediately.
2360 */
4d762701
CH
2361 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2362 /*
2363 * Now we're doing sub-stripe write, also need all data stripes
2364 * to do the full RMW.
2365 */
2366 ret = alloc_rbio_data_pages(rbio);
2367 if (ret < 0)
1d0ef1ca 2368 goto out;
5eb30ee2 2369
4d762701 2370 index_rbio_pages(rbio);
5eb30ee2 2371
4d762701
CH
2372 ret = rmw_read_wait_recover(rbio);
2373 if (ret < 0)
1d0ef1ca 2374 goto out;
4d762701 2375 }
5eb30ee2 2376
5eb30ee2
QW
2377 /*
2378 * At this stage we're not allowed to add any new bios to the
2379 * bio list any more, anyone else that wants to change this stripe
2380 * needs to do their own rmw.
2381 */
74cc3600 2382 spin_lock(&rbio->bio_list_lock);
5eb30ee2 2383 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
74cc3600 2384 spin_unlock(&rbio->bio_list_lock);
5eb30ee2 2385
2942a50d 2386 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
5eb30ee2
QW
2387
2388 index_rbio_pages(rbio);
2389
2390 /*
2391 * We don't cache full rbios because we're assuming
2392 * the higher layers are unlikely to use this area of
2393 * the disk again soon. If they do use it again,
2394 * hopefully they will send another full bio.
2395 */
2396 if (!rbio_is_full(rbio))
2397 cache_rbio_pages(rbio);
2398 else
2399 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2400
2401 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2402 generate_pq_vertical(rbio, sectornr);
2403
2404 bio_list_init(&bio_list);
2405 ret = rmw_assemble_write_bios(rbio, &bio_list);
2406 if (ret < 0)
1d0ef1ca 2407 goto out;
5eb30ee2
QW
2408
2409 /* We should have at least one bio assembled. */
2410 ASSERT(bio_list_size(&bio_list));
2411 submit_write_bios(rbio, &bio_list);
2412 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2413
ad3daf1c
QW
2414 /* We may have more errors than our tolerance during the read. */
2415 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2416 int found_errors;
2417
2418 found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2419 if (found_errors > rbio->bioc->max_errors) {
2420 ret = -EIO;
2421 break;
2422 }
2423 }
1d0ef1ca
CH
2424out:
2425 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
5eb30ee2
QW
2426}
2427
93723095
QW
2428static void rmw_rbio_work(struct work_struct *work)
2429{
2430 struct btrfs_raid_bio *rbio;
93723095
QW
2431
2432 rbio = container_of(work, struct btrfs_raid_bio, work);
1d0ef1ca
CH
2433 if (lock_stripe_add(rbio) == 0)
2434 rmw_rbio(rbio);
93723095
QW
2435}
2436
2437static void rmw_rbio_work_locked(struct work_struct *work)
53b381b3 2438{
1d0ef1ca 2439 rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
53b381b3
DW
2440}
2441
5a6ac9ea
MX
2442/*
2443 * The following code is used to scrub/replace the parity stripe
2444 *
4c664611 2445 * Caller must have already increased bio_counter for getting @bioc.
ae6529c3 2446 *
5a6ac9ea
MX
2447 * Note: We need make sure all the pages that add into the scrub/replace
2448 * raid bio are correct and not be changed during the scrub/replace. That
2449 * is those pages just hold metadata or file data with checksum.
2450 */
2451
6a258d72
QW
2452struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2453 struct btrfs_io_context *bioc,
ff18a4af 2454 struct btrfs_device *scrub_dev,
6a258d72 2455 unsigned long *dbitmap, int stripe_nsectors)
5a6ac9ea 2456{
6a258d72 2457 struct btrfs_fs_info *fs_info = bioc->fs_info;
5a6ac9ea
MX
2458 struct btrfs_raid_bio *rbio;
2459 int i;
2460
ff18a4af 2461 rbio = alloc_rbio(fs_info, bioc);
5a6ac9ea
MX
2462 if (IS_ERR(rbio))
2463 return NULL;
2464 bio_list_add(&rbio->bio_list, bio);
2465 /*
2466 * This is a special bio which is used to hold the completion handler
2467 * and make the scrub rbio is similar to the other types
2468 */
2469 ASSERT(!bio->bi_iter.bi_size);
2470 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2471
9cd3a7eb 2472 /*
4c664611 2473 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
9cd3a7eb
LB
2474 * to the end position, so this search can start from the first parity
2475 * stripe.
2476 */
2477 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
4c664611 2478 if (bioc->stripes[i].dev == scrub_dev) {
5a6ac9ea
MX
2479 rbio->scrubp = i;
2480 break;
2481 }
2482 }
bbbee460 2483 ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
5a6ac9ea 2484
c67c68eb 2485 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
5a6ac9ea
MX
2486 return rbio;
2487}
2488
5a6ac9ea
MX
2489/*
2490 * We just scrub the parity that we have correct data on the same horizontal,
2491 * so we needn't allocate all pages for all the stripes.
2492 */
2493static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2494{
3907ce29 2495 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
aee35e4b 2496 int total_sector_nr;
5a6ac9ea 2497
aee35e4b
QW
2498 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2499 total_sector_nr++) {
2500 struct page *page;
2501 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2502 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
5a6ac9ea 2503
aee35e4b
QW
2504 if (!test_bit(sectornr, &rbio->dbitmap))
2505 continue;
2506 if (rbio->stripe_pages[index])
2507 continue;
2508 page = alloc_page(GFP_NOFS);
2509 if (!page)
2510 return -ENOMEM;
2511 rbio->stripe_pages[index] = page;
5a6ac9ea 2512 }
eb357060 2513 index_stripe_sectors(rbio);
5a6ac9ea
MX
2514 return 0;
2515}
2516
486c737f 2517static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
5a6ac9ea 2518{
4c664611 2519 struct btrfs_io_context *bioc = rbio->bioc;
46900662 2520 const u32 sectorsize = bioc->fs_info->sectorsize;
1389053e 2521 void **pointers = rbio->finish_pointers;
c67c68eb 2522 unsigned long *pbitmap = &rbio->finish_pbitmap;
5a6ac9ea
MX
2523 int nr_data = rbio->nr_data;
2524 int stripe;
3e77605d 2525 int sectornr;
c17af965 2526 bool has_qstripe;
cd678925 2527 struct page *page;
46900662
QW
2528 struct sector_ptr p_sector = { 0 };
2529 struct sector_ptr q_sector = { 0 };
5a6ac9ea 2530 struct bio_list bio_list;
76035976 2531 int is_replace = 0;
5a6ac9ea
MX
2532 int ret;
2533
2534 bio_list_init(&bio_list);
2535
c17af965
DS
2536 if (rbio->real_stripes - rbio->nr_data == 1)
2537 has_qstripe = false;
2538 else if (rbio->real_stripes - rbio->nr_data == 2)
2539 has_qstripe = true;
2540 else
5a6ac9ea 2541 BUG();
5a6ac9ea 2542
1faf3885
QW
2543 /*
2544 * Replace is running and our P/Q stripe is being replaced, then we
2545 * need to duplicate the final write to replace target.
2546 */
2547 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
76035976 2548 is_replace = 1;
c67c68eb 2549 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
76035976
MX
2550 }
2551
5a6ac9ea
MX
2552 /*
2553 * Because the higher layers(scrubber) are unlikely to
2554 * use this area of the disk again soon, so don't cache
2555 * it.
2556 */
2557 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2558
cd678925
QW
2559 page = alloc_page(GFP_NOFS);
2560 if (!page)
6bfd0133 2561 return -ENOMEM;
cd678925
QW
2562 p_sector.has_paddr = true;
2563 p_sector.paddr = page_to_phys(page);
46900662 2564 p_sector.uptodate = 1;
cd678925 2565 page = NULL;
5a6ac9ea 2566
c17af965 2567 if (has_qstripe) {
d70cef0d 2568 /* RAID6, allocate and map temp space for the Q stripe */
cd678925
QW
2569 page = alloc_page(GFP_NOFS);
2570 if (!page) {
2571 __free_page(phys_to_page(p_sector.paddr));
2572 p_sector.has_paddr = false;
6bfd0133 2573 return -ENOMEM;
5a6ac9ea 2574 }
cd678925
QW
2575 q_sector.has_paddr = true;
2576 q_sector.paddr = page_to_phys(page);
46900662 2577 q_sector.uptodate = 1;
cd678925
QW
2578 page = NULL;
2579 pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector);
5a6ac9ea
MX
2580 }
2581
2942a50d 2582 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
5a6ac9ea 2583
d70cef0d 2584 /* Map the parity stripe just once */
cd678925 2585 pointers[nr_data] = kmap_local_sector(&p_sector);
d70cef0d 2586
c67c68eb 2587 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
46900662 2588 struct sector_ptr *sector;
5a6ac9ea 2589 void *parity;
46900662 2590
5a6ac9ea
MX
2591 /* first collect one page from each data stripe */
2592 for (stripe = 0; stripe < nr_data; stripe++) {
46900662 2593 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
cd678925 2594 pointers[stripe] = kmap_local_sector(sector);
5a6ac9ea
MX
2595 }
2596
c17af965 2597 if (has_qstripe) {
b2324e08 2598 assert_rbio(rbio);
d70cef0d 2599 /* RAID6, call the library function to fill in our P/Q */
46900662 2600 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
5a6ac9ea
MX
2601 pointers);
2602 } else {
2603 /* raid5 */
46900662
QW
2604 memcpy(pointers[nr_data], pointers[0], sectorsize);
2605 run_xor(pointers + 1, nr_data - 1, sectorsize);
5a6ac9ea
MX
2606 }
2607
01327610 2608 /* Check scrubbing parity and repair it */
46900662 2609 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
cd678925 2610 parity = kmap_local_sector(sector);
46900662
QW
2611 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2612 memcpy(parity, pointers[rbio->scrubp], sectorsize);
5a6ac9ea
MX
2613 else
2614 /* Parity is right, needn't writeback */
c67c68eb 2615 bitmap_clear(&rbio->dbitmap, sectornr, 1);
58c1a35c 2616 kunmap_local(parity);
5a6ac9ea 2617
94a0b58d
IW
2618 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2619 kunmap_local(pointers[stripe]);
5a6ac9ea
MX
2620 }
2621
94a0b58d 2622 kunmap_local(pointers[nr_data]);
cd678925
QW
2623 __free_page(phys_to_page(p_sector.paddr));
2624 p_sector.has_paddr = false;
2625 if (q_sector.has_paddr) {
2626 __free_page(phys_to_page(q_sector.paddr));
2627 q_sector.has_paddr = false;
d70cef0d 2628 }
5a6ac9ea 2629
5a6ac9ea
MX
2630 /*
2631 * time to start writing. Make bios for everything from the
2632 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2633 * everything else.
2634 */
c67c68eb 2635 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
3e77605d 2636 struct sector_ptr *sector;
5a6ac9ea 2637
3e77605d
QW
2638 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2639 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
ff18a4af 2640 sectornr, REQ_OP_WRITE);
5a6ac9ea
MX
2641 if (ret)
2642 goto cleanup;
2643 }
2644
76035976
MX
2645 if (!is_replace)
2646 goto submit_write;
2647
1faf3885
QW
2648 /*
2649 * Replace is running and our parity stripe needs to be duplicated to
2650 * the target device. Check we have a valid source stripe number.
2651 */
bbbee460 2652 ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
3e77605d
QW
2653 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2654 struct sector_ptr *sector;
76035976 2655
3e77605d
QW
2656 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2657 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1faf3885
QW
2658 rbio->real_stripes,
2659 sectornr, REQ_OP_WRITE);
76035976
MX
2660 if (ret)
2661 goto cleanup;
2662 }
2663
2664submit_write:
6bfd0133
QW
2665 submit_write_bios(rbio, &bio_list);
2666 return 0;
5a6ac9ea
MX
2667
2668cleanup:
801fcfc5 2669 bio_list_put(&bio_list);
6bfd0133 2670 return ret;
5a6ac9ea
MX
2671}
2672
2673static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2674{
2675 if (stripe >= 0 && stripe < rbio->nr_data)
2676 return 1;
2677 return 0;
2678}
2679
6bfd0133 2680static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
5a6ac9ea 2681{
75b47033
QW
2682 void **pointers = NULL;
2683 void **unmap_array = NULL;
2684 int sector_nr;
e7fc357e 2685 int ret = 0;
5a6ac9ea 2686
75b47033
QW
2687 /*
2688 * @pointers array stores the pointer for each sector.
2689 *
2690 * @unmap_array stores copy of pointers that does not get reordered
2691 * during reconstruction so that kunmap_local works.
2692 */
2693 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2694 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2695 if (!pointers || !unmap_array) {
2696 ret = -ENOMEM;
2697 goto out;
2698 }
5a6ac9ea 2699
75b47033
QW
2700 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2701 int dfail = 0, failp = -1;
2702 int faila;
2703 int failb;
2704 int found_errors;
5a6ac9ea 2705
75b47033
QW
2706 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2707 &faila, &failb);
2708 if (found_errors > rbio->bioc->max_errors) {
2709 ret = -EIO;
2710 goto out;
2711 }
2712 if (found_errors == 0)
2713 continue;
5a6ac9ea 2714
75b47033
QW
2715 /* We should have at least one error here. */
2716 ASSERT(faila >= 0 || failb >= 0);
5a6ac9ea 2717
75b47033
QW
2718 if (is_data_stripe(rbio, faila))
2719 dfail++;
2720 else if (is_parity_stripe(faila))
2721 failp = faila;
5a6ac9ea 2722
75b47033
QW
2723 if (is_data_stripe(rbio, failb))
2724 dfail++;
2725 else if (is_parity_stripe(failb))
2726 failp = failb;
2727 /*
2728 * Because we can not use a scrubbing parity to repair the
2729 * data, so the capability of the repair is declined. (In the
2730 * case of RAID5, we can not repair anything.)
2731 */
2732 if (dfail > rbio->bioc->max_errors - 1) {
2733 ret = -EIO;
2734 goto out;
2735 }
2736 /*
2737 * If all data is good, only parity is correctly, just repair
2738 * the parity, no need to recover data stripes.
2739 */
2740 if (dfail == 0)
2741 continue;
6bfd0133 2742
75b47033
QW
2743 /*
2744 * Here means we got one corrupted data stripe and one
2745 * corrupted parity on RAID6, if the corrupted parity is
2746 * scrubbing parity, luckily, use the other one to repair the
2747 * data, or we can not repair the data stripe.
2748 */
2749 if (failp != rbio->scrubp) {
2750 ret = -EIO;
2751 goto out;
2752 }
2753
2754 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2755 if (ret < 0)
2756 goto out;
2757 }
2758out:
2759 kfree(pointers);
2760 kfree(unmap_array);
6bfd0133 2761 return ret;
5a6ac9ea
MX
2762}
2763
52f0c198 2764static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
5a6ac9ea 2765{
52f0c198 2766 struct bio_list bio_list = BIO_EMPTY_LIST;
cb3450b7
QW
2767 int total_sector_nr;
2768 int ret = 0;
5a6ac9ea 2769
1c10702e
QW
2770 /* Build a list of bios to read all the missing parts. */
2771 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2772 total_sector_nr++) {
2773 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2774 int stripe = total_sector_nr / rbio->stripe_nsectors;
2775 struct sector_ptr *sector;
5a6ac9ea 2776
1c10702e
QW
2777 /* No data in the vertical stripe, no need to read. */
2778 if (!test_bit(sectornr, &rbio->dbitmap))
2779 continue;
5a6ac9ea 2780
1c10702e
QW
2781 /*
2782 * We want to find all the sectors missing from the rbio and
2783 * read them from the disk. If sector_in_rbio() finds a sector
2784 * in the bio list we don't need to read it off the stripe.
2785 */
2786 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2787 if (sector)
2788 continue;
2789
2790 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2791 /*
2792 * The bio cache may have handed us an uptodate sector. If so,
2793 * use it.
2794 */
2795 if (sector->uptodate)
2796 continue;
2797
52f0c198 2798 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
ff18a4af 2799 sectornr, REQ_OP_READ);
52f0c198
CH
2800 if (ret) {
2801 bio_list_put(&bio_list);
2802 return ret;
2803 }
5a6ac9ea 2804 }
52f0c198
CH
2805
2806 submit_read_wait_bio_list(rbio, &bio_list);
cb3450b7 2807 return 0;
cb3450b7
QW
2808}
2809
08241d3c 2810static void scrub_rbio(struct btrfs_raid_bio *rbio)
cb3450b7 2811{
ad3daf1c 2812 int sector_nr;
cb3450b7 2813 int ret;
cb3450b7 2814
cb3450b7
QW
2815 ret = alloc_rbio_essential_pages(rbio);
2816 if (ret)
08241d3c 2817 goto out;
cb3450b7 2818
2942a50d
QW
2819 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2820
52f0c198 2821 ret = scrub_assemble_read_bios(rbio);
cb3450b7 2822 if (ret < 0)
08241d3c 2823 goto out;
5a6ac9ea 2824
75b47033 2825 /* We may have some failures, recover the failed sectors first. */
6bfd0133
QW
2826 ret = recover_scrub_rbio(rbio);
2827 if (ret < 0)
08241d3c 2828 goto out;
5a6ac9ea 2829
6bfd0133
QW
2830 /*
2831 * We have every sector properly prepared. Can finish the scrub
2832 * and writeback the good content.
2833 */
486c737f 2834 ret = finish_parity_scrub(rbio);
6bfd0133 2835 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
ad3daf1c
QW
2836 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2837 int found_errors;
2838
2839 found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2840 if (found_errors > rbio->bioc->max_errors) {
2841 ret = -EIO;
2842 break;
2843 }
2844 }
08241d3c
CH
2845out:
2846 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
5a6ac9ea
MX
2847}
2848
6bfd0133 2849static void scrub_rbio_work_locked(struct work_struct *work)
5a6ac9ea 2850{
08241d3c 2851 scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
5a6ac9ea
MX
2852}
2853
5a6ac9ea
MX
2854void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2855{
2856 if (!lock_stripe_add(rbio))
6bfd0133 2857 start_async_work(rbio, scrub_rbio_work_locked);
5a6ac9ea 2858}
94ead93e
QW
2859
2860/*
2861 * This is for scrub call sites where we already have correct data contents.
2862 * This allows us to avoid reading data stripes again.
2863 *
2864 * Unfortunately here we have to do page copy, other than reusing the pages.
2865 * This is due to the fact rbio has its own page management for its cache.
2866 */
2867void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2868 struct page **data_pages, u64 data_logical)
2869{
2870 const u64 offset_in_full_stripe = data_logical -
2871 rbio->bioc->full_stripe_logical;
2872 const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2873 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2874 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2875 int ret;
2876
2877 /*
2878 * If we hit ENOMEM temporarily, but later at
2879 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2880 * the extra read, not a big deal.
2881 *
2882 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2883 * the bio would got proper error number set.
2884 */
2885 ret = alloc_rbio_data_pages(rbio);
2886 if (ret < 0)
2887 return;
2888
2889 /* data_logical must be at stripe boundary and inside the full stripe. */
2890 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2891 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2892
2893 for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2894 struct page *dst = rbio->stripe_pages[page_nr + page_index];
2895 struct page *src = data_pages[page_nr];
2896
2897 memcpy_page(dst, 0, src, 0, PAGE_SIZE);
2898 for (int sector_nr = sectors_per_page * page_index;
2899 sector_nr < sectors_per_page * (page_index + 1);
2900 sector_nr++)
2901 rbio->stripe_sectors[sector_nr].uptodate = true;
2902 }
2903}