Commit | Line | Data |
---|---|---|
103c1972 CH |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
4 | * Copyright (C) 2022 Christoph Hellwig. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
8 | #include "bio.h" | |
9 | #include "ctree.h" | |
10 | #include "volumes.h" | |
11 | #include "raid56.h" | |
12 | #include "async-thread.h" | |
13 | #include "check-integrity.h" | |
14 | #include "dev-replace.h" | |
15 | #include "rcu-string.h" | |
16 | #include "zoned.h" | |
1c2b3ee3 | 17 | #include "file-item.h" |
103c1972 CH |
18 | |
19 | static struct bio_set btrfs_bioset; | |
852eee62 | 20 | static struct bio_set btrfs_clone_bioset; |
7609afac CH |
21 | static struct bio_set btrfs_repair_bioset; |
22 | static mempool_t btrfs_failed_bio_pool; | |
23 | ||
24 | struct btrfs_failed_bio { | |
25 | struct btrfs_bio *bbio; | |
26 | int num_copies; | |
27 | atomic_t repair_count; | |
28 | }; | |
103c1972 CH |
29 | |
30 | /* | |
31 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it | |
32 | * is already initialized by the block layer. | |
33 | */ | |
67d66982 CH |
34 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, |
35 | btrfs_bio_end_io_t end_io, void *private) | |
103c1972 CH |
36 | { |
37 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); | |
d0e5cb2b | 38 | bbio->inode = inode; |
103c1972 CH |
39 | bbio->end_io = end_io; |
40 | bbio->private = private; | |
852eee62 | 41 | atomic_set(&bbio->pending_ios, 1); |
103c1972 CH |
42 | } |
43 | ||
44 | /* | |
45 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for | |
46 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. | |
47 | * | |
48 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by | |
49 | * a mempool. | |
50 | */ | |
51 | struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, | |
d0e5cb2b | 52 | struct btrfs_inode *inode, |
103c1972 CH |
53 | btrfs_bio_end_io_t end_io, void *private) |
54 | { | |
55 | struct bio *bio; | |
56 | ||
57 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); | |
d0e5cb2b | 58 | btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); |
103c1972 CH |
59 | return bio; |
60 | } | |
61 | ||
d5e4377d CH |
62 | static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
63 | struct bio *orig, u64 map_length, | |
64 | bool use_append) | |
852eee62 CH |
65 | { |
66 | struct btrfs_bio *orig_bbio = btrfs_bio(orig); | |
67 | struct bio *bio; | |
68 | ||
d5e4377d CH |
69 | if (use_append) { |
70 | unsigned int nr_segs; | |
71 | ||
72 | bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, | |
73 | &btrfs_clone_bioset, map_length); | |
74 | } else { | |
75 | bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, | |
76 | &btrfs_clone_bioset); | |
77 | } | |
852eee62 CH |
78 | btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); |
79 | ||
80 | btrfs_bio(bio)->file_offset = orig_bbio->file_offset; | |
81 | if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) | |
82 | orig_bbio->file_offset += map_length; | |
83 | ||
84 | atomic_inc(&orig_bbio->pending_ios); | |
85 | return bio; | |
86 | } | |
87 | ||
88 | static void btrfs_orig_write_end_io(struct bio *bio); | |
89 | ||
90 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, | |
91 | struct btrfs_bio *orig_bbio) | |
92 | { | |
93 | /* | |
94 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't | |
95 | * just blindly propagate a write failure here. Instead increment the | |
96 | * error count in the original I/O context so that it is guaranteed to | |
97 | * be larger than the error tolerance. | |
98 | */ | |
99 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { | |
100 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; | |
101 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; | |
102 | ||
103 | atomic_add(orig_bioc->max_errors, &orig_bioc->error); | |
104 | } else { | |
105 | orig_bbio->bio.bi_status = bbio->bio.bi_status; | |
106 | } | |
107 | } | |
108 | ||
109 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) | |
110 | { | |
111 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { | |
112 | struct btrfs_bio *orig_bbio = bbio->private; | |
113 | ||
114 | if (bbio->bio.bi_status) | |
115 | btrfs_bbio_propagate_error(bbio, orig_bbio); | |
116 | bio_put(&bbio->bio); | |
117 | bbio = orig_bbio; | |
118 | } | |
119 | ||
120 | if (atomic_dec_and_test(&bbio->pending_ios)) | |
121 | bbio->end_io(bbio); | |
122 | } | |
123 | ||
7609afac CH |
124 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
125 | { | |
126 | if (cur_mirror == fbio->num_copies) | |
127 | return cur_mirror + 1 - fbio->num_copies; | |
128 | return cur_mirror + 1; | |
129 | } | |
130 | ||
131 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) | |
132 | { | |
133 | if (cur_mirror == 1) | |
134 | return fbio->num_copies; | |
135 | return cur_mirror - 1; | |
136 | } | |
137 | ||
138 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) | |
139 | { | |
140 | if (atomic_dec_and_test(&fbio->repair_count)) { | |
852eee62 | 141 | btrfs_orig_bbio_end_io(fbio->bbio); |
7609afac CH |
142 | mempool_free(fbio, &btrfs_failed_bio_pool); |
143 | } | |
144 | } | |
145 | ||
146 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, | |
147 | struct btrfs_device *dev) | |
148 | { | |
149 | struct btrfs_failed_bio *fbio = repair_bbio->private; | |
150 | struct btrfs_inode *inode = repair_bbio->inode; | |
151 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
152 | struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); | |
153 | int mirror = repair_bbio->mirror_num; | |
154 | ||
155 | if (repair_bbio->bio.bi_status || | |
156 | !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { | |
157 | bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); | |
0d3acb25 | 158 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
7609afac CH |
159 | |
160 | mirror = next_repair_mirror(fbio, mirror); | |
161 | if (mirror == fbio->bbio->mirror_num) { | |
162 | btrfs_debug(fs_info, "no mirror left"); | |
163 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; | |
164 | goto done; | |
165 | } | |
166 | ||
285599b6 | 167 | btrfs_submit_bio(&repair_bbio->bio, mirror); |
7609afac CH |
168 | return; |
169 | } | |
170 | ||
171 | do { | |
172 | mirror = prev_repair_mirror(fbio, mirror); | |
173 | btrfs_repair_io_failure(fs_info, btrfs_ino(inode), | |
174 | repair_bbio->file_offset, fs_info->sectorsize, | |
0d3acb25 | 175 | repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
7609afac CH |
176 | bv->bv_page, bv->bv_offset, mirror); |
177 | } while (mirror != fbio->bbio->mirror_num); | |
178 | ||
179 | done: | |
180 | btrfs_repair_done(fbio); | |
181 | bio_put(&repair_bbio->bio); | |
182 | } | |
183 | ||
184 | /* | |
185 | * Try to kick off a repair read to the next available mirror for a bad sector. | |
186 | * | |
187 | * This primarily tries to recover good data to serve the actual read request, | |
188 | * but also tries to write the good data back to the bad mirror(s) when a | |
189 | * read succeeded to restore the redundancy. | |
190 | */ | |
191 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, | |
192 | u32 bio_offset, | |
193 | struct bio_vec *bv, | |
194 | struct btrfs_failed_bio *fbio) | |
195 | { | |
196 | struct btrfs_inode *inode = failed_bbio->inode; | |
197 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
198 | const u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 199 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
7609afac CH |
200 | struct btrfs_bio *repair_bbio; |
201 | struct bio *repair_bio; | |
202 | int num_copies; | |
203 | int mirror; | |
204 | ||
205 | btrfs_debug(fs_info, "repair read error: read error at %llu", | |
206 | failed_bbio->file_offset + bio_offset); | |
207 | ||
208 | num_copies = btrfs_num_copies(fs_info, logical, sectorsize); | |
209 | if (num_copies == 1) { | |
210 | btrfs_debug(fs_info, "no copy to repair from"); | |
211 | failed_bbio->bio.bi_status = BLK_STS_IOERR; | |
212 | return fbio; | |
213 | } | |
214 | ||
215 | if (!fbio) { | |
216 | fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); | |
217 | fbio->bbio = failed_bbio; | |
218 | fbio->num_copies = num_copies; | |
219 | atomic_set(&fbio->repair_count, 1); | |
220 | } | |
221 | ||
222 | atomic_inc(&fbio->repair_count); | |
223 | ||
224 | repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, | |
225 | &btrfs_repair_bioset); | |
0d3acb25 | 226 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
7609afac CH |
227 | bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); |
228 | ||
229 | repair_bbio = btrfs_bio(repair_bio); | |
230 | btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); | |
231 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; | |
232 | ||
233 | mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); | |
234 | btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); | |
285599b6 | 235 | btrfs_submit_bio(repair_bio, mirror); |
7609afac CH |
236 | return fbio; |
237 | } | |
238 | ||
239 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) | |
240 | { | |
241 | struct btrfs_inode *inode = bbio->inode; | |
242 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
243 | u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 244 | struct bvec_iter *iter = &bbio->saved_iter; |
7609afac CH |
245 | blk_status_t status = bbio->bio.bi_status; |
246 | struct btrfs_failed_bio *fbio = NULL; | |
247 | u32 offset = 0; | |
248 | ||
249 | /* | |
250 | * Hand off repair bios to the repair code as there is no upper level | |
251 | * submitter for them. | |
252 | */ | |
253 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { | |
254 | btrfs_end_repair_bio(bbio, dev); | |
255 | return; | |
256 | } | |
257 | ||
258 | /* Clear the I/O error. A failed repair will reset it. */ | |
259 | bbio->bio.bi_status = BLK_STS_OK; | |
260 | ||
261 | while (iter->bi_size) { | |
262 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); | |
263 | ||
264 | bv.bv_len = min(bv.bv_len, sectorsize); | |
265 | if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) | |
266 | fbio = repair_one_sector(bbio, offset, &bv, fbio); | |
267 | ||
268 | bio_advance_iter_single(&bbio->bio, iter, sectorsize); | |
269 | offset += sectorsize; | |
270 | } | |
271 | ||
7ab0fdfc CH |
272 | if (bbio->csum != bbio->csum_inline) |
273 | kfree(bbio->csum); | |
7609afac CH |
274 | |
275 | if (fbio) | |
276 | btrfs_repair_done(fbio); | |
277 | else | |
852eee62 | 278 | btrfs_orig_bbio_end_io(bbio); |
7609afac CH |
279 | } |
280 | ||
103c1972 CH |
281 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
282 | { | |
283 | if (!dev || !dev->bdev) | |
284 | return; | |
285 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) | |
286 | return; | |
287 | ||
288 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) | |
289 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
290 | if (!(bio->bi_opf & REQ_RAHEAD)) | |
291 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); | |
292 | if (bio->bi_opf & REQ_PREFLUSH) | |
293 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); | |
294 | } | |
295 | ||
296 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, | |
297 | struct bio *bio) | |
298 | { | |
299 | if (bio->bi_opf & REQ_META) | |
300 | return fs_info->endio_meta_workers; | |
301 | return fs_info->endio_workers; | |
302 | } | |
303 | ||
304 | static void btrfs_end_bio_work(struct work_struct *work) | |
305 | { | |
306 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); | |
307 | ||
7609afac CH |
308 | /* Metadata reads are checked and repaired by the submitter. */ |
309 | if (bbio->bio.bi_opf & REQ_META) | |
310 | bbio->end_io(bbio); | |
311 | else | |
860c8c45 | 312 | btrfs_check_read_bio(bbio, bbio->bio.bi_private); |
103c1972 CH |
313 | } |
314 | ||
315 | static void btrfs_simple_end_io(struct bio *bio) | |
316 | { | |
103c1972 | 317 | struct btrfs_bio *bbio = btrfs_bio(bio); |
860c8c45 CH |
318 | struct btrfs_device *dev = bio->bi_private; |
319 | struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; | |
103c1972 CH |
320 | |
321 | btrfs_bio_counter_dec(fs_info); | |
322 | ||
323 | if (bio->bi_status) | |
860c8c45 | 324 | btrfs_log_dev_io_error(bio, dev); |
103c1972 CH |
325 | |
326 | if (bio_op(bio) == REQ_OP_READ) { | |
327 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); | |
328 | queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); | |
329 | } else { | |
69ccf3f4 CH |
330 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) |
331 | btrfs_record_physical_zoned(bbio); | |
852eee62 | 332 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
333 | } |
334 | } | |
335 | ||
336 | static void btrfs_raid56_end_io(struct bio *bio) | |
337 | { | |
338 | struct btrfs_io_context *bioc = bio->bi_private; | |
339 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
340 | ||
341 | btrfs_bio_counter_dec(bioc->fs_info); | |
342 | bbio->mirror_num = bioc->mirror_num; | |
7609afac CH |
343 | if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) |
344 | btrfs_check_read_bio(bbio, NULL); | |
345 | else | |
852eee62 | 346 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
347 | |
348 | btrfs_put_bioc(bioc); | |
349 | } | |
350 | ||
351 | static void btrfs_orig_write_end_io(struct bio *bio) | |
352 | { | |
353 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
354 | struct btrfs_io_context *bioc = stripe->bioc; | |
355 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
356 | ||
357 | btrfs_bio_counter_dec(bioc->fs_info); | |
358 | ||
359 | if (bio->bi_status) { | |
360 | atomic_inc(&bioc->error); | |
361 | btrfs_log_dev_io_error(bio, stripe->dev); | |
362 | } | |
363 | ||
364 | /* | |
365 | * Only send an error to the higher layers if it is beyond the tolerance | |
366 | * threshold. | |
367 | */ | |
368 | if (atomic_read(&bioc->error) > bioc->max_errors) | |
369 | bio->bi_status = BLK_STS_IOERR; | |
370 | else | |
371 | bio->bi_status = BLK_STS_OK; | |
372 | ||
852eee62 | 373 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
374 | btrfs_put_bioc(bioc); |
375 | } | |
376 | ||
377 | static void btrfs_clone_write_end_io(struct bio *bio) | |
378 | { | |
379 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
380 | ||
381 | if (bio->bi_status) { | |
382 | atomic_inc(&stripe->bioc->error); | |
383 | btrfs_log_dev_io_error(bio, stripe->dev); | |
384 | } | |
385 | ||
386 | /* Pass on control to the original bio this one was cloned from */ | |
387 | bio_endio(stripe->bioc->orig_bio); | |
388 | bio_put(bio); | |
389 | } | |
390 | ||
391 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) | |
392 | { | |
393 | if (!dev || !dev->bdev || | |
394 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || | |
395 | (btrfs_op(bio) == BTRFS_MAP_WRITE && | |
396 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { | |
397 | bio_io_error(bio); | |
398 | return; | |
399 | } | |
400 | ||
401 | bio_set_dev(bio, dev->bdev); | |
402 | ||
403 | /* | |
404 | * For zone append writing, bi_sector must point the beginning of the | |
405 | * zone | |
406 | */ | |
407 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
408 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
d5e4377d | 409 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
103c1972 | 410 | |
d5e4377d CH |
411 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
412 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; | |
103c1972 CH |
413 | } |
414 | btrfs_debug_in_rcu(dev->fs_info, | |
415 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", | |
416 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, | |
417 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), | |
418 | dev->devid, bio->bi_iter.bi_size); | |
419 | ||
420 | btrfsic_check_bio(bio); | |
421 | submit_bio(bio); | |
422 | } | |
423 | ||
424 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) | |
425 | { | |
426 | struct bio *orig_bio = bioc->orig_bio, *bio; | |
427 | ||
428 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); | |
429 | ||
430 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ | |
431 | if (dev_nr == bioc->num_stripes - 1) { | |
432 | bio = orig_bio; | |
433 | bio->bi_end_io = btrfs_orig_write_end_io; | |
434 | } else { | |
435 | bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); | |
436 | bio_inc_remaining(orig_bio); | |
437 | bio->bi_end_io = btrfs_clone_write_end_io; | |
438 | } | |
439 | ||
440 | bio->bi_private = &bioc->stripes[dev_nr]; | |
441 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; | |
442 | bioc->stripes[dev_nr].bioc = bioc; | |
443 | btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); | |
444 | } | |
445 | ||
f8a53bb5 CH |
446 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
447 | struct btrfs_io_stripe *smap, int mirror_num) | |
448 | { | |
449 | /* Do not leak our private flag into the block layer. */ | |
450 | bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; | |
451 | ||
452 | if (!bioc) { | |
453 | /* Single mirror read/write fast path. */ | |
454 | btrfs_bio(bio)->mirror_num = mirror_num; | |
455 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; | |
456 | bio->bi_private = smap->dev; | |
457 | bio->bi_end_io = btrfs_simple_end_io; | |
458 | btrfs_submit_dev_bio(smap->dev, bio); | |
459 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | |
460 | /* Parity RAID write or read recovery. */ | |
461 | bio->bi_private = bioc; | |
462 | bio->bi_end_io = btrfs_raid56_end_io; | |
463 | if (bio_op(bio) == REQ_OP_READ) | |
464 | raid56_parity_recover(bio, bioc, mirror_num); | |
465 | else | |
466 | raid56_parity_write(bio, bioc); | |
467 | } else { | |
468 | /* Write to multiple mirrors. */ | |
469 | int total_devs = bioc->num_stripes; | |
470 | ||
471 | bioc->orig_bio = bio; | |
472 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) | |
473 | btrfs_submit_mirrored_bio(bioc, dev_nr); | |
474 | } | |
475 | } | |
476 | ||
477 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) | |
478 | { | |
479 | if (bbio->bio.bi_opf & REQ_META) | |
542e300e | 480 | return btree_csum_one_bio(bbio); |
f8a53bb5 CH |
481 | return btrfs_csum_one_bio(bbio); |
482 | } | |
483 | ||
484 | /* | |
485 | * Async submit bios are used to offload expensive checksumming onto the worker | |
486 | * threads. | |
487 | */ | |
488 | struct async_submit_bio { | |
489 | struct btrfs_bio *bbio; | |
490 | struct btrfs_io_context *bioc; | |
491 | struct btrfs_io_stripe smap; | |
492 | int mirror_num; | |
493 | struct btrfs_work work; | |
494 | }; | |
495 | ||
496 | /* | |
497 | * In order to insert checksums into the metadata in large chunks, we wait | |
498 | * until bio submission time. All the pages in the bio are checksummed and | |
499 | * sums are attached onto the ordered extent record. | |
500 | * | |
501 | * At IO completion time the csums attached on the ordered extent record are | |
502 | * inserted into the btree. | |
503 | */ | |
504 | static void run_one_async_start(struct btrfs_work *work) | |
505 | { | |
506 | struct async_submit_bio *async = | |
507 | container_of(work, struct async_submit_bio, work); | |
508 | blk_status_t ret; | |
509 | ||
510 | ret = btrfs_bio_csum(async->bbio); | |
511 | if (ret) | |
512 | async->bbio->bio.bi_status = ret; | |
513 | } | |
514 | ||
515 | /* | |
516 | * In order to insert checksums into the metadata in large chunks, we wait | |
517 | * until bio submission time. All the pages in the bio are checksummed and | |
518 | * sums are attached onto the ordered extent record. | |
519 | * | |
520 | * At IO completion time the csums attached on the ordered extent record are | |
521 | * inserted into the tree. | |
522 | */ | |
523 | static void run_one_async_done(struct btrfs_work *work) | |
524 | { | |
525 | struct async_submit_bio *async = | |
526 | container_of(work, struct async_submit_bio, work); | |
527 | struct bio *bio = &async->bbio->bio; | |
528 | ||
529 | /* If an error occurred we just want to clean up the bio and move on. */ | |
530 | if (bio->bi_status) { | |
852eee62 | 531 | btrfs_orig_bbio_end_io(async->bbio); |
f8a53bb5 CH |
532 | return; |
533 | } | |
534 | ||
535 | /* | |
536 | * All of the bios that pass through here are from async helpers. | |
537 | * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. | |
538 | * This changes nothing when cgroups aren't in use. | |
539 | */ | |
540 | bio->bi_opf |= REQ_CGROUP_PUNT; | |
541 | __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); | |
542 | } | |
543 | ||
544 | static void run_one_async_free(struct btrfs_work *work) | |
545 | { | |
546 | kfree(container_of(work, struct async_submit_bio, work)); | |
547 | } | |
548 | ||
549 | static bool should_async_write(struct btrfs_bio *bbio) | |
550 | { | |
551 | /* | |
552 | * If the I/O is not issued by fsync and friends, (->sync_writers != 0), | |
553 | * then try to defer the submission to a workqueue to parallelize the | |
554 | * checksum calculation. | |
555 | */ | |
556 | if (atomic_read(&bbio->inode->sync_writers)) | |
557 | return false; | |
558 | ||
559 | /* | |
560 | * Submit metadata writes synchronously if the checksum implementation | |
561 | * is fast, or we are on a zoned device that wants I/O to be submitted | |
562 | * in order. | |
563 | */ | |
564 | if (bbio->bio.bi_opf & REQ_META) { | |
565 | struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; | |
566 | ||
567 | if (btrfs_is_zoned(fs_info)) | |
568 | return false; | |
569 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) | |
570 | return false; | |
571 | } | |
572 | ||
573 | return true; | |
574 | } | |
575 | ||
576 | /* | |
577 | * Submit bio to an async queue. | |
578 | * | |
579 | * Return true if the work has been succesfuly submitted, else false. | |
580 | */ | |
581 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, | |
582 | struct btrfs_io_context *bioc, | |
583 | struct btrfs_io_stripe *smap, int mirror_num) | |
584 | { | |
585 | struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; | |
586 | struct async_submit_bio *async; | |
587 | ||
588 | async = kmalloc(sizeof(*async), GFP_NOFS); | |
589 | if (!async) | |
590 | return false; | |
591 | ||
592 | async->bbio = bbio; | |
593 | async->bioc = bioc; | |
594 | async->smap = *smap; | |
595 | async->mirror_num = mirror_num; | |
596 | ||
597 | btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, | |
598 | run_one_async_free); | |
599 | if (op_is_sync(bbio->bio.bi_opf)) | |
600 | btrfs_queue_work(fs_info->hipri_workers, &async->work); | |
601 | else | |
602 | btrfs_queue_work(fs_info->workers, &async->work); | |
603 | return true; | |
604 | } | |
605 | ||
285599b6 | 606 | static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) |
103c1972 | 607 | { |
9ba0004b | 608 | struct btrfs_bio *bbio = btrfs_bio(bio); |
d5e4377d CH |
609 | struct btrfs_inode *inode = bbio->inode; |
610 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
852eee62 | 611 | struct btrfs_bio *orig_bbio = bbio; |
103c1972 CH |
612 | u64 logical = bio->bi_iter.bi_sector << 9; |
613 | u64 length = bio->bi_iter.bi_size; | |
614 | u64 map_length = length; | |
921603c7 | 615 | bool use_append = btrfs_use_zone_append(bbio); |
103c1972 CH |
616 | struct btrfs_io_context *bioc = NULL; |
617 | struct btrfs_io_stripe smap; | |
9ba0004b CH |
618 | blk_status_t ret; |
619 | int error; | |
103c1972 CH |
620 | |
621 | btrfs_bio_counter_inc_blocked(fs_info); | |
9ba0004b CH |
622 | error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, |
623 | &bioc, &smap, &mirror_num, 1); | |
624 | if (error) { | |
625 | ret = errno_to_blk_status(error); | |
626 | goto fail; | |
103c1972 CH |
627 | } |
628 | ||
852eee62 | 629 | map_length = min(map_length, length); |
d5e4377d CH |
630 | if (use_append) |
631 | map_length = min(map_length, fs_info->max_zone_append_size); | |
632 | ||
103c1972 | 633 | if (map_length < length) { |
d5e4377d | 634 | bio = btrfs_split_bio(fs_info, bio, map_length, use_append); |
852eee62 | 635 | bbio = btrfs_bio(bio); |
103c1972 CH |
636 | } |
637 | ||
1c2b3ee3 CH |
638 | /* |
639 | * Save the iter for the end_io handler and preload the checksums for | |
640 | * data reads. | |
641 | */ | |
642 | if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { | |
0d3acb25 | 643 | bbio->saved_iter = bio->bi_iter; |
1c2b3ee3 CH |
644 | ret = btrfs_lookup_bio_sums(bbio); |
645 | if (ret) | |
852eee62 | 646 | goto fail_put_bio; |
1c2b3ee3 | 647 | } |
7276aa7d | 648 | |
f8a53bb5 | 649 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
d5e4377d CH |
650 | if (use_append) { |
651 | bio->bi_opf &= ~REQ_OP_WRITE; | |
652 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
69ccf3f4 CH |
653 | ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); |
654 | if (ret) | |
852eee62 | 655 | goto fail_put_bio; |
69ccf3f4 CH |
656 | } |
657 | ||
f8a53bb5 CH |
658 | /* |
659 | * Csum items for reloc roots have already been cloned at this | |
660 | * point, so they are handled as part of the no-checksum case. | |
661 | */ | |
d5e4377d | 662 | if (!(inode->flags & BTRFS_INODE_NODATASUM) && |
f8a53bb5 | 663 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
d5e4377d | 664 | !btrfs_is_data_reloc_root(inode->root)) { |
f8a53bb5 CH |
665 | if (should_async_write(bbio) && |
666 | btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) | |
852eee62 | 667 | goto done; |
f8a53bb5 CH |
668 | |
669 | ret = btrfs_bio_csum(bbio); | |
670 | if (ret) | |
852eee62 | 671 | goto fail_put_bio; |
f8a53bb5 | 672 | } |
103c1972 | 673 | } |
f8a53bb5 CH |
674 | |
675 | __btrfs_submit_bio(bio, bioc, &smap, mirror_num); | |
852eee62 CH |
676 | done: |
677 | return map_length == length; | |
9ba0004b | 678 | |
852eee62 CH |
679 | fail_put_bio: |
680 | if (map_length < length) | |
681 | bio_put(bio); | |
9ba0004b CH |
682 | fail: |
683 | btrfs_bio_counter_dec(fs_info); | |
852eee62 CH |
684 | btrfs_bio_end_io(orig_bbio, ret); |
685 | /* Do not submit another chunk */ | |
686 | return true; | |
687 | } | |
688 | ||
285599b6 | 689 | void btrfs_submit_bio(struct bio *bio, int mirror_num) |
852eee62 | 690 | { |
285599b6 | 691 | while (!btrfs_submit_chunk(bio, mirror_num)) |
852eee62 | 692 | ; |
103c1972 CH |
693 | } |
694 | ||
bacf60e5 CH |
695 | /* |
696 | * Submit a repair write. | |
697 | * | |
698 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a | |
699 | * RAID setup. Here we only want to write the one bad copy, so we do the | |
700 | * mapping ourselves and submit the bio directly. | |
701 | * | |
67da05b3 | 702 | * The I/O is issued synchronously to block the repair read completion from |
bacf60e5 CH |
703 | * freeing the bio. |
704 | */ | |
705 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, | |
706 | u64 length, u64 logical, struct page *page, | |
707 | unsigned int pg_offset, int mirror_num) | |
708 | { | |
709 | struct btrfs_device *dev; | |
710 | struct bio_vec bvec; | |
711 | struct bio bio; | |
712 | u64 map_length = 0; | |
713 | u64 sector; | |
714 | struct btrfs_io_context *bioc = NULL; | |
715 | int ret = 0; | |
716 | ||
717 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); | |
718 | BUG_ON(!mirror_num); | |
719 | ||
720 | if (btrfs_repair_one_zone(fs_info, logical)) | |
721 | return 0; | |
722 | ||
723 | map_length = length; | |
724 | ||
725 | /* | |
726 | * Avoid races with device replace and make sure our bioc has devices | |
727 | * associated to its stripes that don't go away while we are doing the | |
728 | * read repair operation. | |
729 | */ | |
730 | btrfs_bio_counter_inc_blocked(fs_info); | |
731 | if (btrfs_is_parity_mirror(fs_info, logical, length)) { | |
732 | /* | |
733 | * Note that we don't use BTRFS_MAP_WRITE because it's supposed | |
734 | * to update all raid stripes, but here we just want to correct | |
735 | * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad | |
736 | * stripe's dev and sector. | |
737 | */ | |
738 | ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, | |
739 | &map_length, &bioc, 0); | |
740 | if (ret) | |
741 | goto out_counter_dec; | |
742 | ASSERT(bioc->mirror_num == 1); | |
743 | } else { | |
744 | ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, | |
745 | &map_length, &bioc, mirror_num); | |
746 | if (ret) | |
747 | goto out_counter_dec; | |
d73a27b8 QW |
748 | /* |
749 | * This happens when dev-replace is also running, and the | |
750 | * mirror_num indicates the dev-replace target. | |
751 | * | |
752 | * In this case, we don't need to do anything, as the read | |
753 | * error just means the replace progress hasn't reached our | |
754 | * read range, and later replace routine would handle it well. | |
755 | */ | |
756 | if (mirror_num != bioc->mirror_num) | |
757 | goto out_counter_dec; | |
bacf60e5 CH |
758 | } |
759 | ||
760 | sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; | |
761 | dev = bioc->stripes[bioc->mirror_num - 1].dev; | |
762 | btrfs_put_bioc(bioc); | |
763 | ||
764 | if (!dev || !dev->bdev || | |
765 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { | |
766 | ret = -EIO; | |
767 | goto out_counter_dec; | |
768 | } | |
769 | ||
770 | bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); | |
771 | bio.bi_iter.bi_sector = sector; | |
772 | __bio_add_page(&bio, page, length, pg_offset); | |
773 | ||
774 | btrfsic_check_bio(&bio); | |
775 | ret = submit_bio_wait(&bio); | |
776 | if (ret) { | |
777 | /* try to remap that extent elsewhere? */ | |
778 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
779 | goto out_bio_uninit; | |
780 | } | |
781 | ||
782 | btrfs_info_rl_in_rcu(fs_info, | |
783 | "read error corrected: ino %llu off %llu (dev %s sector %llu)", | |
784 | ino, start, btrfs_dev_name(dev), sector); | |
785 | ret = 0; | |
786 | ||
787 | out_bio_uninit: | |
788 | bio_uninit(&bio); | |
789 | out_counter_dec: | |
790 | btrfs_bio_counter_dec(fs_info); | |
791 | return ret; | |
792 | } | |
793 | ||
103c1972 CH |
794 | int __init btrfs_bioset_init(void) |
795 | { | |
796 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, | |
797 | offsetof(struct btrfs_bio, bio), | |
798 | BIOSET_NEED_BVECS)) | |
799 | return -ENOMEM; | |
852eee62 CH |
800 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
801 | offsetof(struct btrfs_bio, bio), 0)) | |
802 | goto out_free_bioset; | |
7609afac CH |
803 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
804 | offsetof(struct btrfs_bio, bio), | |
805 | BIOSET_NEED_BVECS)) | |
852eee62 | 806 | goto out_free_clone_bioset; |
7609afac CH |
807 | if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, |
808 | sizeof(struct btrfs_failed_bio))) | |
809 | goto out_free_repair_bioset; | |
103c1972 | 810 | return 0; |
7609afac CH |
811 | |
812 | out_free_repair_bioset: | |
813 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 CH |
814 | out_free_clone_bioset: |
815 | bioset_exit(&btrfs_clone_bioset); | |
7609afac CH |
816 | out_free_bioset: |
817 | bioset_exit(&btrfs_bioset); | |
818 | return -ENOMEM; | |
103c1972 CH |
819 | } |
820 | ||
821 | void __cold btrfs_bioset_exit(void) | |
822 | { | |
7609afac CH |
823 | mempool_exit(&btrfs_failed_bio_pool); |
824 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 | 825 | bioset_exit(&btrfs_clone_bioset); |
103c1972 CH |
826 | bioset_exit(&btrfs_bioset); |
827 | } |