Commit | Line | Data |
---|---|---|
103c1972 CH |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
4 | * Copyright (C) 2022 Christoph Hellwig. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
8 | #include "bio.h" | |
9 | #include "ctree.h" | |
10 | #include "volumes.h" | |
11 | #include "raid56.h" | |
12 | #include "async-thread.h" | |
13 | #include "check-integrity.h" | |
14 | #include "dev-replace.h" | |
15 | #include "rcu-string.h" | |
16 | #include "zoned.h" | |
1c2b3ee3 | 17 | #include "file-item.h" |
103c1972 CH |
18 | |
19 | static struct bio_set btrfs_bioset; | |
852eee62 | 20 | static struct bio_set btrfs_clone_bioset; |
7609afac CH |
21 | static struct bio_set btrfs_repair_bioset; |
22 | static mempool_t btrfs_failed_bio_pool; | |
23 | ||
24 | struct btrfs_failed_bio { | |
25 | struct btrfs_bio *bbio; | |
26 | int num_copies; | |
27 | atomic_t repair_count; | |
28 | }; | |
103c1972 | 29 | |
fbe96087 CH |
30 | /* Is this a data path I/O that needs storage layer checksum and repair? */ |
31 | static inline bool is_data_bbio(struct btrfs_bio *bbio) | |
32 | { | |
33 | return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); | |
34 | } | |
35 | ||
ec63b84d CH |
36 | static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) |
37 | { | |
38 | return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; | |
39 | } | |
40 | ||
103c1972 CH |
41 | /* |
42 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it | |
43 | * is already initialized by the block layer. | |
44 | */ | |
4317ff00 | 45 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, |
67d66982 | 46 | btrfs_bio_end_io_t end_io, void *private) |
103c1972 CH |
47 | { |
48 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); | |
4317ff00 | 49 | bbio->fs_info = fs_info; |
103c1972 CH |
50 | bbio->end_io = end_io; |
51 | bbio->private = private; | |
852eee62 | 52 | atomic_set(&bbio->pending_ios, 1); |
103c1972 CH |
53 | } |
54 | ||
55 | /* | |
56 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for | |
57 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. | |
58 | * | |
59 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by | |
60 | * a mempool. | |
61 | */ | |
b41bbd29 | 62 | struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, |
4317ff00 | 63 | struct btrfs_fs_info *fs_info, |
b41bbd29 | 64 | btrfs_bio_end_io_t end_io, void *private) |
103c1972 | 65 | { |
b41bbd29 | 66 | struct btrfs_bio *bbio; |
103c1972 CH |
67 | struct bio *bio; |
68 | ||
69 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); | |
b41bbd29 | 70 | bbio = btrfs_bio(bio); |
4317ff00 | 71 | btrfs_bio_init(bbio, fs_info, end_io, private); |
b41bbd29 | 72 | return bbio; |
103c1972 CH |
73 | } |
74 | ||
2cef0c79 CH |
75 | static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
76 | struct btrfs_bio *orig_bbio, | |
77 | u64 map_length, bool use_append) | |
852eee62 | 78 | { |
2cef0c79 | 79 | struct btrfs_bio *bbio; |
852eee62 CH |
80 | struct bio *bio; |
81 | ||
d5e4377d CH |
82 | if (use_append) { |
83 | unsigned int nr_segs; | |
84 | ||
2cef0c79 | 85 | bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, |
d5e4377d CH |
86 | &btrfs_clone_bioset, map_length); |
87 | } else { | |
2cef0c79 CH |
88 | bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, |
89 | GFP_NOFS, &btrfs_clone_bioset); | |
d5e4377d | 90 | } |
2cef0c79 | 91 | bbio = btrfs_bio(bio); |
4317ff00 QW |
92 | btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); |
93 | bbio->inode = orig_bbio->inode; | |
2cef0c79 | 94 | bbio->file_offset = orig_bbio->file_offset; |
c731cd0b | 95 | orig_bbio->file_offset += map_length; |
ec63b84d CH |
96 | if (bbio_has_ordered_extent(bbio)) { |
97 | refcount_inc(&orig_bbio->ordered->refs); | |
98 | bbio->ordered = orig_bbio->ordered; | |
99 | } | |
852eee62 | 100 | atomic_inc(&orig_bbio->pending_ios); |
2cef0c79 | 101 | return bbio; |
852eee62 CH |
102 | } |
103 | ||
ec63b84d CH |
104 | /* Free a bio that was never submitted to the underlying device. */ |
105 | static void btrfs_cleanup_bio(struct btrfs_bio *bbio) | |
106 | { | |
107 | if (bbio_has_ordered_extent(bbio)) | |
108 | btrfs_put_ordered_extent(bbio->ordered); | |
109 | bio_put(&bbio->bio); | |
110 | } | |
111 | ||
112 | static void __btrfs_bio_end_io(struct btrfs_bio *bbio) | |
113 | { | |
114 | if (bbio_has_ordered_extent(bbio)) { | |
115 | struct btrfs_ordered_extent *ordered = bbio->ordered; | |
116 | ||
117 | bbio->end_io(bbio); | |
118 | btrfs_put_ordered_extent(ordered); | |
119 | } else { | |
120 | bbio->end_io(bbio); | |
121 | } | |
122 | } | |
123 | ||
124 | void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) | |
125 | { | |
126 | bbio->bio.bi_status = status; | |
127 | __btrfs_bio_end_io(bbio); | |
128 | } | |
129 | ||
852eee62 CH |
130 | static void btrfs_orig_write_end_io(struct bio *bio); |
131 | ||
132 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, | |
133 | struct btrfs_bio *orig_bbio) | |
134 | { | |
135 | /* | |
136 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't | |
137 | * just blindly propagate a write failure here. Instead increment the | |
138 | * error count in the original I/O context so that it is guaranteed to | |
139 | * be larger than the error tolerance. | |
140 | */ | |
141 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { | |
142 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; | |
143 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; | |
144 | ||
145 | atomic_add(orig_bioc->max_errors, &orig_bioc->error); | |
146 | } else { | |
147 | orig_bbio->bio.bi_status = bbio->bio.bi_status; | |
148 | } | |
149 | } | |
150 | ||
151 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) | |
152 | { | |
153 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { | |
154 | struct btrfs_bio *orig_bbio = bbio->private; | |
155 | ||
156 | if (bbio->bio.bi_status) | |
157 | btrfs_bbio_propagate_error(bbio, orig_bbio); | |
ec63b84d | 158 | btrfs_cleanup_bio(bbio); |
852eee62 CH |
159 | bbio = orig_bbio; |
160 | } | |
161 | ||
162 | if (atomic_dec_and_test(&bbio->pending_ios)) | |
ec63b84d | 163 | __btrfs_bio_end_io(bbio); |
852eee62 CH |
164 | } |
165 | ||
7609afac CH |
166 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
167 | { | |
168 | if (cur_mirror == fbio->num_copies) | |
169 | return cur_mirror + 1 - fbio->num_copies; | |
170 | return cur_mirror + 1; | |
171 | } | |
172 | ||
173 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) | |
174 | { | |
175 | if (cur_mirror == 1) | |
176 | return fbio->num_copies; | |
177 | return cur_mirror - 1; | |
178 | } | |
179 | ||
180 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) | |
181 | { | |
182 | if (atomic_dec_and_test(&fbio->repair_count)) { | |
852eee62 | 183 | btrfs_orig_bbio_end_io(fbio->bbio); |
7609afac CH |
184 | mempool_free(fbio, &btrfs_failed_bio_pool); |
185 | } | |
186 | } | |
187 | ||
188 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, | |
189 | struct btrfs_device *dev) | |
190 | { | |
191 | struct btrfs_failed_bio *fbio = repair_bbio->private; | |
192 | struct btrfs_inode *inode = repair_bbio->inode; | |
193 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
194 | struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); | |
195 | int mirror = repair_bbio->mirror_num; | |
196 | ||
197 | if (repair_bbio->bio.bi_status || | |
198 | !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { | |
199 | bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); | |
0d3acb25 | 200 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
7609afac CH |
201 | |
202 | mirror = next_repair_mirror(fbio, mirror); | |
203 | if (mirror == fbio->bbio->mirror_num) { | |
204 | btrfs_debug(fs_info, "no mirror left"); | |
205 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; | |
206 | goto done; | |
207 | } | |
208 | ||
ae42a154 | 209 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
210 | return; |
211 | } | |
212 | ||
213 | do { | |
214 | mirror = prev_repair_mirror(fbio, mirror); | |
215 | btrfs_repair_io_failure(fs_info, btrfs_ino(inode), | |
216 | repair_bbio->file_offset, fs_info->sectorsize, | |
0d3acb25 | 217 | repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
7609afac CH |
218 | bv->bv_page, bv->bv_offset, mirror); |
219 | } while (mirror != fbio->bbio->mirror_num); | |
220 | ||
221 | done: | |
222 | btrfs_repair_done(fbio); | |
223 | bio_put(&repair_bbio->bio); | |
224 | } | |
225 | ||
226 | /* | |
227 | * Try to kick off a repair read to the next available mirror for a bad sector. | |
228 | * | |
229 | * This primarily tries to recover good data to serve the actual read request, | |
230 | * but also tries to write the good data back to the bad mirror(s) when a | |
231 | * read succeeded to restore the redundancy. | |
232 | */ | |
233 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, | |
234 | u32 bio_offset, | |
235 | struct bio_vec *bv, | |
236 | struct btrfs_failed_bio *fbio) | |
237 | { | |
238 | struct btrfs_inode *inode = failed_bbio->inode; | |
239 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
240 | const u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 241 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
7609afac CH |
242 | struct btrfs_bio *repair_bbio; |
243 | struct bio *repair_bio; | |
244 | int num_copies; | |
245 | int mirror; | |
246 | ||
247 | btrfs_debug(fs_info, "repair read error: read error at %llu", | |
248 | failed_bbio->file_offset + bio_offset); | |
249 | ||
250 | num_copies = btrfs_num_copies(fs_info, logical, sectorsize); | |
251 | if (num_copies == 1) { | |
252 | btrfs_debug(fs_info, "no copy to repair from"); | |
253 | failed_bbio->bio.bi_status = BLK_STS_IOERR; | |
254 | return fbio; | |
255 | } | |
256 | ||
257 | if (!fbio) { | |
258 | fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); | |
259 | fbio->bbio = failed_bbio; | |
260 | fbio->num_copies = num_copies; | |
261 | atomic_set(&fbio->repair_count, 1); | |
262 | } | |
263 | ||
264 | atomic_inc(&fbio->repair_count); | |
265 | ||
266 | repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, | |
267 | &btrfs_repair_bioset); | |
0d3acb25 | 268 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
078e4cf5 | 269 | __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); |
7609afac CH |
270 | |
271 | repair_bbio = btrfs_bio(repair_bio); | |
4317ff00 QW |
272 | btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); |
273 | repair_bbio->inode = failed_bbio->inode; | |
7609afac CH |
274 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; |
275 | ||
276 | mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); | |
277 | btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); | |
ae42a154 | 278 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
279 | return fbio; |
280 | } | |
281 | ||
282 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) | |
283 | { | |
284 | struct btrfs_inode *inode = bbio->inode; | |
285 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
286 | u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 287 | struct bvec_iter *iter = &bbio->saved_iter; |
7609afac CH |
288 | blk_status_t status = bbio->bio.bi_status; |
289 | struct btrfs_failed_bio *fbio = NULL; | |
290 | u32 offset = 0; | |
291 | ||
4317ff00 QW |
292 | /* Read-repair requires the inode field to be set by the submitter. */ |
293 | ASSERT(inode); | |
294 | ||
7609afac CH |
295 | /* |
296 | * Hand off repair bios to the repair code as there is no upper level | |
297 | * submitter for them. | |
298 | */ | |
299 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { | |
300 | btrfs_end_repair_bio(bbio, dev); | |
301 | return; | |
302 | } | |
303 | ||
304 | /* Clear the I/O error. A failed repair will reset it. */ | |
305 | bbio->bio.bi_status = BLK_STS_OK; | |
306 | ||
307 | while (iter->bi_size) { | |
308 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); | |
309 | ||
310 | bv.bv_len = min(bv.bv_len, sectorsize); | |
311 | if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) | |
312 | fbio = repair_one_sector(bbio, offset, &bv, fbio); | |
313 | ||
314 | bio_advance_iter_single(&bbio->bio, iter, sectorsize); | |
315 | offset += sectorsize; | |
316 | } | |
317 | ||
7ab0fdfc CH |
318 | if (bbio->csum != bbio->csum_inline) |
319 | kfree(bbio->csum); | |
7609afac CH |
320 | |
321 | if (fbio) | |
322 | btrfs_repair_done(fbio); | |
323 | else | |
852eee62 | 324 | btrfs_orig_bbio_end_io(bbio); |
7609afac CH |
325 | } |
326 | ||
103c1972 CH |
327 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
328 | { | |
329 | if (!dev || !dev->bdev) | |
330 | return; | |
331 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) | |
332 | return; | |
333 | ||
334 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) | |
335 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
98e8d36a | 336 | else if (!(bio->bi_opf & REQ_RAHEAD)) |
103c1972 CH |
337 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
338 | if (bio->bi_opf & REQ_PREFLUSH) | |
339 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); | |
340 | } | |
341 | ||
342 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, | |
343 | struct bio *bio) | |
344 | { | |
345 | if (bio->bi_opf & REQ_META) | |
346 | return fs_info->endio_meta_workers; | |
347 | return fs_info->endio_workers; | |
348 | } | |
349 | ||
350 | static void btrfs_end_bio_work(struct work_struct *work) | |
351 | { | |
352 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); | |
353 | ||
7609afac | 354 | /* Metadata reads are checked and repaired by the submitter. */ |
fbe96087 | 355 | if (is_data_bbio(bbio)) |
860c8c45 | 356 | btrfs_check_read_bio(bbio, bbio->bio.bi_private); |
4317ff00 | 357 | else |
45c2f368 | 358 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
359 | } |
360 | ||
361 | static void btrfs_simple_end_io(struct bio *bio) | |
362 | { | |
103c1972 | 363 | struct btrfs_bio *bbio = btrfs_bio(bio); |
860c8c45 | 364 | struct btrfs_device *dev = bio->bi_private; |
4317ff00 | 365 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
103c1972 CH |
366 | |
367 | btrfs_bio_counter_dec(fs_info); | |
368 | ||
369 | if (bio->bi_status) | |
860c8c45 | 370 | btrfs_log_dev_io_error(bio, dev); |
103c1972 CH |
371 | |
372 | if (bio_op(bio) == REQ_OP_READ) { | |
373 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); | |
374 | queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); | |
375 | } else { | |
e9cb93b9 | 376 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
69ccf3f4 | 377 | btrfs_record_physical_zoned(bbio); |
852eee62 | 378 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
379 | } |
380 | } | |
381 | ||
382 | static void btrfs_raid56_end_io(struct bio *bio) | |
383 | { | |
384 | struct btrfs_io_context *bioc = bio->bi_private; | |
385 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
386 | ||
387 | btrfs_bio_counter_dec(bioc->fs_info); | |
388 | bbio->mirror_num = bioc->mirror_num; | |
fbe96087 | 389 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) |
7609afac CH |
390 | btrfs_check_read_bio(bbio, NULL); |
391 | else | |
852eee62 | 392 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
393 | |
394 | btrfs_put_bioc(bioc); | |
395 | } | |
396 | ||
397 | static void btrfs_orig_write_end_io(struct bio *bio) | |
398 | { | |
399 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
400 | struct btrfs_io_context *bioc = stripe->bioc; | |
401 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
402 | ||
403 | btrfs_bio_counter_dec(bioc->fs_info); | |
404 | ||
405 | if (bio->bi_status) { | |
406 | atomic_inc(&bioc->error); | |
407 | btrfs_log_dev_io_error(bio, stripe->dev); | |
408 | } | |
409 | ||
410 | /* | |
411 | * Only send an error to the higher layers if it is beyond the tolerance | |
412 | * threshold. | |
413 | */ | |
414 | if (atomic_read(&bioc->error) > bioc->max_errors) | |
415 | bio->bi_status = BLK_STS_IOERR; | |
416 | else | |
417 | bio->bi_status = BLK_STS_OK; | |
418 | ||
852eee62 | 419 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
420 | btrfs_put_bioc(bioc); |
421 | } | |
422 | ||
423 | static void btrfs_clone_write_end_io(struct bio *bio) | |
424 | { | |
425 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
426 | ||
427 | if (bio->bi_status) { | |
428 | atomic_inc(&stripe->bioc->error); | |
429 | btrfs_log_dev_io_error(bio, stripe->dev); | |
430 | } | |
431 | ||
432 | /* Pass on control to the original bio this one was cloned from */ | |
433 | bio_endio(stripe->bioc->orig_bio); | |
434 | bio_put(bio); | |
435 | } | |
436 | ||
437 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) | |
438 | { | |
439 | if (!dev || !dev->bdev || | |
440 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || | |
441 | (btrfs_op(bio) == BTRFS_MAP_WRITE && | |
442 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { | |
443 | bio_io_error(bio); | |
444 | return; | |
445 | } | |
446 | ||
447 | bio_set_dev(bio, dev->bdev); | |
448 | ||
449 | /* | |
450 | * For zone append writing, bi_sector must point the beginning of the | |
451 | * zone | |
452 | */ | |
453 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
454 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
d5e4377d | 455 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
103c1972 | 456 | |
d5e4377d CH |
457 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
458 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; | |
103c1972 CH |
459 | } |
460 | btrfs_debug_in_rcu(dev->fs_info, | |
461 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", | |
462 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, | |
463 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), | |
464 | dev->devid, bio->bi_iter.bi_size); | |
465 | ||
466 | btrfsic_check_bio(bio); | |
3480373e CH |
467 | |
468 | if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) | |
469 | blkcg_punt_bio_submit(bio); | |
470 | else | |
471 | submit_bio(bio); | |
103c1972 CH |
472 | } |
473 | ||
474 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) | |
475 | { | |
476 | struct bio *orig_bio = bioc->orig_bio, *bio; | |
477 | ||
478 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); | |
479 | ||
480 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ | |
481 | if (dev_nr == bioc->num_stripes - 1) { | |
482 | bio = orig_bio; | |
483 | bio->bi_end_io = btrfs_orig_write_end_io; | |
484 | } else { | |
485 | bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); | |
486 | bio_inc_remaining(orig_bio); | |
487 | bio->bi_end_io = btrfs_clone_write_end_io; | |
488 | } | |
489 | ||
490 | bio->bi_private = &bioc->stripes[dev_nr]; | |
491 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; | |
492 | bioc->stripes[dev_nr].bioc = bioc; | |
493 | btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); | |
494 | } | |
495 | ||
f8a53bb5 CH |
496 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
497 | struct btrfs_io_stripe *smap, int mirror_num) | |
498 | { | |
f8a53bb5 CH |
499 | if (!bioc) { |
500 | /* Single mirror read/write fast path. */ | |
501 | btrfs_bio(bio)->mirror_num = mirror_num; | |
502 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; | |
3887653c CH |
503 | if (bio_op(bio) != REQ_OP_READ) |
504 | btrfs_bio(bio)->orig_physical = smap->physical; | |
f8a53bb5 CH |
505 | bio->bi_private = smap->dev; |
506 | bio->bi_end_io = btrfs_simple_end_io; | |
507 | btrfs_submit_dev_bio(smap->dev, bio); | |
508 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | |
509 | /* Parity RAID write or read recovery. */ | |
510 | bio->bi_private = bioc; | |
511 | bio->bi_end_io = btrfs_raid56_end_io; | |
512 | if (bio_op(bio) == REQ_OP_READ) | |
513 | raid56_parity_recover(bio, bioc, mirror_num); | |
514 | else | |
515 | raid56_parity_write(bio, bioc); | |
516 | } else { | |
517 | /* Write to multiple mirrors. */ | |
518 | int total_devs = bioc->num_stripes; | |
519 | ||
520 | bioc->orig_bio = bio; | |
521 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) | |
522 | btrfs_submit_mirrored_bio(bioc, dev_nr); | |
523 | } | |
524 | } | |
525 | ||
526 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) | |
527 | { | |
528 | if (bbio->bio.bi_opf & REQ_META) | |
542e300e | 529 | return btree_csum_one_bio(bbio); |
f8a53bb5 CH |
530 | return btrfs_csum_one_bio(bbio); |
531 | } | |
532 | ||
533 | /* | |
534 | * Async submit bios are used to offload expensive checksumming onto the worker | |
535 | * threads. | |
536 | */ | |
537 | struct async_submit_bio { | |
538 | struct btrfs_bio *bbio; | |
539 | struct btrfs_io_context *bioc; | |
540 | struct btrfs_io_stripe smap; | |
541 | int mirror_num; | |
542 | struct btrfs_work work; | |
543 | }; | |
544 | ||
545 | /* | |
546 | * In order to insert checksums into the metadata in large chunks, we wait | |
547 | * until bio submission time. All the pages in the bio are checksummed and | |
548 | * sums are attached onto the ordered extent record. | |
549 | * | |
550 | * At IO completion time the csums attached on the ordered extent record are | |
551 | * inserted into the btree. | |
552 | */ | |
553 | static void run_one_async_start(struct btrfs_work *work) | |
554 | { | |
555 | struct async_submit_bio *async = | |
556 | container_of(work, struct async_submit_bio, work); | |
557 | blk_status_t ret; | |
558 | ||
559 | ret = btrfs_bio_csum(async->bbio); | |
560 | if (ret) | |
561 | async->bbio->bio.bi_status = ret; | |
562 | } | |
563 | ||
564 | /* | |
565 | * In order to insert checksums into the metadata in large chunks, we wait | |
566 | * until bio submission time. All the pages in the bio are checksummed and | |
567 | * sums are attached onto the ordered extent record. | |
568 | * | |
569 | * At IO completion time the csums attached on the ordered extent record are | |
570 | * inserted into the tree. | |
571 | */ | |
572 | static void run_one_async_done(struct btrfs_work *work) | |
573 | { | |
574 | struct async_submit_bio *async = | |
575 | container_of(work, struct async_submit_bio, work); | |
576 | struct bio *bio = &async->bbio->bio; | |
577 | ||
578 | /* If an error occurred we just want to clean up the bio and move on. */ | |
579 | if (bio->bi_status) { | |
852eee62 | 580 | btrfs_orig_bbio_end_io(async->bbio); |
f8a53bb5 CH |
581 | return; |
582 | } | |
583 | ||
584 | /* | |
585 | * All of the bios that pass through here are from async helpers. | |
3480373e CH |
586 | * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's |
587 | * context. This changes nothing when cgroups aren't in use. | |
f8a53bb5 | 588 | */ |
3480373e | 589 | bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; |
f8a53bb5 CH |
590 | __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); |
591 | } | |
592 | ||
593 | static void run_one_async_free(struct btrfs_work *work) | |
594 | { | |
595 | kfree(container_of(work, struct async_submit_bio, work)); | |
596 | } | |
597 | ||
598 | static bool should_async_write(struct btrfs_bio *bbio) | |
599 | { | |
da023618 CH |
600 | /* Submit synchronously if the checksum implementation is fast. */ |
601 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) | |
602 | return false; | |
603 | ||
f8a53bb5 | 604 | /* |
e917ff56 CH |
605 | * Try to defer the submission to a workqueue to parallelize the |
606 | * checksum calculation unless the I/O is issued synchronously. | |
f8a53bb5 | 607 | */ |
e917ff56 | 608 | if (op_is_sync(bbio->bio.bi_opf)) |
f8a53bb5 CH |
609 | return false; |
610 | ||
da023618 CH |
611 | /* Zoned devices require I/O to be submitted in order. */ |
612 | if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) | |
613 | return false; | |
f8a53bb5 CH |
614 | |
615 | return true; | |
616 | } | |
617 | ||
618 | /* | |
619 | * Submit bio to an async queue. | |
620 | * | |
621 | * Return true if the work has been succesfuly submitted, else false. | |
622 | */ | |
623 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, | |
624 | struct btrfs_io_context *bioc, | |
625 | struct btrfs_io_stripe *smap, int mirror_num) | |
626 | { | |
4317ff00 | 627 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
f8a53bb5 CH |
628 | struct async_submit_bio *async; |
629 | ||
630 | async = kmalloc(sizeof(*async), GFP_NOFS); | |
631 | if (!async) | |
632 | return false; | |
633 | ||
634 | async->bbio = bbio; | |
635 | async->bioc = bioc; | |
636 | async->smap = *smap; | |
637 | async->mirror_num = mirror_num; | |
638 | ||
639 | btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, | |
640 | run_one_async_free); | |
8bfec2e4 | 641 | btrfs_queue_work(fs_info->workers, &async->work); |
f8a53bb5 CH |
642 | return true; |
643 | } | |
644 | ||
ae42a154 | 645 | static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) |
103c1972 | 646 | { |
d5e4377d | 647 | struct btrfs_inode *inode = bbio->inode; |
4317ff00 | 648 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
852eee62 | 649 | struct btrfs_bio *orig_bbio = bbio; |
ae42a154 | 650 | struct bio *bio = &bbio->bio; |
adbe7e38 | 651 | u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
103c1972 CH |
652 | u64 length = bio->bi_iter.bi_size; |
653 | u64 map_length = length; | |
921603c7 | 654 | bool use_append = btrfs_use_zone_append(bbio); |
103c1972 CH |
655 | struct btrfs_io_context *bioc = NULL; |
656 | struct btrfs_io_stripe smap; | |
9ba0004b CH |
657 | blk_status_t ret; |
658 | int error; | |
103c1972 CH |
659 | |
660 | btrfs_bio_counter_inc_blocked(fs_info); | |
cd4efd21 CH |
661 | error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, |
662 | &bioc, &smap, &mirror_num, 1); | |
9ba0004b CH |
663 | if (error) { |
664 | ret = errno_to_blk_status(error); | |
665 | goto fail; | |
103c1972 CH |
666 | } |
667 | ||
852eee62 | 668 | map_length = min(map_length, length); |
d5e4377d CH |
669 | if (use_append) |
670 | map_length = min(map_length, fs_info->max_zone_append_size); | |
671 | ||
103c1972 | 672 | if (map_length < length) { |
2cef0c79 CH |
673 | bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); |
674 | bio = &bbio->bio; | |
103c1972 CH |
675 | } |
676 | ||
1c2b3ee3 CH |
677 | /* |
678 | * Save the iter for the end_io handler and preload the checksums for | |
679 | * data reads. | |
680 | */ | |
fbe96087 | 681 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { |
0d3acb25 | 682 | bbio->saved_iter = bio->bi_iter; |
1c2b3ee3 CH |
683 | ret = btrfs_lookup_bio_sums(bbio); |
684 | if (ret) | |
852eee62 | 685 | goto fail_put_bio; |
1c2b3ee3 | 686 | } |
7276aa7d | 687 | |
f8a53bb5 | 688 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
d5e4377d CH |
689 | if (use_append) { |
690 | bio->bi_opf &= ~REQ_OP_WRITE; | |
691 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
69ccf3f4 CH |
692 | } |
693 | ||
f8a53bb5 CH |
694 | /* |
695 | * Csum items for reloc roots have already been cloned at this | |
696 | * point, so they are handled as part of the no-checksum case. | |
697 | */ | |
4317ff00 | 698 | if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && |
f8a53bb5 | 699 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
d5e4377d | 700 | !btrfs_is_data_reloc_root(inode->root)) { |
f8a53bb5 CH |
701 | if (should_async_write(bbio) && |
702 | btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) | |
852eee62 | 703 | goto done; |
f8a53bb5 CH |
704 | |
705 | ret = btrfs_bio_csum(bbio); | |
706 | if (ret) | |
852eee62 | 707 | goto fail_put_bio; |
cbfce4c7 CH |
708 | } else if (use_append) { |
709 | ret = btrfs_alloc_dummy_sum(bbio); | |
710 | if (ret) | |
711 | goto fail_put_bio; | |
f8a53bb5 | 712 | } |
103c1972 | 713 | } |
f8a53bb5 CH |
714 | |
715 | __btrfs_submit_bio(bio, bioc, &smap, mirror_num); | |
852eee62 CH |
716 | done: |
717 | return map_length == length; | |
9ba0004b | 718 | |
852eee62 CH |
719 | fail_put_bio: |
720 | if (map_length < length) | |
ec63b84d | 721 | btrfs_cleanup_bio(bbio); |
9ba0004b CH |
722 | fail: |
723 | btrfs_bio_counter_dec(fs_info); | |
852eee62 CH |
724 | btrfs_bio_end_io(orig_bbio, ret); |
725 | /* Do not submit another chunk */ | |
726 | return true; | |
727 | } | |
728 | ||
ae42a154 | 729 | void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) |
852eee62 | 730 | { |
4317ff00 QW |
731 | /* If bbio->inode is not populated, its file_offset must be 0. */ |
732 | ASSERT(bbio->inode || bbio->file_offset == 0); | |
733 | ||
ae42a154 | 734 | while (!btrfs_submit_chunk(bbio, mirror_num)) |
852eee62 | 735 | ; |
103c1972 CH |
736 | } |
737 | ||
bacf60e5 CH |
738 | /* |
739 | * Submit a repair write. | |
740 | * | |
741 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a | |
742 | * RAID setup. Here we only want to write the one bad copy, so we do the | |
743 | * mapping ourselves and submit the bio directly. | |
744 | * | |
67da05b3 | 745 | * The I/O is issued synchronously to block the repair read completion from |
bacf60e5 CH |
746 | * freeing the bio. |
747 | */ | |
748 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, | |
749 | u64 length, u64 logical, struct page *page, | |
750 | unsigned int pg_offset, int mirror_num) | |
751 | { | |
4886ff7b | 752 | struct btrfs_io_stripe smap = { 0 }; |
bacf60e5 CH |
753 | struct bio_vec bvec; |
754 | struct bio bio; | |
bacf60e5 CH |
755 | int ret = 0; |
756 | ||
757 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); | |
758 | BUG_ON(!mirror_num); | |
759 | ||
760 | if (btrfs_repair_one_zone(fs_info, logical)) | |
761 | return 0; | |
762 | ||
bacf60e5 CH |
763 | /* |
764 | * Avoid races with device replace and make sure our bioc has devices | |
765 | * associated to its stripes that don't go away while we are doing the | |
766 | * read repair operation. | |
767 | */ | |
768 | btrfs_bio_counter_inc_blocked(fs_info); | |
4886ff7b QW |
769 | ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); |
770 | if (ret < 0) | |
771 | goto out_counter_dec; | |
bacf60e5 | 772 | |
4886ff7b QW |
773 | if (!smap.dev->bdev || |
774 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { | |
bacf60e5 CH |
775 | ret = -EIO; |
776 | goto out_counter_dec; | |
777 | } | |
778 | ||
4886ff7b QW |
779 | bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); |
780 | bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; | |
bacf60e5 CH |
781 | __bio_add_page(&bio, page, length, pg_offset); |
782 | ||
783 | btrfsic_check_bio(&bio); | |
784 | ret = submit_bio_wait(&bio); | |
785 | if (ret) { | |
786 | /* try to remap that extent elsewhere? */ | |
4886ff7b | 787 | btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); |
bacf60e5 CH |
788 | goto out_bio_uninit; |
789 | } | |
790 | ||
791 | btrfs_info_rl_in_rcu(fs_info, | |
792 | "read error corrected: ino %llu off %llu (dev %s sector %llu)", | |
4886ff7b QW |
793 | ino, start, btrfs_dev_name(smap.dev), |
794 | smap.physical >> SECTOR_SHIFT); | |
bacf60e5 CH |
795 | ret = 0; |
796 | ||
797 | out_bio_uninit: | |
798 | bio_uninit(&bio); | |
799 | out_counter_dec: | |
800 | btrfs_bio_counter_dec(fs_info); | |
801 | return ret; | |
802 | } | |
803 | ||
4886ff7b QW |
804 | /* |
805 | * Submit a btrfs_bio based repair write. | |
806 | * | |
807 | * If @dev_replace is true, the write would be submitted to dev-replace target. | |
808 | */ | |
809 | void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) | |
810 | { | |
811 | struct btrfs_fs_info *fs_info = bbio->fs_info; | |
812 | u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; | |
813 | u64 length = bbio->bio.bi_iter.bi_size; | |
814 | struct btrfs_io_stripe smap = { 0 }; | |
815 | int ret; | |
816 | ||
817 | ASSERT(fs_info); | |
818 | ASSERT(mirror_num > 0); | |
819 | ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); | |
820 | ASSERT(!bbio->inode); | |
821 | ||
822 | btrfs_bio_counter_inc_blocked(fs_info); | |
823 | ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); | |
824 | if (ret < 0) | |
825 | goto fail; | |
826 | ||
827 | if (dev_replace) { | |
4886ff7b QW |
828 | ASSERT(smap.dev == fs_info->dev_replace.srcdev); |
829 | smap.dev = fs_info->dev_replace.tgtdev; | |
830 | } | |
831 | __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); | |
832 | return; | |
833 | ||
834 | fail: | |
835 | btrfs_bio_counter_dec(fs_info); | |
836 | btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); | |
837 | } | |
838 | ||
103c1972 CH |
839 | int __init btrfs_bioset_init(void) |
840 | { | |
841 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, | |
842 | offsetof(struct btrfs_bio, bio), | |
843 | BIOSET_NEED_BVECS)) | |
844 | return -ENOMEM; | |
852eee62 CH |
845 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
846 | offsetof(struct btrfs_bio, bio), 0)) | |
847 | goto out_free_bioset; | |
7609afac CH |
848 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
849 | offsetof(struct btrfs_bio, bio), | |
850 | BIOSET_NEED_BVECS)) | |
852eee62 | 851 | goto out_free_clone_bioset; |
7609afac CH |
852 | if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, |
853 | sizeof(struct btrfs_failed_bio))) | |
854 | goto out_free_repair_bioset; | |
103c1972 | 855 | return 0; |
7609afac CH |
856 | |
857 | out_free_repair_bioset: | |
858 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 CH |
859 | out_free_clone_bioset: |
860 | bioset_exit(&btrfs_clone_bioset); | |
7609afac CH |
861 | out_free_bioset: |
862 | bioset_exit(&btrfs_bioset); | |
863 | return -ENOMEM; | |
103c1972 CH |
864 | } |
865 | ||
866 | void __cold btrfs_bioset_exit(void) | |
867 | { | |
7609afac CH |
868 | mempool_exit(&btrfs_failed_bio_pool); |
869 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 | 870 | bioset_exit(&btrfs_clone_bioset); |
103c1972 CH |
871 | bioset_exit(&btrfs_bioset); |
872 | } |