Commit | Line | Data |
---|---|---|
103c1972 CH |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
4 | * Copyright (C) 2022 Christoph Hellwig. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
8 | #include "bio.h" | |
9 | #include "ctree.h" | |
10 | #include "volumes.h" | |
11 | #include "raid56.h" | |
12 | #include "async-thread.h" | |
13 | #include "check-integrity.h" | |
14 | #include "dev-replace.h" | |
15 | #include "rcu-string.h" | |
16 | #include "zoned.h" | |
1c2b3ee3 | 17 | #include "file-item.h" |
103c1972 CH |
18 | |
19 | static struct bio_set btrfs_bioset; | |
852eee62 | 20 | static struct bio_set btrfs_clone_bioset; |
7609afac CH |
21 | static struct bio_set btrfs_repair_bioset; |
22 | static mempool_t btrfs_failed_bio_pool; | |
23 | ||
24 | struct btrfs_failed_bio { | |
25 | struct btrfs_bio *bbio; | |
26 | int num_copies; | |
27 | atomic_t repair_count; | |
28 | }; | |
103c1972 CH |
29 | |
30 | /* | |
31 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it | |
32 | * is already initialized by the block layer. | |
33 | */ | |
4317ff00 | 34 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, |
67d66982 | 35 | btrfs_bio_end_io_t end_io, void *private) |
103c1972 CH |
36 | { |
37 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); | |
4317ff00 | 38 | bbio->fs_info = fs_info; |
103c1972 CH |
39 | bbio->end_io = end_io; |
40 | bbio->private = private; | |
852eee62 | 41 | atomic_set(&bbio->pending_ios, 1); |
103c1972 CH |
42 | } |
43 | ||
44 | /* | |
45 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for | |
46 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. | |
47 | * | |
48 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by | |
49 | * a mempool. | |
50 | */ | |
b41bbd29 | 51 | struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, |
4317ff00 | 52 | struct btrfs_fs_info *fs_info, |
b41bbd29 | 53 | btrfs_bio_end_io_t end_io, void *private) |
103c1972 | 54 | { |
b41bbd29 | 55 | struct btrfs_bio *bbio; |
103c1972 CH |
56 | struct bio *bio; |
57 | ||
58 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); | |
b41bbd29 | 59 | bbio = btrfs_bio(bio); |
4317ff00 | 60 | btrfs_bio_init(bbio, fs_info, end_io, private); |
b41bbd29 | 61 | return bbio; |
103c1972 CH |
62 | } |
63 | ||
7edd339c CH |
64 | static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) |
65 | { | |
66 | struct btrfs_ordered_extent *ordered; | |
67 | int ret; | |
68 | ||
69 | ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); | |
70 | if (WARN_ON_ONCE(!ordered)) | |
71 | return BLK_STS_IOERR; | |
72 | ret = btrfs_extract_ordered_extent(bbio, ordered); | |
73 | btrfs_put_ordered_extent(ordered); | |
74 | ||
75 | return errno_to_blk_status(ret); | |
76 | } | |
77 | ||
2cef0c79 CH |
78 | static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
79 | struct btrfs_bio *orig_bbio, | |
80 | u64 map_length, bool use_append) | |
852eee62 | 81 | { |
2cef0c79 | 82 | struct btrfs_bio *bbio; |
852eee62 CH |
83 | struct bio *bio; |
84 | ||
d5e4377d CH |
85 | if (use_append) { |
86 | unsigned int nr_segs; | |
87 | ||
2cef0c79 | 88 | bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, |
d5e4377d CH |
89 | &btrfs_clone_bioset, map_length); |
90 | } else { | |
2cef0c79 CH |
91 | bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, |
92 | GFP_NOFS, &btrfs_clone_bioset); | |
d5e4377d | 93 | } |
2cef0c79 | 94 | bbio = btrfs_bio(bio); |
4317ff00 QW |
95 | btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); |
96 | bbio->inode = orig_bbio->inode; | |
2cef0c79 CH |
97 | bbio->file_offset = orig_bbio->file_offset; |
98 | if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) | |
852eee62 CH |
99 | orig_bbio->file_offset += map_length; |
100 | ||
101 | atomic_inc(&orig_bbio->pending_ios); | |
2cef0c79 | 102 | return bbio; |
852eee62 CH |
103 | } |
104 | ||
105 | static void btrfs_orig_write_end_io(struct bio *bio); | |
106 | ||
107 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, | |
108 | struct btrfs_bio *orig_bbio) | |
109 | { | |
110 | /* | |
111 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't | |
112 | * just blindly propagate a write failure here. Instead increment the | |
113 | * error count in the original I/O context so that it is guaranteed to | |
114 | * be larger than the error tolerance. | |
115 | */ | |
116 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { | |
117 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; | |
118 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; | |
119 | ||
120 | atomic_add(orig_bioc->max_errors, &orig_bioc->error); | |
121 | } else { | |
122 | orig_bbio->bio.bi_status = bbio->bio.bi_status; | |
123 | } | |
124 | } | |
125 | ||
126 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) | |
127 | { | |
128 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { | |
129 | struct btrfs_bio *orig_bbio = bbio->private; | |
130 | ||
131 | if (bbio->bio.bi_status) | |
132 | btrfs_bbio_propagate_error(bbio, orig_bbio); | |
133 | bio_put(&bbio->bio); | |
134 | bbio = orig_bbio; | |
135 | } | |
136 | ||
137 | if (atomic_dec_and_test(&bbio->pending_ios)) | |
138 | bbio->end_io(bbio); | |
139 | } | |
140 | ||
7609afac CH |
141 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
142 | { | |
143 | if (cur_mirror == fbio->num_copies) | |
144 | return cur_mirror + 1 - fbio->num_copies; | |
145 | return cur_mirror + 1; | |
146 | } | |
147 | ||
148 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) | |
149 | { | |
150 | if (cur_mirror == 1) | |
151 | return fbio->num_copies; | |
152 | return cur_mirror - 1; | |
153 | } | |
154 | ||
155 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) | |
156 | { | |
157 | if (atomic_dec_and_test(&fbio->repair_count)) { | |
852eee62 | 158 | btrfs_orig_bbio_end_io(fbio->bbio); |
7609afac CH |
159 | mempool_free(fbio, &btrfs_failed_bio_pool); |
160 | } | |
161 | } | |
162 | ||
163 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, | |
164 | struct btrfs_device *dev) | |
165 | { | |
166 | struct btrfs_failed_bio *fbio = repair_bbio->private; | |
167 | struct btrfs_inode *inode = repair_bbio->inode; | |
168 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
169 | struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); | |
170 | int mirror = repair_bbio->mirror_num; | |
171 | ||
172 | if (repair_bbio->bio.bi_status || | |
173 | !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { | |
174 | bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); | |
0d3acb25 | 175 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
7609afac CH |
176 | |
177 | mirror = next_repair_mirror(fbio, mirror); | |
178 | if (mirror == fbio->bbio->mirror_num) { | |
179 | btrfs_debug(fs_info, "no mirror left"); | |
180 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; | |
181 | goto done; | |
182 | } | |
183 | ||
ae42a154 | 184 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
185 | return; |
186 | } | |
187 | ||
188 | do { | |
189 | mirror = prev_repair_mirror(fbio, mirror); | |
190 | btrfs_repair_io_failure(fs_info, btrfs_ino(inode), | |
191 | repair_bbio->file_offset, fs_info->sectorsize, | |
0d3acb25 | 192 | repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
7609afac CH |
193 | bv->bv_page, bv->bv_offset, mirror); |
194 | } while (mirror != fbio->bbio->mirror_num); | |
195 | ||
196 | done: | |
197 | btrfs_repair_done(fbio); | |
198 | bio_put(&repair_bbio->bio); | |
199 | } | |
200 | ||
201 | /* | |
202 | * Try to kick off a repair read to the next available mirror for a bad sector. | |
203 | * | |
204 | * This primarily tries to recover good data to serve the actual read request, | |
205 | * but also tries to write the good data back to the bad mirror(s) when a | |
206 | * read succeeded to restore the redundancy. | |
207 | */ | |
208 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, | |
209 | u32 bio_offset, | |
210 | struct bio_vec *bv, | |
211 | struct btrfs_failed_bio *fbio) | |
212 | { | |
213 | struct btrfs_inode *inode = failed_bbio->inode; | |
214 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
215 | const u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 216 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
7609afac CH |
217 | struct btrfs_bio *repair_bbio; |
218 | struct bio *repair_bio; | |
219 | int num_copies; | |
220 | int mirror; | |
221 | ||
222 | btrfs_debug(fs_info, "repair read error: read error at %llu", | |
223 | failed_bbio->file_offset + bio_offset); | |
224 | ||
225 | num_copies = btrfs_num_copies(fs_info, logical, sectorsize); | |
226 | if (num_copies == 1) { | |
227 | btrfs_debug(fs_info, "no copy to repair from"); | |
228 | failed_bbio->bio.bi_status = BLK_STS_IOERR; | |
229 | return fbio; | |
230 | } | |
231 | ||
232 | if (!fbio) { | |
233 | fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); | |
234 | fbio->bbio = failed_bbio; | |
235 | fbio->num_copies = num_copies; | |
236 | atomic_set(&fbio->repair_count, 1); | |
237 | } | |
238 | ||
239 | atomic_inc(&fbio->repair_count); | |
240 | ||
241 | repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, | |
242 | &btrfs_repair_bioset); | |
0d3acb25 | 243 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
078e4cf5 | 244 | __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); |
7609afac CH |
245 | |
246 | repair_bbio = btrfs_bio(repair_bio); | |
4317ff00 QW |
247 | btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); |
248 | repair_bbio->inode = failed_bbio->inode; | |
7609afac CH |
249 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; |
250 | ||
251 | mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); | |
252 | btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); | |
ae42a154 | 253 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
254 | return fbio; |
255 | } | |
256 | ||
257 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) | |
258 | { | |
259 | struct btrfs_inode *inode = bbio->inode; | |
260 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
261 | u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 262 | struct bvec_iter *iter = &bbio->saved_iter; |
7609afac CH |
263 | blk_status_t status = bbio->bio.bi_status; |
264 | struct btrfs_failed_bio *fbio = NULL; | |
265 | u32 offset = 0; | |
266 | ||
4317ff00 QW |
267 | /* Read-repair requires the inode field to be set by the submitter. */ |
268 | ASSERT(inode); | |
269 | ||
7609afac CH |
270 | /* |
271 | * Hand off repair bios to the repair code as there is no upper level | |
272 | * submitter for them. | |
273 | */ | |
274 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { | |
275 | btrfs_end_repair_bio(bbio, dev); | |
276 | return; | |
277 | } | |
278 | ||
279 | /* Clear the I/O error. A failed repair will reset it. */ | |
280 | bbio->bio.bi_status = BLK_STS_OK; | |
281 | ||
282 | while (iter->bi_size) { | |
283 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); | |
284 | ||
285 | bv.bv_len = min(bv.bv_len, sectorsize); | |
286 | if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) | |
287 | fbio = repair_one_sector(bbio, offset, &bv, fbio); | |
288 | ||
289 | bio_advance_iter_single(&bbio->bio, iter, sectorsize); | |
290 | offset += sectorsize; | |
291 | } | |
292 | ||
7ab0fdfc CH |
293 | if (bbio->csum != bbio->csum_inline) |
294 | kfree(bbio->csum); | |
7609afac CH |
295 | |
296 | if (fbio) | |
297 | btrfs_repair_done(fbio); | |
298 | else | |
852eee62 | 299 | btrfs_orig_bbio_end_io(bbio); |
7609afac CH |
300 | } |
301 | ||
103c1972 CH |
302 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
303 | { | |
304 | if (!dev || !dev->bdev) | |
305 | return; | |
306 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) | |
307 | return; | |
308 | ||
309 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) | |
310 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
98e8d36a | 311 | else if (!(bio->bi_opf & REQ_RAHEAD)) |
103c1972 CH |
312 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
313 | if (bio->bi_opf & REQ_PREFLUSH) | |
314 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); | |
315 | } | |
316 | ||
317 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, | |
318 | struct bio *bio) | |
319 | { | |
320 | if (bio->bi_opf & REQ_META) | |
321 | return fs_info->endio_meta_workers; | |
322 | return fs_info->endio_workers; | |
323 | } | |
324 | ||
325 | static void btrfs_end_bio_work(struct work_struct *work) | |
326 | { | |
327 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); | |
328 | ||
7609afac | 329 | /* Metadata reads are checked and repaired by the submitter. */ |
4317ff00 | 330 | if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) |
860c8c45 | 331 | btrfs_check_read_bio(bbio, bbio->bio.bi_private); |
4317ff00 QW |
332 | else |
333 | bbio->end_io(bbio); | |
103c1972 CH |
334 | } |
335 | ||
336 | static void btrfs_simple_end_io(struct bio *bio) | |
337 | { | |
103c1972 | 338 | struct btrfs_bio *bbio = btrfs_bio(bio); |
860c8c45 | 339 | struct btrfs_device *dev = bio->bi_private; |
4317ff00 | 340 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
103c1972 CH |
341 | |
342 | btrfs_bio_counter_dec(fs_info); | |
343 | ||
344 | if (bio->bi_status) | |
860c8c45 | 345 | btrfs_log_dev_io_error(bio, dev); |
103c1972 CH |
346 | |
347 | if (bio_op(bio) == REQ_OP_READ) { | |
348 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); | |
349 | queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); | |
350 | } else { | |
69ccf3f4 CH |
351 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) |
352 | btrfs_record_physical_zoned(bbio); | |
852eee62 | 353 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
354 | } |
355 | } | |
356 | ||
357 | static void btrfs_raid56_end_io(struct bio *bio) | |
358 | { | |
359 | struct btrfs_io_context *bioc = bio->bi_private; | |
360 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
361 | ||
362 | btrfs_bio_counter_dec(bioc->fs_info); | |
363 | bbio->mirror_num = bioc->mirror_num; | |
4317ff00 QW |
364 | if (bio_op(bio) == REQ_OP_READ && bbio->inode && |
365 | !(bbio->bio.bi_opf & REQ_META)) | |
7609afac CH |
366 | btrfs_check_read_bio(bbio, NULL); |
367 | else | |
852eee62 | 368 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
369 | |
370 | btrfs_put_bioc(bioc); | |
371 | } | |
372 | ||
373 | static void btrfs_orig_write_end_io(struct bio *bio) | |
374 | { | |
375 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
376 | struct btrfs_io_context *bioc = stripe->bioc; | |
377 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
378 | ||
379 | btrfs_bio_counter_dec(bioc->fs_info); | |
380 | ||
381 | if (bio->bi_status) { | |
382 | atomic_inc(&bioc->error); | |
383 | btrfs_log_dev_io_error(bio, stripe->dev); | |
384 | } | |
385 | ||
386 | /* | |
387 | * Only send an error to the higher layers if it is beyond the tolerance | |
388 | * threshold. | |
389 | */ | |
390 | if (atomic_read(&bioc->error) > bioc->max_errors) | |
391 | bio->bi_status = BLK_STS_IOERR; | |
392 | else | |
393 | bio->bi_status = BLK_STS_OK; | |
394 | ||
852eee62 | 395 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
396 | btrfs_put_bioc(bioc); |
397 | } | |
398 | ||
399 | static void btrfs_clone_write_end_io(struct bio *bio) | |
400 | { | |
401 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
402 | ||
403 | if (bio->bi_status) { | |
404 | atomic_inc(&stripe->bioc->error); | |
405 | btrfs_log_dev_io_error(bio, stripe->dev); | |
406 | } | |
407 | ||
408 | /* Pass on control to the original bio this one was cloned from */ | |
409 | bio_endio(stripe->bioc->orig_bio); | |
410 | bio_put(bio); | |
411 | } | |
412 | ||
413 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) | |
414 | { | |
415 | if (!dev || !dev->bdev || | |
416 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || | |
417 | (btrfs_op(bio) == BTRFS_MAP_WRITE && | |
418 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { | |
419 | bio_io_error(bio); | |
420 | return; | |
421 | } | |
422 | ||
423 | bio_set_dev(bio, dev->bdev); | |
424 | ||
425 | /* | |
426 | * For zone append writing, bi_sector must point the beginning of the | |
427 | * zone | |
428 | */ | |
429 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
430 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
d5e4377d | 431 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
103c1972 | 432 | |
d5e4377d CH |
433 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
434 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; | |
103c1972 CH |
435 | } |
436 | btrfs_debug_in_rcu(dev->fs_info, | |
437 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", | |
438 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, | |
439 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), | |
440 | dev->devid, bio->bi_iter.bi_size); | |
441 | ||
442 | btrfsic_check_bio(bio); | |
3480373e CH |
443 | |
444 | if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) | |
445 | blkcg_punt_bio_submit(bio); | |
446 | else | |
447 | submit_bio(bio); | |
103c1972 CH |
448 | } |
449 | ||
450 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) | |
451 | { | |
452 | struct bio *orig_bio = bioc->orig_bio, *bio; | |
453 | ||
454 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); | |
455 | ||
456 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ | |
457 | if (dev_nr == bioc->num_stripes - 1) { | |
458 | bio = orig_bio; | |
459 | bio->bi_end_io = btrfs_orig_write_end_io; | |
460 | } else { | |
461 | bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); | |
462 | bio_inc_remaining(orig_bio); | |
463 | bio->bi_end_io = btrfs_clone_write_end_io; | |
464 | } | |
465 | ||
466 | bio->bi_private = &bioc->stripes[dev_nr]; | |
467 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; | |
468 | bioc->stripes[dev_nr].bioc = bioc; | |
469 | btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); | |
470 | } | |
471 | ||
f8a53bb5 CH |
472 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
473 | struct btrfs_io_stripe *smap, int mirror_num) | |
474 | { | |
475 | /* Do not leak our private flag into the block layer. */ | |
476 | bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; | |
477 | ||
478 | if (!bioc) { | |
479 | /* Single mirror read/write fast path. */ | |
480 | btrfs_bio(bio)->mirror_num = mirror_num; | |
481 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; | |
482 | bio->bi_private = smap->dev; | |
483 | bio->bi_end_io = btrfs_simple_end_io; | |
484 | btrfs_submit_dev_bio(smap->dev, bio); | |
485 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | |
486 | /* Parity RAID write or read recovery. */ | |
487 | bio->bi_private = bioc; | |
488 | bio->bi_end_io = btrfs_raid56_end_io; | |
489 | if (bio_op(bio) == REQ_OP_READ) | |
490 | raid56_parity_recover(bio, bioc, mirror_num); | |
491 | else | |
492 | raid56_parity_write(bio, bioc); | |
493 | } else { | |
494 | /* Write to multiple mirrors. */ | |
495 | int total_devs = bioc->num_stripes; | |
496 | ||
497 | bioc->orig_bio = bio; | |
498 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) | |
499 | btrfs_submit_mirrored_bio(bioc, dev_nr); | |
500 | } | |
501 | } | |
502 | ||
503 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) | |
504 | { | |
505 | if (bbio->bio.bi_opf & REQ_META) | |
542e300e | 506 | return btree_csum_one_bio(bbio); |
f8a53bb5 CH |
507 | return btrfs_csum_one_bio(bbio); |
508 | } | |
509 | ||
510 | /* | |
511 | * Async submit bios are used to offload expensive checksumming onto the worker | |
512 | * threads. | |
513 | */ | |
514 | struct async_submit_bio { | |
515 | struct btrfs_bio *bbio; | |
516 | struct btrfs_io_context *bioc; | |
517 | struct btrfs_io_stripe smap; | |
518 | int mirror_num; | |
519 | struct btrfs_work work; | |
520 | }; | |
521 | ||
522 | /* | |
523 | * In order to insert checksums into the metadata in large chunks, we wait | |
524 | * until bio submission time. All the pages in the bio are checksummed and | |
525 | * sums are attached onto the ordered extent record. | |
526 | * | |
527 | * At IO completion time the csums attached on the ordered extent record are | |
528 | * inserted into the btree. | |
529 | */ | |
530 | static void run_one_async_start(struct btrfs_work *work) | |
531 | { | |
532 | struct async_submit_bio *async = | |
533 | container_of(work, struct async_submit_bio, work); | |
534 | blk_status_t ret; | |
535 | ||
536 | ret = btrfs_bio_csum(async->bbio); | |
537 | if (ret) | |
538 | async->bbio->bio.bi_status = ret; | |
539 | } | |
540 | ||
541 | /* | |
542 | * In order to insert checksums into the metadata in large chunks, we wait | |
543 | * until bio submission time. All the pages in the bio are checksummed and | |
544 | * sums are attached onto the ordered extent record. | |
545 | * | |
546 | * At IO completion time the csums attached on the ordered extent record are | |
547 | * inserted into the tree. | |
548 | */ | |
549 | static void run_one_async_done(struct btrfs_work *work) | |
550 | { | |
551 | struct async_submit_bio *async = | |
552 | container_of(work, struct async_submit_bio, work); | |
553 | struct bio *bio = &async->bbio->bio; | |
554 | ||
555 | /* If an error occurred we just want to clean up the bio and move on. */ | |
556 | if (bio->bi_status) { | |
852eee62 | 557 | btrfs_orig_bbio_end_io(async->bbio); |
f8a53bb5 CH |
558 | return; |
559 | } | |
560 | ||
561 | /* | |
562 | * All of the bios that pass through here are from async helpers. | |
3480373e CH |
563 | * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's |
564 | * context. This changes nothing when cgroups aren't in use. | |
f8a53bb5 | 565 | */ |
3480373e | 566 | bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; |
f8a53bb5 CH |
567 | __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); |
568 | } | |
569 | ||
570 | static void run_one_async_free(struct btrfs_work *work) | |
571 | { | |
572 | kfree(container_of(work, struct async_submit_bio, work)); | |
573 | } | |
574 | ||
575 | static bool should_async_write(struct btrfs_bio *bbio) | |
576 | { | |
577 | /* | |
578 | * If the I/O is not issued by fsync and friends, (->sync_writers != 0), | |
579 | * then try to defer the submission to a workqueue to parallelize the | |
580 | * checksum calculation. | |
581 | */ | |
582 | if (atomic_read(&bbio->inode->sync_writers)) | |
583 | return false; | |
584 | ||
585 | /* | |
586 | * Submit metadata writes synchronously if the checksum implementation | |
587 | * is fast, or we are on a zoned device that wants I/O to be submitted | |
588 | * in order. | |
589 | */ | |
590 | if (bbio->bio.bi_opf & REQ_META) { | |
4317ff00 | 591 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
f8a53bb5 CH |
592 | |
593 | if (btrfs_is_zoned(fs_info)) | |
594 | return false; | |
595 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) | |
596 | return false; | |
597 | } | |
598 | ||
599 | return true; | |
600 | } | |
601 | ||
602 | /* | |
603 | * Submit bio to an async queue. | |
604 | * | |
605 | * Return true if the work has been succesfuly submitted, else false. | |
606 | */ | |
607 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, | |
608 | struct btrfs_io_context *bioc, | |
609 | struct btrfs_io_stripe *smap, int mirror_num) | |
610 | { | |
4317ff00 | 611 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
f8a53bb5 CH |
612 | struct async_submit_bio *async; |
613 | ||
614 | async = kmalloc(sizeof(*async), GFP_NOFS); | |
615 | if (!async) | |
616 | return false; | |
617 | ||
618 | async->bbio = bbio; | |
619 | async->bioc = bioc; | |
620 | async->smap = *smap; | |
621 | async->mirror_num = mirror_num; | |
622 | ||
623 | btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, | |
624 | run_one_async_free); | |
625 | if (op_is_sync(bbio->bio.bi_opf)) | |
626 | btrfs_queue_work(fs_info->hipri_workers, &async->work); | |
627 | else | |
628 | btrfs_queue_work(fs_info->workers, &async->work); | |
629 | return true; | |
630 | } | |
631 | ||
ae42a154 | 632 | static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) |
103c1972 | 633 | { |
d5e4377d | 634 | struct btrfs_inode *inode = bbio->inode; |
4317ff00 | 635 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
852eee62 | 636 | struct btrfs_bio *orig_bbio = bbio; |
ae42a154 | 637 | struct bio *bio = &bbio->bio; |
103c1972 CH |
638 | u64 logical = bio->bi_iter.bi_sector << 9; |
639 | u64 length = bio->bi_iter.bi_size; | |
640 | u64 map_length = length; | |
921603c7 | 641 | bool use_append = btrfs_use_zone_append(bbio); |
103c1972 CH |
642 | struct btrfs_io_context *bioc = NULL; |
643 | struct btrfs_io_stripe smap; | |
9ba0004b CH |
644 | blk_status_t ret; |
645 | int error; | |
103c1972 CH |
646 | |
647 | btrfs_bio_counter_inc_blocked(fs_info); | |
9ba0004b CH |
648 | error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, |
649 | &bioc, &smap, &mirror_num, 1); | |
650 | if (error) { | |
651 | ret = errno_to_blk_status(error); | |
652 | goto fail; | |
103c1972 CH |
653 | } |
654 | ||
852eee62 | 655 | map_length = min(map_length, length); |
d5e4377d CH |
656 | if (use_append) |
657 | map_length = min(map_length, fs_info->max_zone_append_size); | |
658 | ||
103c1972 | 659 | if (map_length < length) { |
2cef0c79 CH |
660 | bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); |
661 | bio = &bbio->bio; | |
103c1972 CH |
662 | } |
663 | ||
1c2b3ee3 CH |
664 | /* |
665 | * Save the iter for the end_io handler and preload the checksums for | |
666 | * data reads. | |
667 | */ | |
4317ff00 | 668 | if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { |
0d3acb25 | 669 | bbio->saved_iter = bio->bi_iter; |
1c2b3ee3 CH |
670 | ret = btrfs_lookup_bio_sums(bbio); |
671 | if (ret) | |
852eee62 | 672 | goto fail_put_bio; |
1c2b3ee3 | 673 | } |
7276aa7d | 674 | |
f8a53bb5 | 675 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
d5e4377d CH |
676 | if (use_append) { |
677 | bio->bi_opf &= ~REQ_OP_WRITE; | |
678 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
7edd339c | 679 | ret = btrfs_bio_extract_ordered_extent(bbio); |
69ccf3f4 | 680 | if (ret) |
852eee62 | 681 | goto fail_put_bio; |
69ccf3f4 CH |
682 | } |
683 | ||
f8a53bb5 CH |
684 | /* |
685 | * Csum items for reloc roots have already been cloned at this | |
686 | * point, so they are handled as part of the no-checksum case. | |
687 | */ | |
4317ff00 | 688 | if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && |
f8a53bb5 | 689 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
d5e4377d | 690 | !btrfs_is_data_reloc_root(inode->root)) { |
f8a53bb5 CH |
691 | if (should_async_write(bbio) && |
692 | btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) | |
852eee62 | 693 | goto done; |
f8a53bb5 CH |
694 | |
695 | ret = btrfs_bio_csum(bbio); | |
696 | if (ret) | |
852eee62 | 697 | goto fail_put_bio; |
f8a53bb5 | 698 | } |
103c1972 | 699 | } |
f8a53bb5 CH |
700 | |
701 | __btrfs_submit_bio(bio, bioc, &smap, mirror_num); | |
852eee62 CH |
702 | done: |
703 | return map_length == length; | |
9ba0004b | 704 | |
852eee62 CH |
705 | fail_put_bio: |
706 | if (map_length < length) | |
707 | bio_put(bio); | |
9ba0004b CH |
708 | fail: |
709 | btrfs_bio_counter_dec(fs_info); | |
852eee62 CH |
710 | btrfs_bio_end_io(orig_bbio, ret); |
711 | /* Do not submit another chunk */ | |
712 | return true; | |
713 | } | |
714 | ||
ae42a154 | 715 | void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) |
852eee62 | 716 | { |
4317ff00 QW |
717 | /* If bbio->inode is not populated, its file_offset must be 0. */ |
718 | ASSERT(bbio->inode || bbio->file_offset == 0); | |
719 | ||
ae42a154 | 720 | while (!btrfs_submit_chunk(bbio, mirror_num)) |
852eee62 | 721 | ; |
103c1972 CH |
722 | } |
723 | ||
bacf60e5 CH |
724 | /* |
725 | * Submit a repair write. | |
726 | * | |
727 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a | |
728 | * RAID setup. Here we only want to write the one bad copy, so we do the | |
729 | * mapping ourselves and submit the bio directly. | |
730 | * | |
67da05b3 | 731 | * The I/O is issued synchronously to block the repair read completion from |
bacf60e5 CH |
732 | * freeing the bio. |
733 | */ | |
734 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, | |
735 | u64 length, u64 logical, struct page *page, | |
736 | unsigned int pg_offset, int mirror_num) | |
737 | { | |
4886ff7b | 738 | struct btrfs_io_stripe smap = { 0 }; |
bacf60e5 CH |
739 | struct bio_vec bvec; |
740 | struct bio bio; | |
bacf60e5 CH |
741 | int ret = 0; |
742 | ||
743 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); | |
744 | BUG_ON(!mirror_num); | |
745 | ||
746 | if (btrfs_repair_one_zone(fs_info, logical)) | |
747 | return 0; | |
748 | ||
bacf60e5 CH |
749 | /* |
750 | * Avoid races with device replace and make sure our bioc has devices | |
751 | * associated to its stripes that don't go away while we are doing the | |
752 | * read repair operation. | |
753 | */ | |
754 | btrfs_bio_counter_inc_blocked(fs_info); | |
4886ff7b QW |
755 | ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); |
756 | if (ret < 0) | |
757 | goto out_counter_dec; | |
bacf60e5 | 758 | |
4886ff7b QW |
759 | if (!smap.dev->bdev || |
760 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { | |
bacf60e5 CH |
761 | ret = -EIO; |
762 | goto out_counter_dec; | |
763 | } | |
764 | ||
4886ff7b QW |
765 | bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); |
766 | bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; | |
bacf60e5 CH |
767 | __bio_add_page(&bio, page, length, pg_offset); |
768 | ||
769 | btrfsic_check_bio(&bio); | |
770 | ret = submit_bio_wait(&bio); | |
771 | if (ret) { | |
772 | /* try to remap that extent elsewhere? */ | |
4886ff7b | 773 | btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); |
bacf60e5 CH |
774 | goto out_bio_uninit; |
775 | } | |
776 | ||
777 | btrfs_info_rl_in_rcu(fs_info, | |
778 | "read error corrected: ino %llu off %llu (dev %s sector %llu)", | |
4886ff7b QW |
779 | ino, start, btrfs_dev_name(smap.dev), |
780 | smap.physical >> SECTOR_SHIFT); | |
bacf60e5 CH |
781 | ret = 0; |
782 | ||
783 | out_bio_uninit: | |
784 | bio_uninit(&bio); | |
785 | out_counter_dec: | |
786 | btrfs_bio_counter_dec(fs_info); | |
787 | return ret; | |
788 | } | |
789 | ||
4886ff7b QW |
790 | /* |
791 | * Submit a btrfs_bio based repair write. | |
792 | * | |
793 | * If @dev_replace is true, the write would be submitted to dev-replace target. | |
794 | */ | |
795 | void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) | |
796 | { | |
797 | struct btrfs_fs_info *fs_info = bbio->fs_info; | |
798 | u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; | |
799 | u64 length = bbio->bio.bi_iter.bi_size; | |
800 | struct btrfs_io_stripe smap = { 0 }; | |
801 | int ret; | |
802 | ||
803 | ASSERT(fs_info); | |
804 | ASSERT(mirror_num > 0); | |
805 | ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); | |
806 | ASSERT(!bbio->inode); | |
807 | ||
808 | btrfs_bio_counter_inc_blocked(fs_info); | |
809 | ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); | |
810 | if (ret < 0) | |
811 | goto fail; | |
812 | ||
813 | if (dev_replace) { | |
814 | if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) { | |
815 | bbio->bio.bi_opf &= ~REQ_OP_WRITE; | |
816 | bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND; | |
817 | } | |
818 | ASSERT(smap.dev == fs_info->dev_replace.srcdev); | |
819 | smap.dev = fs_info->dev_replace.tgtdev; | |
820 | } | |
821 | __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); | |
822 | return; | |
823 | ||
824 | fail: | |
825 | btrfs_bio_counter_dec(fs_info); | |
826 | btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); | |
827 | } | |
828 | ||
103c1972 CH |
829 | int __init btrfs_bioset_init(void) |
830 | { | |
831 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, | |
832 | offsetof(struct btrfs_bio, bio), | |
833 | BIOSET_NEED_BVECS)) | |
834 | return -ENOMEM; | |
852eee62 CH |
835 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
836 | offsetof(struct btrfs_bio, bio), 0)) | |
837 | goto out_free_bioset; | |
7609afac CH |
838 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
839 | offsetof(struct btrfs_bio, bio), | |
840 | BIOSET_NEED_BVECS)) | |
852eee62 | 841 | goto out_free_clone_bioset; |
7609afac CH |
842 | if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, |
843 | sizeof(struct btrfs_failed_bio))) | |
844 | goto out_free_repair_bioset; | |
103c1972 | 845 | return 0; |
7609afac CH |
846 | |
847 | out_free_repair_bioset: | |
848 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 CH |
849 | out_free_clone_bioset: |
850 | bioset_exit(&btrfs_clone_bioset); | |
7609afac CH |
851 | out_free_bioset: |
852 | bioset_exit(&btrfs_bioset); | |
853 | return -ENOMEM; | |
103c1972 CH |
854 | } |
855 | ||
856 | void __cold btrfs_bioset_exit(void) | |
857 | { | |
7609afac CH |
858 | mempool_exit(&btrfs_failed_bio_pool); |
859 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 | 860 | bioset_exit(&btrfs_clone_bioset); |
103c1972 CH |
861 | bioset_exit(&btrfs_bioset); |
862 | } |