Commit | Line | Data |
---|---|---|
db074436 DW |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2010 Red Hat, Inc. | |
e523f2d4 | 4 | * Copyright (c) 2016-2025 Christoph Hellwig. |
db074436 DW |
5 | */ |
6 | #include <linux/module.h> | |
7 | #include <linux/compiler.h> | |
8 | #include <linux/fs.h> | |
489734ef | 9 | #include <linux/fscrypt.h> |
4bdcd1dd | 10 | #include <linux/pagemap.h> |
db074436 DW |
11 | #include <linux/iomap.h> |
12 | #include <linux/backing-dev.h> | |
13 | #include <linux/uio.h> | |
14 | #include <linux/task_io_accounting_ops.h> | |
e523f2d4 | 15 | #include "internal.h" |
60263d58 | 16 | #include "trace.h" |
db074436 DW |
17 | |
18 | #include "../internal.h" | |
19 | ||
20 | /* | |
21 | * Private flags for iomap_dio, must not overlap with the public ones in | |
22 | * iomap.h: | |
23 | */ | |
e523f2d4 | 24 | #define IOMAP_DIO_NO_INVALIDATE (1U << 25) |
8c052fb3 | 25 | #define IOMAP_DIO_CALLER_COMP (1U << 26) |
7b3c14d1 | 26 | #define IOMAP_DIO_INLINE_COMP (1U << 27) |
3a0be38c | 27 | #define IOMAP_DIO_WRITE_THROUGH (1U << 28) |
44842f64 JA |
28 | #define IOMAP_DIO_NEED_SYNC (1U << 29) |
29 | #define IOMAP_DIO_WRITE (1U << 30) | |
30 | #define IOMAP_DIO_DIRTY (1U << 31) | |
db074436 | 31 | |
10553a91 PR |
32 | /* |
33 | * Used for sub block zeroing in iomap_dio_zero() | |
34 | */ | |
35 | #define IOMAP_ZERO_PAGE_SIZE (SZ_64K) | |
36 | #define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) | |
37 | static struct page *zero_page; | |
38 | ||
db074436 DW |
39 | struct iomap_dio { |
40 | struct kiocb *iocb; | |
838c4f3d | 41 | const struct iomap_dio_ops *dops; |
db074436 DW |
42 | loff_t i_size; |
43 | loff_t size; | |
44 | atomic_t ref; | |
45 | unsigned flags; | |
46 | int error; | |
4fdccaa0 | 47 | size_t done_before; |
db074436 DW |
48 | bool wait_for_completion; |
49 | ||
50 | union { | |
51 | /* used during submission and for synchronous completion: */ | |
52 | struct { | |
53 | struct iov_iter *iter; | |
54 | struct task_struct *waiter; | |
db074436 DW |
55 | } submit; |
56 | ||
57 | /* used for aio completion: */ | |
58 | struct { | |
59 | struct work_struct work; | |
60 | } aio; | |
61 | }; | |
62 | }; | |
63 | ||
908c5490 | 64 | static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, |
dbd4eb81 | 65 | struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf) |
908c5490 CH |
66 | { |
67 | if (dio->dops && dio->dops->bio_set) | |
68 | return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, | |
69 | GFP_KERNEL, dio->dops->bio_set); | |
70 | return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL); | |
71 | } | |
72 | ||
a6d3d495 CH |
73 | static void iomap_dio_submit_bio(const struct iomap_iter *iter, |
74 | struct iomap_dio *dio, struct bio *bio, loff_t pos) | |
db074436 | 75 | { |
daa99c5a JA |
76 | struct kiocb *iocb = dio->iocb; |
77 | ||
db074436 DW |
78 | atomic_inc(&dio->ref); |
79 | ||
9650b453 | 80 | /* Sync dio can't be polled reliably */ |
daa99c5a JA |
81 | if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { |
82 | bio_set_polled(bio, iocb); | |
83 | WRITE_ONCE(iocb->private, bio); | |
3e08773c | 84 | } |
db074436 | 85 | |
034c29fb | 86 | if (dio->dops && dio->dops->submit_io) { |
3e08773c | 87 | dio->dops->submit_io(iter, bio, pos); |
034c29fb CH |
88 | } else { |
89 | WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); | |
3e08773c | 90 | submit_bio(bio); |
034c29fb | 91 | } |
db074436 DW |
92 | } |
93 | ||
c3d4ed1a | 94 | ssize_t iomap_dio_complete(struct iomap_dio *dio) |
db074436 | 95 | { |
838c4f3d | 96 | const struct iomap_dio_ops *dops = dio->dops; |
db074436 | 97 | struct kiocb *iocb = dio->iocb; |
db074436 | 98 | loff_t offset = iocb->ki_pos; |
838c4f3d | 99 | ssize_t ret = dio->error; |
db074436 | 100 | |
838c4f3d CH |
101 | if (dops && dops->end_io) |
102 | ret = dops->end_io(iocb, dio->size, ret, dio->flags); | |
db074436 DW |
103 | |
104 | if (likely(!ret)) { | |
105 | ret = dio->size; | |
106 | /* check for short read */ | |
107 | if (offset + ret > dio->i_size && | |
108 | !(dio->flags & IOMAP_DIO_WRITE)) | |
109 | ret = dio->i_size - offset; | |
db074436 DW |
110 | } |
111 | ||
112 | /* | |
113 | * Try again to invalidate clean pages which might have been cached by | |
114 | * non-direct readahead, or faulted in by get_user_pages() if the source | |
115 | * of the write was an mmap'ed region of the file we're writing. Either | |
116 | * one is a pretty crazy thing to do, so we don't support it 100%. If | |
117 | * this invalidation fails, tough, the write still worked... | |
118 | * | |
838c4f3d CH |
119 | * And this page cache invalidation has to be after ->end_io(), as some |
120 | * filesystems convert unwritten extents to real allocations in | |
121 | * ->end_io() when necessary, otherwise a racing buffer read would cache | |
db074436 DW |
122 | * zeros from unwritten extents. |
123 | */ | |
e523f2d4 CH |
124 | if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && |
125 | !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) | |
c402a9a9 | 126 | kiocb_invalidate_post_direct_write(iocb, dio->size); |
db074436 | 127 | |
1a31182e | 128 | inode_dio_end(file_inode(iocb->ki_filp)); |
db074436 | 129 | |
936e114a CH |
130 | if (ret > 0) { |
131 | iocb->ki_pos += ret; | |
4fdccaa0 | 132 | |
936e114a CH |
133 | /* |
134 | * If this is a DSYNC write, make sure we push it to stable | |
135 | * storage now that we've written data. | |
136 | */ | |
137 | if (dio->flags & IOMAP_DIO_NEED_SYNC) | |
138 | ret = generic_write_sync(iocb, ret); | |
139 | if (ret > 0) | |
140 | ret += dio->done_before; | |
141 | } | |
3fd41721 | 142 | trace_iomap_dio_complete(iocb, dio->error, ret); |
db074436 | 143 | kfree(dio); |
db074436 DW |
144 | return ret; |
145 | } | |
c3d4ed1a | 146 | EXPORT_SYMBOL_GPL(iomap_dio_complete); |
db074436 | 147 | |
8c052fb3 JA |
148 | static ssize_t iomap_dio_deferred_complete(void *data) |
149 | { | |
150 | return iomap_dio_complete(data); | |
151 | } | |
152 | ||
db074436 DW |
153 | static void iomap_dio_complete_work(struct work_struct *work) |
154 | { | |
155 | struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); | |
156 | struct kiocb *iocb = dio->iocb; | |
157 | ||
6b19b766 | 158 | iocb->ki_complete(iocb, iomap_dio_complete(dio)); |
db074436 DW |
159 | } |
160 | ||
161 | /* | |
162 | * Set an error in the dio if none is set yet. We have to use cmpxchg | |
163 | * as the submission context and the completion context(s) can race to | |
164 | * update the error. | |
165 | */ | |
166 | static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) | |
167 | { | |
168 | cmpxchg(&dio->error, 0, ret); | |
169 | } | |
170 | ||
ae2f33a5 CH |
171 | /* |
172 | * Called when dio->ref reaches zero from an I/O completion. | |
173 | */ | |
174 | static void iomap_dio_done(struct iomap_dio *dio) | |
db074436 | 175 | { |
3486237c | 176 | struct kiocb *iocb = dio->iocb; |
db074436 | 177 | |
3486237c | 178 | if (dio->wait_for_completion) { |
ae2f33a5 CH |
179 | /* |
180 | * Synchronous I/O, task itself will handle any completion work | |
181 | * that needs after IO. All we need to do is wake the task. | |
182 | */ | |
3486237c | 183 | struct task_struct *waiter = dio->submit.waiter; |
db074436 | 184 | |
3486237c JA |
185 | WRITE_ONCE(dio->submit.waiter, NULL); |
186 | blk_wake_io_task(waiter); | |
ae2f33a5 | 187 | } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { |
3486237c JA |
188 | WRITE_ONCE(iocb->private, NULL); |
189 | iomap_dio_complete_work(&dio->aio.work); | |
ae2f33a5 CH |
190 | } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { |
191 | /* | |
192 | * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then | |
193 | * schedule our completion that way to avoid an async punt to a | |
194 | * workqueue. | |
195 | */ | |
8c052fb3 JA |
196 | /* only polled IO cares about private cleared */ |
197 | iocb->private = dio; | |
198 | iocb->dio_complete = iomap_dio_deferred_complete; | |
199 | ||
200 | /* | |
201 | * Invoke ->ki_complete() directly. We've assigned our | |
202 | * dio_complete callback handler, and since the issuer set | |
203 | * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will | |
204 | * notice ->dio_complete being set and will defer calling that | |
205 | * handler until it can be done from a safe task context. | |
206 | * | |
207 | * Note that the 'res' being passed in here is not important | |
208 | * for this case. The actual completion value of the request | |
209 | * will be gotten from dio_complete when that is run by the | |
210 | * issuer. | |
211 | */ | |
212 | iocb->ki_complete(iocb, 0); | |
ae2f33a5 CH |
213 | } else { |
214 | struct inode *inode = file_inode(iocb->ki_filp); | |
215 | ||
216 | /* | |
217 | * Async DIO completion that requires filesystem level | |
218 | * completion work gets punted to a work queue to complete as | |
219 | * the operation may require more IO to be issued to finalise | |
220 | * filesystem metadata changes or guarantee data integrity. | |
221 | */ | |
222 | INIT_WORK(&dio->aio.work, iomap_dio_complete_work); | |
223 | queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); | |
8c052fb3 | 224 | } |
ae2f33a5 CH |
225 | } |
226 | ||
227 | void iomap_dio_bio_end_io(struct bio *bio) | |
228 | { | |
229 | struct iomap_dio *dio = bio->bi_private; | |
230 | bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); | |
231 | ||
232 | if (bio->bi_status) | |
233 | iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); | |
234 | ||
235 | if (atomic_dec_and_test(&dio->ref)) | |
236 | iomap_dio_done(dio); | |
8c052fb3 | 237 | |
db074436 DW |
238 | if (should_dirty) { |
239 | bio_check_pages_dirty(bio); | |
240 | } else { | |
241 | bio_release_pages(bio, false); | |
242 | bio_put(bio); | |
243 | } | |
244 | } | |
908c5490 | 245 | EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); |
db074436 | 246 | |
e523f2d4 CH |
247 | u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) |
248 | { | |
249 | struct iomap_dio *dio = ioend->io_bio.bi_private; | |
250 | bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); | |
251 | u32 vec_count = ioend->io_bio.bi_vcnt; | |
252 | ||
253 | if (ioend->io_error) | |
254 | iomap_dio_set_error(dio, ioend->io_error); | |
255 | ||
256 | if (atomic_dec_and_test(&dio->ref)) { | |
257 | /* | |
258 | * Try to avoid another context switch for the completion given | |
259 | * that we are already called from the ioend completion | |
260 | * workqueue, but never invalidate pages from this thread to | |
261 | * avoid deadlocks with buffered I/O completions. Tough luck if | |
262 | * you hit the tiny race with someone dirtying the range now | |
263 | * between this check and the actual completion. | |
264 | */ | |
265 | if (!dio->iocb->ki_filp->f_mapping->nrpages) { | |
266 | dio->flags |= IOMAP_DIO_INLINE_COMP; | |
267 | dio->flags |= IOMAP_DIO_NO_INVALIDATE; | |
268 | } | |
269 | dio->flags &= ~IOMAP_DIO_CALLER_COMP; | |
270 | iomap_dio_done(dio); | |
271 | } | |
272 | ||
273 | if (should_dirty) { | |
274 | bio_check_pages_dirty(&ioend->io_bio); | |
275 | } else { | |
276 | bio_release_pages(&ioend->io_bio, false); | |
277 | bio_put(&ioend->io_bio); | |
278 | } | |
279 | ||
280 | /* | |
281 | * Return the number of bvecs completed as even direct I/O completions | |
282 | * do significant per-folio work and we'll still want to give up the | |
283 | * CPU after a lot of completions. | |
284 | */ | |
285 | return vec_count; | |
286 | } | |
287 | ||
10553a91 | 288 | static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, |
a6d3d495 | 289 | loff_t pos, unsigned len) |
db074436 | 290 | { |
489734ef | 291 | struct inode *inode = file_inode(dio->iocb->ki_filp); |
db074436 DW |
292 | struct bio *bio; |
293 | ||
10553a91 PR |
294 | if (!len) |
295 | return 0; | |
296 | /* | |
297 | * Max block size supported is 64k | |
298 | */ | |
299 | if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) | |
300 | return -EINVAL; | |
301 | ||
908c5490 | 302 | bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); |
489734ef EB |
303 | fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, |
304 | GFP_KERNEL); | |
a6d3d495 | 305 | bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); |
db074436 DW |
306 | bio->bi_private = dio; |
307 | bio->bi_end_io = iomap_dio_bio_end_io; | |
308 | ||
10553a91 | 309 | __bio_add_page(bio, zero_page, len, 0); |
a6d3d495 | 310 | iomap_dio_submit_bio(iter, dio, bio, pos); |
10553a91 | 311 | return 0; |
db074436 DW |
312 | } |
313 | ||
c3b0e880 | 314 | /* |
d279c80e JG |
315 | * Use a FUA write if we need datasync semantics and this is a pure data I/O |
316 | * that doesn't require any metadata updates (including after I/O completion | |
317 | * such as unwritten extent conversion) and the underlying device either | |
318 | * doesn't have a volatile write cache or supports FUA. | |
319 | * This allows us to avoid cache flushes on I/O completion. | |
c3b0e880 | 320 | */ |
d279c80e JG |
321 | static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, |
322 | struct iomap_dio *dio) | |
c3b0e880 | 323 | { |
d279c80e JG |
324 | if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) |
325 | return false; | |
326 | if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) | |
327 | return false; | |
328 | return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); | |
c3b0e880 NA |
329 | } |
330 | ||
8fecec46 | 331 | static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) |
db074436 | 332 | { |
a6d3d495 CH |
333 | const struct iomap *iomap = &iter->iomap; |
334 | struct inode *inode = iter->inode; | |
db074436 | 335 | unsigned int fs_block_size = i_blocksize(inode), pad; |
9e0933c2 | 336 | const loff_t length = iomap_length(iter); |
a6d3d495 | 337 | loff_t pos = iter->pos; |
d279c80e | 338 | blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; |
db074436 DW |
339 | struct bio *bio; |
340 | bool need_zeroout = false; | |
db074436 | 341 | int nr_pages, ret = 0; |
8fecec46 | 342 | u64 copied = 0; |
f550ee9b | 343 | size_t orig_count; |
db074436 | 344 | |
f1bd37a4 | 345 | if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || |
bf8d0853 | 346 | !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) |
db074436 DW |
347 | return -EINVAL; |
348 | ||
d279c80e JG |
349 | if (dio->flags & IOMAP_DIO_WRITE) { |
350 | bio_opf |= REQ_OP_WRITE; | |
351 | ||
370a6de7 | 352 | if (iomap->flags & IOMAP_F_ATOMIC_BIO) { |
aacd436e JG |
353 | /* |
354 | * Ensure that the mapping covers the full write | |
355 | * length, otherwise it won't be submitted as a single | |
356 | * bio, which is required to use hardware atomics. | |
357 | */ | |
d279c80e JG |
358 | if (length != iter->len) |
359 | return -EINVAL; | |
360 | bio_opf |= REQ_ATOMIC; | |
361 | } | |
db074436 | 362 | |
d279c80e JG |
363 | if (iomap->type == IOMAP_UNWRITTEN) { |
364 | dio->flags |= IOMAP_DIO_UNWRITTEN; | |
365 | need_zeroout = true; | |
366 | } | |
db074436 | 367 | |
d279c80e JG |
368 | if (iomap->flags & IOMAP_F_SHARED) |
369 | dio->flags |= IOMAP_DIO_COW; | |
370 | ||
371 | if (iomap->flags & IOMAP_F_NEW) { | |
372 | need_zeroout = true; | |
373 | } else if (iomap->type == IOMAP_MAPPED) { | |
374 | if (iomap_dio_can_use_fua(iomap, dio)) | |
375 | bio_opf |= REQ_FUA; | |
376 | else | |
377 | dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; | |
378 | } | |
db074436 | 379 | |
db074436 | 380 | /* |
d279c80e JG |
381 | * We can only do deferred completion for pure overwrites that |
382 | * don't require additional I/O at completion time. | |
383 | * | |
384 | * This rules out writes that need zeroing or extent conversion, | |
385 | * extend the file size, or issue metadata I/O or cache flushes | |
386 | * during completion processing. | |
db074436 | 387 | */ |
d279c80e JG |
388 | if (need_zeroout || (pos >= i_size_read(inode)) || |
389 | ((dio->flags & IOMAP_DIO_NEED_SYNC) && | |
390 | !(bio_opf & REQ_FUA))) | |
8c052fb3 | 391 | dio->flags &= ~IOMAP_DIO_CALLER_COMP; |
d279c80e JG |
392 | } else { |
393 | bio_opf |= REQ_OP_READ; | |
db074436 DW |
394 | } |
395 | ||
396 | /* | |
f550ee9b JK |
397 | * Save the original count and trim the iter to just the extent we |
398 | * are operating on right now. The iter will be re-expanded once | |
399 | * we are done. | |
db074436 | 400 | */ |
f550ee9b JK |
401 | orig_count = iov_iter_count(dio->submit.iter); |
402 | iov_iter_truncate(dio->submit.iter, length); | |
db074436 | 403 | |
3e1a88ec | 404 | if (!iov_iter_count(dio->submit.iter)) |
f550ee9b | 405 | goto out; |
db074436 | 406 | |
8c052fb3 JA |
407 | /* |
408 | * The rules for polled IO completions follow the guidelines as the | |
409 | * ones we set for inline and deferred completions. If none of those | |
410 | * are available for this IO, clear the polled flag. | |
411 | */ | |
412 | if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) | |
f79d4749 CH |
413 | dio->iocb->ki_flags &= ~IOCB_HIPRI; |
414 | ||
db074436 DW |
415 | if (need_zeroout) { |
416 | /* zero out from the start of the block to the write offset */ | |
417 | pad = pos & (fs_block_size - 1); | |
10553a91 PR |
418 | |
419 | ret = iomap_dio_zero(iter, dio, pos - pad, pad); | |
420 | if (ret) | |
421 | goto out; | |
db074436 DW |
422 | } |
423 | ||
a8affc03 | 424 | nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); |
db074436 DW |
425 | do { |
426 | size_t n; | |
427 | if (dio->error) { | |
428 | iov_iter_revert(dio->submit.iter, copied); | |
f550ee9b JK |
429 | copied = ret = 0; |
430 | goto out; | |
db074436 DW |
431 | } |
432 | ||
908c5490 | 433 | bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); |
489734ef EB |
434 | fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, |
435 | GFP_KERNEL); | |
db074436 | 436 | bio->bi_iter.bi_sector = iomap_sector(iomap, pos); |
44981351 | 437 | bio->bi_write_hint = inode->i_write_hint; |
db074436 DW |
438 | bio->bi_ioprio = dio->iocb->ki_ioprio; |
439 | bio->bi_private = dio; | |
440 | bio->bi_end_io = iomap_dio_bio_end_io; | |
441 | ||
f550ee9b | 442 | ret = bio_iov_iter_get_pages(bio, dio->submit.iter); |
db074436 DW |
443 | if (unlikely(ret)) { |
444 | /* | |
445 | * We have to stop part way through an IO. We must fall | |
446 | * through to the sub-block tail zeroing here, otherwise | |
447 | * this short IO may expose stale data in the tail of | |
448 | * the block we haven't written data to. | |
449 | */ | |
450 | bio_put(bio); | |
451 | goto zero_tail; | |
452 | } | |
453 | ||
454 | n = bio->bi_iter.bi_size; | |
d279c80e | 455 | if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { |
9e0933c2 | 456 | /* |
aacd436e | 457 | * An atomic write bio must cover the complete length, |
9e0933c2 JG |
458 | * which it doesn't, so error. We may need to zero out |
459 | * the tail (complete FS block), similar to when | |
460 | * bio_iov_iter_get_pages() returns an error, above. | |
461 | */ | |
462 | ret = -EINVAL; | |
463 | bio_put(bio); | |
464 | goto zero_tail; | |
465 | } | |
b5799106 | 466 | if (dio->flags & IOMAP_DIO_WRITE) |
db074436 | 467 | task_io_account_write(n); |
b5799106 JG |
468 | else if (dio->flags & IOMAP_DIO_DIRTY) |
469 | bio_set_pages_dirty(bio); | |
db074436 | 470 | |
db074436 | 471 | dio->size += n; |
db074436 DW |
472 | copied += n; |
473 | ||
3e1a88ec | 474 | nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, |
a8affc03 | 475 | BIO_MAX_VECS); |
f79d4749 CH |
476 | /* |
477 | * We can only poll for single bio I/Os. | |
478 | */ | |
479 | if (nr_pages) | |
480 | dio->iocb->ki_flags &= ~IOCB_HIPRI; | |
a6d3d495 | 481 | iomap_dio_submit_bio(iter, dio, bio, pos); |
8cecd0ba | 482 | pos += n; |
db074436 DW |
483 | } while (nr_pages); |
484 | ||
485 | /* | |
486 | * We need to zeroout the tail of a sub-block write if the extent type | |
487 | * requires zeroing or the write extends beyond EOF. If we don't zero | |
488 | * the block tail in the latter case, we can expose stale data via mmap | |
489 | * reads of the EOF block. | |
490 | */ | |
491 | zero_tail: | |
492 | if (need_zeroout || | |
493 | ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { | |
494 | /* zero out from the end of the write to the end of the block */ | |
495 | pad = pos & (fs_block_size - 1); | |
496 | if (pad) | |
10553a91 PR |
497 | ret = iomap_dio_zero(iter, dio, pos, |
498 | fs_block_size - pad); | |
db074436 | 499 | } |
f550ee9b JK |
500 | out: |
501 | /* Undo iter limitation to current extent */ | |
502 | iov_iter_reexpand(dio->submit.iter, orig_count - copied); | |
e9f930ac | 503 | if (copied) |
8fecec46 | 504 | return iomap_iter_advance(iter, &copied); |
e9f930ac | 505 | return ret; |
db074436 DW |
506 | } |
507 | ||
8fecec46 | 508 | static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) |
db074436 | 509 | { |
a6d3d495 CH |
510 | loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); |
511 | ||
db074436 | 512 | dio->size += length; |
42c498c1 AG |
513 | if (!length) |
514 | return -EFAULT; | |
8fecec46 | 515 | return iomap_iter_advance(iter, &length); |
db074436 DW |
516 | } |
517 | ||
8fecec46 | 518 | static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) |
db074436 | 519 | { |
a6d3d495 | 520 | const struct iomap *iomap = &iomi->iomap; |
db074436 | 521 | struct iov_iter *iter = dio->submit.iter; |
a6d3d495 CH |
522 | void *inline_data = iomap_inline_data(iomap, iomi->pos); |
523 | loff_t length = iomap_length(iomi); | |
524 | loff_t pos = iomi->pos; | |
8fecec46 | 525 | u64 copied; |
db074436 | 526 | |
69f4a26c GX |
527 | if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) |
528 | return -EIO; | |
db074436 DW |
529 | |
530 | if (dio->flags & IOMAP_DIO_WRITE) { | |
a6d3d495 | 531 | loff_t size = iomi->inode->i_size; |
db074436 DW |
532 | |
533 | if (pos > size) | |
69f4a26c GX |
534 | memset(iomap_inline_data(iomap, size), 0, pos - size); |
535 | copied = copy_from_iter(inline_data, length, iter); | |
db074436 DW |
536 | if (copied) { |
537 | if (pos + copied > size) | |
a6d3d495 CH |
538 | i_size_write(iomi->inode, pos + copied); |
539 | mark_inode_dirty(iomi->inode); | |
db074436 DW |
540 | } |
541 | } else { | |
69f4a26c | 542 | copied = copy_to_iter(inline_data, length, iter); |
db074436 DW |
543 | } |
544 | dio->size += copied; | |
42c498c1 AG |
545 | if (!copied) |
546 | return -EFAULT; | |
8fecec46 | 547 | return iomap_iter_advance(iomi, &copied); |
db074436 DW |
548 | } |
549 | ||
8fecec46 | 550 | static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) |
db074436 | 551 | { |
a6d3d495 | 552 | switch (iter->iomap.type) { |
db074436 DW |
553 | case IOMAP_HOLE: |
554 | if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) | |
555 | return -EIO; | |
a6d3d495 | 556 | return iomap_dio_hole_iter(iter, dio); |
db074436 DW |
557 | case IOMAP_UNWRITTEN: |
558 | if (!(dio->flags & IOMAP_DIO_WRITE)) | |
a6d3d495 CH |
559 | return iomap_dio_hole_iter(iter, dio); |
560 | return iomap_dio_bio_iter(iter, dio); | |
db074436 | 561 | case IOMAP_MAPPED: |
a6d3d495 | 562 | return iomap_dio_bio_iter(iter, dio); |
db074436 | 563 | case IOMAP_INLINE: |
a6d3d495 | 564 | return iomap_dio_inline_iter(iter, dio); |
a805c111 QC |
565 | case IOMAP_DELALLOC: |
566 | /* | |
567 | * DIO is not serialised against mmap() access at all, and so | |
568 | * if the page_mkwrite occurs between the writeback and the | |
a6d3d495 | 569 | * iomap_iter() call in the DIO path, then it will see the |
a805c111 QC |
570 | * DELALLOC block that the page-mkwrite allocated. |
571 | */ | |
572 | pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", | |
573 | dio->iocb->ki_filp, current->comm); | |
574 | return -EIO; | |
db074436 DW |
575 | default: |
576 | WARN_ON_ONCE(1); | |
577 | return -EIO; | |
578 | } | |
579 | } | |
580 | ||
581 | /* | |
582 | * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO | |
583 | * is being issued as AIO or not. This allows us to optimise pure data writes | |
584 | * to use REQ_FUA rather than requiring generic_write_sync() to issue a | |
585 | * REQ_FLUSH post write. This is slightly tricky because a single request here | |
586 | * can be mapped into multiple disjoint IOs and only a subset of the IOs issued | |
587 | * may be pure data writes. In that case, we still need to do a full data sync | |
588 | * completion. | |
60263d58 | 589 | * |
4fdccaa0 AG |
590 | * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, |
591 | * __iomap_dio_rw can return a partial result if it encounters a non-resident | |
592 | * page in @iter after preparing a transfer. In that case, the non-resident | |
593 | * pages can be faulted in and the request resumed with @done_before set to the | |
594 | * number of bytes previously transferred. The request will then complete with | |
595 | * the correct total number of bytes transferred; this is essential for | |
596 | * completing partial requests asynchronously. | |
597 | * | |
60263d58 CH |
598 | * Returns -ENOTBLK In case of a page invalidation invalidation failure for |
599 | * writes. The callers needs to fall back to buffered I/O in this case. | |
db074436 | 600 | */ |
c3d4ed1a CH |
601 | struct iomap_dio * |
602 | __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, | |
13ef9544 | 603 | const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
786f847f | 604 | unsigned int dio_flags, void *private, size_t done_before) |
db074436 | 605 | { |
db074436 | 606 | struct inode *inode = file_inode(iocb->ki_filp); |
a6d3d495 CH |
607 | struct iomap_iter iomi = { |
608 | .inode = inode, | |
609 | .pos = iocb->ki_pos, | |
610 | .len = iov_iter_count(iter), | |
611 | .flags = IOMAP_DIRECT, | |
786f847f | 612 | .private = private, |
a6d3d495 | 613 | }; |
2f632965 CH |
614 | bool wait_for_completion = |
615 | is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); | |
db074436 DW |
616 | struct blk_plug plug; |
617 | struct iomap_dio *dio; | |
8ee93b4b | 618 | loff_t ret = 0; |
db074436 | 619 | |
3fd41721 RHI |
620 | trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); |
621 | ||
a6d3d495 | 622 | if (!iomi.len) |
c3d4ed1a | 623 | return NULL; |
db074436 DW |
624 | |
625 | dio = kmalloc(sizeof(*dio), GFP_KERNEL); | |
626 | if (!dio) | |
c3d4ed1a | 627 | return ERR_PTR(-ENOMEM); |
db074436 DW |
628 | |
629 | dio->iocb = iocb; | |
630 | atomic_set(&dio->ref, 1); | |
631 | dio->size = 0; | |
632 | dio->i_size = i_size_read(inode); | |
838c4f3d | 633 | dio->dops = dops; |
db074436 DW |
634 | dio->error = 0; |
635 | dio->flags = 0; | |
4fdccaa0 | 636 | dio->done_before = done_before; |
db074436 DW |
637 | |
638 | dio->submit.iter = iter; | |
639 | dio->submit.waiter = current; | |
db074436 | 640 | |
8ee93b4b CH |
641 | if (iocb->ki_flags & IOCB_NOWAIT) |
642 | iomi.flags |= IOMAP_NOWAIT; | |
643 | ||
db074436 | 644 | if (iov_iter_rw(iter) == READ) { |
7b3c14d1 JA |
645 | /* reads can always complete inline */ |
646 | dio->flags |= IOMAP_DIO_INLINE_COMP; | |
647 | ||
a6d3d495 | 648 | if (iomi.pos >= dio->i_size) |
db074436 DW |
649 | goto out_free_dio; |
650 | ||
fcb14cb1 | 651 | if (user_backed_iter(iter)) |
db074436 | 652 | dio->flags |= IOMAP_DIO_DIRTY; |
8ee93b4b CH |
653 | |
654 | ret = kiocb_write_and_wait(iocb, iomi.len); | |
655 | if (ret) | |
656 | goto out_free_dio; | |
db074436 | 657 | } else { |
a6d3d495 | 658 | iomi.flags |= IOMAP_WRITE; |
db074436 DW |
659 | dio->flags |= IOMAP_DIO_WRITE; |
660 | ||
8c052fb3 JA |
661 | /* |
662 | * Flag as supporting deferred completions, if the issuer | |
663 | * groks it. This can avoid a workqueue punt for writes. | |
664 | * We may later clear this flag if we need to do other IO | |
665 | * as part of this IO completion. | |
666 | */ | |
667 | if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) | |
668 | dio->flags |= IOMAP_DIO_CALLER_COMP; | |
669 | ||
8ee93b4b CH |
670 | if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { |
671 | ret = -EAGAIN; | |
672 | if (iomi.pos >= dio->i_size || | |
673 | iomi.pos + iomi.len > dio->i_size) | |
985b71db | 674 | goto out_free_dio; |
8ee93b4b | 675 | iomi.flags |= IOMAP_OVERWRITE_ONLY; |
985b71db JA |
676 | } |
677 | ||
370a6de7 JG |
678 | if (iocb->ki_flags & IOCB_ATOMIC) |
679 | iomi.flags |= IOMAP_ATOMIC; | |
b4de0e9b | 680 | |
db074436 | 681 | /* for data sync or sync, we need sync completion processing */ |
d3bff1fc | 682 | if (iocb_is_dsync(iocb)) { |
db074436 DW |
683 | dio->flags |= IOMAP_DIO_NEED_SYNC; |
684 | ||
36518b6b | 685 | /* |
3a0be38c JA |
686 | * For datasync only writes, we optimistically try using |
687 | * WRITE_THROUGH for this IO. This flag requires either | |
688 | * FUA writes through the device's write cache, or a | |
689 | * normal write to a device without a volatile write | |
690 | * cache. For the former, Any non-FUA write that occurs | |
691 | * will clear this flag, hence we know before completion | |
692 | * whether a cache flush is necessary. | |
36518b6b AV |
693 | */ |
694 | if (!(iocb->ki_flags & IOCB_SYNC)) | |
3a0be38c | 695 | dio->flags |= IOMAP_DIO_WRITE_THROUGH; |
36518b6b | 696 | } |
db074436 | 697 | |
54752de9 DC |
698 | /* |
699 | * Try to invalidate cache pages for the range we are writing. | |
60263d58 CH |
700 | * If this invalidation fails, let the caller fall back to |
701 | * buffered I/O. | |
54752de9 | 702 | */ |
8ee93b4b CH |
703 | ret = kiocb_invalidate_pages(iocb, iomi.len); |
704 | if (ret) { | |
705 | if (ret != -EAGAIN) { | |
706 | trace_iomap_dio_invalidate_fail(inode, iomi.pos, | |
707 | iomi.len); | |
9e0933c2 JG |
708 | if (iocb->ki_flags & IOCB_ATOMIC) { |
709 | /* | |
710 | * folio invalidation failed, maybe | |
711 | * this is transient, unlock and see if | |
712 | * the caller tries again. | |
713 | */ | |
714 | ret = -EAGAIN; | |
715 | } else { | |
716 | /* fall back to buffered write */ | |
717 | ret = -ENOTBLK; | |
718 | } | |
8ee93b4b | 719 | } |
60263d58 CH |
720 | goto out_free_dio; |
721 | } | |
db074436 | 722 | |
54752de9 DC |
723 | if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { |
724 | ret = sb_init_dio_done_wq(inode->i_sb); | |
725 | if (ret < 0) | |
726 | goto out_free_dio; | |
727 | } | |
db074436 DW |
728 | } |
729 | ||
730 | inode_dio_begin(inode); | |
731 | ||
732 | blk_start_plug(&plug); | |
f79d4749 | 733 | while ((ret = iomap_iter(&iomi, ops)) > 0) { |
edd3e3b7 | 734 | iomi.status = iomap_dio_iter(&iomi, dio); |
f79d4749 CH |
735 | |
736 | /* | |
737 | * We can only poll for single bio I/Os. | |
738 | */ | |
739 | iocb->ki_flags &= ~IOCB_HIPRI; | |
740 | } | |
741 | ||
db074436 DW |
742 | blk_finish_plug(&plug); |
743 | ||
a6d3d495 CH |
744 | /* |
745 | * We only report that we've read data up to i_size. | |
746 | * Revert iter to a state corresponding to that as some callers (such | |
747 | * as the splice code) rely on it. | |
748 | */ | |
749 | if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) | |
750 | iov_iter_revert(iter, iomi.pos - dio->i_size); | |
751 | ||
97308f8b AG |
752 | if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { |
753 | if (!(iocb->ki_flags & IOCB_NOWAIT)) | |
754 | wait_for_completion = true; | |
755 | ret = 0; | |
756 | } | |
757 | ||
a6d3d495 CH |
758 | /* magic error code to fall back to buffered I/O */ |
759 | if (ret == -ENOTBLK) { | |
760 | wait_for_completion = true; | |
761 | ret = 0; | |
762 | } | |
db074436 DW |
763 | if (ret < 0) |
764 | iomap_dio_set_error(dio, ret); | |
765 | ||
766 | /* | |
3a0be38c JA |
767 | * If all the writes we issued were already written through to the |
768 | * media, we don't need to flush the cache on IO completion. Clear the | |
769 | * sync flag for this case. | |
db074436 | 770 | */ |
3a0be38c | 771 | if (dio->flags & IOMAP_DIO_WRITE_THROUGH) |
db074436 DW |
772 | dio->flags &= ~IOMAP_DIO_NEED_SYNC; |
773 | ||
db074436 DW |
774 | /* |
775 | * We are about to drop our additional submission reference, which | |
d9973ce2 | 776 | * might be the last reference to the dio. There are three different |
777 | * ways we can progress here: | |
db074436 DW |
778 | * |
779 | * (a) If this is the last reference we will always complete and free | |
780 | * the dio ourselves. | |
781 | * (b) If this is not the last reference, and we serve an asynchronous | |
782 | * iocb, we must never touch the dio after the decrement, the | |
783 | * I/O completion handler will complete and free it. | |
784 | * (c) If this is not the last reference, but we serve a synchronous | |
785 | * iocb, the I/O completion handler will wake us up on the drop | |
786 | * of the final reference, and we will complete and free it here | |
787 | * after we got woken by the I/O completion handler. | |
788 | */ | |
789 | dio->wait_for_completion = wait_for_completion; | |
790 | if (!atomic_dec_and_test(&dio->ref)) { | |
3fd41721 RHI |
791 | if (!wait_for_completion) { |
792 | trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len); | |
c3d4ed1a | 793 | return ERR_PTR(-EIOCBQUEUED); |
3fd41721 | 794 | } |
db074436 DW |
795 | |
796 | for (;;) { | |
797 | set_current_state(TASK_UNINTERRUPTIBLE); | |
798 | if (!READ_ONCE(dio->submit.waiter)) | |
799 | break; | |
800 | ||
9650b453 | 801 | blk_io_schedule(); |
db074436 DW |
802 | } |
803 | __set_current_state(TASK_RUNNING); | |
804 | } | |
805 | ||
c3d4ed1a | 806 | return dio; |
db074436 DW |
807 | |
808 | out_free_dio: | |
809 | kfree(dio); | |
c3d4ed1a CH |
810 | if (ret) |
811 | return ERR_PTR(ret); | |
812 | return NULL; | |
813 | } | |
814 | EXPORT_SYMBOL_GPL(__iomap_dio_rw); | |
815 | ||
816 | ssize_t | |
817 | iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, | |
818 | const struct iomap_ops *ops, const struct iomap_dio_ops *dops, | |
786f847f | 819 | unsigned int dio_flags, void *private, size_t done_before) |
c3d4ed1a CH |
820 | { |
821 | struct iomap_dio *dio; | |
822 | ||
786f847f CH |
823 | dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private, |
824 | done_before); | |
c3d4ed1a CH |
825 | if (IS_ERR_OR_NULL(dio)) |
826 | return PTR_ERR_OR_ZERO(dio); | |
827 | return iomap_dio_complete(dio); | |
db074436 DW |
828 | } |
829 | EXPORT_SYMBOL_GPL(iomap_dio_rw); | |
10553a91 PR |
830 | |
831 | static int __init iomap_dio_init(void) | |
832 | { | |
833 | zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, | |
834 | IOMAP_ZERO_PAGE_ORDER); | |
835 | ||
836 | if (!zero_page) | |
837 | return -ENOMEM; | |
838 | ||
10553a91 PR |
839 | return 0; |
840 | } | |
841 | fs_initcall(iomap_dio_init); |