Commit | Line | Data |
---|---|---|
db074436 DW |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2010 Red Hat, Inc. | |
a6d3d495 | 4 | * Copyright (c) 2016-2021 Christoph Hellwig. |
db074436 DW |
5 | */ |
6 | #include <linux/module.h> | |
7 | #include <linux/compiler.h> | |
8 | #include <linux/fs.h> | |
489734ef | 9 | #include <linux/fscrypt.h> |
4bdcd1dd | 10 | #include <linux/pagemap.h> |
db074436 DW |
11 | #include <linux/iomap.h> |
12 | #include <linux/backing-dev.h> | |
13 | #include <linux/uio.h> | |
14 | #include <linux/task_io_accounting_ops.h> | |
60263d58 | 15 | #include "trace.h" |
db074436 DW |
16 | |
17 | #include "../internal.h" | |
18 | ||
19 | /* | |
20 | * Private flags for iomap_dio, must not overlap with the public ones in | |
21 | * iomap.h: | |
22 | */ | |
8c052fb3 | 23 | #define IOMAP_DIO_CALLER_COMP (1U << 26) |
7b3c14d1 | 24 | #define IOMAP_DIO_INLINE_COMP (1U << 27) |
3a0be38c | 25 | #define IOMAP_DIO_WRITE_THROUGH (1U << 28) |
44842f64 JA |
26 | #define IOMAP_DIO_NEED_SYNC (1U << 29) |
27 | #define IOMAP_DIO_WRITE (1U << 30) | |
28 | #define IOMAP_DIO_DIRTY (1U << 31) | |
db074436 DW |
29 | |
30 | struct iomap_dio { | |
31 | struct kiocb *iocb; | |
838c4f3d | 32 | const struct iomap_dio_ops *dops; |
db074436 DW |
33 | loff_t i_size; |
34 | loff_t size; | |
35 | atomic_t ref; | |
36 | unsigned flags; | |
37 | int error; | |
4fdccaa0 | 38 | size_t done_before; |
db074436 DW |
39 | bool wait_for_completion; |
40 | ||
41 | union { | |
42 | /* used during submission and for synchronous completion: */ | |
43 | struct { | |
44 | struct iov_iter *iter; | |
45 | struct task_struct *waiter; | |
db074436 DW |
46 | } submit; |
47 | ||
48 | /* used for aio completion: */ | |
49 | struct { | |
50 | struct work_struct work; | |
51 | } aio; | |
52 | }; | |
53 | }; | |
54 | ||
908c5490 | 55 | static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, |
dbd4eb81 | 56 | struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf) |
908c5490 CH |
57 | { |
58 | if (dio->dops && dio->dops->bio_set) | |
59 | return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, | |
60 | GFP_KERNEL, dio->dops->bio_set); | |
61 | return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL); | |
62 | } | |
63 | ||
a6d3d495 CH |
64 | static void iomap_dio_submit_bio(const struct iomap_iter *iter, |
65 | struct iomap_dio *dio, struct bio *bio, loff_t pos) | |
db074436 | 66 | { |
daa99c5a JA |
67 | struct kiocb *iocb = dio->iocb; |
68 | ||
db074436 DW |
69 | atomic_inc(&dio->ref); |
70 | ||
9650b453 | 71 | /* Sync dio can't be polled reliably */ |
daa99c5a JA |
72 | if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { |
73 | bio_set_polled(bio, iocb); | |
74 | WRITE_ONCE(iocb->private, bio); | |
3e08773c | 75 | } |
db074436 | 76 | |
8cecd0ba | 77 | if (dio->dops && dio->dops->submit_io) |
3e08773c | 78 | dio->dops->submit_io(iter, bio, pos); |
8cecd0ba | 79 | else |
3e08773c | 80 | submit_bio(bio); |
db074436 DW |
81 | } |
82 | ||
c3d4ed1a | 83 | ssize_t iomap_dio_complete(struct iomap_dio *dio) |
db074436 | 84 | { |
838c4f3d | 85 | const struct iomap_dio_ops *dops = dio->dops; |
db074436 | 86 | struct kiocb *iocb = dio->iocb; |
db074436 | 87 | loff_t offset = iocb->ki_pos; |
838c4f3d | 88 | ssize_t ret = dio->error; |
db074436 | 89 | |
838c4f3d CH |
90 | if (dops && dops->end_io) |
91 | ret = dops->end_io(iocb, dio->size, ret, dio->flags); | |
db074436 DW |
92 | |
93 | if (likely(!ret)) { | |
94 | ret = dio->size; | |
95 | /* check for short read */ | |
96 | if (offset + ret > dio->i_size && | |
97 | !(dio->flags & IOMAP_DIO_WRITE)) | |
98 | ret = dio->i_size - offset; | |
db074436 DW |
99 | } |
100 | ||
101 | /* | |
102 | * Try again to invalidate clean pages which might have been cached by | |
103 | * non-direct readahead, or faulted in by get_user_pages() if the source | |
104 | * of the write was an mmap'ed region of the file we're writing. Either | |
105 | * one is a pretty crazy thing to do, so we don't support it 100%. If | |
106 | * this invalidation fails, tough, the write still worked... | |
107 | * | |
838c4f3d CH |
108 | * And this page cache invalidation has to be after ->end_io(), as some |
109 | * filesystems convert unwritten extents to real allocations in | |
110 | * ->end_io() when necessary, otherwise a racing buffer read would cache | |
db074436 DW |
111 | * zeros from unwritten extents. |
112 | */ | |
c402a9a9 CH |
113 | if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) |
114 | kiocb_invalidate_post_direct_write(iocb, dio->size); | |
db074436 | 115 | |
1a31182e | 116 | inode_dio_end(file_inode(iocb->ki_filp)); |
db074436 | 117 | |
936e114a CH |
118 | if (ret > 0) { |
119 | iocb->ki_pos += ret; | |
4fdccaa0 | 120 | |
936e114a CH |
121 | /* |
122 | * If this is a DSYNC write, make sure we push it to stable | |
123 | * storage now that we've written data. | |
124 | */ | |
125 | if (dio->flags & IOMAP_DIO_NEED_SYNC) | |
126 | ret = generic_write_sync(iocb, ret); | |
127 | if (ret > 0) | |
128 | ret += dio->done_before; | |
129 | } | |
3fd41721 | 130 | trace_iomap_dio_complete(iocb, dio->error, ret); |
db074436 | 131 | kfree(dio); |
db074436 DW |
132 | return ret; |
133 | } | |
c3d4ed1a | 134 | EXPORT_SYMBOL_GPL(iomap_dio_complete); |
db074436 | 135 | |
8c052fb3 JA |
136 | static ssize_t iomap_dio_deferred_complete(void *data) |
137 | { | |
138 | return iomap_dio_complete(data); | |
139 | } | |
140 | ||
db074436 DW |
141 | static void iomap_dio_complete_work(struct work_struct *work) |
142 | { | |
143 | struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); | |
144 | struct kiocb *iocb = dio->iocb; | |
145 | ||
6b19b766 | 146 | iocb->ki_complete(iocb, iomap_dio_complete(dio)); |
db074436 DW |
147 | } |
148 | ||
149 | /* | |
150 | * Set an error in the dio if none is set yet. We have to use cmpxchg | |
151 | * as the submission context and the completion context(s) can race to | |
152 | * update the error. | |
153 | */ | |
154 | static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) | |
155 | { | |
156 | cmpxchg(&dio->error, 0, ret); | |
157 | } | |
158 | ||
908c5490 | 159 | void iomap_dio_bio_end_io(struct bio *bio) |
db074436 DW |
160 | { |
161 | struct iomap_dio *dio = bio->bi_private; | |
162 | bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); | |
3486237c | 163 | struct kiocb *iocb = dio->iocb; |
db074436 DW |
164 | |
165 | if (bio->bi_status) | |
166 | iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); | |
3486237c JA |
167 | if (!atomic_dec_and_test(&dio->ref)) |
168 | goto release_bio; | |
169 | ||
170 | /* | |
171 | * Synchronous dio, task itself will handle any completion work | |
172 | * that needs after IO. All we need to do is wake the task. | |
173 | */ | |
174 | if (dio->wait_for_completion) { | |
175 | struct task_struct *waiter = dio->submit.waiter; | |
db074436 | 176 | |
3486237c JA |
177 | WRITE_ONCE(dio->submit.waiter, NULL); |
178 | blk_wake_io_task(waiter); | |
179 | goto release_bio; | |
db074436 DW |
180 | } |
181 | ||
7b3c14d1 JA |
182 | /* |
183 | * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline | |
184 | */ | |
185 | if (dio->flags & IOMAP_DIO_INLINE_COMP) { | |
3486237c JA |
186 | WRITE_ONCE(iocb->private, NULL); |
187 | iomap_dio_complete_work(&dio->aio.work); | |
188 | goto release_bio; | |
189 | } | |
190 | ||
8c052fb3 JA |
191 | /* |
192 | * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule | |
193 | * our completion that way to avoid an async punt to a workqueue. | |
194 | */ | |
195 | if (dio->flags & IOMAP_DIO_CALLER_COMP) { | |
196 | /* only polled IO cares about private cleared */ | |
197 | iocb->private = dio; | |
198 | iocb->dio_complete = iomap_dio_deferred_complete; | |
199 | ||
200 | /* | |
201 | * Invoke ->ki_complete() directly. We've assigned our | |
202 | * dio_complete callback handler, and since the issuer set | |
203 | * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will | |
204 | * notice ->dio_complete being set and will defer calling that | |
205 | * handler until it can be done from a safe task context. | |
206 | * | |
207 | * Note that the 'res' being passed in here is not important | |
208 | * for this case. The actual completion value of the request | |
209 | * will be gotten from dio_complete when that is run by the | |
210 | * issuer. | |
211 | */ | |
212 | iocb->ki_complete(iocb, 0); | |
213 | goto release_bio; | |
214 | } | |
215 | ||
3486237c JA |
216 | /* |
217 | * Async DIO completion that requires filesystem level completion work | |
218 | * gets punted to a work queue to complete as the operation may require | |
219 | * more IO to be issued to finalise filesystem metadata changes or | |
220 | * guarantee data integrity. | |
221 | */ | |
3486237c JA |
222 | INIT_WORK(&dio->aio.work, iomap_dio_complete_work); |
223 | queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, | |
224 | &dio->aio.work); | |
225 | release_bio: | |
db074436 DW |
226 | if (should_dirty) { |
227 | bio_check_pages_dirty(bio); | |
228 | } else { | |
229 | bio_release_pages(bio, false); | |
230 | bio_put(bio); | |
231 | } | |
232 | } | |
908c5490 | 233 | EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); |
db074436 | 234 | |
a6d3d495 CH |
235 | static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, |
236 | loff_t pos, unsigned len) | |
db074436 | 237 | { |
489734ef | 238 | struct inode *inode = file_inode(dio->iocb->ki_filp); |
db074436 | 239 | struct page *page = ZERO_PAGE(0); |
db074436 DW |
240 | struct bio *bio; |
241 | ||
908c5490 | 242 | bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); |
489734ef EB |
243 | fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, |
244 | GFP_KERNEL); | |
a6d3d495 | 245 | bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); |
db074436 DW |
246 | bio->bi_private = dio; |
247 | bio->bi_end_io = iomap_dio_bio_end_io; | |
248 | ||
db074436 | 249 | __bio_add_page(bio, page, len, 0); |
a6d3d495 | 250 | iomap_dio_submit_bio(iter, dio, bio, pos); |
db074436 DW |
251 | } |
252 | ||
c3b0e880 NA |
253 | /* |
254 | * Figure out the bio's operation flags from the dio request, the | |
255 | * mapping, and whether or not we want FUA. Note that we can end up | |
3a0be38c | 256 | * clearing the WRITE_THROUGH flag in the dio request. |
c3b0e880 | 257 | */ |
dbd4eb81 | 258 | static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, |
a6d3d495 | 259 | const struct iomap *iomap, bool use_fua) |
c3b0e880 | 260 | { |
dbd4eb81 | 261 | blk_opf_t opflags = REQ_SYNC | REQ_IDLE; |
c3b0e880 | 262 | |
8e81aa16 | 263 | if (!(dio->flags & IOMAP_DIO_WRITE)) |
c3b0e880 | 264 | return REQ_OP_READ; |
c3b0e880 | 265 | |
8e81aa16 | 266 | opflags |= REQ_OP_WRITE; |
c3b0e880 NA |
267 | if (use_fua) |
268 | opflags |= REQ_FUA; | |
269 | else | |
3a0be38c | 270 | dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; |
c3b0e880 NA |
271 | |
272 | return opflags; | |
273 | } | |
274 | ||
a6d3d495 CH |
275 | static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, |
276 | struct iomap_dio *dio) | |
db074436 | 277 | { |
a6d3d495 CH |
278 | const struct iomap *iomap = &iter->iomap; |
279 | struct inode *inode = iter->inode; | |
db074436 | 280 | unsigned int fs_block_size = i_blocksize(inode), pad; |
a6d3d495 CH |
281 | loff_t length = iomap_length(iter); |
282 | loff_t pos = iter->pos; | |
dbd4eb81 | 283 | blk_opf_t bio_opf; |
db074436 DW |
284 | struct bio *bio; |
285 | bool need_zeroout = false; | |
286 | bool use_fua = false; | |
287 | int nr_pages, ret = 0; | |
288 | size_t copied = 0; | |
f550ee9b | 289 | size_t orig_count; |
db074436 | 290 | |
f1bd37a4 | 291 | if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || |
bf8d0853 | 292 | !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) |
db074436 DW |
293 | return -EINVAL; |
294 | ||
295 | if (iomap->type == IOMAP_UNWRITTEN) { | |
296 | dio->flags |= IOMAP_DIO_UNWRITTEN; | |
297 | need_zeroout = true; | |
298 | } | |
299 | ||
300 | if (iomap->flags & IOMAP_F_SHARED) | |
301 | dio->flags |= IOMAP_DIO_COW; | |
302 | ||
303 | if (iomap->flags & IOMAP_F_NEW) { | |
304 | need_zeroout = true; | |
305 | } else if (iomap->type == IOMAP_MAPPED) { | |
306 | /* | |
307 | * Use a FUA write if we need datasync semantics, this is a pure | |
308 | * data IO that doesn't require any metadata updates (including | |
309 | * after IO completion such as unwritten extent conversion) and | |
3a0be38c JA |
310 | * the underlying device either supports FUA or doesn't have |
311 | * a volatile write cache. This allows us to avoid cache flushes | |
8c052fb3 JA |
312 | * on IO completion. If we can't use writethrough and need to |
313 | * sync, disable in-task completions as dio completion will | |
314 | * need to call generic_write_sync() which will do a blocking | |
315 | * fsync / cache flush call. | |
db074436 DW |
316 | */ |
317 | if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && | |
3a0be38c JA |
318 | (dio->flags & IOMAP_DIO_WRITE_THROUGH) && |
319 | (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) | |
db074436 | 320 | use_fua = true; |
8c052fb3 JA |
321 | else if (dio->flags & IOMAP_DIO_NEED_SYNC) |
322 | dio->flags &= ~IOMAP_DIO_CALLER_COMP; | |
db074436 DW |
323 | } |
324 | ||
325 | /* | |
f550ee9b JK |
326 | * Save the original count and trim the iter to just the extent we |
327 | * are operating on right now. The iter will be re-expanded once | |
328 | * we are done. | |
db074436 | 329 | */ |
f550ee9b JK |
330 | orig_count = iov_iter_count(dio->submit.iter); |
331 | iov_iter_truncate(dio->submit.iter, length); | |
db074436 | 332 | |
3e1a88ec | 333 | if (!iov_iter_count(dio->submit.iter)) |
f550ee9b | 334 | goto out; |
db074436 | 335 | |
f79d4749 | 336 | /* |
8c052fb3 JA |
337 | * We can only do deferred completion for pure overwrites that |
338 | * don't require additional IO at completion. This rules out | |
339 | * writes that need zeroing or extent conversion, extend | |
340 | * the file size, or issue journal IO or cache flushes | |
341 | * during completion processing. | |
f79d4749 CH |
342 | */ |
343 | if (need_zeroout || | |
8c052fb3 | 344 | ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || |
f79d4749 | 345 | ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) |
8c052fb3 JA |
346 | dio->flags &= ~IOMAP_DIO_CALLER_COMP; |
347 | ||
348 | /* | |
349 | * The rules for polled IO completions follow the guidelines as the | |
350 | * ones we set for inline and deferred completions. If none of those | |
351 | * are available for this IO, clear the polled flag. | |
352 | */ | |
353 | if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) | |
f79d4749 CH |
354 | dio->iocb->ki_flags &= ~IOCB_HIPRI; |
355 | ||
db074436 DW |
356 | if (need_zeroout) { |
357 | /* zero out from the start of the block to the write offset */ | |
358 | pad = pos & (fs_block_size - 1); | |
359 | if (pad) | |
a6d3d495 | 360 | iomap_dio_zero(iter, dio, pos - pad, pad); |
db074436 DW |
361 | } |
362 | ||
c3b0e880 NA |
363 | /* |
364 | * Set the operation flags early so that bio_iov_iter_get_pages | |
365 | * can set up the page vector appropriately for a ZONE_APPEND | |
366 | * operation. | |
367 | */ | |
368 | bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); | |
369 | ||
a8affc03 | 370 | nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); |
db074436 DW |
371 | do { |
372 | size_t n; | |
373 | if (dio->error) { | |
374 | iov_iter_revert(dio->submit.iter, copied); | |
f550ee9b JK |
375 | copied = ret = 0; |
376 | goto out; | |
db074436 DW |
377 | } |
378 | ||
908c5490 | 379 | bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); |
489734ef EB |
380 | fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, |
381 | GFP_KERNEL); | |
db074436 | 382 | bio->bi_iter.bi_sector = iomap_sector(iomap, pos); |
44981351 | 383 | bio->bi_write_hint = inode->i_write_hint; |
db074436 DW |
384 | bio->bi_ioprio = dio->iocb->ki_ioprio; |
385 | bio->bi_private = dio; | |
386 | bio->bi_end_io = iomap_dio_bio_end_io; | |
387 | ||
f550ee9b | 388 | ret = bio_iov_iter_get_pages(bio, dio->submit.iter); |
db074436 DW |
389 | if (unlikely(ret)) { |
390 | /* | |
391 | * We have to stop part way through an IO. We must fall | |
392 | * through to the sub-block tail zeroing here, otherwise | |
393 | * this short IO may expose stale data in the tail of | |
394 | * the block we haven't written data to. | |
395 | */ | |
396 | bio_put(bio); | |
397 | goto zero_tail; | |
398 | } | |
399 | ||
400 | n = bio->bi_iter.bi_size; | |
401 | if (dio->flags & IOMAP_DIO_WRITE) { | |
db074436 DW |
402 | task_io_account_write(n); |
403 | } else { | |
db074436 DW |
404 | if (dio->flags & IOMAP_DIO_DIRTY) |
405 | bio_set_pages_dirty(bio); | |
406 | } | |
407 | ||
db074436 | 408 | dio->size += n; |
db074436 DW |
409 | copied += n; |
410 | ||
3e1a88ec | 411 | nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, |
a8affc03 | 412 | BIO_MAX_VECS); |
f79d4749 CH |
413 | /* |
414 | * We can only poll for single bio I/Os. | |
415 | */ | |
416 | if (nr_pages) | |
417 | dio->iocb->ki_flags &= ~IOCB_HIPRI; | |
a6d3d495 | 418 | iomap_dio_submit_bio(iter, dio, bio, pos); |
8cecd0ba | 419 | pos += n; |
db074436 DW |
420 | } while (nr_pages); |
421 | ||
422 | /* | |
423 | * We need to zeroout the tail of a sub-block write if the extent type | |
424 | * requires zeroing or the write extends beyond EOF. If we don't zero | |
425 | * the block tail in the latter case, we can expose stale data via mmap | |
426 | * reads of the EOF block. | |
427 | */ | |
428 | zero_tail: | |
429 | if (need_zeroout || | |
430 | ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { | |
431 | /* zero out from the end of the write to the end of the block */ | |
432 | pad = pos & (fs_block_size - 1); | |
433 | if (pad) | |
a6d3d495 | 434 | iomap_dio_zero(iter, dio, pos, fs_block_size - pad); |
db074436 | 435 | } |
f550ee9b JK |
436 | out: |
437 | /* Undo iter limitation to current extent */ | |
438 | iov_iter_reexpand(dio->submit.iter, orig_count - copied); | |
e9f930ac JS |
439 | if (copied) |
440 | return copied; | |
441 | return ret; | |
db074436 DW |
442 | } |
443 | ||
a6d3d495 CH |
444 | static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, |
445 | struct iomap_dio *dio) | |
db074436 | 446 | { |
a6d3d495 CH |
447 | loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); |
448 | ||
db074436 | 449 | dio->size += length; |
42c498c1 AG |
450 | if (!length) |
451 | return -EFAULT; | |
db074436 DW |
452 | return length; |
453 | } | |
454 | ||
a6d3d495 CH |
455 | static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, |
456 | struct iomap_dio *dio) | |
db074436 | 457 | { |
a6d3d495 | 458 | const struct iomap *iomap = &iomi->iomap; |
db074436 | 459 | struct iov_iter *iter = dio->submit.iter; |
a6d3d495 CH |
460 | void *inline_data = iomap_inline_data(iomap, iomi->pos); |
461 | loff_t length = iomap_length(iomi); | |
462 | loff_t pos = iomi->pos; | |
db074436 DW |
463 | size_t copied; |
464 | ||
69f4a26c GX |
465 | if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) |
466 | return -EIO; | |
db074436 DW |
467 | |
468 | if (dio->flags & IOMAP_DIO_WRITE) { | |
a6d3d495 | 469 | loff_t size = iomi->inode->i_size; |
db074436 DW |
470 | |
471 | if (pos > size) | |
69f4a26c GX |
472 | memset(iomap_inline_data(iomap, size), 0, pos - size); |
473 | copied = copy_from_iter(inline_data, length, iter); | |
db074436 DW |
474 | if (copied) { |
475 | if (pos + copied > size) | |
a6d3d495 CH |
476 | i_size_write(iomi->inode, pos + copied); |
477 | mark_inode_dirty(iomi->inode); | |
db074436 DW |
478 | } |
479 | } else { | |
69f4a26c | 480 | copied = copy_to_iter(inline_data, length, iter); |
db074436 DW |
481 | } |
482 | dio->size += copied; | |
42c498c1 AG |
483 | if (!copied) |
484 | return -EFAULT; | |
db074436 DW |
485 | return copied; |
486 | } | |
487 | ||
a6d3d495 CH |
488 | static loff_t iomap_dio_iter(const struct iomap_iter *iter, |
489 | struct iomap_dio *dio) | |
db074436 | 490 | { |
a6d3d495 | 491 | switch (iter->iomap.type) { |
db074436 DW |
492 | case IOMAP_HOLE: |
493 | if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) | |
494 | return -EIO; | |
a6d3d495 | 495 | return iomap_dio_hole_iter(iter, dio); |
db074436 DW |
496 | case IOMAP_UNWRITTEN: |
497 | if (!(dio->flags & IOMAP_DIO_WRITE)) | |
a6d3d495 CH |
498 | return iomap_dio_hole_iter(iter, dio); |
499 | return iomap_dio_bio_iter(iter, dio); | |
db074436 | 500 | case IOMAP_MAPPED: |
a6d3d495 | 501 | return iomap_dio_bio_iter(iter, dio); |
db074436 | 502 | case IOMAP_INLINE: |
a6d3d495 | 503 | return iomap_dio_inline_iter(iter, dio); |
a805c111 QC |
504 | case IOMAP_DELALLOC: |
505 | /* | |
506 | * DIO is not serialised against mmap() access at all, and so | |
507 | * if the page_mkwrite occurs between the writeback and the | |
a6d3d495 | 508 | * iomap_iter() call in the DIO path, then it will see the |
a805c111 QC |
509 | * DELALLOC block that the page-mkwrite allocated. |
510 | */ | |
511 | pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", | |
512 | dio->iocb->ki_filp, current->comm); | |
513 | return -EIO; | |
db074436 DW |
514 | default: |
515 | WARN_ON_ONCE(1); | |
516 | return -EIO; | |
517 | } | |
518 | } | |
519 | ||
520 | /* | |
521 | * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO | |
522 | * is being issued as AIO or not. This allows us to optimise pure data writes | |
523 | * to use REQ_FUA rather than requiring generic_write_sync() to issue a | |
524 | * REQ_FLUSH post write. This is slightly tricky because a single request here | |
525 | * can be mapped into multiple disjoint IOs and only a subset of the IOs issued | |
526 | * may be pure data writes. In that case, we still need to do a full data sync | |
527 | * completion. | |
60263d58 | 528 | * |
4fdccaa0 AG |
529 | * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, |
530 | * __iomap_dio_rw can return a partial result if it encounters a non-resident | |
531 | * page in @iter after preparing a transfer. In that case, the non-resident | |
532 | * pages can be faulted in and the request resumed with @done_before set to the | |
533 | * number of bytes previously transferred. The request will then complete with | |
534 | * the correct total number of bytes transferred; this is essential for | |
535 | * completing partial requests asynchronously. | |
536 | * | |
60263d58 CH |
537 | * Returns -ENOTBLK In case of a page invalidation invalidation failure for |
538 | * writes. The callers needs to fall back to buffered I/O in this case. | |
db074436 | 539 | */ |
c3d4ed1a CH |
540 | struct iomap_dio * |
541 | __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, | |
13ef9544 | 542 | const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
786f847f | 543 | unsigned int dio_flags, void *private, size_t done_before) |
db074436 | 544 | { |
db074436 | 545 | struct inode *inode = file_inode(iocb->ki_filp); |
a6d3d495 CH |
546 | struct iomap_iter iomi = { |
547 | .inode = inode, | |
548 | .pos = iocb->ki_pos, | |
549 | .len = iov_iter_count(iter), | |
550 | .flags = IOMAP_DIRECT, | |
786f847f | 551 | .private = private, |
a6d3d495 | 552 | }; |
2f632965 CH |
553 | bool wait_for_completion = |
554 | is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); | |
db074436 DW |
555 | struct blk_plug plug; |
556 | struct iomap_dio *dio; | |
8ee93b4b | 557 | loff_t ret = 0; |
db074436 | 558 | |
3fd41721 RHI |
559 | trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); |
560 | ||
a6d3d495 | 561 | if (!iomi.len) |
c3d4ed1a | 562 | return NULL; |
db074436 DW |
563 | |
564 | dio = kmalloc(sizeof(*dio), GFP_KERNEL); | |
565 | if (!dio) | |
c3d4ed1a | 566 | return ERR_PTR(-ENOMEM); |
db074436 DW |
567 | |
568 | dio->iocb = iocb; | |
569 | atomic_set(&dio->ref, 1); | |
570 | dio->size = 0; | |
571 | dio->i_size = i_size_read(inode); | |
838c4f3d | 572 | dio->dops = dops; |
db074436 DW |
573 | dio->error = 0; |
574 | dio->flags = 0; | |
4fdccaa0 | 575 | dio->done_before = done_before; |
db074436 DW |
576 | |
577 | dio->submit.iter = iter; | |
578 | dio->submit.waiter = current; | |
db074436 | 579 | |
8ee93b4b CH |
580 | if (iocb->ki_flags & IOCB_NOWAIT) |
581 | iomi.flags |= IOMAP_NOWAIT; | |
582 | ||
db074436 | 583 | if (iov_iter_rw(iter) == READ) { |
7b3c14d1 JA |
584 | /* reads can always complete inline */ |
585 | dio->flags |= IOMAP_DIO_INLINE_COMP; | |
586 | ||
a6d3d495 | 587 | if (iomi.pos >= dio->i_size) |
db074436 DW |
588 | goto out_free_dio; |
589 | ||
fcb14cb1 | 590 | if (user_backed_iter(iter)) |
db074436 | 591 | dio->flags |= IOMAP_DIO_DIRTY; |
8ee93b4b CH |
592 | |
593 | ret = kiocb_write_and_wait(iocb, iomi.len); | |
594 | if (ret) | |
595 | goto out_free_dio; | |
db074436 | 596 | } else { |
a6d3d495 | 597 | iomi.flags |= IOMAP_WRITE; |
db074436 DW |
598 | dio->flags |= IOMAP_DIO_WRITE; |
599 | ||
8c052fb3 JA |
600 | /* |
601 | * Flag as supporting deferred completions, if the issuer | |
602 | * groks it. This can avoid a workqueue punt for writes. | |
603 | * We may later clear this flag if we need to do other IO | |
604 | * as part of this IO completion. | |
605 | */ | |
606 | if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) | |
607 | dio->flags |= IOMAP_DIO_CALLER_COMP; | |
608 | ||
8ee93b4b CH |
609 | if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { |
610 | ret = -EAGAIN; | |
611 | if (iomi.pos >= dio->i_size || | |
612 | iomi.pos + iomi.len > dio->i_size) | |
985b71db | 613 | goto out_free_dio; |
8ee93b4b | 614 | iomi.flags |= IOMAP_OVERWRITE_ONLY; |
985b71db JA |
615 | } |
616 | ||
db074436 | 617 | /* for data sync or sync, we need sync completion processing */ |
d3bff1fc | 618 | if (iocb_is_dsync(iocb)) { |
db074436 DW |
619 | dio->flags |= IOMAP_DIO_NEED_SYNC; |
620 | ||
36518b6b | 621 | /* |
3a0be38c JA |
622 | * For datasync only writes, we optimistically try using |
623 | * WRITE_THROUGH for this IO. This flag requires either | |
624 | * FUA writes through the device's write cache, or a | |
625 | * normal write to a device without a volatile write | |
626 | * cache. For the former, Any non-FUA write that occurs | |
627 | * will clear this flag, hence we know before completion | |
628 | * whether a cache flush is necessary. | |
36518b6b AV |
629 | */ |
630 | if (!(iocb->ki_flags & IOCB_SYNC)) | |
3a0be38c | 631 | dio->flags |= IOMAP_DIO_WRITE_THROUGH; |
36518b6b | 632 | } |
db074436 | 633 | |
54752de9 DC |
634 | /* |
635 | * Try to invalidate cache pages for the range we are writing. | |
60263d58 CH |
636 | * If this invalidation fails, let the caller fall back to |
637 | * buffered I/O. | |
54752de9 | 638 | */ |
8ee93b4b CH |
639 | ret = kiocb_invalidate_pages(iocb, iomi.len); |
640 | if (ret) { | |
641 | if (ret != -EAGAIN) { | |
642 | trace_iomap_dio_invalidate_fail(inode, iomi.pos, | |
643 | iomi.len); | |
644 | ret = -ENOTBLK; | |
645 | } | |
60263d58 CH |
646 | goto out_free_dio; |
647 | } | |
db074436 | 648 | |
54752de9 DC |
649 | if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { |
650 | ret = sb_init_dio_done_wq(inode->i_sb); | |
651 | if (ret < 0) | |
652 | goto out_free_dio; | |
653 | } | |
db074436 DW |
654 | } |
655 | ||
656 | inode_dio_begin(inode); | |
657 | ||
658 | blk_start_plug(&plug); | |
f79d4749 | 659 | while ((ret = iomap_iter(&iomi, ops)) > 0) { |
a6d3d495 | 660 | iomi.processed = iomap_dio_iter(&iomi, dio); |
f79d4749 CH |
661 | |
662 | /* | |
663 | * We can only poll for single bio I/Os. | |
664 | */ | |
665 | iocb->ki_flags &= ~IOCB_HIPRI; | |
666 | } | |
667 | ||
db074436 DW |
668 | blk_finish_plug(&plug); |
669 | ||
a6d3d495 CH |
670 | /* |
671 | * We only report that we've read data up to i_size. | |
672 | * Revert iter to a state corresponding to that as some callers (such | |
673 | * as the splice code) rely on it. | |
674 | */ | |
675 | if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) | |
676 | iov_iter_revert(iter, iomi.pos - dio->i_size); | |
677 | ||
97308f8b AG |
678 | if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { |
679 | if (!(iocb->ki_flags & IOCB_NOWAIT)) | |
680 | wait_for_completion = true; | |
681 | ret = 0; | |
682 | } | |
683 | ||
a6d3d495 CH |
684 | /* magic error code to fall back to buffered I/O */ |
685 | if (ret == -ENOTBLK) { | |
686 | wait_for_completion = true; | |
687 | ret = 0; | |
688 | } | |
db074436 DW |
689 | if (ret < 0) |
690 | iomap_dio_set_error(dio, ret); | |
691 | ||
692 | /* | |
3a0be38c JA |
693 | * If all the writes we issued were already written through to the |
694 | * media, we don't need to flush the cache on IO completion. Clear the | |
695 | * sync flag for this case. | |
db074436 | 696 | */ |
3a0be38c | 697 | if (dio->flags & IOMAP_DIO_WRITE_THROUGH) |
db074436 DW |
698 | dio->flags &= ~IOMAP_DIO_NEED_SYNC; |
699 | ||
db074436 DW |
700 | /* |
701 | * We are about to drop our additional submission reference, which | |
d9973ce2 | 702 | * might be the last reference to the dio. There are three different |
703 | * ways we can progress here: | |
db074436 DW |
704 | * |
705 | * (a) If this is the last reference we will always complete and free | |
706 | * the dio ourselves. | |
707 | * (b) If this is not the last reference, and we serve an asynchronous | |
708 | * iocb, we must never touch the dio after the decrement, the | |
709 | * I/O completion handler will complete and free it. | |
710 | * (c) If this is not the last reference, but we serve a synchronous | |
711 | * iocb, the I/O completion handler will wake us up on the drop | |
712 | * of the final reference, and we will complete and free it here | |
713 | * after we got woken by the I/O completion handler. | |
714 | */ | |
715 | dio->wait_for_completion = wait_for_completion; | |
716 | if (!atomic_dec_and_test(&dio->ref)) { | |
3fd41721 RHI |
717 | if (!wait_for_completion) { |
718 | trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len); | |
c3d4ed1a | 719 | return ERR_PTR(-EIOCBQUEUED); |
3fd41721 | 720 | } |
db074436 DW |
721 | |
722 | for (;;) { | |
723 | set_current_state(TASK_UNINTERRUPTIBLE); | |
724 | if (!READ_ONCE(dio->submit.waiter)) | |
725 | break; | |
726 | ||
9650b453 | 727 | blk_io_schedule(); |
db074436 DW |
728 | } |
729 | __set_current_state(TASK_RUNNING); | |
730 | } | |
731 | ||
c3d4ed1a | 732 | return dio; |
db074436 DW |
733 | |
734 | out_free_dio: | |
735 | kfree(dio); | |
c3d4ed1a CH |
736 | if (ret) |
737 | return ERR_PTR(ret); | |
738 | return NULL; | |
739 | } | |
740 | EXPORT_SYMBOL_GPL(__iomap_dio_rw); | |
741 | ||
742 | ssize_t | |
743 | iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, | |
744 | const struct iomap_ops *ops, const struct iomap_dio_ops *dops, | |
786f847f | 745 | unsigned int dio_flags, void *private, size_t done_before) |
c3d4ed1a CH |
746 | { |
747 | struct iomap_dio *dio; | |
748 | ||
786f847f CH |
749 | dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private, |
750 | done_before); | |
c3d4ed1a CH |
751 | if (IS_ERR_OR_NULL(dio)) |
752 | return PTR_ERR_OR_ZERO(dio); | |
753 | return iomap_dio_complete(dio); | |
db074436 DW |
754 | } |
755 | EXPORT_SYMBOL_GPL(iomap_dio_rw); |