Commit | Line | Data |
---|---|---|
afc51aaa DW |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2010 Red Hat, Inc. | |
598ecfba | 4 | * Copyright (C) 2016-2019 Christoph Hellwig. |
afc51aaa DW |
5 | */ |
6 | #include <linux/module.h> | |
7 | #include <linux/compiler.h> | |
8 | #include <linux/fs.h> | |
9 | #include <linux/iomap.h> | |
10 | #include <linux/pagemap.h> | |
11 | #include <linux/uio.h> | |
12 | #include <linux/buffer_head.h> | |
13 | #include <linux/dax.h> | |
14 | #include <linux/writeback.h> | |
598ecfba | 15 | #include <linux/list_sort.h> |
afc51aaa DW |
16 | #include <linux/swap.h> |
17 | #include <linux/bio.h> | |
18 | #include <linux/sched/signal.h> | |
19 | #include <linux/migrate.h> | |
9e91c572 | 20 | #include "trace.h" |
afc51aaa DW |
21 | |
22 | #include "../internal.h" | |
23 | ||
ebb7fb15 DC |
24 | #define IOEND_BATCH_SIZE 4096 |
25 | ||
ab08b01e | 26 | /* |
95c4cd05 MWO |
27 | * Structure allocated for each folio when block size < folio size |
28 | * to track sub-folio uptodate status and I/O completions. | |
ab08b01e CH |
29 | */ |
30 | struct iomap_page { | |
7d636676 | 31 | atomic_t read_bytes_pending; |
0fb2d720 | 32 | atomic_t write_bytes_pending; |
1cea335d | 33 | spinlock_t uptodate_lock; |
0a195b91 | 34 | unsigned long uptodate[]; |
ab08b01e CH |
35 | }; |
36 | ||
95c4cd05 | 37 | static inline struct iomap_page *to_iomap_page(struct folio *folio) |
ab08b01e | 38 | { |
95c4cd05 MWO |
39 | if (folio_test_private(folio)) |
40 | return folio_get_private(folio); | |
ab08b01e CH |
41 | return NULL; |
42 | } | |
43 | ||
598ecfba CH |
44 | static struct bio_set iomap_ioend_bioset; |
45 | ||
afc51aaa | 46 | static struct iomap_page * |
9753b868 | 47 | iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) |
afc51aaa | 48 | { |
95c4cd05 | 49 | struct iomap_page *iop = to_iomap_page(folio); |
435d44b3 | 50 | unsigned int nr_blocks = i_blocks_per_folio(inode, folio); |
9753b868 | 51 | gfp_t gfp; |
afc51aaa | 52 | |
0a195b91 | 53 | if (iop || nr_blocks <= 1) |
afc51aaa DW |
54 | return iop; |
55 | ||
9753b868 SR |
56 | if (flags & IOMAP_NOWAIT) |
57 | gfp = GFP_NOWAIT; | |
58 | else | |
59 | gfp = GFP_NOFS | __GFP_NOFAIL; | |
60 | ||
0a195b91 | 61 | iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), |
9753b868 SR |
62 | gfp); |
63 | if (iop) { | |
64 | spin_lock_init(&iop->uptodate_lock); | |
65 | if (folio_test_uptodate(folio)) | |
66 | bitmap_fill(iop->uptodate, nr_blocks); | |
67 | folio_attach_private(folio, iop); | |
68 | } | |
afc51aaa DW |
69 | return iop; |
70 | } | |
71 | ||
c46e8324 | 72 | static void iomap_page_release(struct folio *folio) |
afc51aaa | 73 | { |
c46e8324 MWO |
74 | struct iomap_page *iop = folio_detach_private(folio); |
75 | struct inode *inode = folio->mapping->host; | |
76 | unsigned int nr_blocks = i_blocks_per_folio(inode, folio); | |
afc51aaa DW |
77 | |
78 | if (!iop) | |
79 | return; | |
7d636676 | 80 | WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); |
0fb2d720 | 81 | WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); |
0a195b91 | 82 | WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != |
c46e8324 | 83 | folio_test_uptodate(folio)); |
afc51aaa DW |
84 | kfree(iop); |
85 | } | |
86 | ||
87 | /* | |
431c0566 | 88 | * Calculate the range inside the folio that we actually need to read. |
afc51aaa | 89 | */ |
431c0566 MWO |
90 | static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, |
91 | loff_t *pos, loff_t length, size_t *offp, size_t *lenp) | |
afc51aaa | 92 | { |
431c0566 | 93 | struct iomap_page *iop = to_iomap_page(folio); |
afc51aaa DW |
94 | loff_t orig_pos = *pos; |
95 | loff_t isize = i_size_read(inode); | |
96 | unsigned block_bits = inode->i_blkbits; | |
97 | unsigned block_size = (1 << block_bits); | |
431c0566 MWO |
98 | size_t poff = offset_in_folio(folio, *pos); |
99 | size_t plen = min_t(loff_t, folio_size(folio) - poff, length); | |
afc51aaa DW |
100 | unsigned first = poff >> block_bits; |
101 | unsigned last = (poff + plen - 1) >> block_bits; | |
102 | ||
103 | /* | |
f1f264b4 | 104 | * If the block size is smaller than the page size, we need to check the |
afc51aaa DW |
105 | * per-block uptodate status and adjust the offset and length if needed |
106 | * to avoid reading in already uptodate ranges. | |
107 | */ | |
108 | if (iop) { | |
109 | unsigned int i; | |
110 | ||
111 | /* move forward for each leading block marked uptodate */ | |
112 | for (i = first; i <= last; i++) { | |
113 | if (!test_bit(i, iop->uptodate)) | |
114 | break; | |
115 | *pos += block_size; | |
116 | poff += block_size; | |
117 | plen -= block_size; | |
118 | first++; | |
119 | } | |
120 | ||
121 | /* truncate len if we find any trailing uptodate block(s) */ | |
122 | for ( ; i <= last; i++) { | |
123 | if (test_bit(i, iop->uptodate)) { | |
124 | plen -= (last - i + 1) * block_size; | |
125 | last = i - 1; | |
126 | break; | |
127 | } | |
128 | } | |
129 | } | |
130 | ||
131 | /* | |
f1f264b4 | 132 | * If the extent spans the block that contains the i_size, we need to |
afc51aaa DW |
133 | * handle both halves separately so that we properly zero data in the |
134 | * page cache for blocks that are entirely outside of i_size. | |
135 | */ | |
136 | if (orig_pos <= isize && orig_pos + length > isize) { | |
431c0566 | 137 | unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; |
afc51aaa DW |
138 | |
139 | if (first <= end && last > end) | |
140 | plen -= (last - end) * block_size; | |
141 | } | |
142 | ||
143 | *offp = poff; | |
144 | *lenp = plen; | |
145 | } | |
146 | ||
431c0566 MWO |
147 | static void iomap_iop_set_range_uptodate(struct folio *folio, |
148 | struct iomap_page *iop, size_t off, size_t len) | |
afc51aaa | 149 | { |
431c0566 | 150 | struct inode *inode = folio->mapping->host; |
afc51aaa DW |
151 | unsigned first = off >> inode->i_blkbits; |
152 | unsigned last = (off + len - 1) >> inode->i_blkbits; | |
1cea335d | 153 | unsigned long flags; |
afc51aaa | 154 | |
1cea335d | 155 | spin_lock_irqsave(&iop->uptodate_lock, flags); |
b21866f5 | 156 | bitmap_set(iop->uptodate, first, last - first + 1); |
431c0566 MWO |
157 | if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio))) |
158 | folio_mark_uptodate(folio); | |
1cea335d CH |
159 | spin_unlock_irqrestore(&iop->uptodate_lock, flags); |
160 | } | |
161 | ||
431c0566 MWO |
162 | static void iomap_set_range_uptodate(struct folio *folio, |
163 | struct iomap_page *iop, size_t off, size_t len) | |
1cea335d | 164 | { |
cd1e5afe | 165 | if (iop) |
431c0566 | 166 | iomap_iop_set_range_uptodate(folio, iop, off, len); |
1cea335d | 167 | else |
431c0566 | 168 | folio_mark_uptodate(folio); |
afc51aaa DW |
169 | } |
170 | ||
8ffd74e9 MWO |
171 | static void iomap_finish_folio_read(struct folio *folio, size_t offset, |
172 | size_t len, int error) | |
afc51aaa | 173 | { |
95c4cd05 | 174 | struct iomap_page *iop = to_iomap_page(folio); |
afc51aaa DW |
175 | |
176 | if (unlikely(error)) { | |
8ffd74e9 MWO |
177 | folio_clear_uptodate(folio); |
178 | folio_set_error(folio); | |
afc51aaa | 179 | } else { |
431c0566 | 180 | iomap_set_range_uptodate(folio, iop, offset, len); |
afc51aaa DW |
181 | } |
182 | ||
8ffd74e9 MWO |
183 | if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) |
184 | folio_unlock(folio); | |
afc51aaa DW |
185 | } |
186 | ||
8ffd74e9 | 187 | static void iomap_read_end_io(struct bio *bio) |
afc51aaa DW |
188 | { |
189 | int error = blk_status_to_errno(bio->bi_status); | |
8ffd74e9 | 190 | struct folio_iter fi; |
afc51aaa | 191 | |
8ffd74e9 MWO |
192 | bio_for_each_folio_all(fi, bio) |
193 | iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); | |
afc51aaa DW |
194 | bio_put(bio); |
195 | } | |
196 | ||
197 | struct iomap_readpage_ctx { | |
3aa9c659 MWO |
198 | struct folio *cur_folio; |
199 | bool cur_folio_in_bio; | |
afc51aaa | 200 | struct bio *bio; |
9d24a13a | 201 | struct readahead_control *rac; |
afc51aaa DW |
202 | }; |
203 | ||
5ad448ce AG |
204 | /** |
205 | * iomap_read_inline_data - copy inline data into the page cache | |
206 | * @iter: iteration structure | |
874628a2 | 207 | * @folio: folio to copy to |
5ad448ce | 208 | * |
874628a2 | 209 | * Copy the inline data in @iter into @folio and zero out the rest of the folio. |
5ad448ce AG |
210 | * Only a single IOMAP_INLINE extent is allowed at the end of each file. |
211 | * Returns zero for success to complete the read, or the usual negative errno. | |
212 | */ | |
213 | static int iomap_read_inline_data(const struct iomap_iter *iter, | |
874628a2 | 214 | struct folio *folio) |
afc51aaa | 215 | { |
cd1e5afe | 216 | struct iomap_page *iop; |
fad0a1ab | 217 | const struct iomap *iomap = iomap_iter_srcmap(iter); |
1b5c1e36 | 218 | size_t size = i_size_read(iter->inode) - iomap->offset; |
b405435b | 219 | size_t poff = offset_in_page(iomap->offset); |
431c0566 | 220 | size_t offset = offset_in_folio(folio, iomap->offset); |
afc51aaa DW |
221 | void *addr; |
222 | ||
874628a2 | 223 | if (folio_test_uptodate(folio)) |
5ad448ce | 224 | return 0; |
afc51aaa | 225 | |
ae44f9c2 MWO |
226 | if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) |
227 | return -EIO; | |
69f4a26c GX |
228 | if (WARN_ON_ONCE(size > PAGE_SIZE - |
229 | offset_in_page(iomap->inline_data))) | |
230 | return -EIO; | |
231 | if (WARN_ON_ONCE(size > iomap->length)) | |
232 | return -EIO; | |
431c0566 | 233 | if (offset > 0) |
9753b868 | 234 | iop = iomap_page_create(iter->inode, folio, iter->flags); |
cd1e5afe MWO |
235 | else |
236 | iop = to_iomap_page(folio); | |
afc51aaa | 237 | |
874628a2 | 238 | addr = kmap_local_folio(folio, offset); |
afc51aaa | 239 | memcpy(addr, iomap->inline_data, size); |
b405435b | 240 | memset(addr + size, 0, PAGE_SIZE - poff - size); |
ab069d5f | 241 | kunmap_local(addr); |
431c0566 | 242 | iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); |
5ad448ce | 243 | return 0; |
afc51aaa DW |
244 | } |
245 | ||
fad0a1ab | 246 | static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, |
1b5c1e36 | 247 | loff_t pos) |
009d8d84 | 248 | { |
fad0a1ab | 249 | const struct iomap *srcmap = iomap_iter_srcmap(iter); |
1b5c1e36 CH |
250 | |
251 | return srcmap->type != IOMAP_MAPPED || | |
252 | (srcmap->flags & IOMAP_F_NEW) || | |
253 | pos >= i_size_read(iter->inode); | |
009d8d84 CH |
254 | } |
255 | ||
fad0a1ab | 256 | static loff_t iomap_readpage_iter(const struct iomap_iter *iter, |
f6d48000 | 257 | struct iomap_readpage_ctx *ctx, loff_t offset) |
afc51aaa | 258 | { |
fad0a1ab | 259 | const struct iomap *iomap = &iter->iomap; |
f6d48000 CH |
260 | loff_t pos = iter->pos + offset; |
261 | loff_t length = iomap_length(iter) - offset; | |
3aa9c659 | 262 | struct folio *folio = ctx->cur_folio; |
637d3375 | 263 | struct iomap_page *iop; |
afc51aaa | 264 | loff_t orig_pos = pos; |
431c0566 | 265 | size_t poff, plen; |
afc51aaa DW |
266 | sector_t sector; |
267 | ||
5ad448ce | 268 | if (iomap->type == IOMAP_INLINE) |
874628a2 | 269 | return iomap_read_inline_data(iter, folio); |
afc51aaa DW |
270 | |
271 | /* zero post-eof blocks as the page may be mapped */ | |
9753b868 | 272 | iop = iomap_page_create(iter->inode, folio, iter->flags); |
431c0566 | 273 | iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); |
afc51aaa DW |
274 | if (plen == 0) |
275 | goto done; | |
276 | ||
1b5c1e36 | 277 | if (iomap_block_needs_zeroing(iter, pos)) { |
431c0566 MWO |
278 | folio_zero_range(folio, poff, plen); |
279 | iomap_set_range_uptodate(folio, iop, poff, plen); | |
afc51aaa DW |
280 | goto done; |
281 | } | |
282 | ||
3aa9c659 | 283 | ctx->cur_folio_in_bio = true; |
7d636676 MWO |
284 | if (iop) |
285 | atomic_add(plen, &iop->read_bytes_pending); | |
afc51aaa | 286 | |
afc51aaa | 287 | sector = iomap_sector(iomap, pos); |
d0364f94 CH |
288 | if (!ctx->bio || |
289 | bio_end_sector(ctx->bio) != sector || | |
431c0566 | 290 | !bio_add_folio(ctx->bio, folio, plen, poff)) { |
3aa9c659 | 291 | gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); |
457df33e | 292 | gfp_t orig_gfp = gfp; |
5f7136db | 293 | unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); |
afc51aaa DW |
294 | |
295 | if (ctx->bio) | |
296 | submit_bio(ctx->bio); | |
297 | ||
9d24a13a | 298 | if (ctx->rac) /* same as readahead_gfp_mask */ |
afc51aaa | 299 | gfp |= __GFP_NORETRY | __GFP_NOWARN; |
07888c66 CH |
300 | ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), |
301 | REQ_OP_READ, gfp); | |
457df33e MWO |
302 | /* |
303 | * If the bio_alloc fails, try it again for a single page to | |
304 | * avoid having to deal with partial page reads. This emulates | |
f132ab7d | 305 | * what do_mpage_read_folio does. |
457df33e | 306 | */ |
07888c66 CH |
307 | if (!ctx->bio) { |
308 | ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, | |
309 | orig_gfp); | |
310 | } | |
9d24a13a | 311 | if (ctx->rac) |
afc51aaa DW |
312 | ctx->bio->bi_opf |= REQ_RAHEAD; |
313 | ctx->bio->bi_iter.bi_sector = sector; | |
afc51aaa | 314 | ctx->bio->bi_end_io = iomap_read_end_io; |
c2478469 | 315 | bio_add_folio_nofail(ctx->bio, folio, plen, poff); |
afc51aaa | 316 | } |
431c0566 | 317 | |
afc51aaa DW |
318 | done: |
319 | /* | |
320 | * Move the caller beyond our range so that it keeps making progress. | |
f1f264b4 | 321 | * For that, we have to include any leading non-uptodate ranges, but |
afc51aaa DW |
322 | * we can skip trailing ones as they will be handled in the next |
323 | * iteration. | |
324 | */ | |
325 | return pos - orig_pos + plen; | |
326 | } | |
327 | ||
7479c505 | 328 | int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) |
afc51aaa | 329 | { |
f6d48000 | 330 | struct iomap_iter iter = { |
3aa9c659 MWO |
331 | .inode = folio->mapping->host, |
332 | .pos = folio_pos(folio), | |
333 | .len = folio_size(folio), | |
f6d48000 CH |
334 | }; |
335 | struct iomap_readpage_ctx ctx = { | |
3aa9c659 | 336 | .cur_folio = folio, |
f6d48000 CH |
337 | }; |
338 | int ret; | |
afc51aaa | 339 | |
3aa9c659 | 340 | trace_iomap_readpage(iter.inode, 1); |
9e91c572 | 341 | |
f6d48000 CH |
342 | while ((ret = iomap_iter(&iter, ops)) > 0) |
343 | iter.processed = iomap_readpage_iter(&iter, &ctx, 0); | |
344 | ||
345 | if (ret < 0) | |
3aa9c659 | 346 | folio_set_error(folio); |
afc51aaa DW |
347 | |
348 | if (ctx.bio) { | |
349 | submit_bio(ctx.bio); | |
3aa9c659 | 350 | WARN_ON_ONCE(!ctx.cur_folio_in_bio); |
afc51aaa | 351 | } else { |
3aa9c659 MWO |
352 | WARN_ON_ONCE(ctx.cur_folio_in_bio); |
353 | folio_unlock(folio); | |
afc51aaa DW |
354 | } |
355 | ||
356 | /* | |
2c69e205 | 357 | * Just like mpage_readahead and block_read_full_folio, we always |
7479c505 | 358 | * return 0 and just set the folio error flag on errors. This |
f1f264b4 | 359 | * should be cleaned up throughout the stack eventually. |
afc51aaa DW |
360 | */ |
361 | return 0; | |
362 | } | |
7479c505 | 363 | EXPORT_SYMBOL_GPL(iomap_read_folio); |
afc51aaa | 364 | |
fad0a1ab | 365 | static loff_t iomap_readahead_iter(const struct iomap_iter *iter, |
f6d48000 | 366 | struct iomap_readpage_ctx *ctx) |
afc51aaa | 367 | { |
f6d48000 | 368 | loff_t length = iomap_length(iter); |
afc51aaa DW |
369 | loff_t done, ret; |
370 | ||
371 | for (done = 0; done < length; done += ret) { | |
3aa9c659 MWO |
372 | if (ctx->cur_folio && |
373 | offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { | |
374 | if (!ctx->cur_folio_in_bio) | |
375 | folio_unlock(ctx->cur_folio); | |
376 | ctx->cur_folio = NULL; | |
afc51aaa | 377 | } |
3aa9c659 MWO |
378 | if (!ctx->cur_folio) { |
379 | ctx->cur_folio = readahead_folio(ctx->rac); | |
380 | ctx->cur_folio_in_bio = false; | |
afc51aaa | 381 | } |
f6d48000 | 382 | ret = iomap_readpage_iter(iter, ctx, done); |
d8af404f AG |
383 | if (ret <= 0) |
384 | return ret; | |
afc51aaa DW |
385 | } |
386 | ||
387 | return done; | |
388 | } | |
389 | ||
9d24a13a MWO |
390 | /** |
391 | * iomap_readahead - Attempt to read pages from a file. | |
392 | * @rac: Describes the pages to be read. | |
393 | * @ops: The operations vector for the filesystem. | |
394 | * | |
395 | * This function is for filesystems to call to implement their readahead | |
396 | * address_space operation. | |
397 | * | |
398 | * Context: The @ops callbacks may submit I/O (eg to read the addresses of | |
399 | * blocks from disc), and may wait for it. The caller may be trying to | |
400 | * access a different page, and so sleeping excessively should be avoided. | |
401 | * It may allocate memory, but should avoid costly allocations. This | |
402 | * function is called with memalloc_nofs set, so allocations will not cause | |
403 | * the filesystem to be reentered. | |
404 | */ | |
405 | void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) | |
afc51aaa | 406 | { |
f6d48000 CH |
407 | struct iomap_iter iter = { |
408 | .inode = rac->mapping->host, | |
409 | .pos = readahead_pos(rac), | |
410 | .len = readahead_length(rac), | |
411 | }; | |
afc51aaa | 412 | struct iomap_readpage_ctx ctx = { |
9d24a13a | 413 | .rac = rac, |
afc51aaa | 414 | }; |
afc51aaa | 415 | |
f6d48000 | 416 | trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); |
9e91c572 | 417 | |
f6d48000 CH |
418 | while (iomap_iter(&iter, ops) > 0) |
419 | iter.processed = iomap_readahead_iter(&iter, &ctx); | |
9d24a13a | 420 | |
afc51aaa DW |
421 | if (ctx.bio) |
422 | submit_bio(ctx.bio); | |
3aa9c659 MWO |
423 | if (ctx.cur_folio) { |
424 | if (!ctx.cur_folio_in_bio) | |
425 | folio_unlock(ctx.cur_folio); | |
afc51aaa | 426 | } |
afc51aaa | 427 | } |
9d24a13a | 428 | EXPORT_SYMBOL_GPL(iomap_readahead); |
afc51aaa DW |
429 | |
430 | /* | |
2e7e80f7 | 431 | * iomap_is_partially_uptodate checks whether blocks within a folio are |
afc51aaa DW |
432 | * uptodate or not. |
433 | * | |
2e7e80f7 MWO |
434 | * Returns true if all blocks which correspond to the specified part |
435 | * of the folio are uptodate. | |
afc51aaa | 436 | */ |
2e7e80f7 | 437 | bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) |
afc51aaa | 438 | { |
95c4cd05 | 439 | struct iomap_page *iop = to_iomap_page(folio); |
2e7e80f7 | 440 | struct inode *inode = folio->mapping->host; |
2e7e80f7 | 441 | unsigned first, last, i; |
afc51aaa | 442 | |
2e7e80f7 MWO |
443 | if (!iop) |
444 | return false; | |
afc51aaa | 445 | |
2756c818 MWO |
446 | /* Caller's range may extend past the end of this folio */ |
447 | count = min(folio_size(folio) - from, count); | |
afc51aaa | 448 | |
2756c818 | 449 | /* First and last blocks in range within folio */ |
afc51aaa | 450 | first = from >> inode->i_blkbits; |
2756c818 | 451 | last = (from + count - 1) >> inode->i_blkbits; |
afc51aaa | 452 | |
2e7e80f7 MWO |
453 | for (i = first; i <= last; i++) |
454 | if (!test_bit(i, iop->uptodate)) | |
455 | return false; | |
456 | return true; | |
afc51aaa DW |
457 | } |
458 | EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); | |
459 | ||
98321b51 AG |
460 | /** |
461 | * iomap_get_folio - get a folio reference for writing | |
462 | * @iter: iteration structure | |
463 | * @pos: start offset of write | |
464 | * | |
465 | * Returns a locked reference to the folio at @pos, or an error pointer if the | |
466 | * folio could not be obtained. | |
467 | */ | |
468 | struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) | |
469 | { | |
e999a5c5 | 470 | unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; |
98321b51 AG |
471 | |
472 | if (iter->flags & IOMAP_NOWAIT) | |
473 | fgp |= FGP_NOWAIT; | |
474 | ||
66dabbb6 | 475 | return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, |
98321b51 | 476 | fgp, mapping_gfp_mask(iter->inode->i_mapping)); |
98321b51 AG |
477 | } |
478 | EXPORT_SYMBOL_GPL(iomap_get_folio); | |
479 | ||
8597447d | 480 | bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) |
afc51aaa | 481 | { |
8597447d | 482 | trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), |
39f16c83 | 483 | folio_size(folio)); |
9e91c572 | 484 | |
afc51aaa | 485 | /* |
8597447d MWO |
486 | * mm accommodates an old ext3 case where clean folios might |
487 | * not have had the dirty bit cleared. Thus, it can send actual | |
488 | * dirty folios to ->release_folio() via shrink_active_list(); | |
489 | * skip those here. | |
afc51aaa | 490 | */ |
39f16c83 | 491 | if (folio_test_dirty(folio) || folio_test_writeback(folio)) |
8597447d | 492 | return false; |
c46e8324 | 493 | iomap_page_release(folio); |
8597447d | 494 | return true; |
afc51aaa | 495 | } |
8597447d | 496 | EXPORT_SYMBOL_GPL(iomap_release_folio); |
afc51aaa | 497 | |
8306a5f5 | 498 | void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) |
afc51aaa | 499 | { |
d82354f6 | 500 | trace_iomap_invalidate_folio(folio->mapping->host, |
1241ebec | 501 | folio_pos(folio) + offset, len); |
9e91c572 | 502 | |
afc51aaa | 503 | /* |
60d82310 MWO |
504 | * If we're invalidating the entire folio, clear the dirty state |
505 | * from it and release it to avoid unnecessary buildup of the LRU. | |
afc51aaa | 506 | */ |
8306a5f5 MWO |
507 | if (offset == 0 && len == folio_size(folio)) { |
508 | WARN_ON_ONCE(folio_test_writeback(folio)); | |
509 | folio_cancel_dirty(folio); | |
c46e8324 | 510 | iomap_page_release(folio); |
60d82310 MWO |
511 | } else if (folio_test_large(folio)) { |
512 | /* Must release the iop so the page can be split */ | |
513 | WARN_ON_ONCE(!folio_test_uptodate(folio) && | |
514 | folio_test_dirty(folio)); | |
515 | iomap_page_release(folio); | |
afc51aaa DW |
516 | } |
517 | } | |
8306a5f5 MWO |
518 | EXPORT_SYMBOL_GPL(iomap_invalidate_folio); |
519 | ||
afc51aaa DW |
520 | static void |
521 | iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) | |
522 | { | |
523 | loff_t i_size = i_size_read(inode); | |
524 | ||
525 | /* | |
526 | * Only truncate newly allocated pages beyoned EOF, even if the | |
527 | * write started inside the existing inode size. | |
528 | */ | |
529 | if (pos + len > i_size) | |
b71450e2 AG |
530 | truncate_pagecache_range(inode, max(pos, i_size), |
531 | pos + len - 1); | |
afc51aaa DW |
532 | } |
533 | ||
431c0566 MWO |
534 | static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, |
535 | size_t poff, size_t plen, const struct iomap *iomap) | |
afc51aaa DW |
536 | { |
537 | struct bio_vec bvec; | |
538 | struct bio bio; | |
539 | ||
49add496 | 540 | bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); |
afc51aaa | 541 | bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); |
c2478469 | 542 | bio_add_folio_nofail(&bio, folio, plen, poff); |
afc51aaa DW |
543 | return submit_bio_wait(&bio); |
544 | } | |
545 | ||
fad0a1ab | 546 | static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, |
bc6123a8 | 547 | size_t len, struct folio *folio) |
afc51aaa | 548 | { |
fad0a1ab | 549 | const struct iomap *srcmap = iomap_iter_srcmap(iter); |
9753b868 | 550 | struct iomap_page *iop; |
1b5c1e36 | 551 | loff_t block_size = i_blocksize(iter->inode); |
6cc19c5f NB |
552 | loff_t block_start = round_down(pos, block_size); |
553 | loff_t block_end = round_up(pos + len, block_size); | |
cae2de69 | 554 | unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); |
431c0566 MWO |
555 | size_t from = offset_in_folio(folio, pos), to = from + len; |
556 | size_t poff, plen; | |
afc51aaa | 557 | |
431c0566 | 558 | if (folio_test_uptodate(folio)) |
afc51aaa | 559 | return 0; |
431c0566 | 560 | folio_clear_error(folio); |
afc51aaa | 561 | |
9753b868 | 562 | iop = iomap_page_create(iter->inode, folio, iter->flags); |
cae2de69 SR |
563 | if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) |
564 | return -EAGAIN; | |
9753b868 | 565 | |
afc51aaa | 566 | do { |
431c0566 | 567 | iomap_adjust_read_range(iter->inode, folio, &block_start, |
afc51aaa DW |
568 | block_end - block_start, &poff, &plen); |
569 | if (plen == 0) | |
570 | break; | |
571 | ||
b74b1293 | 572 | if (!(iter->flags & IOMAP_UNSHARE) && |
32a38a49 | 573 | (from <= poff || from >= poff + plen) && |
d3b40439 CH |
574 | (to <= poff || to >= poff + plen)) |
575 | continue; | |
576 | ||
1b5c1e36 | 577 | if (iomap_block_needs_zeroing(iter, block_start)) { |
b74b1293 | 578 | if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) |
32a38a49 | 579 | return -EIO; |
431c0566 | 580 | folio_zero_segments(folio, poff, from, to, poff + plen); |
14284fed | 581 | } else { |
cae2de69 SR |
582 | int status; |
583 | ||
584 | if (iter->flags & IOMAP_NOWAIT) | |
585 | return -EAGAIN; | |
586 | ||
587 | status = iomap_read_folio_sync(block_start, folio, | |
14284fed MWO |
588 | poff, plen, srcmap); |
589 | if (status) | |
590 | return status; | |
afc51aaa | 591 | } |
431c0566 | 592 | iomap_set_range_uptodate(folio, iop, poff, plen); |
afc51aaa DW |
593 | } while ((block_start += plen) < block_end); |
594 | ||
d3b40439 | 595 | return 0; |
afc51aaa DW |
596 | } |
597 | ||
07c22b56 AG |
598 | static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, |
599 | size_t len) | |
600 | { | |
471859f5 | 601 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
07c22b56 | 602 | |
471859f5 AG |
603 | if (folio_ops && folio_ops->get_folio) |
604 | return folio_ops->get_folio(iter, pos, len); | |
07c22b56 AG |
605 | else |
606 | return iomap_get_folio(iter, pos); | |
607 | } | |
608 | ||
7a70a508 AG |
609 | static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, |
610 | struct folio *folio) | |
611 | { | |
471859f5 | 612 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
7a70a508 | 613 | |
471859f5 AG |
614 | if (folio_ops && folio_ops->put_folio) { |
615 | folio_ops->put_folio(iter->inode, pos, ret, folio); | |
9060bc4d | 616 | } else { |
7a70a508 | 617 | folio_unlock(folio); |
7a70a508 | 618 | folio_put(folio); |
80baab88 | 619 | } |
7a70a508 AG |
620 | } |
621 | ||
fad0a1ab | 622 | static int iomap_write_begin_inline(const struct iomap_iter *iter, |
bc6123a8 | 623 | struct folio *folio) |
69f4a26c GX |
624 | { |
625 | /* needs more work for the tailpacking case; disable for now */ | |
1b5c1e36 | 626 | if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) |
69f4a26c | 627 | return -EIO; |
874628a2 | 628 | return iomap_read_inline_data(iter, folio); |
69f4a26c GX |
629 | } |
630 | ||
d7b64041 | 631 | static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, |
bc6123a8 | 632 | size_t len, struct folio **foliop) |
afc51aaa | 633 | { |
471859f5 | 634 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
fad0a1ab | 635 | const struct iomap *srcmap = iomap_iter_srcmap(iter); |
d1bd0b4e | 636 | struct folio *folio; |
afc51aaa DW |
637 | int status = 0; |
638 | ||
1b5c1e36 CH |
639 | BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); |
640 | if (srcmap != &iter->iomap) | |
c039b997 | 641 | BUG_ON(pos + len > srcmap->offset + srcmap->length); |
afc51aaa DW |
642 | |
643 | if (fatal_signal_pending(current)) | |
644 | return -EINTR; | |
645 | ||
d454ab82 MWO |
646 | if (!mapping_large_folio_support(iter->inode->i_mapping)) |
647 | len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); | |
648 | ||
07c22b56 | 649 | folio = __iomap_get_folio(iter, pos, len); |
9060bc4d | 650 | if (IS_ERR(folio)) |
98321b51 | 651 | return PTR_ERR(folio); |
d7b64041 DC |
652 | |
653 | /* | |
654 | * Now we have a locked folio, before we do anything with it we need to | |
655 | * check that the iomap we have cached is not stale. The inode extent | |
656 | * mapping can change due to concurrent IO in flight (e.g. | |
657 | * IOMAP_UNWRITTEN state can change and memory reclaim could have | |
658 | * reclaimed a previously partially written page at this index after IO | |
659 | * completion before this write reaches this file offset) and hence we | |
660 | * could do the wrong thing here (zero a page range incorrectly or fail | |
661 | * to zero) and corrupt data. | |
662 | */ | |
471859f5 AG |
663 | if (folio_ops && folio_ops->iomap_valid) { |
664 | bool iomap_valid = folio_ops->iomap_valid(iter->inode, | |
665 | &iter->iomap); | |
d7b64041 DC |
666 | if (!iomap_valid) { |
667 | iter->iomap.flags |= IOMAP_F_STALE; | |
668 | status = 0; | |
669 | goto out_unlock; | |
670 | } | |
671 | } | |
672 | ||
d454ab82 MWO |
673 | if (pos + len > folio_pos(folio) + folio_size(folio)) |
674 | len = folio_pos(folio) + folio_size(folio) - pos; | |
afc51aaa | 675 | |
c039b997 | 676 | if (srcmap->type == IOMAP_INLINE) |
bc6123a8 | 677 | status = iomap_write_begin_inline(iter, folio); |
1b5c1e36 | 678 | else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) |
d1bd0b4e | 679 | status = __block_write_begin_int(folio, pos, len, NULL, srcmap); |
afc51aaa | 680 | else |
bc6123a8 | 681 | status = __iomap_write_begin(iter, pos, len, folio); |
afc51aaa DW |
682 | |
683 | if (unlikely(status)) | |
684 | goto out_unlock; | |
685 | ||
bc6123a8 | 686 | *foliop = folio; |
afc51aaa DW |
687 | return 0; |
688 | ||
689 | out_unlock: | |
7a70a508 | 690 | __iomap_put_folio(iter, pos, 0, folio); |
1b5c1e36 | 691 | iomap_write_failed(iter->inode, pos, len); |
afc51aaa | 692 | |
afc51aaa DW |
693 | return status; |
694 | } | |
695 | ||
e25ba8cb | 696 | static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, |
bc6123a8 | 697 | size_t copied, struct folio *folio) |
afc51aaa | 698 | { |
cd1e5afe | 699 | struct iomap_page *iop = to_iomap_page(folio); |
bc6123a8 | 700 | flush_dcache_folio(folio); |
afc51aaa DW |
701 | |
702 | /* | |
703 | * The blocks that were entirely written will now be uptodate, so we | |
7479c505 | 704 | * don't have to worry about a read_folio reading them and overwriting a |
f1f264b4 | 705 | * partial write. However, if we've encountered a short write and only |
afc51aaa | 706 | * partially written into a block, it will not be marked uptodate, so a |
7479c505 | 707 | * read_folio might come in and destroy our partial write. |
afc51aaa | 708 | * |
f1f264b4 AG |
709 | * Do the simplest thing and just treat any short write to a |
710 | * non-uptodate page as a zero-length write, and force the caller to | |
711 | * redo the whole thing. | |
afc51aaa | 712 | */ |
bc6123a8 | 713 | if (unlikely(copied < len && !folio_test_uptodate(folio))) |
afc51aaa | 714 | return 0; |
431c0566 | 715 | iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); |
bc6123a8 | 716 | filemap_dirty_folio(inode->i_mapping, folio); |
afc51aaa DW |
717 | return copied; |
718 | } | |
719 | ||
fad0a1ab | 720 | static size_t iomap_write_end_inline(const struct iomap_iter *iter, |
9c4ce08d | 721 | struct folio *folio, loff_t pos, size_t copied) |
afc51aaa | 722 | { |
fad0a1ab | 723 | const struct iomap *iomap = &iter->iomap; |
afc51aaa DW |
724 | void *addr; |
725 | ||
9c4ce08d | 726 | WARN_ON_ONCE(!folio_test_uptodate(folio)); |
69f4a26c | 727 | BUG_ON(!iomap_inline_data_valid(iomap)); |
afc51aaa | 728 | |
9c4ce08d MWO |
729 | flush_dcache_folio(folio); |
730 | addr = kmap_local_folio(folio, pos); | |
ab069d5f MWO |
731 | memcpy(iomap_inline_data(iomap, pos), addr, copied); |
732 | kunmap_local(addr); | |
afc51aaa | 733 | |
1b5c1e36 | 734 | mark_inode_dirty(iter->inode); |
afc51aaa DW |
735 | return copied; |
736 | } | |
737 | ||
e25ba8cb | 738 | /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ |
1b5c1e36 | 739 | static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, |
bc6123a8 | 740 | size_t copied, struct folio *folio) |
afc51aaa | 741 | { |
fad0a1ab | 742 | const struct iomap *srcmap = iomap_iter_srcmap(iter); |
1b5c1e36 | 743 | loff_t old_size = iter->inode->i_size; |
e25ba8cb | 744 | size_t ret; |
afc51aaa | 745 | |
c039b997 | 746 | if (srcmap->type == IOMAP_INLINE) { |
9c4ce08d | 747 | ret = iomap_write_end_inline(iter, folio, pos, copied); |
c039b997 | 748 | } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { |
1b5c1e36 | 749 | ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, |
bc6123a8 | 750 | copied, &folio->page, NULL); |
afc51aaa | 751 | } else { |
bc6123a8 | 752 | ret = __iomap_write_end(iter->inode, pos, len, copied, folio); |
afc51aaa DW |
753 | } |
754 | ||
755 | /* | |
756 | * Update the in-memory inode size after copying the data into the page | |
757 | * cache. It's up to the file system to write the updated size to disk, | |
758 | * preferably after I/O completion so that no stale data is exposed. | |
759 | */ | |
760 | if (pos + ret > old_size) { | |
1b5c1e36 CH |
761 | i_size_write(iter->inode, pos + ret); |
762 | iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; | |
afc51aaa | 763 | } |
7a70a508 | 764 | __iomap_put_folio(iter, pos, ret, folio); |
afc51aaa DW |
765 | |
766 | if (old_size < pos) | |
1b5c1e36 | 767 | pagecache_isize_extended(iter->inode, old_size, pos); |
afc51aaa | 768 | if (ret < len) |
d74999c8 | 769 | iomap_write_failed(iter->inode, pos + ret, len - ret); |
afc51aaa DW |
770 | return ret; |
771 | } | |
772 | ||
ce83a025 | 773 | static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) |
afc51aaa | 774 | { |
ce83a025 CH |
775 | loff_t length = iomap_length(iter); |
776 | loff_t pos = iter->pos; | |
afc51aaa | 777 | ssize_t written = 0; |
ce83a025 | 778 | long status = 0; |
cae2de69 SR |
779 | struct address_space *mapping = iter->inode->i_mapping; |
780 | unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; | |
afc51aaa DW |
781 | |
782 | do { | |
bc6123a8 | 783 | struct folio *folio; |
afc51aaa DW |
784 | struct page *page; |
785 | unsigned long offset; /* Offset into pagecache page */ | |
786 | unsigned long bytes; /* Bytes to write to page */ | |
787 | size_t copied; /* Bytes copied from user */ | |
788 | ||
789 | offset = offset_in_page(pos); | |
790 | bytes = min_t(unsigned long, PAGE_SIZE - offset, | |
791 | iov_iter_count(i)); | |
792 | again: | |
cae2de69 SR |
793 | status = balance_dirty_pages_ratelimited_flags(mapping, |
794 | bdp_flags); | |
795 | if (unlikely(status)) | |
796 | break; | |
797 | ||
afc51aaa DW |
798 | if (bytes > length) |
799 | bytes = length; | |
800 | ||
801 | /* | |
f1f264b4 | 802 | * Bring in the user page that we'll copy from _first_. |
afc51aaa DW |
803 | * Otherwise there's a nasty deadlock on copying from the |
804 | * same page as we're writing to, without it being marked | |
805 | * up-to-date. | |
cae2de69 SR |
806 | * |
807 | * For async buffered writes the assumption is that the user | |
808 | * page has already been faulted in. This can be optimized by | |
809 | * faulting the user page. | |
afc51aaa | 810 | */ |
631f871f | 811 | if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { |
afc51aaa DW |
812 | status = -EFAULT; |
813 | break; | |
814 | } | |
815 | ||
bc6123a8 | 816 | status = iomap_write_begin(iter, pos, bytes, &folio); |
afc51aaa DW |
817 | if (unlikely(status)) |
818 | break; | |
d7b64041 DC |
819 | if (iter->iomap.flags & IOMAP_F_STALE) |
820 | break; | |
afc51aaa | 821 | |
bc6123a8 | 822 | page = folio_file_page(folio, pos >> PAGE_SHIFT); |
cae2de69 | 823 | if (mapping_writably_mapped(mapping)) |
afc51aaa DW |
824 | flush_dcache_page(page); |
825 | ||
f0b65f39 | 826 | copied = copy_page_from_iter_atomic(page, offset, bytes, i); |
afc51aaa | 827 | |
bc6123a8 | 828 | status = iomap_write_end(iter, pos, bytes, copied, folio); |
afc51aaa | 829 | |
f0b65f39 AV |
830 | if (unlikely(copied != status)) |
831 | iov_iter_revert(i, copied - status); | |
afc51aaa | 832 | |
f0b65f39 | 833 | cond_resched(); |
bc1bb416 | 834 | if (unlikely(status == 0)) { |
afc51aaa | 835 | /* |
bc1bb416 AV |
836 | * A short copy made iomap_write_end() reject the |
837 | * thing entirely. Might be memory poisoning | |
838 | * halfway through, might be a race with munmap, | |
839 | * might be severe memory pressure. | |
afc51aaa | 840 | */ |
bc1bb416 AV |
841 | if (copied) |
842 | bytes = copied; | |
afc51aaa DW |
843 | goto again; |
844 | } | |
f0b65f39 AV |
845 | pos += status; |
846 | written += status; | |
847 | length -= status; | |
afc51aaa DW |
848 | } while (iov_iter_count(i) && length); |
849 | ||
18e419f6 SR |
850 | if (status == -EAGAIN) { |
851 | iov_iter_revert(i, written); | |
852 | return -EAGAIN; | |
853 | } | |
afc51aaa DW |
854 | return written ? written : status; |
855 | } | |
856 | ||
857 | ssize_t | |
ce83a025 | 858 | iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, |
afc51aaa DW |
859 | const struct iomap_ops *ops) |
860 | { | |
ce83a025 CH |
861 | struct iomap_iter iter = { |
862 | .inode = iocb->ki_filp->f_mapping->host, | |
863 | .pos = iocb->ki_pos, | |
864 | .len = iov_iter_count(i), | |
865 | .flags = IOMAP_WRITE, | |
866 | }; | |
219580ee | 867 | ssize_t ret; |
afc51aaa | 868 | |
cae2de69 SR |
869 | if (iocb->ki_flags & IOCB_NOWAIT) |
870 | iter.flags |= IOMAP_NOWAIT; | |
871 | ||
ce83a025 CH |
872 | while ((ret = iomap_iter(&iter, ops)) > 0) |
873 | iter.processed = iomap_write_iter(&iter, i); | |
219580ee CH |
874 | |
875 | if (unlikely(ret < 0)) | |
ce83a025 | 876 | return ret; |
219580ee CH |
877 | ret = iter.pos - iocb->ki_pos; |
878 | iocb->ki_pos += ret; | |
879 | return ret; | |
afc51aaa DW |
880 | } |
881 | EXPORT_SYMBOL_GPL(iomap_file_buffered_write); | |
882 | ||
f43dc4dc DC |
883 | /* |
884 | * Scan the data range passed to us for dirty page cache folios. If we find a | |
885 | * dirty folio, punch out the preceeding range and update the offset from which | |
886 | * the next punch will start from. | |
887 | * | |
888 | * We can punch out storage reservations under clean pages because they either | |
889 | * contain data that has been written back - in which case the delalloc punch | |
890 | * over that range is a no-op - or they have been read faults in which case they | |
891 | * contain zeroes and we can remove the delalloc backing range and any new | |
892 | * writes to those pages will do the normal hole filling operation... | |
893 | * | |
894 | * This makes the logic simple: we only need to keep the delalloc extents only | |
895 | * over the dirty ranges of the page cache. | |
896 | * | |
897 | * This function uses [start_byte, end_byte) intervals (i.e. open ended) to | |
898 | * simplify range iterations. | |
899 | */ | |
900 | static int iomap_write_delalloc_scan(struct inode *inode, | |
901 | loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, | |
902 | int (*punch)(struct inode *inode, loff_t offset, loff_t length)) | |
903 | { | |
904 | while (start_byte < end_byte) { | |
905 | struct folio *folio; | |
906 | ||
907 | /* grab locked page */ | |
908 | folio = filemap_lock_folio(inode->i_mapping, | |
909 | start_byte >> PAGE_SHIFT); | |
66dabbb6 | 910 | if (IS_ERR(folio)) { |
f43dc4dc DC |
911 | start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + |
912 | PAGE_SIZE; | |
913 | continue; | |
914 | } | |
915 | ||
916 | /* if dirty, punch up to offset */ | |
917 | if (folio_test_dirty(folio)) { | |
918 | if (start_byte > *punch_start_byte) { | |
919 | int error; | |
920 | ||
921 | error = punch(inode, *punch_start_byte, | |
922 | start_byte - *punch_start_byte); | |
923 | if (error) { | |
924 | folio_unlock(folio); | |
925 | folio_put(folio); | |
926 | return error; | |
927 | } | |
928 | } | |
929 | ||
930 | /* | |
931 | * Make sure the next punch start is correctly bound to | |
932 | * the end of this data range, not the end of the folio. | |
933 | */ | |
934 | *punch_start_byte = min_t(loff_t, end_byte, | |
935 | folio_next_index(folio) << PAGE_SHIFT); | |
936 | } | |
937 | ||
938 | /* move offset to start of next folio in range */ | |
939 | start_byte = folio_next_index(folio) << PAGE_SHIFT; | |
940 | folio_unlock(folio); | |
941 | folio_put(folio); | |
942 | } | |
943 | return 0; | |
944 | } | |
945 | ||
946 | /* | |
947 | * Punch out all the delalloc blocks in the range given except for those that | |
948 | * have dirty data still pending in the page cache - those are going to be | |
949 | * written and so must still retain the delalloc backing for writeback. | |
950 | * | |
951 | * As we are scanning the page cache for data, we don't need to reimplement the | |
952 | * wheel - mapping_seek_hole_data() does exactly what we need to identify the | |
953 | * start and end of data ranges correctly even for sub-folio block sizes. This | |
954 | * byte range based iteration is especially convenient because it means we | |
955 | * don't have to care about variable size folios, nor where the start or end of | |
956 | * the data range lies within a folio, if they lie within the same folio or even | |
957 | * if there are multiple discontiguous data ranges within the folio. | |
958 | * | |
959 | * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so | |
960 | * can return data ranges that exist in the cache beyond EOF. e.g. a page fault | |
961 | * spanning EOF will initialise the post-EOF data to zeroes and mark it up to | |
962 | * date. A write page fault can then mark it dirty. If we then fail a write() | |
963 | * beyond EOF into that up to date cached range, we allocate a delalloc block | |
964 | * beyond EOF and then have to punch it out. Because the range is up to date, | |
965 | * mapping_seek_hole_data() will return it, and we will skip the punch because | |
966 | * the folio is dirty. THis is incorrect - we always need to punch out delalloc | |
967 | * beyond EOF in this case as writeback will never write back and covert that | |
968 | * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, | |
969 | * resulting in always punching out the range from the EOF to the end of the | |
970 | * range the iomap spans. | |
971 | * | |
972 | * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it | |
973 | * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA | |
974 | * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) | |
975 | * returns the end of the data range (data_end). Using closed intervals would | |
976 | * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose | |
977 | * the code to subtle off-by-one bugs.... | |
978 | */ | |
979 | static int iomap_write_delalloc_release(struct inode *inode, | |
980 | loff_t start_byte, loff_t end_byte, | |
981 | int (*punch)(struct inode *inode, loff_t pos, loff_t length)) | |
982 | { | |
983 | loff_t punch_start_byte = start_byte; | |
984 | loff_t scan_end_byte = min(i_size_read(inode), end_byte); | |
985 | int error = 0; | |
986 | ||
987 | /* | |
988 | * Lock the mapping to avoid races with page faults re-instantiating | |
989 | * folios and dirtying them via ->page_mkwrite whilst we walk the | |
990 | * cache and perform delalloc extent removal. Failing to do this can | |
991 | * leave dirty pages with no space reservation in the cache. | |
992 | */ | |
993 | filemap_invalidate_lock(inode->i_mapping); | |
994 | while (start_byte < scan_end_byte) { | |
995 | loff_t data_end; | |
996 | ||
997 | start_byte = mapping_seek_hole_data(inode->i_mapping, | |
998 | start_byte, scan_end_byte, SEEK_DATA); | |
999 | /* | |
1000 | * If there is no more data to scan, all that is left is to | |
1001 | * punch out the remaining range. | |
1002 | */ | |
1003 | if (start_byte == -ENXIO || start_byte == scan_end_byte) | |
1004 | break; | |
1005 | if (start_byte < 0) { | |
1006 | error = start_byte; | |
1007 | goto out_unlock; | |
1008 | } | |
1009 | WARN_ON_ONCE(start_byte < punch_start_byte); | |
1010 | WARN_ON_ONCE(start_byte > scan_end_byte); | |
1011 | ||
1012 | /* | |
1013 | * We find the end of this contiguous cached data range by | |
1014 | * seeking from start_byte to the beginning of the next hole. | |
1015 | */ | |
1016 | data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, | |
1017 | scan_end_byte, SEEK_HOLE); | |
1018 | if (data_end < 0) { | |
1019 | error = data_end; | |
1020 | goto out_unlock; | |
1021 | } | |
1022 | WARN_ON_ONCE(data_end <= start_byte); | |
1023 | WARN_ON_ONCE(data_end > scan_end_byte); | |
1024 | ||
1025 | error = iomap_write_delalloc_scan(inode, &punch_start_byte, | |
1026 | start_byte, data_end, punch); | |
1027 | if (error) | |
1028 | goto out_unlock; | |
1029 | ||
1030 | /* The next data search starts at the end of this one. */ | |
1031 | start_byte = data_end; | |
1032 | } | |
1033 | ||
1034 | if (punch_start_byte < end_byte) | |
1035 | error = punch(inode, punch_start_byte, | |
1036 | end_byte - punch_start_byte); | |
1037 | out_unlock: | |
1038 | filemap_invalidate_unlock(inode->i_mapping); | |
1039 | return error; | |
1040 | } | |
1041 | ||
9c7babf9 DC |
1042 | /* |
1043 | * When a short write occurs, the filesystem may need to remove reserved space | |
1044 | * that was allocated in ->iomap_begin from it's ->iomap_end method. For | |
1045 | * filesystems that use delayed allocation, we need to punch out delalloc | |
1046 | * extents from the range that are not dirty in the page cache. As the write can | |
1047 | * race with page faults, there can be dirty pages over the delalloc extent | |
1048 | * outside the range of a short write but still within the delalloc extent | |
1049 | * allocated for this iomap. | |
1050 | * | |
1051 | * This function uses [start_byte, end_byte) intervals (i.e. open ended) to | |
f43dc4dc DC |
1052 | * simplify range iterations. |
1053 | * | |
1054 | * The punch() callback *must* only punch delalloc extents in the range passed | |
1055 | * to it. It must skip over all other types of extents in the range and leave | |
1056 | * them completely unchanged. It must do this punch atomically with respect to | |
1057 | * other extent modifications. | |
1058 | * | |
1059 | * The punch() callback may be called with a folio locked to prevent writeback | |
1060 | * extent allocation racing at the edge of the range we are currently punching. | |
1061 | * The locked folio may or may not cover the range being punched, so it is not | |
1062 | * safe for the punch() callback to lock folios itself. | |
1063 | * | |
1064 | * Lock order is: | |
1065 | * | |
1066 | * inode->i_rwsem (shared or exclusive) | |
1067 | * inode->i_mapping->invalidate_lock (exclusive) | |
1068 | * folio_lock() | |
1069 | * ->punch | |
1070 | * internal filesystem allocation lock | |
9c7babf9 DC |
1071 | */ |
1072 | int iomap_file_buffered_write_punch_delalloc(struct inode *inode, | |
1073 | struct iomap *iomap, loff_t pos, loff_t length, | |
1074 | ssize_t written, | |
1075 | int (*punch)(struct inode *inode, loff_t pos, loff_t length)) | |
1076 | { | |
1077 | loff_t start_byte; | |
1078 | loff_t end_byte; | |
302efbef | 1079 | unsigned int blocksize = i_blocksize(inode); |
9c7babf9 DC |
1080 | |
1081 | if (iomap->type != IOMAP_DELALLOC) | |
1082 | return 0; | |
1083 | ||
1084 | /* If we didn't reserve the blocks, we're not allowed to punch them. */ | |
1085 | if (!(iomap->flags & IOMAP_F_NEW)) | |
1086 | return 0; | |
1087 | ||
1088 | /* | |
1089 | * start_byte refers to the first unused block after a short write. If | |
1090 | * nothing was written, round offset down to point at the first block in | |
1091 | * the range. | |
1092 | */ | |
1093 | if (unlikely(!written)) | |
1094 | start_byte = round_down(pos, blocksize); | |
1095 | else | |
1096 | start_byte = round_up(pos + written, blocksize); | |
1097 | end_byte = round_up(pos + length, blocksize); | |
1098 | ||
1099 | /* Nothing to do if we've written the entire delalloc extent */ | |
1100 | if (start_byte >= end_byte) | |
1101 | return 0; | |
1102 | ||
f43dc4dc DC |
1103 | return iomap_write_delalloc_release(inode, start_byte, end_byte, |
1104 | punch); | |
9c7babf9 DC |
1105 | } |
1106 | EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); | |
1107 | ||
8fc274d1 | 1108 | static loff_t iomap_unshare_iter(struct iomap_iter *iter) |
afc51aaa | 1109 | { |
8fc274d1 | 1110 | struct iomap *iomap = &iter->iomap; |
fad0a1ab | 1111 | const struct iomap *srcmap = iomap_iter_srcmap(iter); |
8fc274d1 CH |
1112 | loff_t pos = iter->pos; |
1113 | loff_t length = iomap_length(iter); | |
afc51aaa | 1114 | long status = 0; |
d4ff3b2e | 1115 | loff_t written = 0; |
afc51aaa | 1116 | |
3590c4d8 CH |
1117 | /* don't bother with blocks that are not shared to start with */ |
1118 | if (!(iomap->flags & IOMAP_F_SHARED)) | |
1119 | return length; | |
1120 | /* don't bother with holes or unwritten extents */ | |
c039b997 | 1121 | if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) |
3590c4d8 CH |
1122 | return length; |
1123 | ||
afc51aaa | 1124 | do { |
32a38a49 CH |
1125 | unsigned long offset = offset_in_page(pos); |
1126 | unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); | |
bc6123a8 | 1127 | struct folio *folio; |
afc51aaa | 1128 | |
bc6123a8 | 1129 | status = iomap_write_begin(iter, pos, bytes, &folio); |
afc51aaa DW |
1130 | if (unlikely(status)) |
1131 | return status; | |
d7b64041 DC |
1132 | if (iter->iomap.flags & IOMAP_F_STALE) |
1133 | break; | |
afc51aaa | 1134 | |
bc6123a8 | 1135 | status = iomap_write_end(iter, pos, bytes, bytes, folio); |
e25ba8cb MWO |
1136 | if (WARN_ON_ONCE(status == 0)) |
1137 | return -EIO; | |
afc51aaa DW |
1138 | |
1139 | cond_resched(); | |
1140 | ||
1141 | pos += status; | |
1142 | written += status; | |
1143 | length -= status; | |
1144 | ||
8fc274d1 | 1145 | balance_dirty_pages_ratelimited(iter->inode->i_mapping); |
afc51aaa DW |
1146 | } while (length); |
1147 | ||
1148 | return written; | |
1149 | } | |
1150 | ||
1151 | int | |
3590c4d8 | 1152 | iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, |
afc51aaa DW |
1153 | const struct iomap_ops *ops) |
1154 | { | |
8fc274d1 CH |
1155 | struct iomap_iter iter = { |
1156 | .inode = inode, | |
1157 | .pos = pos, | |
1158 | .len = len, | |
b74b1293 | 1159 | .flags = IOMAP_WRITE | IOMAP_UNSHARE, |
8fc274d1 CH |
1160 | }; |
1161 | int ret; | |
afc51aaa | 1162 | |
8fc274d1 CH |
1163 | while ((ret = iomap_iter(&iter, ops)) > 0) |
1164 | iter.processed = iomap_unshare_iter(&iter); | |
1165 | return ret; | |
afc51aaa | 1166 | } |
3590c4d8 | 1167 | EXPORT_SYMBOL_GPL(iomap_file_unshare); |
afc51aaa | 1168 | |
2aa3048e | 1169 | static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) |
afc51aaa | 1170 | { |
fad0a1ab | 1171 | const struct iomap *srcmap = iomap_iter_srcmap(iter); |
2aa3048e CH |
1172 | loff_t pos = iter->pos; |
1173 | loff_t length = iomap_length(iter); | |
afc51aaa | 1174 | loff_t written = 0; |
afc51aaa DW |
1175 | |
1176 | /* already zeroed? we're done. */ | |
c039b997 | 1177 | if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) |
81ee8e52 | 1178 | return length; |
afc51aaa DW |
1179 | |
1180 | do { | |
4d7bd0eb MWO |
1181 | struct folio *folio; |
1182 | int status; | |
1183 | size_t offset; | |
1184 | size_t bytes = min_t(u64, SIZE_MAX, length); | |
1185 | ||
4d7bd0eb MWO |
1186 | status = iomap_write_begin(iter, pos, bytes, &folio); |
1187 | if (status) | |
1188 | return status; | |
d7b64041 DC |
1189 | if (iter->iomap.flags & IOMAP_F_STALE) |
1190 | break; | |
4d7bd0eb MWO |
1191 | |
1192 | offset = offset_in_folio(folio, pos); | |
1193 | if (bytes > folio_size(folio) - offset) | |
1194 | bytes = folio_size(folio) - offset; | |
1195 | ||
1196 | folio_zero_range(folio, offset, bytes); | |
1197 | folio_mark_accessed(folio); | |
1198 | ||
1199 | bytes = iomap_write_end(iter, pos, bytes, bytes, folio); | |
4d7bd0eb MWO |
1200 | if (WARN_ON_ONCE(bytes == 0)) |
1201 | return -EIO; | |
afc51aaa DW |
1202 | |
1203 | pos += bytes; | |
81ee8e52 | 1204 | length -= bytes; |
afc51aaa | 1205 | written += bytes; |
81ee8e52 | 1206 | } while (length > 0); |
afc51aaa | 1207 | |
98eb8d95 KX |
1208 | if (did_zero) |
1209 | *did_zero = true; | |
afc51aaa DW |
1210 | return written; |
1211 | } | |
1212 | ||
1213 | int | |
1214 | iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, | |
1215 | const struct iomap_ops *ops) | |
1216 | { | |
2aa3048e CH |
1217 | struct iomap_iter iter = { |
1218 | .inode = inode, | |
1219 | .pos = pos, | |
1220 | .len = len, | |
1221 | .flags = IOMAP_ZERO, | |
1222 | }; | |
1223 | int ret; | |
afc51aaa | 1224 | |
2aa3048e CH |
1225 | while ((ret = iomap_iter(&iter, ops)) > 0) |
1226 | iter.processed = iomap_zero_iter(&iter, did_zero); | |
1227 | return ret; | |
afc51aaa DW |
1228 | } |
1229 | EXPORT_SYMBOL_GPL(iomap_zero_range); | |
1230 | ||
1231 | int | |
1232 | iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, | |
1233 | const struct iomap_ops *ops) | |
1234 | { | |
1235 | unsigned int blocksize = i_blocksize(inode); | |
1236 | unsigned int off = pos & (blocksize - 1); | |
1237 | ||
1238 | /* Block boundary? Nothing to do */ | |
1239 | if (!off) | |
1240 | return 0; | |
1241 | return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); | |
1242 | } | |
1243 | EXPORT_SYMBOL_GPL(iomap_truncate_page); | |
1244 | ||
ea0f843a MWO |
1245 | static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, |
1246 | struct folio *folio) | |
afc51aaa | 1247 | { |
253564ba | 1248 | loff_t length = iomap_length(iter); |
afc51aaa DW |
1249 | int ret; |
1250 | ||
253564ba | 1251 | if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { |
d1bd0b4e | 1252 | ret = __block_write_begin_int(folio, iter->pos, length, NULL, |
253564ba | 1253 | &iter->iomap); |
afc51aaa DW |
1254 | if (ret) |
1255 | return ret; | |
ea0f843a | 1256 | block_commit_write(&folio->page, 0, length); |
afc51aaa | 1257 | } else { |
ea0f843a MWO |
1258 | WARN_ON_ONCE(!folio_test_uptodate(folio)); |
1259 | folio_mark_dirty(folio); | |
afc51aaa DW |
1260 | } |
1261 | ||
1262 | return length; | |
1263 | } | |
1264 | ||
1265 | vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) | |
1266 | { | |
253564ba CH |
1267 | struct iomap_iter iter = { |
1268 | .inode = file_inode(vmf->vma->vm_file), | |
1269 | .flags = IOMAP_WRITE | IOMAP_FAULT, | |
1270 | }; | |
ea0f843a | 1271 | struct folio *folio = page_folio(vmf->page); |
afc51aaa DW |
1272 | ssize_t ret; |
1273 | ||
ea0f843a MWO |
1274 | folio_lock(folio); |
1275 | ret = folio_mkwrite_check_truncate(folio, iter.inode); | |
243145bc | 1276 | if (ret < 0) |
afc51aaa | 1277 | goto out_unlock; |
ea0f843a | 1278 | iter.pos = folio_pos(folio); |
253564ba CH |
1279 | iter.len = ret; |
1280 | while ((ret = iomap_iter(&iter, ops)) > 0) | |
ea0f843a | 1281 | iter.processed = iomap_folio_mkwrite_iter(&iter, folio); |
afc51aaa | 1282 | |
253564ba CH |
1283 | if (ret < 0) |
1284 | goto out_unlock; | |
ea0f843a | 1285 | folio_wait_stable(folio); |
afc51aaa DW |
1286 | return VM_FAULT_LOCKED; |
1287 | out_unlock: | |
ea0f843a | 1288 | folio_unlock(folio); |
2ba39cc4 | 1289 | return vmf_fs_error(ret); |
afc51aaa DW |
1290 | } |
1291 | EXPORT_SYMBOL_GPL(iomap_page_mkwrite); | |
598ecfba | 1292 | |
8ffd74e9 MWO |
1293 | static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, |
1294 | size_t len, int error) | |
598ecfba | 1295 | { |
95c4cd05 | 1296 | struct iomap_page *iop = to_iomap_page(folio); |
598ecfba CH |
1297 | |
1298 | if (error) { | |
8ffd74e9 | 1299 | folio_set_error(folio); |
b69eea82 | 1300 | mapping_set_error(inode->i_mapping, error); |
598ecfba CH |
1301 | } |
1302 | ||
8ffd74e9 | 1303 | WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); |
0fb2d720 | 1304 | WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); |
598ecfba | 1305 | |
0fb2d720 | 1306 | if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) |
8ffd74e9 | 1307 | folio_end_writeback(folio); |
598ecfba CH |
1308 | } |
1309 | ||
1310 | /* | |
1311 | * We're now finished for good with this ioend structure. Update the page | |
1312 | * state, release holds on bios, and finally free up memory. Do not use the | |
1313 | * ioend after this. | |
1314 | */ | |
ebb7fb15 | 1315 | static u32 |
598ecfba CH |
1316 | iomap_finish_ioend(struct iomap_ioend *ioend, int error) |
1317 | { | |
1318 | struct inode *inode = ioend->io_inode; | |
1319 | struct bio *bio = &ioend->io_inline_bio; | |
1320 | struct bio *last = ioend->io_bio, *next; | |
1321 | u64 start = bio->bi_iter.bi_sector; | |
c275779f | 1322 | loff_t offset = ioend->io_offset; |
598ecfba | 1323 | bool quiet = bio_flagged(bio, BIO_QUIET); |
ebb7fb15 | 1324 | u32 folio_count = 0; |
598ecfba CH |
1325 | |
1326 | for (bio = &ioend->io_inline_bio; bio; bio = next) { | |
8ffd74e9 | 1327 | struct folio_iter fi; |
598ecfba CH |
1328 | |
1329 | /* | |
1330 | * For the last bio, bi_private points to the ioend, so we | |
1331 | * need to explicitly end the iteration here. | |
1332 | */ | |
1333 | if (bio == last) | |
1334 | next = NULL; | |
1335 | else | |
1336 | next = bio->bi_private; | |
1337 | ||
8ffd74e9 | 1338 | /* walk all folios in bio, ending page IO on them */ |
ebb7fb15 | 1339 | bio_for_each_folio_all(fi, bio) { |
8ffd74e9 MWO |
1340 | iomap_finish_folio_write(inode, fi.folio, fi.length, |
1341 | error); | |
ebb7fb15 DC |
1342 | folio_count++; |
1343 | } | |
598ecfba CH |
1344 | bio_put(bio); |
1345 | } | |
c275779f | 1346 | /* The ioend has been freed by bio_put() */ |
598ecfba CH |
1347 | |
1348 | if (unlikely(error && !quiet)) { | |
1349 | printk_ratelimited(KERN_ERR | |
9cd0ed63 | 1350 | "%s: writeback error on inode %lu, offset %lld, sector %llu", |
c275779f | 1351 | inode->i_sb->s_id, inode->i_ino, offset, start); |
598ecfba | 1352 | } |
ebb7fb15 | 1353 | return folio_count; |
598ecfba CH |
1354 | } |
1355 | ||
ebb7fb15 DC |
1356 | /* |
1357 | * Ioend completion routine for merged bios. This can only be called from task | |
1358 | * contexts as merged ioends can be of unbound length. Hence we have to break up | |
1359 | * the writeback completions into manageable chunks to avoid long scheduler | |
1360 | * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get | |
1361 | * good batch processing throughput without creating adverse scheduler latency | |
1362 | * conditions. | |
1363 | */ | |
598ecfba CH |
1364 | void |
1365 | iomap_finish_ioends(struct iomap_ioend *ioend, int error) | |
1366 | { | |
1367 | struct list_head tmp; | |
ebb7fb15 DC |
1368 | u32 completions; |
1369 | ||
1370 | might_sleep(); | |
598ecfba CH |
1371 | |
1372 | list_replace_init(&ioend->io_list, &tmp); | |
ebb7fb15 | 1373 | completions = iomap_finish_ioend(ioend, error); |
598ecfba CH |
1374 | |
1375 | while (!list_empty(&tmp)) { | |
ebb7fb15 DC |
1376 | if (completions > IOEND_BATCH_SIZE * 8) { |
1377 | cond_resched(); | |
1378 | completions = 0; | |
1379 | } | |
598ecfba CH |
1380 | ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); |
1381 | list_del_init(&ioend->io_list); | |
ebb7fb15 | 1382 | completions += iomap_finish_ioend(ioend, error); |
598ecfba CH |
1383 | } |
1384 | } | |
1385 | EXPORT_SYMBOL_GPL(iomap_finish_ioends); | |
1386 | ||
1387 | /* | |
1388 | * We can merge two adjacent ioends if they have the same set of work to do. | |
1389 | */ | |
1390 | static bool | |
1391 | iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) | |
1392 | { | |
1393 | if (ioend->io_bio->bi_status != next->io_bio->bi_status) | |
1394 | return false; | |
1395 | if ((ioend->io_flags & IOMAP_F_SHARED) ^ | |
1396 | (next->io_flags & IOMAP_F_SHARED)) | |
1397 | return false; | |
1398 | if ((ioend->io_type == IOMAP_UNWRITTEN) ^ | |
1399 | (next->io_type == IOMAP_UNWRITTEN)) | |
1400 | return false; | |
1401 | if (ioend->io_offset + ioend->io_size != next->io_offset) | |
1402 | return false; | |
ebb7fb15 DC |
1403 | /* |
1404 | * Do not merge physically discontiguous ioends. The filesystem | |
1405 | * completion functions will have to iterate the physical | |
1406 | * discontiguities even if we merge the ioends at a logical level, so | |
1407 | * we don't gain anything by merging physical discontiguities here. | |
1408 | * | |
1409 | * We cannot use bio->bi_iter.bi_sector here as it is modified during | |
1410 | * submission so does not point to the start sector of the bio at | |
1411 | * completion. | |
1412 | */ | |
1413 | if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) | |
1414 | return false; | |
598ecfba CH |
1415 | return true; |
1416 | } | |
1417 | ||
1418 | void | |
6e552494 | 1419 | iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) |
598ecfba CH |
1420 | { |
1421 | struct iomap_ioend *next; | |
1422 | ||
1423 | INIT_LIST_HEAD(&ioend->io_list); | |
1424 | ||
1425 | while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, | |
1426 | io_list))) { | |
1427 | if (!iomap_ioend_can_merge(ioend, next)) | |
1428 | break; | |
1429 | list_move_tail(&next->io_list, &ioend->io_list); | |
1430 | ioend->io_size += next->io_size; | |
598ecfba CH |
1431 | } |
1432 | } | |
1433 | EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); | |
1434 | ||
1435 | static int | |
4f0f586b ST |
1436 | iomap_ioend_compare(void *priv, const struct list_head *a, |
1437 | const struct list_head *b) | |
598ecfba | 1438 | { |
b3d423ec CH |
1439 | struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); |
1440 | struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); | |
598ecfba | 1441 | |
598ecfba CH |
1442 | if (ia->io_offset < ib->io_offset) |
1443 | return -1; | |
b3d423ec | 1444 | if (ia->io_offset > ib->io_offset) |
598ecfba CH |
1445 | return 1; |
1446 | return 0; | |
1447 | } | |
1448 | ||
1449 | void | |
1450 | iomap_sort_ioends(struct list_head *ioend_list) | |
1451 | { | |
1452 | list_sort(NULL, ioend_list, iomap_ioend_compare); | |
1453 | } | |
1454 | EXPORT_SYMBOL_GPL(iomap_sort_ioends); | |
1455 | ||
1456 | static void iomap_writepage_end_bio(struct bio *bio) | |
1457 | { | |
1458 | struct iomap_ioend *ioend = bio->bi_private; | |
1459 | ||
1460 | iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); | |
1461 | } | |
1462 | ||
1463 | /* | |
1464 | * Submit the final bio for an ioend. | |
1465 | * | |
1466 | * If @error is non-zero, it means that we have a situation where some part of | |
f1f264b4 | 1467 | * the submission process has failed after we've marked pages for writeback |
598ecfba CH |
1468 | * and unlocked them. In this situation, we need to fail the bio instead of |
1469 | * submitting it. This typically only happens on a filesystem shutdown. | |
1470 | */ | |
1471 | static int | |
1472 | iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, | |
1473 | int error) | |
1474 | { | |
1475 | ioend->io_bio->bi_private = ioend; | |
1476 | ioend->io_bio->bi_end_io = iomap_writepage_end_bio; | |
1477 | ||
1478 | if (wpc->ops->prepare_ioend) | |
1479 | error = wpc->ops->prepare_ioend(ioend, error); | |
1480 | if (error) { | |
1481 | /* | |
f1f264b4 | 1482 | * If we're failing the IO now, just mark the ioend with an |
598ecfba CH |
1483 | * error and finish it. This will run IO completion immediately |
1484 | * as there is only one reference to the ioend at this point in | |
1485 | * time. | |
1486 | */ | |
1487 | ioend->io_bio->bi_status = errno_to_blk_status(error); | |
1488 | bio_endio(ioend->io_bio); | |
1489 | return error; | |
1490 | } | |
1491 | ||
1492 | submit_bio(ioend->io_bio); | |
1493 | return 0; | |
1494 | } | |
1495 | ||
1496 | static struct iomap_ioend * | |
1497 | iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, | |
1498 | loff_t offset, sector_t sector, struct writeback_control *wbc) | |
1499 | { | |
1500 | struct iomap_ioend *ioend; | |
1501 | struct bio *bio; | |
1502 | ||
609be106 CH |
1503 | bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, |
1504 | REQ_OP_WRITE | wbc_to_write_flags(wbc), | |
1505 | GFP_NOFS, &iomap_ioend_bioset); | |
598ecfba | 1506 | bio->bi_iter.bi_sector = sector; |
598ecfba CH |
1507 | wbc_init_bio(wbc, bio); |
1508 | ||
1509 | ioend = container_of(bio, struct iomap_ioend, io_inline_bio); | |
1510 | INIT_LIST_HEAD(&ioend->io_list); | |
1511 | ioend->io_type = wpc->iomap.type; | |
1512 | ioend->io_flags = wpc->iomap.flags; | |
1513 | ioend->io_inode = inode; | |
1514 | ioend->io_size = 0; | |
ebb7fb15 | 1515 | ioend->io_folios = 0; |
598ecfba | 1516 | ioend->io_offset = offset; |
598ecfba | 1517 | ioend->io_bio = bio; |
ebb7fb15 | 1518 | ioend->io_sector = sector; |
598ecfba CH |
1519 | return ioend; |
1520 | } | |
1521 | ||
1522 | /* | |
1523 | * Allocate a new bio, and chain the old bio to the new one. | |
1524 | * | |
f1f264b4 | 1525 | * Note that we have to perform the chaining in this unintuitive order |
598ecfba CH |
1526 | * so that the bi_private linkage is set up in the right direction for the |
1527 | * traversal in iomap_finish_ioend(). | |
1528 | */ | |
1529 | static struct bio * | |
1530 | iomap_chain_bio(struct bio *prev) | |
1531 | { | |
1532 | struct bio *new; | |
1533 | ||
07888c66 CH |
1534 | new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); |
1535 | bio_clone_blkg_association(new, prev); | |
598ecfba | 1536 | new->bi_iter.bi_sector = bio_end_sector(prev); |
598ecfba CH |
1537 | |
1538 | bio_chain(prev, new); | |
1539 | bio_get(prev); /* for iomap_finish_ioend */ | |
1540 | submit_bio(prev); | |
1541 | return new; | |
1542 | } | |
1543 | ||
1544 | static bool | |
1545 | iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, | |
1546 | sector_t sector) | |
1547 | { | |
1548 | if ((wpc->iomap.flags & IOMAP_F_SHARED) != | |
1549 | (wpc->ioend->io_flags & IOMAP_F_SHARED)) | |
1550 | return false; | |
1551 | if (wpc->iomap.type != wpc->ioend->io_type) | |
1552 | return false; | |
1553 | if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) | |
1554 | return false; | |
1555 | if (sector != bio_end_sector(wpc->ioend->io_bio)) | |
1556 | return false; | |
ebb7fb15 DC |
1557 | /* |
1558 | * Limit ioend bio chain lengths to minimise IO completion latency. This | |
1559 | * also prevents long tight loops ending page writeback on all the | |
1560 | * folios in the ioend. | |
1561 | */ | |
1562 | if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) | |
1563 | return false; | |
598ecfba CH |
1564 | return true; |
1565 | } | |
1566 | ||
1567 | /* | |
1568 | * Test to see if we have an existing ioend structure that we could append to | |
f1f264b4 | 1569 | * first; otherwise finish off the current ioend and start another. |
598ecfba CH |
1570 | */ |
1571 | static void | |
e735c007 | 1572 | iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, |
598ecfba CH |
1573 | struct iomap_page *iop, struct iomap_writepage_ctx *wpc, |
1574 | struct writeback_control *wbc, struct list_head *iolist) | |
1575 | { | |
e735c007 | 1576 | sector_t sector = iomap_sector(&wpc->iomap, pos); |
598ecfba | 1577 | unsigned len = i_blocksize(inode); |
e735c007 | 1578 | size_t poff = offset_in_folio(folio, pos); |
598ecfba | 1579 | |
e735c007 | 1580 | if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { |
598ecfba CH |
1581 | if (wpc->ioend) |
1582 | list_add(&wpc->ioend->io_list, iolist); | |
e735c007 | 1583 | wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); |
598ecfba CH |
1584 | } |
1585 | ||
e735c007 | 1586 | if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { |
c1b79f11 | 1587 | wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); |
c2478469 | 1588 | bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); |
598ecfba CH |
1589 | } |
1590 | ||
c1b79f11 CH |
1591 | if (iop) |
1592 | atomic_add(len, &iop->write_bytes_pending); | |
598ecfba | 1593 | wpc->ioend->io_size += len; |
e735c007 | 1594 | wbc_account_cgroup_owner(wbc, &folio->page, len); |
598ecfba CH |
1595 | } |
1596 | ||
1597 | /* | |
1598 | * We implement an immediate ioend submission policy here to avoid needing to | |
1599 | * chain multiple ioends and hence nest mempool allocations which can violate | |
f1f264b4 AG |
1600 | * the forward progress guarantees we need to provide. The current ioend we're |
1601 | * adding blocks to is cached in the writepage context, and if the new block | |
1602 | * doesn't append to the cached ioend, it will create a new ioend and cache that | |
598ecfba CH |
1603 | * instead. |
1604 | * | |
1605 | * If a new ioend is created and cached, the old ioend is returned and queued | |
1606 | * locally for submission once the entire page is processed or an error has been | |
1607 | * detected. While ioends are submitted immediately after they are completed, | |
1608 | * batching optimisations are provided by higher level block plugging. | |
1609 | * | |
1610 | * At the end of a writeback pass, there will be a cached ioend remaining on the | |
1611 | * writepage context that the caller will need to submit. | |
1612 | */ | |
1613 | static int | |
1614 | iomap_writepage_map(struct iomap_writepage_ctx *wpc, | |
1615 | struct writeback_control *wbc, struct inode *inode, | |
e735c007 | 1616 | struct folio *folio, u64 end_pos) |
598ecfba | 1617 | { |
9753b868 | 1618 | struct iomap_page *iop = iomap_page_create(inode, folio, 0); |
598ecfba CH |
1619 | struct iomap_ioend *ioend, *next; |
1620 | unsigned len = i_blocksize(inode); | |
92655036 MWO |
1621 | unsigned nblocks = i_blocks_per_folio(inode, folio); |
1622 | u64 pos = folio_pos(folio); | |
598ecfba CH |
1623 | int error = 0, count = 0, i; |
1624 | LIST_HEAD(submit_list); | |
1625 | ||
0fb2d720 | 1626 | WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); |
598ecfba CH |
1627 | |
1628 | /* | |
92655036 MWO |
1629 | * Walk through the folio to find areas to write back. If we |
1630 | * run off the end of the current map or find the current map | |
1631 | * invalid, grab a new one. | |
598ecfba | 1632 | */ |
92655036 | 1633 | for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { |
598ecfba CH |
1634 | if (iop && !test_bit(i, iop->uptodate)) |
1635 | continue; | |
1636 | ||
92655036 | 1637 | error = wpc->ops->map_blocks(wpc, inode, pos); |
598ecfba CH |
1638 | if (error) |
1639 | break; | |
adc9c2e5 | 1640 | trace_iomap_writepage_map(inode, &wpc->iomap); |
3e19e6f3 CH |
1641 | if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) |
1642 | continue; | |
598ecfba CH |
1643 | if (wpc->iomap.type == IOMAP_HOLE) |
1644 | continue; | |
e735c007 | 1645 | iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, |
598ecfba CH |
1646 | &submit_list); |
1647 | count++; | |
1648 | } | |
ebb7fb15 DC |
1649 | if (count) |
1650 | wpc->ioend->io_folios++; | |
598ecfba CH |
1651 | |
1652 | WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); | |
e735c007 MWO |
1653 | WARN_ON_ONCE(!folio_test_locked(folio)); |
1654 | WARN_ON_ONCE(folio_test_writeback(folio)); | |
1655 | WARN_ON_ONCE(folio_test_dirty(folio)); | |
598ecfba CH |
1656 | |
1657 | /* | |
1658 | * We cannot cancel the ioend directly here on error. We may have | |
1659 | * already set other pages under writeback and hence we have to run I/O | |
1660 | * completion to mark the error state of the pages under writeback | |
1661 | * appropriately. | |
1662 | */ | |
1663 | if (unlikely(error)) { | |
763e4cdc BF |
1664 | /* |
1665 | * Let the filesystem know what portion of the current page | |
f1f264b4 | 1666 | * failed to map. If the page hasn't been added to ioend, it |
763e4cdc BF |
1667 | * won't be affected by I/O completion and we must unlock it |
1668 | * now. | |
1669 | */ | |
6e478521 | 1670 | if (wpc->ops->discard_folio) |
92655036 | 1671 | wpc->ops->discard_folio(folio, pos); |
598ecfba | 1672 | if (!count) { |
e735c007 | 1673 | folio_unlock(folio); |
598ecfba CH |
1674 | goto done; |
1675 | } | |
598ecfba CH |
1676 | } |
1677 | ||
e735c007 MWO |
1678 | folio_start_writeback(folio); |
1679 | folio_unlock(folio); | |
598ecfba CH |
1680 | |
1681 | /* | |
f1f264b4 | 1682 | * Preserve the original error if there was one; catch |
598ecfba CH |
1683 | * submission errors here and propagate into subsequent ioend |
1684 | * submissions. | |
1685 | */ | |
1686 | list_for_each_entry_safe(ioend, next, &submit_list, io_list) { | |
1687 | int error2; | |
1688 | ||
1689 | list_del_init(&ioend->io_list); | |
1690 | error2 = iomap_submit_ioend(wpc, ioend, error); | |
1691 | if (error2 && !error) | |
1692 | error = error2; | |
1693 | } | |
1694 | ||
1695 | /* | |
1696 | * We can end up here with no error and nothing to write only if we race | |
1697 | * with a partial page truncate on a sub-page block sized filesystem. | |
1698 | */ | |
1699 | if (!count) | |
e735c007 | 1700 | folio_end_writeback(folio); |
598ecfba | 1701 | done: |
3d5f3ba1 | 1702 | mapping_set_error(inode->i_mapping, error); |
598ecfba CH |
1703 | return error; |
1704 | } | |
1705 | ||
1706 | /* | |
1707 | * Write out a dirty page. | |
1708 | * | |
f1f264b4 AG |
1709 | * For delalloc space on the page, we need to allocate space and flush it. |
1710 | * For unwritten space on the page, we need to start the conversion to | |
598ecfba CH |
1711 | * regular allocated space. |
1712 | */ | |
d585bdbe MWO |
1713 | static int iomap_do_writepage(struct folio *folio, |
1714 | struct writeback_control *wbc, void *data) | |
598ecfba CH |
1715 | { |
1716 | struct iomap_writepage_ctx *wpc = data; | |
e735c007 | 1717 | struct inode *inode = folio->mapping->host; |
81d4782a | 1718 | u64 end_pos, isize; |
598ecfba | 1719 | |
e735c007 | 1720 | trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); |
598ecfba CH |
1721 | |
1722 | /* | |
e735c007 | 1723 | * Refuse to write the folio out if we're called from reclaim context. |
598ecfba CH |
1724 | * |
1725 | * This avoids stack overflows when called from deeply used stacks in | |
1726 | * random callers for direct reclaim or memcg reclaim. We explicitly | |
1727 | * allow reclaim from kswapd as the stack usage there is relatively low. | |
1728 | * | |
1729 | * This should never happen except in the case of a VM regression so | |
1730 | * warn about it. | |
1731 | */ | |
1732 | if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == | |
1733 | PF_MEMALLOC)) | |
1734 | goto redirty; | |
1735 | ||
598ecfba | 1736 | /* |
e735c007 | 1737 | * Is this folio beyond the end of the file? |
598ecfba | 1738 | * |
e735c007 MWO |
1739 | * The folio index is less than the end_index, adjust the end_pos |
1740 | * to the highest offset that this folio should represent. | |
598ecfba CH |
1741 | * ----------------------------------------------------- |
1742 | * | file mapping | <EOF> | | |
1743 | * ----------------------------------------------------- | |
1744 | * | Page ... | Page N-2 | Page N-1 | Page N | | | |
1745 | * ^--------------------------------^----------|-------- | |
1746 | * | desired writeback range | see else | | |
1747 | * ---------------------------------^------------------| | |
1748 | */ | |
81d4782a | 1749 | isize = i_size_read(inode); |
e735c007 | 1750 | end_pos = folio_pos(folio) + folio_size(folio); |
81d4782a | 1751 | if (end_pos > isize) { |
598ecfba CH |
1752 | /* |
1753 | * Check whether the page to write out is beyond or straddles | |
1754 | * i_size or not. | |
1755 | * ------------------------------------------------------- | |
1756 | * | file mapping | <EOF> | | |
1757 | * ------------------------------------------------------- | |
1758 | * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | | |
1759 | * ^--------------------------------^-----------|--------- | |
1760 | * | | Straddles | | |
1761 | * ---------------------------------^-----------|--------| | |
1762 | */ | |
e735c007 | 1763 | size_t poff = offset_in_folio(folio, isize); |
81d4782a | 1764 | pgoff_t end_index = isize >> PAGE_SHIFT; |
598ecfba CH |
1765 | |
1766 | /* | |
d58562ca CM |
1767 | * Skip the page if it's fully outside i_size, e.g. |
1768 | * due to a truncate operation that's in progress. We've | |
1769 | * cleaned this page and truncate will finish things off for | |
1770 | * us. | |
598ecfba | 1771 | * |
f1f264b4 AG |
1772 | * Note that the end_index is unsigned long. If the given |
1773 | * offset is greater than 16TB on a 32-bit system then if we | |
1774 | * checked if the page is fully outside i_size with | |
1775 | * "if (page->index >= end_index + 1)", "end_index + 1" would | |
1776 | * overflow and evaluate to 0. Hence this page would be | |
1777 | * redirtied and written out repeatedly, which would result in | |
1778 | * an infinite loop; the user program performing this operation | |
1779 | * would hang. Instead, we can detect this situation by | |
1780 | * checking if the page is totally beyond i_size or if its | |
598ecfba CH |
1781 | * offset is just equal to the EOF. |
1782 | */ | |
e735c007 MWO |
1783 | if (folio->index > end_index || |
1784 | (folio->index == end_index && poff == 0)) | |
d58562ca | 1785 | goto unlock; |
598ecfba CH |
1786 | |
1787 | /* | |
1788 | * The page straddles i_size. It must be zeroed out on each | |
1789 | * and every writepage invocation because it may be mmapped. | |
1790 | * "A file is mapped in multiples of the page size. For a file | |
1791 | * that is not a multiple of the page size, the remaining | |
1792 | * memory is zeroed when mapped, and writes to that region are | |
1793 | * not written out to the file." | |
1794 | */ | |
e735c007 | 1795 | folio_zero_segment(folio, poff, folio_size(folio)); |
81d4782a | 1796 | end_pos = isize; |
598ecfba CH |
1797 | } |
1798 | ||
e735c007 | 1799 | return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); |
598ecfba CH |
1800 | |
1801 | redirty: | |
e735c007 | 1802 | folio_redirty_for_writepage(wbc, folio); |
d58562ca | 1803 | unlock: |
e735c007 | 1804 | folio_unlock(folio); |
598ecfba CH |
1805 | return 0; |
1806 | } | |
1807 | ||
598ecfba CH |
1808 | int |
1809 | iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, | |
1810 | struct iomap_writepage_ctx *wpc, | |
1811 | const struct iomap_writeback_ops *ops) | |
1812 | { | |
1813 | int ret; | |
1814 | ||
1815 | wpc->ops = ops; | |
1816 | ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); | |
1817 | if (!wpc->ioend) | |
1818 | return ret; | |
1819 | return iomap_submit_ioend(wpc, wpc->ioend, ret); | |
1820 | } | |
1821 | EXPORT_SYMBOL_GPL(iomap_writepages); | |
1822 | ||
1823 | static int __init iomap_init(void) | |
1824 | { | |
1825 | return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), | |
1826 | offsetof(struct iomap_ioend, io_inline_bio), | |
1827 | BIOSET_NEED_BVECS); | |
1828 | } | |
1829 | fs_initcall(iomap_init); |