1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Network filesystem high-level buffered read support.
4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #include <linux/export.h>
9 #include <linux/task_io_accounting_ops.h>
13 * Unlock the folios in a read operation. We need to set PG_writeback on any
14 * folios we're going to write back before we unlock them.
16 * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
17 * PG_private_2 and do a direct write to the cache from here instead.
19 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
21 struct netfs_io_subrequest *subreq;
22 struct netfs_folio *finfo;
24 pgoff_t start_page = rreq->start / PAGE_SIZE;
25 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
27 bool subreq_failed = false;
29 XA_STATE(xas, &rreq->mapping->i_pages, start_page);
31 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
32 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
33 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
34 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
38 /* Walk through the pagecache and the I/O request lists simultaneously.
39 * We may have a mixture of cached and uncached sections and we only
40 * really want to write out the uncached sections. This is slightly
41 * complicated by the possibility that we might have huge pages with a
44 subreq = list_first_entry(&rreq->subrequests,
45 struct netfs_io_subrequest, rreq_link);
46 subreq_failed = (subreq->error < 0);
48 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
51 xas_for_each(&xas, folio, last_page) {
53 bool pg_failed = false;
54 bool wback_to_cache = false;
55 bool folio_started = false;
57 if (xas_retry(&xas, folio))
60 pg_end = folio_pos(folio) + folio_size(folio) - 1;
69 if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
70 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
72 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
73 folio_start_private_2(folio);
78 test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
80 pg_failed |= subreq_failed;
81 sreq_end = subreq->start + subreq->len - 1;
82 if (pg_end < sreq_end)
85 account += subreq->transferred;
86 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
87 subreq = list_next_entry(subreq, rreq_link);
88 subreq_failed = (subreq->error < 0);
91 subreq_failed = false;
94 if (pg_end == sreq_end)
99 flush_dcache_folio(folio);
100 finfo = netfs_folio_info(folio);
102 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
103 if (finfo->netfs_group)
104 folio_change_private(folio, finfo->netfs_group);
106 folio_detach_private(folio);
109 folio_mark_uptodate(folio);
110 if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
111 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
112 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
113 filemap_dirty_folio(folio->mapping, folio);
117 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
118 if (folio->index == rreq->no_unlock_folio &&
119 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
127 task_io_account_read(account);
128 if (rreq->netfs_ops->done)
129 rreq->netfs_ops->done(rreq);
132 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
133 unsigned long long *_start,
134 unsigned long long *_len,
135 unsigned long long i_size)
137 struct netfs_cache_resources *cres = &rreq->cache_resources;
139 if (cres->ops && cres->ops->expand_readahead)
140 cres->ops->expand_readahead(cres, _start, _len, i_size);
143 static void netfs_rreq_expand(struct netfs_io_request *rreq,
144 struct readahead_control *ractl)
146 /* Give the cache a chance to change the request parameters. The
147 * resultant request must contain the original region.
149 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
151 /* Give the netfs a chance to change the request parameters. The
152 * resultant request must contain the original region.
154 if (rreq->netfs_ops->expand_readahead)
155 rreq->netfs_ops->expand_readahead(rreq);
157 /* Expand the request if the cache wants it to start earlier. Note
158 * that the expansion may get further extended if the VM wishes to
159 * insert THPs and the preferred start and/or end wind up in the middle
162 * If this is the case, however, the THP size should be an integer
163 * multiple of the cache granule size, so we get a whole number of
164 * granules to deal with.
166 if (rreq->start != readahead_pos(ractl) ||
167 rreq->len != readahead_length(ractl)) {
168 readahead_expand(ractl, rreq->start, rreq->len);
169 rreq->start = readahead_pos(ractl);
170 rreq->len = readahead_length(ractl);
172 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
173 netfs_read_trace_expanded);
178 * Begin an operation, and fetch the stored zero point value from the cookie if
181 static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
183 return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
187 * netfs_readahead - Helper to manage a read request
188 * @ractl: The description of the readahead request
190 * Fulfil a readahead request by drawing data from the cache if possible, or
191 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
192 * requests from different sources will get munged together. If necessary, the
193 * readahead window can be expanded in either direction to a more convenient
194 * alighment for RPC efficiency or to make storage in the cache feasible.
196 * The calling netfs must initialise a netfs context contiguous to the vfs
197 * inode before calling this.
199 * This is usable whether or not caching is enabled.
201 void netfs_readahead(struct readahead_control *ractl)
203 struct netfs_io_request *rreq;
204 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
207 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
209 if (readahead_count(ractl) == 0)
212 rreq = netfs_alloc_request(ractl->mapping, ractl->file,
213 readahead_pos(ractl),
214 readahead_length(ractl),
219 ret = netfs_begin_cache_read(rreq, ctx);
220 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
223 netfs_stat(&netfs_n_rh_readahead);
224 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
225 netfs_read_trace_readahead);
227 netfs_rreq_expand(rreq, ractl);
229 /* Set up the output buffer */
230 iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
231 rreq->start, rreq->len);
233 /* Drop the refs on the folios here rather than in the cache or
234 * filesystem. The locks will be dropped in netfs_rreq_unlock().
236 while (readahead_folio(ractl))
239 netfs_begin_read(rreq, false);
240 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
244 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
247 EXPORT_SYMBOL(netfs_readahead);
250 * netfs_read_folio - Helper to manage a read_folio request
251 * @file: The file to read from
252 * @folio: The folio to read
254 * Fulfil a read_folio request by drawing data from the cache if
255 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
256 * Multiple I/O requests from different sources will get munged together.
258 * The calling netfs must initialise a netfs context contiguous to the vfs
259 * inode before calling this.
261 * This is usable whether or not caching is enabled.
263 int netfs_read_folio(struct file *file, struct folio *folio)
265 struct address_space *mapping = folio->mapping;
266 struct netfs_io_request *rreq;
267 struct netfs_inode *ctx = netfs_inode(mapping->host);
268 struct folio *sink = NULL;
271 _enter("%lx", folio->index);
273 rreq = netfs_alloc_request(mapping, file,
274 folio_file_pos(folio), folio_size(folio),
281 ret = netfs_begin_cache_read(rreq, ctx);
282 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
285 netfs_stat(&netfs_n_rh_read_folio);
286 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
288 /* Set up the output buffer */
289 if (folio_test_dirty(folio)) {
290 /* Handle someone trying to read from an unflushed streaming
291 * write. We fiddle the buffer so that a gap at the beginning
292 * and/or a gap at the end get copied to, but the middle is
295 struct netfs_folio *finfo = netfs_folio_info(folio);
296 struct bio_vec *bvec;
297 unsigned int from = finfo->dirty_offset;
298 unsigned int to = from + finfo->dirty_len;
299 unsigned int off = 0, i = 0;
300 size_t flen = folio_size(folio);
301 size_t nr_bvec = flen / PAGE_SIZE + 2;
305 bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
309 sink = folio_alloc(GFP_KERNEL, 0);
313 trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
315 rreq->direct_bv = bvec;
316 rreq->direct_bv_count = nr_bvec;
318 bvec_set_folio(&bvec[i++], folio, from, 0);
322 part = min_t(size_t, to - off, PAGE_SIZE);
323 bvec_set_folio(&bvec[i++], sink, part, 0);
327 bvec_set_folio(&bvec[i++], folio, flen - to, to);
328 iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
330 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
331 rreq->start, rreq->len);
334 ret = netfs_begin_read(rreq, true);
337 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
338 return ret < 0 ? ret : 0;
341 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
346 EXPORT_SYMBOL(netfs_read_folio);
349 * Prepare a folio for writing without reading first
350 * @folio: The folio being prepared
351 * @pos: starting position for the write
352 * @len: length of write
353 * @always_fill: T if the folio should always be completely filled/cleared
355 * In some cases, write_begin doesn't need to read at all:
357 * - write that lies in a folio that is completely beyond EOF
358 * - write that covers the folio from start to EOF or beyond it
360 * If any of these criteria are met, then zero out the unwritten parts
361 * of the folio and return true. Otherwise, return false.
363 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
366 struct inode *inode = folio_inode(folio);
367 loff_t i_size = i_size_read(inode);
368 size_t offset = offset_in_folio(folio, pos);
369 size_t plen = folio_size(folio);
371 if (unlikely(always_fill)) {
372 if (pos - offset + len <= i_size)
373 return false; /* Page entirely before EOF */
374 zero_user_segment(&folio->page, 0, plen);
375 folio_mark_uptodate(folio);
379 /* Full folio write */
380 if (offset == 0 && len >= plen)
383 /* Page entirely beyond the end of the file */
384 if (pos - offset >= i_size)
387 /* Write that covers from the start of the folio to EOF or beyond */
388 if (offset == 0 && (pos + len) >= i_size)
393 zero_user_segments(&folio->page, 0, offset, offset + len, plen);
398 * netfs_write_begin - Helper to prepare for writing
399 * @ctx: The netfs context
400 * @file: The file to read from
401 * @mapping: The mapping to read from
402 * @pos: File position at which the write will begin
403 * @len: The length of the write (may extend beyond the end of the folio chosen)
404 * @_folio: Where to put the resultant folio
405 * @_fsdata: Place for the netfs to store a cookie
407 * Pre-read data for a write-begin request by drawing data from the cache if
408 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
409 * Multiple I/O requests from different sources will get munged together. If
410 * necessary, the readahead window can be expanded in either direction to a
411 * more convenient alighment for RPC efficiency or to make storage in the cache
414 * The calling netfs must provide a table of operations, only one of which,
415 * issue_op, is mandatory.
417 * The check_write_begin() operation can be provided to check for and flush
418 * conflicting writes once the folio is grabbed and locked. It is passed a
419 * pointer to the fsdata cookie that gets returned to the VM to be passed to
420 * write_end. It is permitted to sleep. It should return 0 if the request
421 * should go ahead or it may return an error. It may also unlock and put the
422 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
423 * will cause the folio to be re-got and the process to be retried.
425 * The calling netfs must initialise a netfs context contiguous to the vfs
426 * inode before calling this.
428 * This is usable whether or not caching is enabled.
430 int netfs_write_begin(struct netfs_inode *ctx,
431 struct file *file, struct address_space *mapping,
432 loff_t pos, unsigned int len, struct folio **_folio,
435 struct netfs_io_request *rreq;
437 pgoff_t index = pos >> PAGE_SHIFT;
440 DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
443 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
444 mapping_gfp_mask(mapping));
446 return PTR_ERR(folio);
448 if (ctx->ops->check_write_begin) {
449 /* Allow the netfs (eg. ceph) to flush conflicts. */
450 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
452 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
459 if (folio_test_uptodate(folio))
462 /* If the page is beyond the EOF, we want to clear it - unless it's
463 * within the cache granule containing the EOF, in which case we need
464 * to preload the granule.
466 if (!netfs_is_cache_enabled(ctx) &&
467 netfs_skip_folio_read(folio, pos, len, false)) {
468 netfs_stat(&netfs_n_rh_write_zskip);
472 rreq = netfs_alloc_request(mapping, file,
473 folio_file_pos(folio), folio_size(folio),
474 NETFS_READ_FOR_WRITE);
479 rreq->no_unlock_folio = folio->index;
480 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
482 ret = netfs_begin_cache_read(rreq, ctx);
483 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
486 netfs_stat(&netfs_n_rh_write_begin);
487 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
489 /* Expand the request to meet caching requirements and download
492 ractl._nr_pages = folio_nr_pages(folio);
493 netfs_rreq_expand(rreq, &ractl);
495 /* Set up the output buffer */
496 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
497 rreq->start, rreq->len);
499 /* We hold the folio locks, so we can drop the references */
501 while (readahead_folio(&ractl))
504 ret = netfs_begin_read(rreq, true);
507 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
515 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
521 _leave(" = %d", ret);
524 EXPORT_SYMBOL(netfs_write_begin);
527 * Preload the data into a page we're proposing to write into.
529 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
530 size_t offset, size_t len)
532 struct netfs_io_request *rreq;
533 struct address_space *mapping = folio->mapping;
534 struct netfs_inode *ctx = netfs_inode(mapping->host);
535 unsigned long long start = folio_pos(folio);
536 size_t flen = folio_size(folio);
539 _enter("%zx @%llx", flen, start);
543 rreq = netfs_alloc_request(mapping, file, start, flen,
544 NETFS_READ_FOR_WRITE);
550 rreq->no_unlock_folio = folio->index;
551 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
552 ret = netfs_begin_cache_read(rreq, ctx);
553 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
556 netfs_stat(&netfs_n_rh_write_begin);
557 trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
559 /* Set up the output buffer */
560 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
561 rreq->start, rreq->len);
563 ret = netfs_begin_read(rreq, true);
564 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
568 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
570 _leave(" = %d", ret);
575 * netfs_buffered_read_iter - Filesystem buffered I/O read routine
576 * @iocb: kernel I/O control block
577 * @iter: destination for the data read
579 * This is the ->read_iter() routine for all filesystems that can use the page
582 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
583 * returned when no data can be read without waiting for I/O requests to
584 * complete; it doesn't prevent readahead.
586 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
587 * shall be made for the read or for readahead. When no data can be read,
588 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
589 * possibly empty read shall be returned.
592 * * number of bytes copied, even for partial reads
593 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
595 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
597 struct inode *inode = file_inode(iocb->ki_filp);
598 struct netfs_inode *ictx = netfs_inode(inode);
601 if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
602 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
605 ret = netfs_start_io_read(inode);
607 ret = filemap_read(iocb, iter, 0);
608 netfs_end_io_read(inode);
612 EXPORT_SYMBOL(netfs_buffered_read_iter);
615 * netfs_file_read_iter - Generic filesystem read routine
616 * @iocb: kernel I/O control block
617 * @iter: destination for the data read
619 * This is the ->read_iter() routine for all filesystems that can use the page
622 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
623 * returned when no data can be read without waiting for I/O requests to
624 * complete; it doesn't prevent readahead.
626 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
627 * shall be made for the read or for readahead. When no data can be read,
628 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
629 * possibly empty read shall be returned.
632 * * number of bytes copied, even for partial reads
633 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
635 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
637 struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
639 if ((iocb->ki_flags & IOCB_DIRECT) ||
640 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
641 return netfs_unbuffered_read_iter(iocb, iter);
643 return netfs_buffered_read_iter(iocb, iter);
645 EXPORT_SYMBOL(netfs_file_read_iter);