netfs: Implement a write-through caching option
[linux-block.git] / fs / netfs / buffered_write.c
CommitLineData
c38f4e96
DH
1// SPDX-License-Identifier: GPL-2.0-only
2/* Network filesystem high-level write support.
3 *
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/export.h>
9#include <linux/fs.h>
10#include <linux/mm.h>
11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/pagevec.h>
14#include "internal.h"
15
16/*
17 * Determined write method. Adjust netfs_folio_traces if this is changed.
18 */
19enum netfs_how_to_modify {
20 NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
21 NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
22 NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
23 NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
24 NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
25 NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
26 NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
27};
28
41d8e767
DH
29static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
30
c38f4e96
DH
31static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
32{
33 if (netfs_group && !folio_get_private(folio))
34 folio_attach_private(folio, netfs_get_group(netfs_group));
35}
36
62c3b748
DH
37#if IS_ENABLED(CONFIG_FSCACHE)
38static void netfs_folio_start_fscache(bool caching, struct folio *folio)
39{
40 if (caching)
41 folio_start_fscache(folio);
42}
43#else
44static void netfs_folio_start_fscache(bool caching, struct folio *folio)
45{
46}
47#endif
48
c38f4e96
DH
49/*
50 * Decide how we should modify a folio. We might be attempting to do
51 * write-streaming, in which case we don't want to a local RMW cycle if we can
52 * avoid it. If we're doing local caching or content crypto, we award that
53 * priority over avoiding RMW. If the file is open readably, then we also
54 * assume that we may want to read what we wrote.
55 */
56static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
57 struct file *file,
58 struct folio *folio,
59 void *netfs_group,
60 size_t flen,
61 size_t offset,
62 size_t len,
63 bool maybe_trouble)
64{
65 struct netfs_folio *finfo = netfs_folio_info(folio);
66 loff_t pos = folio_file_pos(folio);
67
68 _enter("");
69
70 if (netfs_folio_group(folio) != netfs_group)
71 return NETFS_FLUSH_CONTENT;
72
73 if (folio_test_uptodate(folio))
74 return NETFS_FOLIO_IS_UPTODATE;
75
76 if (pos >= ctx->remote_i_size)
77 return NETFS_MODIFY_AND_CLEAR;
78
79 if (!maybe_trouble && offset == 0 && len >= flen)
80 return NETFS_WHOLE_FOLIO_MODIFY;
81
82 if (file->f_mode & FMODE_READ)
83 return NETFS_JUST_PREFETCH;
84
85 if (netfs_is_cache_enabled(ctx))
86 return NETFS_JUST_PREFETCH;
87
88 if (!finfo)
89 return NETFS_STREAMING_WRITE;
90
91 /* We can continue a streaming write only if it continues on from the
92 * previous. If it overlaps, we must flush lest we suffer a partial
93 * copy and disjoint dirty regions.
94 */
95 if (offset == finfo->dirty_offset + finfo->dirty_len)
96 return NETFS_STREAMING_WRITE_CONT;
97 return NETFS_FLUSH_CONTENT;
98}
99
100/*
e2e2e839
DH
101 * Grab a folio for writing and lock it. Attempt to allocate as large a folio
102 * as possible to hold as much of the remaining length as possible in one go.
c38f4e96
DH
103 */
104static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
105 loff_t pos, size_t part)
106{
107 pgoff_t index = pos / PAGE_SIZE;
e2e2e839 108 fgf_t fgp_flags = FGP_WRITEBEGIN;
c38f4e96 109
e2e2e839
DH
110 if (mapping_large_folio_support(mapping))
111 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
112
113 return __filemap_get_folio(mapping, index, fgp_flags,
c38f4e96
DH
114 mapping_gfp_mask(mapping));
115}
116
117/**
118 * netfs_perform_write - Copy data into the pagecache.
119 * @iocb: The operation parameters
120 * @iter: The source buffer
121 * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
122 *
123 * Copy data into pagecache pages attached to the inode specified by @iocb.
124 * The caller must hold appropriate inode locks.
125 *
126 * Dirty pages are tagged with a netfs_folio struct if they're not up to date
127 * to indicate the range modified. Dirty pages may also be tagged with a
128 * netfs-specific grouping such that data from an old group gets flushed before
129 * a new one is started.
130 */
131ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
132 struct netfs_group *netfs_group)
133{
134 struct file *file = iocb->ki_filp;
135 struct inode *inode = file_inode(file);
136 struct address_space *mapping = inode->i_mapping;
137 struct netfs_inode *ctx = netfs_inode(inode);
41d8e767
DH
138 struct writeback_control wbc = {
139 .sync_mode = WB_SYNC_NONE,
140 .for_sync = true,
141 .nr_to_write = LONG_MAX,
142 .range_start = iocb->ki_pos,
143 .range_end = iocb->ki_pos + iter->count,
144 };
145 struct netfs_io_request *wreq = NULL;
c38f4e96
DH
146 struct netfs_folio *finfo;
147 struct folio *folio;
148 enum netfs_how_to_modify howto;
149 enum netfs_folio_trace trace;
150 unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
151 ssize_t written = 0, ret;
152 loff_t i_size, pos = iocb->ki_pos, from, to;
153 size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
154 bool maybe_trouble = false;
155
41d8e767
DH
156 if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
157 iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
158 ) {
159 if (pos < i_size_read(inode)) {
160 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
161 if (ret < 0) {
162 goto out;
163 }
164 }
165
166 wbc_attach_fdatawrite_inode(&wbc, mapping->host);
167
168 wreq = netfs_begin_writethrough(iocb, iter->count);
169 if (IS_ERR(wreq)) {
170 wbc_detach_inode(&wbc);
171 ret = PTR_ERR(wreq);
172 wreq = NULL;
173 goto out;
174 }
175 if (!is_sync_kiocb(iocb))
176 wreq->iocb = iocb;
177 wreq->cleanup = netfs_cleanup_buffered_write;
178 }
179
c38f4e96
DH
180 do {
181 size_t flen;
182 size_t offset; /* Offset into pagecache folio */
183 size_t part; /* Bytes to write to folio */
184 size_t copied; /* Bytes copied from user */
185
186 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
187 if (unlikely(ret < 0))
188 break;
189
190 offset = pos & (max_chunk - 1);
191 part = min(max_chunk - offset, iov_iter_count(iter));
192
193 /* Bring in the user pages that we will copy from _first_ lest
194 * we hit a nasty deadlock on copying from the same page as
195 * we're writing to, without it being marked uptodate.
196 *
197 * Not only is this an optimisation, but it is also required to
198 * check that the address is actually valid, when atomic
199 * usercopies are used below.
200 *
201 * We rely on the page being held onto long enough by the LRU
202 * that we can grab it below if this causes it to be read.
203 */
204 ret = -EFAULT;
205 if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
206 break;
207
208 ret = -ENOMEM;
209 folio = netfs_grab_folio_for_write(mapping, pos, part);
210 if (!folio)
211 break;
212
213 flen = folio_size(folio);
214 offset = pos & (flen - 1);
215 part = min_t(size_t, flen - offset, part);
216
217 if (signal_pending(current)) {
218 ret = written ? -EINTR : -ERESTARTSYS;
219 goto error_folio_unlock;
220 }
221
222 /* See if we need to prefetch the area we're going to modify.
223 * We need to do this before we get a lock on the folio in case
224 * there's more than one writer competing for the same cache
225 * block.
226 */
227 howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
228 flen, offset, part, maybe_trouble);
229 _debug("howto %u", howto);
230 switch (howto) {
231 case NETFS_JUST_PREFETCH:
232 ret = netfs_prefetch_for_write(file, folio, offset, part);
233 if (ret < 0) {
234 _debug("prefetch = %zd", ret);
235 goto error_folio_unlock;
236 }
237 break;
238 case NETFS_FOLIO_IS_UPTODATE:
239 case NETFS_WHOLE_FOLIO_MODIFY:
240 case NETFS_STREAMING_WRITE_CONT:
241 break;
242 case NETFS_MODIFY_AND_CLEAR:
243 zero_user_segment(&folio->page, 0, offset);
244 break;
245 case NETFS_STREAMING_WRITE:
246 ret = -EIO;
247 if (WARN_ON(folio_get_private(folio)))
248 goto error_folio_unlock;
249 break;
250 case NETFS_FLUSH_CONTENT:
251 trace_netfs_folio(folio, netfs_flush_content);
252 from = folio_pos(folio);
253 to = from + folio_size(folio) - 1;
254 folio_unlock(folio);
255 folio_put(folio);
256 ret = filemap_write_and_wait_range(mapping, from, to);
257 if (ret < 0)
258 goto error_folio_unlock;
259 continue;
260 }
261
262 if (mapping_writably_mapped(mapping))
263 flush_dcache_folio(folio);
264
265 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
266
267 flush_dcache_folio(folio);
268
269 /* Deal with a (partially) failed copy */
270 if (copied == 0) {
271 ret = -EFAULT;
272 goto error_folio_unlock;
273 }
274
275 trace = (enum netfs_folio_trace)howto;
276 switch (howto) {
277 case NETFS_FOLIO_IS_UPTODATE:
278 case NETFS_JUST_PREFETCH:
279 netfs_set_group(folio, netfs_group);
280 break;
281 case NETFS_MODIFY_AND_CLEAR:
282 zero_user_segment(&folio->page, offset + copied, flen);
283 netfs_set_group(folio, netfs_group);
284 folio_mark_uptodate(folio);
285 break;
286 case NETFS_WHOLE_FOLIO_MODIFY:
287 if (unlikely(copied < part)) {
288 maybe_trouble = true;
289 iov_iter_revert(iter, copied);
290 copied = 0;
291 goto retry;
292 }
293 netfs_set_group(folio, netfs_group);
294 folio_mark_uptodate(folio);
295 break;
296 case NETFS_STREAMING_WRITE:
297 if (offset == 0 && copied == flen) {
298 netfs_set_group(folio, netfs_group);
299 folio_mark_uptodate(folio);
300 trace = netfs_streaming_filled_page;
301 break;
302 }
303 finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
304 if (!finfo) {
305 iov_iter_revert(iter, copied);
306 ret = -ENOMEM;
307 goto error_folio_unlock;
308 }
309 finfo->netfs_group = netfs_get_group(netfs_group);
310 finfo->dirty_offset = offset;
311 finfo->dirty_len = copied;
312 folio_attach_private(folio, (void *)((unsigned long)finfo |
313 NETFS_FOLIO_INFO));
314 break;
315 case NETFS_STREAMING_WRITE_CONT:
316 finfo = netfs_folio_info(folio);
317 finfo->dirty_len += copied;
318 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
319 if (finfo->netfs_group)
320 folio_change_private(folio, finfo->netfs_group);
321 else
322 folio_detach_private(folio);
323 folio_mark_uptodate(folio);
324 kfree(finfo);
325 trace = netfs_streaming_cont_filled_page;
326 }
327 break;
328 default:
329 WARN(true, "Unexpected modify type %u ix=%lx\n",
330 howto, folio_index(folio));
331 ret = -EIO;
332 goto error_folio_unlock;
333 }
334
335 trace_netfs_folio(folio, trace);
336
337 /* Update the inode size if we moved the EOF marker */
338 i_size = i_size_read(inode);
339 pos += copied;
340 if (pos > i_size) {
341 if (ctx->ops->update_i_size) {
342 ctx->ops->update_i_size(inode, pos);
343 } else {
344 i_size_write(inode, pos);
345#if IS_ENABLED(CONFIG_FSCACHE)
346 fscache_update_cookie(ctx->cache, NULL, &pos);
347#endif
348 }
349 }
350 written += copied;
351
41d8e767
DH
352 if (likely(!wreq)) {
353 folio_mark_dirty(folio);
354 } else {
355 if (folio_test_dirty(folio))
356 /* Sigh. mmap. */
357 folio_clear_dirty_for_io(folio);
358 /* We make multiple writes to the folio... */
359 if (!folio_test_writeback(folio)) {
360 folio_wait_fscache(folio);
361 folio_start_writeback(folio);
362 folio_start_fscache(folio);
363 if (wreq->iter.count == 0)
364 trace_netfs_folio(folio, netfs_folio_trace_wthru);
365 else
366 trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
367 }
368 netfs_advance_writethrough(wreq, copied,
369 offset + copied == flen);
370 }
c38f4e96
DH
371 retry:
372 folio_unlock(folio);
373 folio_put(folio);
374 folio = NULL;
375
376 cond_resched();
377 } while (iov_iter_count(iter));
378
379out:
41d8e767
DH
380 if (unlikely(wreq)) {
381 ret = netfs_end_writethrough(wreq, iocb);
382 wbc_detach_inode(&wbc);
383 if (ret == -EIOCBQUEUED)
384 return ret;
c38f4e96
DH
385 }
386
41d8e767 387 iocb->ki_pos += written;
c38f4e96
DH
388 _leave(" = %zd [%zd]", written, ret);
389 return written ? written : ret;
390
391error_folio_unlock:
392 folio_unlock(folio);
393 folio_put(folio);
394 goto out;
395}
396EXPORT_SYMBOL(netfs_perform_write);
938e13a7
DH
397
398/**
399 * netfs_buffered_write_iter_locked - write data to a file
400 * @iocb: IO state structure (file, offset, etc.)
401 * @from: iov_iter with data to write
402 * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
403 *
404 * This function does all the work needed for actually writing data to a
405 * file. It does all basic checks, removes SUID from the file, updates
406 * modification times and calls proper subroutines depending on whether we
407 * do direct IO or a standard buffered write.
408 *
409 * The caller must hold appropriate locks around this function and have called
410 * generic_write_checks() already. The caller is also responsible for doing
411 * any necessary syncing afterwards.
412 *
413 * This function does *not* take care of syncing data in case of O_SYNC write.
414 * A caller has to handle it. This is mainly due to the fact that we want to
415 * avoid syncing under i_rwsem.
416 *
417 * Return:
418 * * number of bytes written, even for truncated writes
419 * * negative error code if no data has been written at all
420 */
421ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
422 struct netfs_group *netfs_group)
423{
424 struct file *file = iocb->ki_filp;
425 ssize_t ret;
426
427 trace_netfs_write_iter(iocb, from);
428
429 ret = file_remove_privs(file);
430 if (ret)
431 return ret;
432
433 ret = file_update_time(file);
434 if (ret)
435 return ret;
436
437 return netfs_perform_write(iocb, from, netfs_group);
438}
439EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
440
441/**
442 * netfs_file_write_iter - write data to a file
443 * @iocb: IO state structure
444 * @from: iov_iter with data to write
445 *
446 * Perform a write to a file, writing into the pagecache if possible and doing
447 * an unbuffered write instead if not.
448 *
449 * Return:
450 * * Negative error code if no data has been written at all of
451 * vfs_fsync_range() failed for a synchronous write
452 * * Number of bytes written, even for truncated writes
453 */
454ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
455{
456 struct file *file = iocb->ki_filp;
457 struct inode *inode = file->f_mapping->host;
458 struct netfs_inode *ictx = netfs_inode(inode);
459 ssize_t ret;
460
461 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
462
463 if ((iocb->ki_flags & IOCB_DIRECT) ||
464 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
465 return netfs_unbuffered_write_iter(iocb, from);
466
467 ret = netfs_start_io_write(inode);
468 if (ret < 0)
469 return ret;
470
471 ret = generic_write_checks(iocb, from);
472 if (ret > 0)
473 ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
474 netfs_end_io_write(inode);
475 if (ret > 0)
476 ret = generic_write_sync(iocb, ret);
477 return ret;
478}
479EXPORT_SYMBOL(netfs_file_write_iter);
102a7e2c
DH
480
481/*
482 * Notification that a previously read-only page is about to become writable.
483 * Note that the caller indicates a single page of a multipage folio.
484 */
485vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
486{
487 struct folio *folio = page_folio(vmf->page);
488 struct file *file = vmf->vma->vm_file;
489 struct inode *inode = file_inode(file);
490 vm_fault_t ret = VM_FAULT_RETRY;
491 int err;
492
493 _enter("%lx", folio->index);
494
495 sb_start_pagefault(inode->i_sb);
496
497 if (folio_wait_writeback_killable(folio))
498 goto out;
499
500 if (folio_lock_killable(folio) < 0)
501 goto out;
502
503 /* Can we see a streaming write here? */
504 if (WARN_ON(!folio_test_uptodate(folio))) {
505 ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
506 goto out;
507 }
508
509 if (netfs_folio_group(folio) != netfs_group) {
510 folio_unlock(folio);
511 err = filemap_fdatawait_range(inode->i_mapping,
512 folio_pos(folio),
513 folio_pos(folio) + folio_size(folio));
514 switch (err) {
515 case 0:
516 ret = VM_FAULT_RETRY;
517 goto out;
518 case -ENOMEM:
519 ret = VM_FAULT_OOM;
520 goto out;
521 default:
522 ret = VM_FAULT_SIGBUS;
523 goto out;
524 }
525 }
526
527 if (folio_test_dirty(folio))
528 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
529 else
530 trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
531 netfs_set_group(folio, netfs_group);
532 file_update_time(file);
533 ret = VM_FAULT_LOCKED;
534out:
535 sb_end_pagefault(inode->i_sb);
536 return ret;
537}
538EXPORT_SYMBOL(netfs_page_mkwrite);
62c3b748
DH
539
540/*
541 * Kill all the pages in the given range
542 */
543static void netfs_kill_pages(struct address_space *mapping,
544 loff_t start, loff_t len)
545{
546 struct folio *folio;
547 pgoff_t index = start / PAGE_SIZE;
548 pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
549
550 _enter("%llx-%llx", start, start + len - 1);
551
552 do {
553 _debug("kill %lx (to %lx)", index, last);
554
555 folio = filemap_get_folio(mapping, index);
556 if (IS_ERR(folio)) {
557 next = index + 1;
558 continue;
559 }
560
561 next = folio_next_index(folio);
562
563 trace_netfs_folio(folio, netfs_folio_trace_kill);
564 folio_clear_uptodate(folio);
565 if (folio_test_fscache(folio))
566 folio_end_fscache(folio);
567 folio_end_writeback(folio);
568 folio_lock(folio);
569 generic_error_remove_page(mapping, &folio->page);
570 folio_unlock(folio);
571 folio_put(folio);
572
573 } while (index = next, index <= last);
574
575 _leave("");
576}
577
578/*
579 * Redirty all the pages in a given range.
580 */
581static void netfs_redirty_pages(struct address_space *mapping,
582 loff_t start, loff_t len)
583{
584 struct folio *folio;
585 pgoff_t index = start / PAGE_SIZE;
586 pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
587
588 _enter("%llx-%llx", start, start + len - 1);
589
590 do {
591 _debug("redirty %llx @%llx", len, start);
592
593 folio = filemap_get_folio(mapping, index);
594 if (IS_ERR(folio)) {
595 next = index + 1;
596 continue;
597 }
598
599 next = folio_next_index(folio);
600 trace_netfs_folio(folio, netfs_folio_trace_redirty);
601 filemap_dirty_folio(mapping, folio);
602 if (folio_test_fscache(folio))
603 folio_end_fscache(folio);
604 folio_end_writeback(folio);
605 folio_put(folio);
606 } while (index = next, index <= last);
607
608 balance_dirty_pages_ratelimited(mapping);
609
610 _leave("");
611}
612
613/*
614 * Completion of write to server
615 */
616static void netfs_pages_written_back(struct netfs_io_request *wreq)
617{
618 struct address_space *mapping = wreq->mapping;
619 struct netfs_folio *finfo;
620 struct netfs_group *group = NULL;
621 struct folio *folio;
622 pgoff_t last;
623 int gcount = 0;
624
625 XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
626
627 _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
628
629 rcu_read_lock();
630
631 last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
632 xas_for_each(&xas, folio, last) {
633 WARN(!folio_test_writeback(folio),
634 "bad %zx @%llx page %lx %lx\n",
635 wreq->len, wreq->start, folio_index(folio), last);
636
637 if ((finfo = netfs_folio_info(folio))) {
638 /* Streaming writes cannot be redirtied whilst under
639 * writeback, so discard the streaming record.
640 */
641 folio_detach_private(folio);
642 group = finfo->netfs_group;
643 gcount++;
644 trace_netfs_folio(folio, netfs_folio_trace_clear_s);
645 kfree(finfo);
646 } else if ((group = netfs_folio_group(folio))) {
647 /* Need to detach the group pointer if the page didn't
648 * get redirtied. If it has been redirtied, then it
649 * must be within the same group.
650 */
651 if (folio_test_dirty(folio)) {
652 trace_netfs_folio(folio, netfs_folio_trace_redirtied);
653 goto end_wb;
654 }
655 if (folio_trylock(folio)) {
656 if (!folio_test_dirty(folio)) {
657 folio_detach_private(folio);
658 gcount++;
659 trace_netfs_folio(folio, netfs_folio_trace_clear_g);
660 } else {
661 trace_netfs_folio(folio, netfs_folio_trace_redirtied);
662 }
663 folio_unlock(folio);
664 goto end_wb;
665 }
666
667 xas_pause(&xas);
668 rcu_read_unlock();
669 folio_lock(folio);
670 if (!folio_test_dirty(folio)) {
671 folio_detach_private(folio);
672 gcount++;
673 trace_netfs_folio(folio, netfs_folio_trace_clear_g);
674 } else {
675 trace_netfs_folio(folio, netfs_folio_trace_redirtied);
676 }
677 folio_unlock(folio);
678 rcu_read_lock();
679 } else {
680 trace_netfs_folio(folio, netfs_folio_trace_clear);
681 }
682 end_wb:
683 if (folio_test_fscache(folio))
684 folio_end_fscache(folio);
685 folio_end_writeback(folio);
686 }
687
688 rcu_read_unlock();
689 netfs_put_group_many(group, gcount);
690 _leave("");
691}
692
693/*
694 * Deal with the disposition of the folios that are under writeback to close
695 * out the operation.
696 */
697static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
698{
699 struct address_space *mapping = wreq->mapping;
700
701 _enter("");
702
703 switch (wreq->error) {
704 case 0:
705 netfs_pages_written_back(wreq);
706 break;
707
708 default:
709 pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
710 fallthrough;
711 case -EACCES:
712 case -EPERM:
713 case -ENOKEY:
714 case -EKEYEXPIRED:
715 case -EKEYREJECTED:
716 case -EKEYREVOKED:
717 case -ENETRESET:
718 case -EDQUOT:
719 case -ENOSPC:
720 netfs_redirty_pages(mapping, wreq->start, wreq->len);
721 break;
722
723 case -EROFS:
724 case -EIO:
725 case -EREMOTEIO:
726 case -EFBIG:
727 case -ENOENT:
728 case -ENOMEDIUM:
729 case -ENXIO:
730 netfs_kill_pages(mapping, wreq->start, wreq->len);
731 break;
732 }
733
734 if (wreq->error)
735 mapping_set_error(mapping, wreq->error);
736 if (wreq->netfs_ops->done)
737 wreq->netfs_ops->done(wreq);
738}
739
740/*
741 * Extend the region to be written back to include subsequent contiguously
742 * dirty pages if possible, but don't sleep while doing so.
743 *
744 * If this page holds new content, then we can include filler zeros in the
745 * writeback.
746 */
747static void netfs_extend_writeback(struct address_space *mapping,
748 struct netfs_group *group,
749 struct xa_state *xas,
750 long *_count,
751 loff_t start,
752 loff_t max_len,
753 bool caching,
754 size_t *_len,
755 size_t *_top)
756{
757 struct netfs_folio *finfo;
758 struct folio_batch fbatch;
759 struct folio *folio;
760 unsigned int i;
761 pgoff_t index = (start + *_len) / PAGE_SIZE;
762 size_t len;
763 void *priv;
764 bool stop = true;
765
766 folio_batch_init(&fbatch);
767
768 do {
769 /* Firstly, we gather up a batch of contiguous dirty pages
770 * under the RCU read lock - but we can't clear the dirty flags
771 * there if any of those pages are mapped.
772 */
773 rcu_read_lock();
774
775 xas_for_each(xas, folio, ULONG_MAX) {
776 stop = true;
777 if (xas_retry(xas, folio))
778 continue;
779 if (xa_is_value(folio))
780 break;
781 if (folio_index(folio) != index) {
782 xas_reset(xas);
783 break;
784 }
785
786 if (!folio_try_get_rcu(folio)) {
787 xas_reset(xas);
788 continue;
789 }
790
791 /* Has the folio moved or been split? */
792 if (unlikely(folio != xas_reload(xas))) {
793 folio_put(folio);
794 xas_reset(xas);
795 break;
796 }
797
798 if (!folio_trylock(folio)) {
799 folio_put(folio);
800 xas_reset(xas);
801 break;
802 }
803 if (!folio_test_dirty(folio) ||
804 folio_test_writeback(folio) ||
805 folio_test_fscache(folio)) {
806 folio_unlock(folio);
807 folio_put(folio);
808 xas_reset(xas);
809 break;
810 }
811
812 stop = false;
813 len = folio_size(folio);
814 priv = folio_get_private(folio);
815 if ((const struct netfs_group *)priv != group) {
816 stop = true;
817 finfo = netfs_folio_info(folio);
818 if (finfo->netfs_group != group ||
819 finfo->dirty_offset > 0) {
820 folio_unlock(folio);
821 folio_put(folio);
822 xas_reset(xas);
823 break;
824 }
825 len = finfo->dirty_len;
826 }
827
828 *_top += folio_size(folio);
829 index += folio_nr_pages(folio);
830 *_count -= folio_nr_pages(folio);
831 *_len += len;
832 if (*_len >= max_len || *_count <= 0)
833 stop = true;
834
835 if (!folio_batch_add(&fbatch, folio))
836 break;
837 if (stop)
838 break;
839 }
840
841 xas_pause(xas);
842 rcu_read_unlock();
843
844 /* Now, if we obtained any folios, we can shift them to being
845 * writable and mark them for caching.
846 */
847 if (!folio_batch_count(&fbatch))
848 break;
849
850 for (i = 0; i < folio_batch_count(&fbatch); i++) {
851 folio = fbatch.folios[i];
852 trace_netfs_folio(folio, netfs_folio_trace_store_plus);
853
854 if (!folio_clear_dirty_for_io(folio))
855 BUG();
856 folio_start_writeback(folio);
857 netfs_folio_start_fscache(caching, folio);
858 folio_unlock(folio);
859 }
860
861 folio_batch_release(&fbatch);
862 cond_resched();
863 } while (!stop);
864}
865
866/*
867 * Synchronously write back the locked page and any subsequent non-locked dirty
868 * pages.
869 */
870static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
871 struct writeback_control *wbc,
872 struct netfs_group *group,
873 struct xa_state *xas,
874 struct folio *folio,
875 unsigned long long start,
876 unsigned long long end)
877{
878 struct netfs_io_request *wreq;
879 struct netfs_folio *finfo;
880 struct netfs_inode *ctx = netfs_inode(mapping->host);
881 unsigned long long i_size = i_size_read(&ctx->inode);
882 size_t len, max_len;
883 bool caching = netfs_is_cache_enabled(ctx);
884 long count = wbc->nr_to_write;
885 int ret;
886
887 _enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching);
888
889 wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
890 NETFS_WRITEBACK);
891 if (IS_ERR(wreq)) {
892 folio_unlock(folio);
893 return PTR_ERR(wreq);
894 }
895
896 if (!folio_clear_dirty_for_io(folio))
897 BUG();
898 folio_start_writeback(folio);
899 netfs_folio_start_fscache(caching, folio);
900
901 count -= folio_nr_pages(folio);
902
903 /* Find all consecutive lockable dirty pages that have contiguous
904 * written regions, stopping when we find a page that is not
905 * immediately lockable, is not dirty or is missing, or we reach the
906 * end of the range.
907 */
908 trace_netfs_folio(folio, netfs_folio_trace_store);
909
910 len = wreq->len;
911 finfo = netfs_folio_info(folio);
912 if (finfo) {
913 start += finfo->dirty_offset;
914 if (finfo->dirty_offset + finfo->dirty_len != len) {
915 len = finfo->dirty_len;
916 goto cant_expand;
917 }
918 len = finfo->dirty_len;
919 }
920
921 if (start < i_size) {
922 /* Trim the write to the EOF; the extra data is ignored. Also
923 * put an upper limit on the size of a single storedata op.
924 */
925 max_len = 65536 * 4096;
926 max_len = min_t(unsigned long long, max_len, end - start + 1);
927 max_len = min_t(unsigned long long, max_len, i_size - start);
928
929 if (len < max_len)
930 netfs_extend_writeback(mapping, group, xas, &count, start,
931 max_len, caching, &len, &wreq->upper_len);
932 }
933
934cant_expand:
935 len = min_t(unsigned long long, len, i_size - start);
936
937 /* We now have a contiguous set of dirty pages, each with writeback
938 * set; the first page is still locked at this point, but all the rest
939 * have been unlocked.
940 */
941 folio_unlock(folio);
942 wreq->start = start;
943 wreq->len = len;
944
945 if (start < i_size) {
946 _debug("write back %zx @%llx [%llx]", len, start, i_size);
947
948 /* Speculatively write to the cache. We have to fix this up
949 * later if the store fails.
950 */
951 wreq->cleanup = netfs_cleanup_buffered_write;
952
953 iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
954 wreq->upper_len);
955 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
956 ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
957 if (ret == 0 || ret == -EIOCBQUEUED)
958 wbc->nr_to_write -= len / PAGE_SIZE;
959 } else {
960 _debug("write discard %zx @%llx [%llx]", len, start, i_size);
961
962 /* The dirty region was entirely beyond the EOF. */
963 fscache_clear_page_bits(mapping, start, len, caching);
964 netfs_pages_written_back(wreq);
965 ret = 0;
966 }
967
968 netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
969 _leave(" = 1");
970 return 1;
971}
972
973/*
974 * Write a region of pages back to the server
975 */
976static ssize_t netfs_writepages_begin(struct address_space *mapping,
977 struct writeback_control *wbc,
978 struct netfs_group *group,
979 struct xa_state *xas,
980 unsigned long long *_start,
981 unsigned long long end)
982{
983 const struct netfs_folio *finfo;
984 struct folio *folio;
985 unsigned long long start = *_start;
986 ssize_t ret;
987 void *priv;
988 int skips = 0;
989
990 _enter("%llx,%llx,", start, end);
991
992search_again:
993 /* Find the first dirty page in the group. */
994 rcu_read_lock();
995
996 for (;;) {
997 folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
998 if (xas_retry(xas, folio) || xa_is_value(folio))
999 continue;
1000 if (!folio)
1001 break;
1002
1003 if (!folio_try_get_rcu(folio)) {
1004 xas_reset(xas);
1005 continue;
1006 }
1007
1008 if (unlikely(folio != xas_reload(xas))) {
1009 folio_put(folio);
1010 xas_reset(xas);
1011 continue;
1012 }
1013
1014 /* Skip any dirty folio that's not in the group of interest. */
1015 priv = folio_get_private(folio);
1016 if ((const struct netfs_group *)priv != group) {
1017 finfo = netfs_folio_info(folio);
1018 if (finfo->netfs_group != group) {
1019 folio_put(folio);
1020 continue;
1021 }
1022 }
1023
1024 xas_pause(xas);
1025 break;
1026 }
1027 rcu_read_unlock();
1028 if (!folio)
1029 return 0;
1030
1031 start = folio_pos(folio); /* May regress with THPs */
1032
1033 _debug("wback %lx", folio_index(folio));
1034
1035 /* At this point we hold neither the i_pages lock nor the page lock:
1036 * the page may be truncated or invalidated (changing page->mapping to
1037 * NULL), or even swizzled back from swapper_space to tmpfs file
1038 * mapping
1039 */
1040lock_again:
1041 if (wbc->sync_mode != WB_SYNC_NONE) {
1042 ret = folio_lock_killable(folio);
1043 if (ret < 0)
1044 return ret;
1045 } else {
1046 if (!folio_trylock(folio))
1047 goto search_again;
1048 }
1049
1050 if (folio->mapping != mapping ||
1051 !folio_test_dirty(folio)) {
1052 start += folio_size(folio);
1053 folio_unlock(folio);
1054 goto search_again;
1055 }
1056
1057 if (folio_test_writeback(folio) ||
1058 folio_test_fscache(folio)) {
1059 folio_unlock(folio);
1060 if (wbc->sync_mode != WB_SYNC_NONE) {
1061 folio_wait_writeback(folio);
1062#ifdef CONFIG_NETFS_FSCACHE
1063 folio_wait_fscache(folio);
1064#endif
1065 goto lock_again;
1066 }
1067
1068 start += folio_size(folio);
1069 if (wbc->sync_mode == WB_SYNC_NONE) {
1070 if (skips >= 5 || need_resched()) {
1071 ret = 0;
1072 goto out;
1073 }
1074 skips++;
1075 }
1076 goto search_again;
1077 }
1078
1079 ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
1080 folio, start, end);
1081out:
1082 if (ret > 0)
1083 *_start = start + ret;
1084 _leave(" = %zd [%llx]", ret, *_start);
1085 return ret;
1086}
1087
1088/*
1089 * Write a region of pages back to the server
1090 */
1091static int netfs_writepages_region(struct address_space *mapping,
1092 struct writeback_control *wbc,
1093 struct netfs_group *group,
1094 unsigned long long *_start,
1095 unsigned long long end)
1096{
1097 ssize_t ret;
1098
1099 XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
1100
1101 do {
1102 ret = netfs_writepages_begin(mapping, wbc, group, &xas,
1103 _start, end);
1104 if (ret > 0 && wbc->nr_to_write > 0)
1105 cond_resched();
1106 } while (ret > 0 && wbc->nr_to_write > 0);
1107
1108 return ret > 0 ? 0 : ret;
1109}
1110
1111/*
1112 * write some of the pending data back to the server
1113 */
1114int netfs_writepages(struct address_space *mapping,
1115 struct writeback_control *wbc)
1116{
1117 struct netfs_group *group = NULL;
1118 loff_t start, end;
1119 int ret;
1120
1121 _enter("");
1122
1123 /* We have to be careful as we can end up racing with setattr()
1124 * truncating the pagecache since the caller doesn't take a lock here
1125 * to prevent it.
1126 */
1127
1128 if (wbc->range_cyclic && mapping->writeback_index) {
1129 start = mapping->writeback_index * PAGE_SIZE;
1130 ret = netfs_writepages_region(mapping, wbc, group,
1131 &start, LLONG_MAX);
1132 if (ret < 0)
1133 goto out;
1134
1135 if (wbc->nr_to_write <= 0) {
1136 mapping->writeback_index = start / PAGE_SIZE;
1137 goto out;
1138 }
1139
1140 start = 0;
1141 end = mapping->writeback_index * PAGE_SIZE;
1142 mapping->writeback_index = 0;
1143 ret = netfs_writepages_region(mapping, wbc, group, &start, end);
1144 if (ret == 0)
1145 mapping->writeback_index = start / PAGE_SIZE;
1146 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
1147 start = 0;
1148 ret = netfs_writepages_region(mapping, wbc, group,
1149 &start, LLONG_MAX);
1150 if (wbc->nr_to_write > 0 && ret == 0)
1151 mapping->writeback_index = start / PAGE_SIZE;
1152 } else {
1153 start = wbc->range_start;
1154 ret = netfs_writepages_region(mapping, wbc, group,
1155 &start, wbc->range_end);
1156 }
1157
1158out:
1159 _leave(" = %d", ret);
1160 return ret;
1161}
1162EXPORT_SYMBOL(netfs_writepages);
4a79616c
DH
1163
1164/*
1165 * Deal with the disposition of a laundered folio.
1166 */
1167static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
1168{
1169 if (wreq->error) {
1170 pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
1171 mapping_set_error(wreq->mapping, wreq->error);
1172 }
1173}
1174
1175/**
1176 * netfs_launder_folio - Clean up a dirty folio that's being invalidated
1177 * @folio: The folio to clean
1178 *
1179 * This is called to write back a folio that's being invalidated when an inode
1180 * is getting torn down. Ideally, writepages would be used instead.
1181 */
1182int netfs_launder_folio(struct folio *folio)
1183{
1184 struct netfs_io_request *wreq;
1185 struct address_space *mapping = folio->mapping;
1186 struct netfs_folio *finfo = netfs_folio_info(folio);
1187 struct netfs_group *group = netfs_folio_group(folio);
1188 struct bio_vec bvec;
1189 unsigned long long i_size = i_size_read(mapping->host);
1190 unsigned long long start = folio_pos(folio);
1191 size_t offset = 0, len;
1192 int ret = 0;
1193
1194 if (finfo) {
1195 offset = finfo->dirty_offset;
1196 start += offset;
1197 len = finfo->dirty_len;
1198 } else {
1199 len = folio_size(folio);
1200 }
1201 len = min_t(unsigned long long, len, i_size - start);
1202
1203 wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
1204 if (IS_ERR(wreq)) {
1205 ret = PTR_ERR(wreq);
1206 goto out;
1207 }
1208
1209 if (!folio_clear_dirty_for_io(folio))
1210 goto out_put;
1211
1212 trace_netfs_folio(folio, netfs_folio_trace_launder);
1213
1214 _debug("launder %llx-%llx", start, start + len - 1);
1215
1216 /* Speculatively write to the cache. We have to fix this up later if
1217 * the store fails.
1218 */
1219 wreq->cleanup = netfs_cleanup_launder_folio;
1220
1221 bvec_set_folio(&bvec, folio, len, offset);
1222 iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
1223 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
1224 ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
1225
1226out_put:
1227 folio_detach_private(folio);
1228 netfs_put_group(group);
1229 kfree(finfo);
1230 netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
1231out:
1232 folio_wait_fscache(folio);
1233 _leave(" = %d", ret);
1234 return ret;
1235}
1236EXPORT_SYMBOL(netfs_launder_folio);