netfs: Rename netfs_read_*request to netfs_io_*request
[linux-block.git] / fs / ceph / addr.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
3d14c5d2 2#include <linux/ceph/ceph_debug.h>
1d3576fd
SW
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/mm.h>
d7bdba1c 7#include <linux/swap.h>
1d3576fd 8#include <linux/pagemap.h>
5a0e3ad6 9#include <linux/slab.h>
1d3576fd
SW
10#include <linux/pagevec.h>
11#include <linux/task_io_accounting_ops.h>
f361bf4a 12#include <linux/signal.h>
5c308356 13#include <linux/iversion.h>
97e27aaa 14#include <linux/ktime.h>
f0702876 15#include <linux/netfs.h>
1d3576fd
SW
16
17#include "super.h"
3d14c5d2 18#include "mds_client.h"
99ccbd22 19#include "cache.h"
97e27aaa 20#include "metric.h"
3d14c5d2 21#include <linux/ceph/osd_client.h>
08c1ac50 22#include <linux/ceph/striper.h>
1d3576fd
SW
23
24/*
25 * Ceph address space ops.
26 *
27 * There are a few funny things going on here.
28 *
29 * The page->private field is used to reference a struct
30 * ceph_snap_context for _every_ dirty page. This indicates which
31 * snapshot the page was logically dirtied in, and thus which snap
32 * context needs to be associated with the osd write during writeback.
33 *
34 * Similarly, struct ceph_inode_info maintains a set of counters to
25985edc 35 * count dirty pages on the inode. In the absence of snapshots,
1d3576fd
SW
36 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
37 *
38 * When a snapshot is taken (that is, when the client receives
39 * notification that a snapshot was taken), each inode with caps and
40 * with dirty pages (dirty pages implies there is a cap) gets a new
41 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
42 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
43 * moved to capsnap->dirty. (Unless a sync write is currently in
44 * progress. In that case, the capsnap is said to be "pending", new
45 * writes cannot start, and the capsnap isn't "finalized" until the
46 * write completes (or fails) and a final size/mtime for the inode for
47 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
48 *
49 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
50 * we look for the first capsnap in i_cap_snaps and write out pages in
51 * that snap context _only_. Then we move on to the next capsnap,
52 * eventually reaching the "live" or "head" context (i.e., pages that
53 * are not yet snapped) and are writing the most recently dirtied
54 * pages.
55 *
56 * Invalidate and so forth must take care to ensure the dirty page
57 * accounting is preserved.
58 */
59
2baba250
YS
60#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
61#define CONGESTION_OFF_THRESH(congestion_kb) \
62 (CONGESTION_ON_THRESH(congestion_kb) - \
63 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
64
d801327d 65static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
78525c74 66 struct folio *folio, void **_fsdata);
d801327d 67
61600ef8
YZ
68static inline struct ceph_snap_context *page_snap_context(struct page *page)
69{
70 if (PagePrivate(page))
71 return (void *)page->private;
72 return NULL;
73}
1d3576fd
SW
74
75/*
76 * Dirty a page. Optimistically adjust accounting, on the assumption
77 * that we won't race with invalidate. If we do, readjust.
78 */
79static int ceph_set_page_dirty(struct page *page)
80{
81 struct address_space *mapping = page->mapping;
82 struct inode *inode;
83 struct ceph_inode_info *ci;
1d3576fd 84 struct ceph_snap_context *snapc;
1d3576fd 85
7d6e1f54 86 if (PageDirty(page)) {
1d3576fd
SW
87 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
88 mapping->host, page, page->index);
7d6e1f54 89 BUG_ON(!PagePrivate(page));
1d3576fd
SW
90 return 0;
91 }
92
93 inode = mapping->host;
94 ci = ceph_inode(inode);
95
1d3576fd 96 /* dirty the head */
be655596 97 spin_lock(&ci->i_ceph_lock);
5dda377c
YZ
98 BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
99 if (__ceph_have_pending_cap_snap(ci)) {
100 struct ceph_cap_snap *capsnap =
101 list_last_entry(&ci->i_cap_snaps,
102 struct ceph_cap_snap,
103 ci_item);
104 snapc = ceph_get_snap_context(capsnap->context);
105 capsnap->dirty_pages++;
106 } else {
107 BUG_ON(!ci->i_head_snapc);
108 snapc = ceph_get_snap_context(ci->i_head_snapc);
109 ++ci->i_wrbuffer_ref_head;
110 }
1d3576fd 111 if (ci->i_wrbuffer_ref == 0)
0444d76a 112 ihold(inode);
1d3576fd
SW
113 ++ci->i_wrbuffer_ref;
114 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
115 "snapc %p seq %lld (%d snaps)\n",
116 mapping->host, page, page->index,
117 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
118 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
119 snapc, snapc->seq, snapc->num_snaps);
be655596 120 spin_unlock(&ci->i_ceph_lock);
1d3576fd 121
7d6e1f54
SZ
122 /*
123 * Reference snap context in page->private. Also set
124 * PagePrivate so that we get invalidatepage callback.
125 */
126 BUG_ON(PagePrivate(page));
379fc7fa 127 attach_page_private(page, snapc);
1d3576fd 128
400e1286 129 return ceph_fscache_set_page_dirty(page);
1d3576fd
SW
130}
131
132/*
133 * If we are truncating the full page (i.e. offset == 0), adjust the
134 * dirty page counters appropriately. Only called if there is private
135 * data on the page.
136 */
d47992f8
LC
137static void ceph_invalidatepage(struct page *page, unsigned int offset,
138 unsigned int length)
1d3576fd 139{
4ce1e9ad 140 struct inode *inode;
1d3576fd 141 struct ceph_inode_info *ci;
379fc7fa 142 struct ceph_snap_context *snapc;
1d3576fd 143
4ce1e9ad 144 inode = page->mapping->host;
b150f5c1
MT
145 ci = ceph_inode(inode);
146
8ff2d290 147 if (offset != 0 || length != thp_size(page)) {
b150f5c1
MT
148 dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
149 inode, page, page->index, offset, length);
150 return;
151 }
4ce1e9ad 152
b072d774 153 WARN_ON(!PageLocked(page));
400e1286
JL
154 if (PagePrivate(page)) {
155 dout("%p invalidatepage %p idx %lu full dirty page\n",
156 inode, page, page->index);
99ccbd22 157
400e1286
JL
158 snapc = detach_page_private(page);
159 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
160 ceph_put_snap_context(snapc);
161 }
b150f5c1 162
400e1286 163 wait_on_page_fscache(page);
1d3576fd
SW
164}
165
7c46b318 166static int ceph_releasepage(struct page *page, gfp_t gfp)
1d3576fd 167{
400e1286
JL
168 struct inode *inode = page->mapping->host;
169
170 dout("%llx:%llx releasepage %p idx %lu (%sdirty)\n",
171 ceph_vinop(inode), page,
172 page->index, PageDirty(page) ? "" : "not ");
173
174 if (PagePrivate(page))
175 return 0;
99ccbd22 176
7c46b318 177 if (PageFsCache(page)) {
d7bdba1c 178 if (current_is_kswapd() || !(gfp & __GFP_FS))
7c46b318
JL
179 return 0;
180 wait_on_page_fscache(page);
181 }
400e1286
JL
182 ceph_fscache_note_page_release(inode);
183 return 1;
1d3576fd
SW
184}
185
6a19114b 186static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
f0702876 187{
a25cedb4 188 struct inode *inode = rreq->inode;
f0702876
JL
189 struct ceph_inode_info *ci = ceph_inode(inode);
190 struct ceph_file_layout *lo = &ci->i_layout;
191 u32 blockoff;
192 u64 blockno;
193
194 /* Expand the start downward */
195 blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
196 rreq->start = blockno * lo->stripe_unit;
197 rreq->len += blockoff;
198
199 /* Now, round up the length to the next block */
200 rreq->len = roundup(rreq->len, lo->stripe_unit);
201}
202
6a19114b 203static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
f0702876 204{
a25cedb4 205 struct inode *inode = subreq->rreq->inode;
f0702876
JL
206 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
207 struct ceph_inode_info *ci = ceph_inode(inode);
208 u64 objno, objoff;
209 u32 xlen;
210
211 /* Truncate the extent at the end of the current block */
212 ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
213 &objno, &objoff, &xlen);
214 subreq->len = min(xlen, fsc->mount_options->rsize);
215 return true;
216}
217
218static void finish_netfs_read(struct ceph_osd_request *req)
219{
220 struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
221 struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
6a19114b 222 struct netfs_io_subrequest *subreq = req->r_priv;
f0702876
JL
223 int num_pages;
224 int err = req->r_result;
225
8ae99ae2 226 ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
903f4fec 227 req->r_end_latency, osd_data->length, err);
f0702876
JL
228
229 dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
230 subreq->len, i_size_read(req->r_inode));
231
232 /* no object means success but no data */
233 if (err == -ENOENT)
234 err = 0;
235 else if (err == -EBLOCKLISTED)
236 fsc->blocklisted = true;
237
238 if (err >= 0 && err < subreq->len)
239 __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
240
241 netfs_subreq_terminated(subreq, err, true);
242
243 num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
244 ceph_put_page_vector(osd_data->pages, num_pages, false);
245 iput(req->r_inode);
246}
247
6a19114b 248static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
5b19f1eb 249{
6a19114b 250 struct netfs_io_request *rreq = subreq->rreq;
5b19f1eb
DH
251 struct inode *inode = rreq->inode;
252 struct ceph_mds_reply_info_parsed *rinfo;
253 struct ceph_mds_reply_info_in *iinfo;
254 struct ceph_mds_request *req;
255 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
256 struct ceph_inode_info *ci = ceph_inode(inode);
257 struct iov_iter iter;
258 ssize_t err = 0;
259 size_t len;
260
261 __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
262 __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
263
264 if (subreq->start >= inode->i_size)
265 goto out;
266
267 /* We need to fetch the inline data. */
268 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
269 if (IS_ERR(req)) {
270 err = PTR_ERR(req);
271 goto out;
272 }
273 req->r_ino1 = ci->i_vino;
274 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
275 req->r_num_caps = 2;
276
277 err = ceph_mdsc_do_request(mdsc, NULL, req);
278 if (err < 0)
279 goto out;
280
281 rinfo = &req->r_reply_info;
282 iinfo = &rinfo->targeti;
283 if (iinfo->inline_version == CEPH_INLINE_NONE) {
284 /* The data got uninlined */
285 ceph_mdsc_put_request(req);
286 return false;
287 }
288
289 len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
290 iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
291 err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
292 if (err == 0)
293 err = -EFAULT;
294
295 ceph_mdsc_put_request(req);
296out:
297 netfs_subreq_terminated(subreq, err, false);
298 return true;
299}
300
6a19114b 301static void ceph_netfs_issue_op(struct netfs_io_subrequest *subreq)
f0702876 302{
6a19114b 303 struct netfs_io_request *rreq = subreq->rreq;
a25cedb4 304 struct inode *inode = rreq->inode;
f0702876
JL
305 struct ceph_inode_info *ci = ceph_inode(inode);
306 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
307 struct ceph_osd_request *req;
308 struct ceph_vino vino = ceph_vino(inode);
309 struct iov_iter iter;
310 struct page **pages;
311 size_t page_off;
312 int err = 0;
313 u64 len = subreq->len;
314
5b19f1eb
DH
315 if (ci->i_inline_version != CEPH_INLINE_NONE &&
316 ceph_netfs_issue_op_inline(subreq))
317 return;
318
f0702876
JL
319 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
320 0, 1, CEPH_OSD_OP_READ,
321 CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
322 NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
323 if (IS_ERR(req)) {
324 err = PTR_ERR(req);
325 req = NULL;
326 goto out;
327 }
328
329 dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
330 iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
331 err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
332 if (err < 0) {
333 dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
334 goto out;
335 }
336
337 /* should always give us a page-aligned read */
338 WARN_ON_ONCE(page_off);
339 len = err;
340
341 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
342 req->r_callback = finish_netfs_read;
343 req->r_priv = subreq;
344 req->r_inode = inode;
345 ihold(inode);
346
347 err = ceph_osdc_start_request(req->r_osdc, req, false);
348 if (err)
349 iput(inode);
350out:
351 ceph_osdc_put_request(req);
352 if (err)
353 netfs_subreq_terminated(subreq, err, false);
354 dout("%s: result %d\n", __func__, err);
355}
356
49870056
JL
357static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
358{
359 struct inode *inode = mapping->host;
360 struct ceph_inode_info *ci = ceph_inode(inode);
361 int got = (uintptr_t)priv;
362
363 if (got)
364 ceph_put_cap_refs(ci, got);
365}
366
6a19114b 367static const struct netfs_request_ops ceph_netfs_read_ops = {
f0702876
JL
368 .is_cache_enabled = ceph_is_cache_enabled,
369 .begin_cache_operation = ceph_begin_cache_operation,
370 .issue_op = ceph_netfs_issue_op,
371 .expand_readahead = ceph_netfs_expand_readahead,
372 .clamp_length = ceph_netfs_clamp_length,
d801327d 373 .check_write_begin = ceph_netfs_check_write_begin,
49870056 374 .cleanup = ceph_readahead_cleanup,
f0702876
JL
375};
376
377/* read a single page, without unlocking it. */
78525c74 378static int ceph_readpage(struct file *file, struct page *subpage)
f0702876 379{
78525c74 380 struct folio *folio = page_folio(subpage);
f0702876
JL
381 struct inode *inode = file_inode(file);
382 struct ceph_inode_info *ci = ceph_inode(inode);
383 struct ceph_vino vino = ceph_vino(inode);
78525c74
DH
384 size_t len = folio_size(folio);
385 u64 off = folio_file_pos(folio);
f0702876 386
5b19f1eb
DH
387 dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d",
388 vino.ino, vino.snap, file, off, len, folio, folio_index(folio),
389 ci->i_inline_version != CEPH_INLINE_NONE);
f0702876 390
78525c74 391 return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
f0702876
JL
392}
393
49870056 394static void ceph_readahead(struct readahead_control *ractl)
1d3576fd 395{
49870056
JL
396 struct inode *inode = file_inode(ractl->file);
397 struct ceph_file_info *fi = ractl->file->private_data;
398 struct ceph_rw_context *rw_ctx;
2b1ac852
YZ
399 int got = 0;
400 int ret = 0;
401
49870056
JL
402 if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
403 return;
404
405 rw_ctx = ceph_find_rw_context(fi);
5d988308 406 if (!rw_ctx) {
49870056
JL
407 /*
408 * readahead callers do not necessarily hold Fcb caps
409 * (e.g. fadvise, madvise).
410 */
2b1ac852 411 int want = CEPH_CAP_FILE_CACHE;
49870056
JL
412
413 ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
414 if (ret < 0)
2b1ac852 415 dout("start_read %p, error getting cap\n", inode);
49870056 416 else if (!(got & want))
2b1ac852 417 dout("start_read %p, no cache cap\n", inode);
1d3576fd 418
49870056
JL
419 if (ret <= 0)
420 return;
2b1ac852 421 }
49870056 422 netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
1d3576fd
SW
423}
424
1702e797
JL
425#ifdef CONFIG_CEPH_FSCACHE
426static void ceph_set_page_fscache(struct page *page)
427{
428 set_page_fscache(page);
429}
430
431static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
432{
433 struct inode *inode = priv;
434
435 if (IS_ERR_VALUE(error) && error != -ENOBUFS)
436 ceph_fscache_invalidate(inode, false);
437}
438
439static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
440{
441 struct ceph_inode_info *ci = ceph_inode(inode);
442 struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
443
444 fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
445 ceph_fscache_write_terminated, inode, caching);
446}
447#else
448static inline void ceph_set_page_fscache(struct page *page)
449{
450}
451
452static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
453{
454}
455#endif /* CONFIG_CEPH_FSCACHE */
456
1f934b00
YZ
457struct ceph_writeback_ctl
458{
459 loff_t i_size;
460 u64 truncate_size;
461 u32 truncate_seq;
462 bool size_stable;
2a2d927e 463 bool head_snapc;
1f934b00
YZ
464};
465
1d3576fd
SW
466/*
467 * Get ref for the oldest snapc for an inode with dirty data... that is, the
468 * only snap context we are allowed to write back.
1d3576fd 469 */
1f934b00 470static struct ceph_snap_context *
05455e11
YZ
471get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
472 struct ceph_snap_context *page_snapc)
1d3576fd
SW
473{
474 struct ceph_inode_info *ci = ceph_inode(inode);
475 struct ceph_snap_context *snapc = NULL;
476 struct ceph_cap_snap *capsnap = NULL;
477
be655596 478 spin_lock(&ci->i_ceph_lock);
1d3576fd
SW
479 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
480 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
481 capsnap->context, capsnap->dirty_pages);
05455e11
YZ
482 if (!capsnap->dirty_pages)
483 continue;
484
485 /* get i_size, truncate_{seq,size} for page_snapc? */
486 if (snapc && capsnap->context != page_snapc)
487 continue;
488
489 if (ctl) {
490 if (capsnap->writing) {
491 ctl->i_size = i_size_read(inode);
492 ctl->size_stable = false;
493 } else {
494 ctl->i_size = capsnap->size;
495 ctl->size_stable = true;
1f934b00 496 }
05455e11
YZ
497 ctl->truncate_size = capsnap->truncate_size;
498 ctl->truncate_seq = capsnap->truncate_seq;
2a2d927e 499 ctl->head_snapc = false;
1d3576fd 500 }
05455e11
YZ
501
502 if (snapc)
503 break;
504
505 snapc = ceph_get_snap_context(capsnap->context);
506 if (!page_snapc ||
507 page_snapc == snapc ||
508 page_snapc->seq > snapc->seq)
509 break;
1d3576fd 510 }
7d8cb26d 511 if (!snapc && ci->i_wrbuffer_ref_head) {
80e755fe 512 snapc = ceph_get_snap_context(ci->i_head_snapc);
1d3576fd
SW
513 dout(" head snapc %p has %d dirty pages\n",
514 snapc, ci->i_wrbuffer_ref_head);
1f934b00
YZ
515 if (ctl) {
516 ctl->i_size = i_size_read(inode);
517 ctl->truncate_size = ci->i_truncate_size;
518 ctl->truncate_seq = ci->i_truncate_seq;
519 ctl->size_stable = false;
2a2d927e 520 ctl->head_snapc = true;
1f934b00 521 }
1d3576fd 522 }
be655596 523 spin_unlock(&ci->i_ceph_lock);
1d3576fd
SW
524 return snapc;
525}
526
1f934b00
YZ
527static u64 get_writepages_data_length(struct inode *inode,
528 struct page *page, u64 start)
529{
530 struct ceph_inode_info *ci = ceph_inode(inode);
531 struct ceph_snap_context *snapc = page_snap_context(page);
532 struct ceph_cap_snap *capsnap = NULL;
533 u64 end = i_size_read(inode);
534
535 if (snapc != ci->i_head_snapc) {
536 bool found = false;
537 spin_lock(&ci->i_ceph_lock);
538 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
539 if (capsnap->context == snapc) {
540 if (!capsnap->writing)
541 end = capsnap->size;
542 found = true;
543 break;
544 }
545 }
546 spin_unlock(&ci->i_ceph_lock);
547 WARN_ON(!found);
548 }
8ff2d290
JL
549 if (end > page_offset(page) + thp_size(page))
550 end = page_offset(page) + thp_size(page);
1f934b00
YZ
551 return end > start ? end - start : 0;
552}
553
1d3576fd
SW
554/*
555 * Write a single page, but leave the page locked.
556 *
b72b13eb 557 * If we get a write error, mark the mapping for error, but still adjust the
1d3576fd
SW
558 * dirty page accounting (i.e., page is no longer dirty).
559 */
560static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
561{
6390987f
JL
562 struct inode *inode = page->mapping->host;
563 struct ceph_inode_info *ci = ceph_inode(inode);
564 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
6298a337 565 struct ceph_snap_context *snapc, *oldest;
fc2744aa 566 loff_t page_off = page_offset(page);
6390987f 567 int err;
8ff2d290 568 loff_t len = thp_size(page);
1f934b00 569 struct ceph_writeback_ctl ceph_wbc;
6390987f
JL
570 struct ceph_osd_client *osdc = &fsc->client->osdc;
571 struct ceph_osd_request *req;
1702e797 572 bool caching = ceph_is_cache_enabled(inode);
1d3576fd
SW
573
574 dout("writepage %p idx %lu\n", page, page->index);
575
1d3576fd 576 /* verify this is a writeable snap context */
61600ef8 577 snapc = page_snap_context(page);
d37b1d99 578 if (!snapc) {
1d3576fd 579 dout("writepage %p page %p not dirty?\n", inode, page);
43986881 580 return 0;
1d3576fd 581 }
05455e11 582 oldest = get_oldest_context(inode, &ceph_wbc, snapc);
6298a337 583 if (snapc->seq > oldest->seq) {
1d3576fd 584 dout("writepage %p page %p snapc %p not writeable - noop\n",
61600ef8 585 inode, page, snapc);
1d3576fd 586 /* we should only noop if called by kswapd */
fa71fefb 587 WARN_ON(!(current->flags & PF_MEMALLOC));
6298a337 588 ceph_put_snap_context(oldest);
fa71fefb 589 redirty_page_for_writepage(wbc, page);
43986881 590 return 0;
1d3576fd 591 }
6298a337 592 ceph_put_snap_context(oldest);
1d3576fd
SW
593
594 /* is this a partial page at end of file? */
1f934b00
YZ
595 if (page_off >= ceph_wbc.i_size) {
596 dout("%p page eof %llu\n", page, ceph_wbc.i_size);
8ff2d290 597 page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
43986881 598 return 0;
fc2744aa 599 }
43986881 600
1f934b00
YZ
601 if (ceph_wbc.i_size < page_off + len)
602 len = ceph_wbc.i_size - page_off;
1d3576fd 603
6390987f 604 dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
1c0a9c2d 605 inode, page, page->index, page_off, len, snapc, snapc->seq);
1d3576fd 606
314c4737 607 if (atomic_long_inc_return(&fsc->writeback_count) >
3d14c5d2 608 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
09dc9fc2 609 set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
2baba250 610
6390987f
JL
611 req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
612 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
613 ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
614 true);
1702e797 615 if (IS_ERR(req))
6390987f 616 return PTR_ERR(req);
1702e797
JL
617
618 set_page_writeback(page);
619 if (caching)
620 ceph_set_page_fscache(page);
621 ceph_fscache_write_to_cache(inode, page_off, len, caching);
6390987f
JL
622
623 /* it may be a short write due to an object boundary */
8ff2d290 624 WARN_ON_ONCE(len > thp_size(page));
6390987f
JL
625 osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
626 dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
627
628 req->r_mtime = inode->i_mtime;
629 err = ceph_osdc_start_request(osdc, req, true);
630 if (!err)
631 err = ceph_osdc_wait_request(osdc, req);
632
8ae99ae2 633 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
903f4fec 634 req->r_end_latency, len, err);
6390987f
JL
635
636 ceph_osdc_put_request(req);
637 if (err == 0)
638 err = len;
639
1d3576fd 640 if (err < 0) {
ad15ec06
YZ
641 struct writeback_control tmp_wbc;
642 if (!wbc)
643 wbc = &tmp_wbc;
644 if (err == -ERESTARTSYS) {
645 /* killed by SIGKILL */
646 dout("writepage interrupted page %p\n", page);
647 redirty_page_for_writepage(wbc, page);
648 end_page_writeback(page);
43986881 649 return err;
ad15ec06 650 }
0b98acd6
ID
651 if (err == -EBLOCKLISTED)
652 fsc->blocklisted = true;
ad15ec06
YZ
653 dout("writepage setting page/mapping error %d %p\n",
654 err, page);
1d3576fd 655 mapping_set_error(&inode->i_data, err);
ad15ec06 656 wbc->pages_skipped++;
1d3576fd
SW
657 } else {
658 dout("writepage cleaned page %p\n", page);
659 err = 0; /* vfs expects us to return 0 */
660 }
379fc7fa
JL
661 oldest = detach_page_private(page);
662 WARN_ON_ONCE(oldest != snapc);
1d3576fd
SW
663 end_page_writeback(page);
664 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
6298a337 665 ceph_put_snap_context(snapc); /* page's reference */
314c4737
YZ
666
667 if (atomic_long_dec_return(&fsc->writeback_count) <
668 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
669 clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
670
1d3576fd
SW
671 return err;
672}
673
674static int ceph_writepage(struct page *page, struct writeback_control *wbc)
675{
dbd646a8
YS
676 int err;
677 struct inode *inode = page->mapping->host;
678 BUG_ON(!inode);
70b666c3 679 ihold(inode);
1702e797
JL
680
681 wait_on_page_fscache(page);
682
dbd646a8 683 err = writepage_nounlock(page, wbc);
ad15ec06
YZ
684 if (err == -ERESTARTSYS) {
685 /* direct memory reclaimer was killed by SIGKILL. return 0
686 * to prevent caller from setting mapping/page error */
687 err = 0;
688 }
1d3576fd 689 unlock_page(page);
dbd646a8 690 iput(inode);
1d3576fd
SW
691 return err;
692}
693
1d3576fd
SW
694/*
695 * async writeback completion handler.
696 *
697 * If we get an error, set the mapping error bit, but not the individual
698 * page error bits.
699 */
85e084fe 700static void writepages_finish(struct ceph_osd_request *req)
1d3576fd
SW
701{
702 struct inode *inode = req->r_inode;
1d3576fd 703 struct ceph_inode_info *ci = ceph_inode(inode);
87060c10 704 struct ceph_osd_data *osd_data;
1d3576fd 705 struct page *page;
5b64640c
YZ
706 int num_pages, total_pages = 0;
707 int i, j;
708 int rc = req->r_result;
1d3576fd
SW
709 struct ceph_snap_context *snapc = req->r_snapc;
710 struct address_space *mapping = inode->i_mapping;
3d14c5d2 711 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
903f4fec 712 unsigned int len = 0;
5b64640c 713 bool remove_page;
1d3576fd 714
5b64640c 715 dout("writepages_finish %p rc %d\n", inode, rc);
26544c62 716 if (rc < 0) {
1d3576fd 717 mapping_set_error(mapping, rc);
26544c62 718 ceph_set_error_write(ci);
0b98acd6
ID
719 if (rc == -EBLOCKLISTED)
720 fsc->blocklisted = true;
26544c62
JL
721 } else {
722 ceph_clear_error_write(ci);
723 }
5b64640c
YZ
724
725 /*
726 * We lost the cache cap, need to truncate the page before
727 * it is unlocked, otherwise we'd truncate it later in the
728 * page truncation thread, possibly losing some data that
729 * raced its way in
730 */
731 remove_page = !(ceph_caps_issued(ci) &
732 (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
1d3576fd
SW
733
734 /* clean all pages */
5b64640c
YZ
735 for (i = 0; i < req->r_num_ops; i++) {
736 if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
737 break;
e63dc5c7 738
5b64640c
YZ
739 osd_data = osd_req_op_extent_osd_data(req, i);
740 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
903f4fec 741 len += osd_data->length;
5b64640c
YZ
742 num_pages = calc_pages_for((u64)osd_data->alignment,
743 (u64)osd_data->length);
744 total_pages += num_pages;
745 for (j = 0; j < num_pages; j++) {
746 page = osd_data->pages[j];
747 BUG_ON(!page);
748 WARN_ON(!PageUptodate(page));
749
750 if (atomic_long_dec_return(&fsc->writeback_count) <
751 CONGESTION_OFF_THRESH(
752 fsc->mount_options->congestion_kb))
09dc9fc2 753 clear_bdi_congested(inode_to_bdi(inode),
5b64640c
YZ
754 BLK_RW_ASYNC);
755
379fc7fa 756 ceph_put_snap_context(detach_page_private(page));
5b64640c 757 end_page_writeback(page);
379fc7fa 758 dout("unlocking %p\n", page);
5b64640c
YZ
759
760 if (remove_page)
761 generic_error_remove_page(inode->i_mapping,
762 page);
763
764 unlock_page(page);
765 }
766 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
767 inode, osd_data->length, rc >= 0 ? num_pages : 0);
e63dc5c7 768
96ac9158 769 release_pages(osd_data->pages, num_pages);
1d3576fd 770 }
1d3576fd 771
903f4fec
XL
772 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
773 req->r_end_latency, len, rc);
774
5b64640c
YZ
775 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
776
777 osd_data = osd_req_op_extent_osd_data(req, 0);
87060c10 778 if (osd_data->pages_from_pool)
a0102bda 779 mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
1d3576fd 780 else
87060c10 781 kfree(osd_data->pages);
1d3576fd
SW
782 ceph_osdc_put_request(req);
783}
784
1d3576fd
SW
785/*
786 * initiate async writeback
787 */
788static int ceph_writepages_start(struct address_space *mapping,
789 struct writeback_control *wbc)
790{
791 struct inode *inode = mapping->host;
1d3576fd 792 struct ceph_inode_info *ci = ceph_inode(inode);
fc2744aa
YZ
793 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
794 struct ceph_vino vino = ceph_vino(inode);
2a2d927e 795 pgoff_t index, start_index, end = -1;
80e755fe 796 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
1d3576fd 797 struct pagevec pvec;
1d3576fd 798 int rc = 0;
93407472 799 unsigned int wsize = i_blocksize(inode);
1d3576fd 800 struct ceph_osd_request *req = NULL;
1f934b00 801 struct ceph_writeback_ctl ceph_wbc;
590e9d98 802 bool should_loop, range_whole = false;
af9cc401 803 bool done = false;
1702e797 804 bool caching = ceph_is_cache_enabled(inode);
1d3576fd 805
3fb99d48 806 dout("writepages_start %p (mode=%s)\n", inode,
1d3576fd
SW
807 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
808 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
809
5d6451b1 810 if (ceph_inode_is_shutdown(inode)) {
6c93df5d
YZ
811 if (ci->i_wrbuffer_ref > 0) {
812 pr_warn_ratelimited(
813 "writepage_start %p %lld forced umount\n",
814 inode, ceph_ino(inode));
815 }
a341d4df 816 mapping_set_error(mapping, -EIO);
1d3576fd
SW
817 return -EIO; /* we're in a forced umount, don't write! */
818 }
95cca2b4 819 if (fsc->mount_options->wsize < wsize)
3d14c5d2 820 wsize = fsc->mount_options->wsize;
1d3576fd 821
86679820 822 pagevec_init(&pvec);
1d3576fd 823
590e9d98 824 start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
2a2d927e 825 index = start_index;
1d3576fd
SW
826
827retry:
828 /* find oldest snap context with dirty data */
05455e11 829 snapc = get_oldest_context(inode, &ceph_wbc, NULL);
1d3576fd
SW
830 if (!snapc) {
831 /* hmm, why does writepages get called when there
832 is no dirty data? */
833 dout(" no snap context with dirty data?\n");
834 goto out;
835 }
836 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
837 snapc, snapc->seq, snapc->num_snaps);
fc2744aa 838
2a2d927e
YZ
839 should_loop = false;
840 if (ceph_wbc.head_snapc && snapc != last_snapc) {
841 /* where to start/end? */
842 if (wbc->range_cyclic) {
843 index = start_index;
844 end = -1;
845 if (index > 0)
846 should_loop = true;
847 dout(" cyclic, start at %lu\n", index);
848 } else {
849 index = wbc->range_start >> PAGE_SHIFT;
850 end = wbc->range_end >> PAGE_SHIFT;
851 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
852 range_whole = true;
853 dout(" not cyclic, %lu to %lu\n", index, end);
854 }
855 } else if (!ceph_wbc.head_snapc) {
856 /* Do not respect wbc->range_{start,end}. Dirty pages
857 * in that range can be associated with newer snapc.
858 * They are not writeable until we write all dirty pages
859 * associated with 'snapc' get written */
1582af2e 860 if (index > 0)
2a2d927e
YZ
861 should_loop = true;
862 dout(" non-head snapc, range whole\n");
1d3576fd 863 }
2a2d927e
YZ
864
865 ceph_put_snap_context(last_snapc);
1d3576fd
SW
866 last_snapc = snapc;
867
af9cc401 868 while (!done && index <= end) {
5b64640c 869 int num_ops = 0, op_idx;
0e5ecac7 870 unsigned i, pvec_pages, max_pages, locked_pages = 0;
5b64640c 871 struct page **pages = NULL, **data_pages;
1d3576fd 872 struct page *page;
0e5ecac7 873 pgoff_t strip_unit_end = 0;
5b64640c 874 u64 offset = 0, len = 0;
a0102bda 875 bool from_pool = false;
1d3576fd 876
0e5ecac7 877 max_pages = wsize >> PAGE_SHIFT;
1d3576fd
SW
878
879get_more_pages:
2e169296
JL
880 pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
881 end, PAGECACHE_TAG_DIRTY);
0ed75fc8 882 dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
1d3576fd
SW
883 if (!pvec_pages && !locked_pages)
884 break;
885 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
886 page = pvec.pages[i];
887 dout("? %p idx %lu\n", page, page->index);
888 if (locked_pages == 0)
889 lock_page(page); /* first page */
890 else if (!trylock_page(page))
891 break;
892
893 /* only dirty pages, or our accounting breaks */
894 if (unlikely(!PageDirty(page)) ||
895 unlikely(page->mapping != mapping)) {
896 dout("!dirty or !mapping %p\n", page);
897 unlock_page(page);
0713e5f2 898 continue;
1d3576fd 899 }
af9cc401
YZ
900 /* only if matching snap context */
901 pgsnapc = page_snap_context(page);
902 if (pgsnapc != snapc) {
903 dout("page snapc %p %lld != oldest %p %lld\n",
904 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
1582af2e
YZ
905 if (!should_loop &&
906 !ceph_wbc.head_snapc &&
907 wbc->sync_mode != WB_SYNC_NONE)
908 should_loop = true;
1d3576fd 909 unlock_page(page);
af9cc401 910 continue;
1d3576fd 911 }
1f934b00
YZ
912 if (page_offset(page) >= ceph_wbc.i_size) {
913 dout("%p page eof %llu\n",
914 page, ceph_wbc.i_size);
c95f1c5f
EC
915 if ((ceph_wbc.size_stable ||
916 page_offset(page) >= i_size_read(inode)) &&
917 clear_page_dirty_for_io(page))
af9cc401 918 mapping->a_ops->invalidatepage(page,
8ff2d290 919 0, thp_size(page));
af9cc401
YZ
920 unlock_page(page);
921 continue;
922 }
923 if (strip_unit_end && (page->index > strip_unit_end)) {
924 dout("end of strip unit %p\n", page);
1d3576fd
SW
925 unlock_page(page);
926 break;
927 }
1702e797 928 if (PageWriteback(page) || PageFsCache(page)) {
0713e5f2
YZ
929 if (wbc->sync_mode == WB_SYNC_NONE) {
930 dout("%p under writeback\n", page);
931 unlock_page(page);
932 continue;
933 }
934 dout("waiting on writeback %p\n", page);
935 wait_on_page_writeback(page);
1702e797 936 wait_on_page_fscache(page);
1d3576fd
SW
937 }
938
1d3576fd
SW
939 if (!clear_page_dirty_for_io(page)) {
940 dout("%p !clear_page_dirty_for_io\n", page);
941 unlock_page(page);
0713e5f2 942 continue;
1d3576fd
SW
943 }
944
e5975c7c
AE
945 /*
946 * We have something to write. If this is
947 * the first locked page this time through,
5b64640c
YZ
948 * calculate max possinle write size and
949 * allocate a page array
e5975c7c 950 */
1d3576fd 951 if (locked_pages == 0) {
5b64640c
YZ
952 u64 objnum;
953 u64 objoff;
dccbf080 954 u32 xlen;
5b64640c 955
1d3576fd 956 /* prepare async write request */
e5975c7c 957 offset = (u64)page_offset(page);
dccbf080
ID
958 ceph_calc_file_object_mapping(&ci->i_layout,
959 offset, wsize,
960 &objnum, &objoff,
961 &xlen);
962 len = xlen;
8c71897b 963
3fb99d48 964 num_ops = 1;
5b64640c 965 strip_unit_end = page->index +
09cbfeaf 966 ((len - 1) >> PAGE_SHIFT);
88486957 967
5b64640c 968 BUG_ON(pages);
88486957 969 max_pages = calc_pages_for(0, (u64)len);
6da2ec56
KC
970 pages = kmalloc_array(max_pages,
971 sizeof(*pages),
972 GFP_NOFS);
88486957 973 if (!pages) {
a0102bda
JL
974 from_pool = true;
975 pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
e5975c7c 976 BUG_ON(!pages);
88486957 977 }
5b64640c
YZ
978
979 len = 0;
980 } else if (page->index !=
09cbfeaf 981 (offset + len) >> PAGE_SHIFT) {
a0102bda
JL
982 if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
983 CEPH_OSD_MAX_OPS)) {
5b64640c
YZ
984 redirty_page_for_writepage(wbc, page);
985 unlock_page(page);
986 break;
987 }
988
989 num_ops++;
990 offset = (u64)page_offset(page);
991 len = 0;
1d3576fd
SW
992 }
993
994 /* note position of first page in pvec */
1d3576fd
SW
995 dout("%p will write page %p idx %lu\n",
996 inode, page, page->index);
2baba250 997
5b64640c
YZ
998 if (atomic_long_inc_return(&fsc->writeback_count) >
999 CONGESTION_ON_THRESH(
3d14c5d2 1000 fsc->mount_options->congestion_kb)) {
09dc9fc2 1001 set_bdi_congested(inode_to_bdi(inode),
213c99ee 1002 BLK_RW_ASYNC);
2baba250
YS
1003 }
1004
0713e5f2
YZ
1005
1006 pages[locked_pages++] = page;
1007 pvec.pages[i] = NULL;
1008
8ff2d290 1009 len += thp_size(page);
1d3576fd
SW
1010 }
1011
1012 /* did we get anything? */
1013 if (!locked_pages)
1014 goto release_pvec_pages;
1015 if (i) {
0713e5f2
YZ
1016 unsigned j, n = 0;
1017 /* shift unused page to beginning of pvec */
1018 for (j = 0; j < pvec_pages; j++) {
1019 if (!pvec.pages[j])
1020 continue;
1021 if (n < j)
1022 pvec.pages[n] = pvec.pages[j];
1023 n++;
1024 }
1025 pvec.nr = n;
1d3576fd
SW
1026
1027 if (pvec_pages && i == pvec_pages &&
1028 locked_pages < max_pages) {
1029 dout("reached end pvec, trying for more\n");
0713e5f2 1030 pagevec_release(&pvec);
1d3576fd
SW
1031 goto get_more_pages;
1032 }
1d3576fd
SW
1033 }
1034
5b64640c 1035new_request:
e5975c7c 1036 offset = page_offset(pages[0]);
5b64640c
YZ
1037 len = wsize;
1038
1039 req = ceph_osdc_new_request(&fsc->client->osdc,
1040 &ci->i_layout, vino,
1041 offset, &len, 0, num_ops,
1f934b00
YZ
1042 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
1043 snapc, ceph_wbc.truncate_seq,
1044 ceph_wbc.truncate_size, false);
5b64640c
YZ
1045 if (IS_ERR(req)) {
1046 req = ceph_osdc_new_request(&fsc->client->osdc,
1047 &ci->i_layout, vino,
1048 offset, &len, 0,
1049 min(num_ops,
1050 CEPH_OSD_SLAB_OPS),
1051 CEPH_OSD_OP_WRITE,
54ea0046 1052 CEPH_OSD_FLAG_WRITE,
1f934b00
YZ
1053 snapc, ceph_wbc.truncate_seq,
1054 ceph_wbc.truncate_size, true);
5b64640c 1055 BUG_ON(IS_ERR(req));
e1966b49 1056 }
5b64640c 1057 BUG_ON(len < page_offset(pages[locked_pages - 1]) +
8ff2d290 1058 thp_size(page) - offset);
5b64640c
YZ
1059
1060 req->r_callback = writepages_finish;
1061 req->r_inode = inode;
1d3576fd 1062
5b64640c
YZ
1063 /* Format the osd request message and submit the write */
1064 len = 0;
1065 data_pages = pages;
1066 op_idx = 0;
1067 for (i = 0; i < locked_pages; i++) {
1068 u64 cur_offset = page_offset(pages[i]);
1702e797
JL
1069 /*
1070 * Discontinuity in page range? Ceph can handle that by just passing
1071 * multiple extents in the write op.
1072 */
5b64640c 1073 if (offset + len != cur_offset) {
1702e797 1074 /* If it's full, stop here */
3fb99d48 1075 if (op_idx + 1 == req->r_num_ops)
5b64640c 1076 break;
1702e797
JL
1077
1078 /* Kick off an fscache write with what we have so far. */
1079 ceph_fscache_write_to_cache(inode, offset, len, caching);
1080
1081 /* Start a new extent */
5b64640c
YZ
1082 osd_req_op_extent_dup_last(req, op_idx,
1083 cur_offset - offset);
1084 dout("writepages got pages at %llu~%llu\n",
1085 offset, len);
1086 osd_req_op_extent_osd_data_pages(req, op_idx,
1087 data_pages, len, 0,
a0102bda 1088 from_pool, false);
5b64640c 1089 osd_req_op_extent_update(req, op_idx, len);
e5975c7c 1090
5b64640c 1091 len = 0;
1702e797 1092 offset = cur_offset;
5b64640c
YZ
1093 data_pages = pages + i;
1094 op_idx++;
1095 }
1096
1097 set_page_writeback(pages[i]);
1702e797
JL
1098 if (caching)
1099 ceph_set_page_fscache(pages[i]);
8ff2d290 1100 len += thp_size(page);
5b64640c 1101 }
1702e797 1102 ceph_fscache_write_to_cache(inode, offset, len, caching);
5b64640c 1103
1f934b00
YZ
1104 if (ceph_wbc.size_stable) {
1105 len = min(len, ceph_wbc.i_size - offset);
5b64640c
YZ
1106 } else if (i == locked_pages) {
1107 /* writepages_finish() clears writeback pages
1108 * according to the data length, so make sure
1109 * data length covers all locked pages */
8ff2d290 1110 u64 min_len = len + 1 - thp_size(page);
1f934b00
YZ
1111 len = get_writepages_data_length(inode, pages[i - 1],
1112 offset);
5b64640c
YZ
1113 len = max(len, min_len);
1114 }
1115 dout("writepages got pages at %llu~%llu\n", offset, len);
e5975c7c 1116
5b64640c 1117 osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
a0102bda 1118 0, from_pool, false);
5b64640c 1119 osd_req_op_extent_update(req, op_idx, len);
e5975c7c 1120
5b64640c
YZ
1121 BUG_ON(op_idx + 1 != req->r_num_ops);
1122
a0102bda 1123 from_pool = false;
5b64640c
YZ
1124 if (i < locked_pages) {
1125 BUG_ON(num_ops <= req->r_num_ops);
1126 num_ops -= req->r_num_ops;
5b64640c
YZ
1127 locked_pages -= i;
1128
1129 /* allocate new pages array for next request */
1130 data_pages = pages;
6da2ec56
KC
1131 pages = kmalloc_array(locked_pages, sizeof(*pages),
1132 GFP_NOFS);
5b64640c 1133 if (!pages) {
a0102bda
JL
1134 from_pool = true;
1135 pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
5b64640c
YZ
1136 BUG_ON(!pages);
1137 }
1138 memcpy(pages, data_pages + i,
1139 locked_pages * sizeof(*pages));
1140 memset(data_pages + i, 0,
1141 locked_pages * sizeof(*pages));
1142 } else {
1143 BUG_ON(num_ops != req->r_num_ops);
1144 index = pages[i - 1]->index + 1;
1145 /* request message now owns the pages array */
1146 pages = NULL;
1147 }
e5975c7c 1148
fac02ddf 1149 req->r_mtime = inode->i_mtime;
9d6fcb08
SW
1150 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
1151 BUG_ON(rc);
1d3576fd
SW
1152 req = NULL;
1153
5b64640c
YZ
1154 wbc->nr_to_write -= i;
1155 if (pages)
1156 goto new_request;
1157
2a2d927e
YZ
1158 /*
1159 * We stop writing back only if we are not doing
1160 * integrity sync. In case of integrity sync we have to
1161 * keep going until we have written all the pages
1162 * we tagged for writeback prior to entering this loop.
1163 */
1164 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
af9cc401 1165 done = true;
1d3576fd
SW
1166
1167release_pvec_pages:
1168 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
1169 pvec.nr ? pvec.pages[0] : NULL);
1170 pagevec_release(&pvec);
1d3576fd
SW
1171 }
1172
1173 if (should_loop && !done) {
1174 /* more to do; loop back to beginning of file */
1175 dout("writepages looping back to beginning of file\n");
2a2d927e 1176 end = start_index - 1; /* OK even when start_index == 0 */
f275635e
YZ
1177
1178 /* to write dirty pages associated with next snapc,
1179 * we need to wait until current writes complete */
1180 if (wbc->sync_mode != WB_SYNC_NONE &&
1181 start_index == 0 && /* all dirty pages were checked */
1182 !ceph_wbc.head_snapc) {
1183 struct page *page;
1184 unsigned i, nr;
1185 index = 0;
1186 while ((index <= end) &&
1187 (nr = pagevec_lookup_tag(&pvec, mapping, &index,
67fd707f 1188 PAGECACHE_TAG_WRITEBACK))) {
f275635e
YZ
1189 for (i = 0; i < nr; i++) {
1190 page = pvec.pages[i];
1191 if (page_snap_context(page) != snapc)
1192 continue;
1193 wait_on_page_writeback(page);
1194 }
1195 pagevec_release(&pvec);
1196 cond_resched();
1197 }
1198 }
1199
2a2d927e 1200 start_index = 0;
1d3576fd
SW
1201 index = 0;
1202 goto retry;
1203 }
1204
1205 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1206 mapping->writeback_index = index;
1207
1208out:
3ed97d63 1209 ceph_osdc_put_request(req);
2a2d927e
YZ
1210 ceph_put_snap_context(last_snapc);
1211 dout("writepages dend - startone, rc = %d\n", rc);
1d3576fd
SW
1212 return rc;
1213}
1214
1215
1216
1217/*
1218 * See if a given @snapc is either writeable, or already written.
1219 */
1220static int context_is_writeable_or_written(struct inode *inode,
1221 struct ceph_snap_context *snapc)
1222{
05455e11 1223 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
6298a337
SW
1224 int ret = !oldest || snapc->seq <= oldest->seq;
1225
1226 ceph_put_snap_context(oldest);
1227 return ret;
1d3576fd
SW
1228}
1229
18d620f0
JL
1230/**
1231 * ceph_find_incompatible - find an incompatible context and return it
18d620f0 1232 * @page: page being dirtied
8f883c24 1233 *
18d620f0
JL
1234 * We are only allowed to write into/dirty a page if the page is
1235 * clean, or already dirty within the same snap context. Returns a
1236 * conflicting context if there is one, NULL if there isn't, or a
1237 * negative error code on other errors.
1238 *
1239 * Must be called with page lock held.
1d3576fd 1240 */
18d620f0 1241static struct ceph_snap_context *
d45156bf 1242ceph_find_incompatible(struct page *page)
1d3576fd 1243{
d45156bf 1244 struct inode *inode = page->mapping->host;
1d3576fd 1245 struct ceph_inode_info *ci = ceph_inode(inode);
1d3576fd 1246
5d6451b1
JL
1247 if (ceph_inode_is_shutdown(inode)) {
1248 dout(" page %p %llx:%llx is shutdown\n", page,
1249 ceph_vinop(inode));
1250 return ERR_PTR(-ESTALE);
6c93df5d
YZ
1251 }
1252
18d620f0
JL
1253 for (;;) {
1254 struct ceph_snap_context *snapc, *oldest;
1255
1256 wait_on_page_writeback(page);
1257
1258 snapc = page_snap_context(page);
1259 if (!snapc || snapc == ci->i_head_snapc)
1260 break;
1d3576fd 1261
1d3576fd
SW
1262 /*
1263 * this page is already dirty in another (older) snap
1264 * context! is it writeable now?
1265 */
05455e11 1266 oldest = get_oldest_context(inode, NULL, NULL);
80e755fe 1267 if (snapc->seq > oldest->seq) {
18d620f0 1268 /* not writeable -- return it for the caller to deal with */
6298a337 1269 ceph_put_snap_context(oldest);
18d620f0
JL
1270 dout(" page %p snapc %p not current or oldest\n", page, snapc);
1271 return ceph_get_snap_context(snapc);
1d3576fd 1272 }
6298a337 1273 ceph_put_snap_context(oldest);
1d3576fd
SW
1274
1275 /* yay, writeable, do it now (without dropping page lock) */
18d620f0
JL
1276 dout(" page %p snapc %p not current, but oldest\n", page, snapc);
1277 if (clear_page_dirty_for_io(page)) {
1278 int r = writepage_nounlock(page, NULL);
1279 if (r < 0)
1280 return ERR_PTR(r);
1281 }
1282 }
1283 return NULL;
1284}
1285
d801327d 1286static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
78525c74 1287 struct folio *folio, void **_fsdata)
d801327d
JL
1288{
1289 struct inode *inode = file_inode(file);
1290 struct ceph_inode_info *ci = ceph_inode(inode);
1291 struct ceph_snap_context *snapc;
1292
78525c74 1293 snapc = ceph_find_incompatible(folio_page(folio, 0));
d801327d
JL
1294 if (snapc) {
1295 int r;
1296
78525c74
DH
1297 folio_unlock(folio);
1298 folio_put(folio);
d801327d
JL
1299 if (IS_ERR(snapc))
1300 return PTR_ERR(snapc);
1301
1302 ceph_queue_writeback(inode);
1303 r = wait_event_killable(ci->i_cap_wq,
1304 context_is_writeable_or_written(inode, snapc));
1305 ceph_put_snap_context(snapc);
1306 return r == 0 ? -EAGAIN : r;
1307 }
1308 return 0;
1309}
1310
18d620f0
JL
1311/*
1312 * We are only allowed to write into/dirty the page if the page is
1313 * clean, or already dirty within the same snap context.
18d620f0 1314 */
1cc16990 1315static int ceph_write_begin(struct file *file, struct address_space *mapping,
78525c74 1316 loff_t pos, unsigned len, unsigned aop_flags,
1cc16990 1317 struct page **pagep, void **fsdata)
18d620f0
JL
1318{
1319 struct inode *inode = file_inode(file);
78525c74 1320 struct folio *folio = NULL;
d801327d 1321 int r;
1d3576fd 1322
78525c74 1323 r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
d801327d 1324 &ceph_netfs_read_ops, NULL);
d801327d 1325 if (r == 0)
78525c74 1326 folio_wait_fscache(folio);
1cc16990 1327 if (r < 0) {
78525c74
DH
1328 if (folio)
1329 folio_put(folio);
1cc16990 1330 } else {
78525c74
DH
1331 WARN_ON_ONCE(!folio_test_locked(folio));
1332 *pagep = &folio->page;
1cc16990 1333 }
4af6b225
YS
1334 return r;
1335}
1336
1d3576fd
SW
1337/*
1338 * we don't do anything in here that simple_write_end doesn't do
5dda377c 1339 * except adjust dirty page accounting
1d3576fd
SW
1340 */
1341static int ceph_write_end(struct file *file, struct address_space *mapping,
1342 loff_t pos, unsigned len, unsigned copied,
78525c74 1343 struct page *subpage, void *fsdata)
1d3576fd 1344{
78525c74 1345 struct folio *folio = page_folio(subpage);
496ad9aa 1346 struct inode *inode = file_inode(file);
efb0ca76 1347 bool check_cap = false;
1d3576fd 1348
78525c74
DH
1349 dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
1350 inode, folio, (int)pos, (int)copied, (int)len);
1d3576fd 1351
78525c74 1352 if (!folio_test_uptodate(folio)) {
ce3a8732 1353 /* just return that nothing was copied on a short copy */
b9de313c
AV
1354 if (copied < len) {
1355 copied = 0;
1356 goto out;
1357 }
78525c74 1358 folio_mark_uptodate(folio);
b9de313c 1359 }
1d3576fd
SW
1360
1361 /* did file size increase? */
99c88e69 1362 if (pos+copied > i_size_read(inode))
1d3576fd
SW
1363 check_cap = ceph_inode_set_size(inode, pos+copied);
1364
78525c74 1365 folio_mark_dirty(folio);
1d3576fd 1366
b9de313c 1367out:
78525c74
DH
1368 folio_unlock(folio);
1369 folio_put(folio);
1d3576fd
SW
1370
1371 if (check_cap)
1372 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1373
1374 return copied;
1375}
1376
1d3576fd
SW
1377const struct address_space_operations ceph_aops = {
1378 .readpage = ceph_readpage,
49870056 1379 .readahead = ceph_readahead,
1d3576fd
SW
1380 .writepage = ceph_writepage,
1381 .writepages = ceph_writepages_start,
1382 .write_begin = ceph_write_begin,
1383 .write_end = ceph_write_end,
1384 .set_page_dirty = ceph_set_page_dirty,
1385 .invalidatepage = ceph_invalidatepage,
1386 .releasepage = ceph_releasepage,
9c43ff44 1387 .direct_IO = noop_direct_IO,
1d3576fd
SW
1388};
1389
4f7e89f6
YZ
1390static void ceph_block_sigs(sigset_t *oldset)
1391{
1392 sigset_t mask;
1393 siginitsetinv(&mask, sigmask(SIGKILL));
1394 sigprocmask(SIG_BLOCK, &mask, oldset);
1395}
1396
1397static void ceph_restore_sigs(sigset_t *oldset)
1398{
1399 sigprocmask(SIG_SETMASK, oldset, NULL);
1400}
1d3576fd
SW
1401
1402/*
1403 * vm ops
1404 */
24499847 1405static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
61f68816 1406{
11bac800 1407 struct vm_area_struct *vma = vmf->vma;
61f68816
YZ
1408 struct inode *inode = file_inode(vma->vm_file);
1409 struct ceph_inode_info *ci = ceph_inode(inode);
1410 struct ceph_file_info *fi = vma->vm_file->private_data;
c403c3a2 1411 loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
24499847 1412 int want, got, err;
4f7e89f6 1413 sigset_t oldset;
24499847 1414 vm_fault_t ret = VM_FAULT_SIGBUS;
4f7e89f6 1415
5d6451b1
JL
1416 if (ceph_inode_is_shutdown(inode))
1417 return ret;
1418
4f7e89f6 1419 ceph_block_sigs(&oldset);
61f68816 1420
8ff2d290
JL
1421 dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
1422 inode, ceph_vinop(inode), off);
61f68816
YZ
1423 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1424 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1425 else
1426 want = CEPH_CAP_FILE_CACHE;
4f7e89f6
YZ
1427
1428 got = 0;
e72968e1 1429 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
24499847 1430 if (err < 0)
4f7e89f6 1431 goto out_restore;
6ce026e4 1432
8ff2d290
JL
1433 dout("filemap_fault %p %llu got cap refs on %s\n",
1434 inode, off, ceph_cap_string(got));
61f68816 1435
83701246 1436 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
2b1ac852 1437 ci->i_inline_version == CEPH_INLINE_NONE) {
5d988308
YZ
1438 CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
1439 ceph_add_rw_context(fi, &rw_ctx);
11bac800 1440 ret = filemap_fault(vmf);
5d988308 1441 ceph_del_rw_context(fi, &rw_ctx);
8ff2d290
JL
1442 dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
1443 inode, off, ceph_cap_string(got), ret);
2b1ac852 1444 } else
24499847 1445 err = -EAGAIN;
61f68816 1446
61f68816
YZ
1447 ceph_put_cap_refs(ci, got);
1448
24499847 1449 if (err != -EAGAIN)
4f7e89f6 1450 goto out_restore;
83701246
YZ
1451
1452 /* read inline data */
09cbfeaf 1453 if (off >= PAGE_SIZE) {
83701246
YZ
1454 /* does not support inline data > PAGE_SIZE */
1455 ret = VM_FAULT_SIGBUS;
1456 } else {
83701246 1457 struct address_space *mapping = inode->i_mapping;
057ba5b2
JK
1458 struct page *page;
1459
1460 filemap_invalidate_lock_shared(mapping);
1461 page = find_or_create_page(mapping, 0,
1462 mapping_gfp_constraint(mapping, ~__GFP_FS));
83701246
YZ
1463 if (!page) {
1464 ret = VM_FAULT_OOM;
4f7e89f6 1465 goto out_inline;
83701246 1466 }
24499847 1467 err = __ceph_do_getattr(inode, page,
83701246 1468 CEPH_STAT_CAP_INLINE_DATA, true);
24499847 1469 if (err < 0 || off >= i_size_read(inode)) {
83701246 1470 unlock_page(page);
09cbfeaf 1471 put_page(page);
c64a2b05 1472 ret = vmf_error(err);
4f7e89f6 1473 goto out_inline;
83701246 1474 }
24499847
SJ
1475 if (err < PAGE_SIZE)
1476 zero_user_segment(page, err, PAGE_SIZE);
83701246
YZ
1477 else
1478 flush_dcache_page(page);
1479 SetPageUptodate(page);
1480 vmf->page = page;
1481 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
4f7e89f6 1482out_inline:
057ba5b2 1483 filemap_invalidate_unlock_shared(mapping);
8ff2d290
JL
1484 dout("filemap_fault %p %llu read inline data ret %x\n",
1485 inode, off, ret);
83701246 1486 }
4f7e89f6
YZ
1487out_restore:
1488 ceph_restore_sigs(&oldset);
24499847
SJ
1489 if (err < 0)
1490 ret = vmf_error(err);
6ce026e4 1491
61f68816
YZ
1492 return ret;
1493}
1d3576fd 1494
24499847 1495static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1d3576fd 1496{
11bac800 1497 struct vm_area_struct *vma = vmf->vma;
496ad9aa 1498 struct inode *inode = file_inode(vma->vm_file);
61f68816
YZ
1499 struct ceph_inode_info *ci = ceph_inode(inode);
1500 struct ceph_file_info *fi = vma->vm_file->private_data;
f66fd9f0 1501 struct ceph_cap_flush *prealloc_cf;
61f68816 1502 struct page *page = vmf->page;
6285bc23 1503 loff_t off = page_offset(page);
61f68816
YZ
1504 loff_t size = i_size_read(inode);
1505 size_t len;
24499847 1506 int want, got, err;
4f7e89f6 1507 sigset_t oldset;
24499847 1508 vm_fault_t ret = VM_FAULT_SIGBUS;
3ca9c3bd 1509
5d6451b1
JL
1510 if (ceph_inode_is_shutdown(inode))
1511 return ret;
1512
f66fd9f0
YZ
1513 prealloc_cf = ceph_alloc_cap_flush();
1514 if (!prealloc_cf)
6ce026e4 1515 return VM_FAULT_OOM;
f66fd9f0 1516
249c1df5 1517 sb_start_pagefault(inode->i_sb);
4f7e89f6 1518 ceph_block_sigs(&oldset);
f66fd9f0 1519
8ff2d290
JL
1520 if (off + thp_size(page) <= size)
1521 len = thp_size(page);
1d3576fd 1522 else
8ff2d290 1523 len = offset_in_thp(page, size);
1d3576fd 1524
61f68816
YZ
1525 dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
1526 inode, ceph_vinop(inode), off, len, size);
1527 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1528 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1529 else
1530 want = CEPH_CAP_FILE_BUFFER;
4f7e89f6
YZ
1531
1532 got = 0;
e72968e1 1533 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
24499847 1534 if (err < 0)
4f7e89f6 1535 goto out_free;
6ce026e4 1536
61f68816
YZ
1537 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
1538 inode, off, len, ceph_cap_string(got));
1539
1540 /* Update time before taking page lock */
1541 file_update_time(vma->vm_file);
5c308356 1542 inode_inc_iversion_raw(inode);
4af6b225 1543
f0b33df5 1544 do {
d45156bf
JL
1545 struct ceph_snap_context *snapc;
1546
f0b33df5 1547 lock_page(page);
4af6b225 1548
cb03c143 1549 if (page_mkwrite_check_truncate(page, inode) < 0) {
f0b33df5
YZ
1550 unlock_page(page);
1551 ret = VM_FAULT_NOPAGE;
1552 break;
1553 }
1554
d45156bf
JL
1555 snapc = ceph_find_incompatible(page);
1556 if (!snapc) {
f0b33df5
YZ
1557 /* success. we'll keep the page locked. */
1558 set_page_dirty(page);
1559 ret = VM_FAULT_LOCKED;
d45156bf
JL
1560 break;
1561 }
1562
1563 unlock_page(page);
1564
1565 if (IS_ERR(snapc)) {
1566 ret = VM_FAULT_SIGBUS;
1567 break;
f0b33df5 1568 }
d45156bf
JL
1569
1570 ceph_queue_writeback(inode);
1571 err = wait_event_killable(ci->i_cap_wq,
1572 context_is_writeable_or_written(inode, snapc));
1573 ceph_put_snap_context(snapc);
1574 } while (err == 0);
4af6b225 1575
083db6fd 1576 if (ret == VM_FAULT_LOCKED) {
61f68816
YZ
1577 int dirty;
1578 spin_lock(&ci->i_ceph_lock);
f66fd9f0
YZ
1579 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1580 &prealloc_cf);
61f68816
YZ
1581 spin_unlock(&ci->i_ceph_lock);
1582 if (dirty)
1583 __mark_inode_dirty(inode, dirty);
1584 }
1585
24499847 1586 dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
61f68816 1587 inode, off, len, ceph_cap_string(got), ret);
a8810cdc 1588 ceph_put_cap_refs_async(ci, got);
f66fd9f0 1589out_free:
4f7e89f6 1590 ceph_restore_sigs(&oldset);
249c1df5 1591 sb_end_pagefault(inode->i_sb);
f66fd9f0 1592 ceph_free_cap_flush(prealloc_cf);
24499847
SJ
1593 if (err < 0)
1594 ret = vmf_error(err);
1d3576fd
SW
1595 return ret;
1596}
1597
31c542a1
YZ
1598void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
1599 char *data, size_t len)
1600{
1601 struct address_space *mapping = inode->i_mapping;
1602 struct page *page;
1603
1604 if (locked_page) {
1605 page = locked_page;
1606 } else {
1607 if (i_size_read(inode) == 0)
1608 return;
1609 page = find_or_create_page(mapping, 0,
c62d2555
MH
1610 mapping_gfp_constraint(mapping,
1611 ~__GFP_FS));
31c542a1
YZ
1612 if (!page)
1613 return;
1614 if (PageUptodate(page)) {
1615 unlock_page(page);
09cbfeaf 1616 put_page(page);
31c542a1
YZ
1617 return;
1618 }
1619 }
1620
0668ff52 1621 dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
31c542a1
YZ
1622 inode, ceph_vinop(inode), len, locked_page);
1623
1624 if (len > 0) {
1625 void *kaddr = kmap_atomic(page);
1626 memcpy(kaddr, data, len);
1627 kunmap_atomic(kaddr);
1628 }
1629
1630 if (page != locked_page) {
09cbfeaf
KS
1631 if (len < PAGE_SIZE)
1632 zero_user_segment(page, len, PAGE_SIZE);
31c542a1
YZ
1633 else
1634 flush_dcache_page(page);
1635
1636 SetPageUptodate(page);
1637 unlock_page(page);
09cbfeaf 1638 put_page(page);
31c542a1
YZ
1639 }
1640}
1641
083db6fd 1642int ceph_uninline_data(struct file *file)
28127bdd 1643{
083db6fd 1644 struct inode *inode = file_inode(file);
28127bdd
YZ
1645 struct ceph_inode_info *ci = ceph_inode(inode);
1646 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1647 struct ceph_osd_request *req;
083db6fd
DH
1648 struct ceph_cap_flush *prealloc_cf;
1649 struct folio *folio = NULL;
1650 struct page *pages[1];
28127bdd
YZ
1651 u64 len, inline_version;
1652 int err = 0;
083db6fd
DH
1653
1654 prealloc_cf = ceph_alloc_cap_flush();
1655 if (!prealloc_cf)
1656 return -ENOMEM;
1657
1658 folio = read_mapping_folio(inode->i_mapping, 0, file);
1659 if (IS_ERR(folio)) {
1660 err = PTR_ERR(folio);
1661 goto out;
1662 }
1663
1664 folio_lock(folio);
28127bdd
YZ
1665
1666 spin_lock(&ci->i_ceph_lock);
1667 inline_version = ci->i_inline_version;
1668 spin_unlock(&ci->i_ceph_lock);
1669
1670 dout("uninline_data %p %llx.%llx inline_version %llu\n",
1671 inode, ceph_vinop(inode), inline_version);
1672
1673 if (inline_version == 1 || /* initial version, no data */
1674 inline_version == CEPH_INLINE_NONE)
083db6fd 1675 goto out_unlock;
28127bdd 1676
083db6fd
DH
1677 len = i_size_read(inode);
1678 if (len > folio_size(folio))
1679 len = folio_size(folio);
28127bdd
YZ
1680
1681 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1682 ceph_vino(inode), 0, &len, 0, 1,
54ea0046 1683 CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
34b759b4 1684 NULL, 0, 0, false);
28127bdd
YZ
1685 if (IS_ERR(req)) {
1686 err = PTR_ERR(req);
083db6fd 1687 goto out_unlock;
28127bdd
YZ
1688 }
1689
fac02ddf 1690 req->r_mtime = inode->i_mtime;
28127bdd
YZ
1691 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1692 if (!err)
1693 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
1694 ceph_osdc_put_request(req);
1695 if (err < 0)
083db6fd 1696 goto out_unlock;
28127bdd
YZ
1697
1698 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1699 ceph_vino(inode), 0, &len, 1, 3,
54ea0046 1700 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
34b759b4
ID
1701 NULL, ci->i_truncate_seq,
1702 ci->i_truncate_size, false);
28127bdd
YZ
1703 if (IS_ERR(req)) {
1704 err = PTR_ERR(req);
083db6fd 1705 goto out_unlock;
28127bdd
YZ
1706 }
1707
083db6fd
DH
1708 pages[0] = folio_page(folio, 0);
1709 osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
28127bdd 1710
ec137c10
YZ
1711 {
1712 __le64 xattr_buf = cpu_to_le64(inline_version);
1713 err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
1714 "inline_version", &xattr_buf,
1715 sizeof(xattr_buf),
1716 CEPH_OSD_CMPXATTR_OP_GT,
1717 CEPH_OSD_CMPXATTR_MODE_U64);
1718 if (err)
083db6fd 1719 goto out_put_req;
ec137c10
YZ
1720 }
1721
1722 {
1723 char xattr_buf[32];
1724 int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
1725 "%llu", inline_version);
1726 err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
1727 "inline_version",
1728 xattr_buf, xattr_len, 0, 0);
1729 if (err)
083db6fd 1730 goto out_put_req;
ec137c10 1731 }
28127bdd 1732
fac02ddf 1733 req->r_mtime = inode->i_mtime;
28127bdd
YZ
1734 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1735 if (!err)
1736 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
97e27aaa 1737
8ae99ae2 1738 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
903f4fec 1739 req->r_end_latency, len, err);
97e27aaa 1740
083db6fd
DH
1741 if (!err) {
1742 int dirty;
1743
1744 /* Set to CAP_INLINE_NONE and dirty the caps */
1745 down_read(&fsc->mdsc->snap_rwsem);
1746 spin_lock(&ci->i_ceph_lock);
1747 ci->i_inline_version = CEPH_INLINE_NONE;
1748 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
1749 spin_unlock(&ci->i_ceph_lock);
1750 up_read(&fsc->mdsc->snap_rwsem);
1751 if (dirty)
1752 __mark_inode_dirty(inode, dirty);
1753 }
1754out_put_req:
28127bdd
YZ
1755 ceph_osdc_put_request(req);
1756 if (err == -ECANCELED)
1757 err = 0;
083db6fd
DH
1758out_unlock:
1759 folio_unlock(folio);
1760 folio_put(folio);
28127bdd 1761out:
083db6fd 1762 ceph_free_cap_flush(prealloc_cf);
28127bdd
YZ
1763 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
1764 inode, ceph_vinop(inode), inline_version, err);
1765 return err;
1766}
1767
7cbea8dc 1768static const struct vm_operations_struct ceph_vmops = {
61f68816 1769 .fault = ceph_filemap_fault,
1d3576fd
SW
1770 .page_mkwrite = ceph_page_mkwrite,
1771};
1772
1773int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1774{
1775 struct address_space *mapping = file->f_mapping;
1776
1777 if (!mapping->a_ops->readpage)
1778 return -ENOEXEC;
1779 file_accessed(file);
1780 vma->vm_ops = &ceph_vmops;
1d3576fd
SW
1781 return 0;
1782}
10183a69
YZ
1783
1784enum {
1785 POOL_READ = 1,
1786 POOL_WRITE = 2,
1787};
1788
779fe0fb
YZ
1789static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
1790 s64 pool, struct ceph_string *pool_ns)
10183a69
YZ
1791{
1792 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1793 struct ceph_mds_client *mdsc = fsc->mdsc;
1794 struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
1795 struct rb_node **p, *parent;
1796 struct ceph_pool_perm *perm;
1797 struct page **pages;
779fe0fb 1798 size_t pool_ns_len;
10183a69
YZ
1799 int err = 0, err2 = 0, have = 0;
1800
1801 down_read(&mdsc->pool_perm_rwsem);
1802 p = &mdsc->pool_perm_tree.rb_node;
1803 while (*p) {
1804 perm = rb_entry(*p, struct ceph_pool_perm, node);
1805 if (pool < perm->pool)
1806 p = &(*p)->rb_left;
1807 else if (pool > perm->pool)
1808 p = &(*p)->rb_right;
1809 else {
779fe0fb
YZ
1810 int ret = ceph_compare_string(pool_ns,
1811 perm->pool_ns,
1812 perm->pool_ns_len);
1813 if (ret < 0)
1814 p = &(*p)->rb_left;
1815 else if (ret > 0)
1816 p = &(*p)->rb_right;
1817 else {
1818 have = perm->perm;
1819 break;
1820 }
10183a69
YZ
1821 }
1822 }
1823 up_read(&mdsc->pool_perm_rwsem);
1824 if (*p)
1825 goto out;
1826
779fe0fb
YZ
1827 if (pool_ns)
1828 dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
1829 pool, (int)pool_ns->len, pool_ns->str);
1830 else
1831 dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
10183a69
YZ
1832
1833 down_write(&mdsc->pool_perm_rwsem);
779fe0fb 1834 p = &mdsc->pool_perm_tree.rb_node;
10183a69
YZ
1835 parent = NULL;
1836 while (*p) {
1837 parent = *p;
1838 perm = rb_entry(parent, struct ceph_pool_perm, node);
1839 if (pool < perm->pool)
1840 p = &(*p)->rb_left;
1841 else if (pool > perm->pool)
1842 p = &(*p)->rb_right;
1843 else {
779fe0fb
YZ
1844 int ret = ceph_compare_string(pool_ns,
1845 perm->pool_ns,
1846 perm->pool_ns_len);
1847 if (ret < 0)
1848 p = &(*p)->rb_left;
1849 else if (ret > 0)
1850 p = &(*p)->rb_right;
1851 else {
1852 have = perm->perm;
1853 break;
1854 }
10183a69
YZ
1855 }
1856 }
1857 if (*p) {
1858 up_write(&mdsc->pool_perm_rwsem);
1859 goto out;
1860 }
1861
34b759b4 1862 rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
10183a69
YZ
1863 1, false, GFP_NOFS);
1864 if (!rd_req) {
1865 err = -ENOMEM;
1866 goto out_unlock;
1867 }
1868
1869 rd_req->r_flags = CEPH_OSD_FLAG_READ;
1870 osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
1871 rd_req->r_base_oloc.pool = pool;
779fe0fb
YZ
1872 if (pool_ns)
1873 rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
d30291b9 1874 ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
10183a69 1875
13d1ad16
ID
1876 err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
1877 if (err)
1878 goto out_unlock;
10183a69 1879
34b759b4 1880 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
10183a69
YZ
1881 1, false, GFP_NOFS);
1882 if (!wr_req) {
1883 err = -ENOMEM;
1884 goto out_unlock;
1885 }
1886
54ea0046 1887 wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
10183a69 1888 osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
63244fa1 1889 ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
d30291b9 1890 ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
10183a69 1891
13d1ad16
ID
1892 err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
1893 if (err)
1894 goto out_unlock;
10183a69
YZ
1895
1896 /* one page should be large enough for STAT data */
1897 pages = ceph_alloc_page_vector(1, GFP_KERNEL);
1898 if (IS_ERR(pages)) {
1899 err = PTR_ERR(pages);
1900 goto out_unlock;
1901 }
1902
1903 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
1904 0, false, true);
10183a69
YZ
1905 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
1906
fac02ddf 1907 wr_req->r_mtime = ci->vfs_inode.i_mtime;
10183a69
YZ
1908 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
1909
1910 if (!err)
1911 err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
1912 if (!err2)
1913 err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
1914
1915 if (err >= 0 || err == -ENOENT)
1916 have |= POOL_READ;
131d7eb4 1917 else if (err != -EPERM) {
0b98acd6
ID
1918 if (err == -EBLOCKLISTED)
1919 fsc->blocklisted = true;
10183a69 1920 goto out_unlock;
131d7eb4 1921 }
10183a69
YZ
1922
1923 if (err2 == 0 || err2 == -EEXIST)
1924 have |= POOL_WRITE;
1925 else if (err2 != -EPERM) {
0b98acd6
ID
1926 if (err2 == -EBLOCKLISTED)
1927 fsc->blocklisted = true;
10183a69
YZ
1928 err = err2;
1929 goto out_unlock;
1930 }
1931
779fe0fb
YZ
1932 pool_ns_len = pool_ns ? pool_ns->len : 0;
1933 perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
10183a69
YZ
1934 if (!perm) {
1935 err = -ENOMEM;
1936 goto out_unlock;
1937 }
1938
1939 perm->pool = pool;
1940 perm->perm = have;
779fe0fb
YZ
1941 perm->pool_ns_len = pool_ns_len;
1942 if (pool_ns_len > 0)
1943 memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
1944 perm->pool_ns[pool_ns_len] = 0;
1945
10183a69
YZ
1946 rb_link_node(&perm->node, parent, p);
1947 rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
1948 err = 0;
1949out_unlock:
1950 up_write(&mdsc->pool_perm_rwsem);
1951
3ed97d63
ID
1952 ceph_osdc_put_request(rd_req);
1953 ceph_osdc_put_request(wr_req);
10183a69
YZ
1954out:
1955 if (!err)
1956 err = have;
779fe0fb
YZ
1957 if (pool_ns)
1958 dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
1959 pool, (int)pool_ns->len, pool_ns->str, err);
1960 else
1961 dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
10183a69
YZ
1962 return err;
1963}
1964
5e3ded1b 1965int ceph_pool_perm_check(struct inode *inode, int need)
10183a69 1966{
5e3ded1b 1967 struct ceph_inode_info *ci = ceph_inode(inode);
779fe0fb 1968 struct ceph_string *pool_ns;
5e3ded1b 1969 s64 pool;
10183a69
YZ
1970 int ret, flags;
1971
e9b22501
JL
1972 /* Only need to do this for regular files */
1973 if (!S_ISREG(inode->i_mode))
1974 return 0;
1975
80e80fbb
YZ
1976 if (ci->i_vino.snap != CEPH_NOSNAP) {
1977 /*
1978 * Pool permission check needs to write to the first object.
1979 * But for snapshot, head of the first object may have alread
1980 * been deleted. Skip check to avoid creating orphan object.
1981 */
1982 return 0;
1983 }
1984
5e3ded1b 1985 if (ceph_test_mount_opt(ceph_inode_to_client(inode),
10183a69
YZ
1986 NOPOOLPERM))
1987 return 0;
1988
1989 spin_lock(&ci->i_ceph_lock);
1990 flags = ci->i_ceph_flags;
7627151e 1991 pool = ci->i_layout.pool_id;
10183a69
YZ
1992 spin_unlock(&ci->i_ceph_lock);
1993check:
1994 if (flags & CEPH_I_POOL_PERM) {
1995 if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
7627151e 1996 dout("ceph_pool_perm_check pool %lld no read perm\n",
10183a69
YZ
1997 pool);
1998 return -EPERM;
1999 }
2000 if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
7627151e 2001 dout("ceph_pool_perm_check pool %lld no write perm\n",
10183a69
YZ
2002 pool);
2003 return -EPERM;
2004 }
2005 return 0;
2006 }
2007
779fe0fb
YZ
2008 pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
2009 ret = __ceph_pool_perm_get(ci, pool, pool_ns);
2010 ceph_put_string(pool_ns);
10183a69
YZ
2011 if (ret < 0)
2012 return ret;
2013
2014 flags = CEPH_I_POOL_PERM;
2015 if (ret & POOL_READ)
2016 flags |= CEPH_I_POOL_RD;
2017 if (ret & POOL_WRITE)
2018 flags |= CEPH_I_POOL_WR;
2019
2020 spin_lock(&ci->i_ceph_lock);
779fe0fb
YZ
2021 if (pool == ci->i_layout.pool_id &&
2022 pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
2023 ci->i_ceph_flags |= flags;
10183a69 2024 } else {
7627151e 2025 pool = ci->i_layout.pool_id;
10183a69
YZ
2026 flags = ci->i_ceph_flags;
2027 }
2028 spin_unlock(&ci->i_ceph_lock);
2029 goto check;
2030}
2031
2032void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
2033{
2034 struct ceph_pool_perm *perm;
2035 struct rb_node *n;
2036
2037 while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
2038 n = rb_first(&mdsc->pool_perm_tree);
2039 perm = rb_entry(n, struct ceph_pool_perm, node);
2040 rb_erase(n, &mdsc->pool_perm_tree);
2041 kfree(perm);
2042 }
2043}