ceph: add helpers for converting names for userland presentation
[linux-block.git] / fs / ceph / inode.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
3d14c5d2 2#include <linux/ceph/ceph_debug.h>
355da1eb
SW
3
4#include <linux/module.h>
5#include <linux/fs.h>
355da1eb
SW
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
355da1eb
SW
10#include <linux/writeback.h>
11#include <linux/vmalloc.h>
2cdeb1e4 12#include <linux/xattr.h>
4db658ea 13#include <linux/posix_acl.h>
3e7fbe9c 14#include <linux/random.h>
a407846e 15#include <linux/sort.h>
a35ead31 16#include <linux/iversion.h>
2d332d5b 17#include <linux/fscrypt.h>
355da1eb
SW
18
19#include "super.h"
3d14c5d2 20#include "mds_client.h"
99ccbd22 21#include "cache.h"
2d332d5b 22#include "crypto.h"
3d14c5d2 23#include <linux/ceph/decode.h>
355da1eb
SW
24
25/*
26 * Ceph inode operations
27 *
28 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
29 * setattr, etc.), xattr helpers, and helpers for assimilating
30 * metadata returned by the MDS into our cache.
31 *
32 * Also define helpers for doing asynchronous writeback, invalidation,
33 * and truncation for the benefit of those who can't afford to block
34 * (typically because they are in the message handler path).
35 */
36
37static const struct inode_operations ceph_symlink_iops;
38
1cf89a8d 39static void ceph_inode_work(struct work_struct *work);
355da1eb
SW
40
41/*
42 * find or create an inode, given the ceph ino number
43 */
ad1fee96
YS
44static int ceph_set_ino_cb(struct inode *inode, void *data)
45{
ebce3eb2 46 struct ceph_inode_info *ci = ceph_inode(inode);
1dd8d470 47 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
ebce3eb2
JL
48
49 ci->i_vino = *(struct ceph_vino *)data;
50 inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
a35ead31 51 inode_set_iversion_raw(inode, 0);
1dd8d470
XL
52 percpu_counter_inc(&mdsc->metric.total_inodes);
53
ad1fee96
YS
54 return 0;
55}
56
ec9595c0
JL
57/**
58 * ceph_new_inode - allocate a new inode in advance of an expected create
59 * @dir: parent directory for new inode
60 * @dentry: dentry that may eventually point to new inode
61 * @mode: mode of new inode
62 * @as_ctx: pointer to inherited security context
63 *
64 * Allocate a new inode in advance of an operation to create a new inode.
65 * This allocates the inode and sets up the acl_sec_ctx with appropriate
66 * info for the new inode.
67 *
68 * Returns a pointer to the new inode or an ERR_PTR.
69 */
70struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
71 umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
72{
73 int err;
74 struct inode *inode;
75
76 inode = new_inode(dir->i_sb);
77 if (!inode)
78 return ERR_PTR(-ENOMEM);
79
80 if (!S_ISLNK(*mode)) {
81 err = ceph_pre_init_acls(dir, mode, as_ctx);
82 if (err < 0)
83 goto out_err;
84 }
85
6b5717bd
JL
86 inode->i_state = 0;
87 inode->i_mode = *mode;
88
ec9595c0
JL
89 err = ceph_security_init_secctx(dentry, *mode, as_ctx);
90 if (err < 0)
91 goto out_err;
92
6b5717bd
JL
93 err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
94 if (err)
95 goto out_err;
96
ec9595c0
JL
97 return inode;
98out_err:
99 iput(inode);
100 return ERR_PTR(err);
101}
102
103void ceph_as_ctx_to_req(struct ceph_mds_request *req,
104 struct ceph_acl_sec_ctx *as_ctx)
105{
106 if (as_ctx->pagelist) {
107 req->r_pagelist = as_ctx->pagelist;
108 as_ctx->pagelist = NULL;
109 }
6b5717bd 110 ceph_fscrypt_as_ctx_to_req(req, as_ctx);
ec9595c0
JL
111}
112
113/**
114 * ceph_get_inode - find or create/hash a new inode
115 * @sb: superblock to search and allocate in
116 * @vino: vino to search for
117 * @newino: optional new inode to insert if one isn't found (may be NULL)
118 *
119 * Search for or insert a new inode into the hash for the given vino, and
120 * return a reference to it. If new is non-NULL, its reference is consumed.
121 */
122struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
123 struct inode *newino)
355da1eb
SW
124{
125 struct inode *inode;
355da1eb 126
d4f6b31d
JL
127 if (ceph_vino_is_reserved(vino))
128 return ERR_PTR(-EREMOTEIO);
129
ec9595c0
JL
130 if (newino) {
131 inode = inode_insert5(newino, (unsigned long)vino.ino,
132 ceph_ino_compare, ceph_set_ino_cb, &vino);
133 if (inode != newino)
134 iput(newino);
135 } else {
136 inode = iget5_locked(sb, (unsigned long)vino.ino,
137 ceph_ino_compare, ceph_set_ino_cb, &vino);
138 }
139
140 if (!inode) {
141 dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
355da1eb 142 return ERR_PTR(-ENOMEM);
ec9595c0 143 }
355da1eb 144
ebce3eb2
JL
145 dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
146 ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
355da1eb
SW
147 return inode;
148}
149
150/*
151 * get/constuct snapdir inode for a given directory
152 */
153struct inode *ceph_get_snapdir(struct inode *parent)
154{
155 struct ceph_vino vino = {
156 .ino = ceph_ino(parent),
157 .snap = CEPH_SNAPDIR,
158 };
ec9595c0 159 struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
b377ff13 160 struct ceph_inode_info *ci = ceph_inode(inode);
355da1eb 161
355da1eb 162 if (IS_ERR(inode))
7e34bc52 163 return inode;
3e10a15f
JL
164
165 if (!S_ISDIR(parent->i_mode)) {
166 pr_warn_once("bad snapdir parent type (mode=0%o)\n",
167 parent->i_mode);
322794d3 168 goto err;
3e10a15f
JL
169 }
170
171 if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
172 pr_warn_once("bad snapdir inode type (mode=0%o)\n",
173 inode->i_mode);
322794d3 174 goto err;
3e10a15f
JL
175 }
176
355da1eb
SW
177 inode->i_mode = parent->i_mode;
178 inode->i_uid = parent->i_uid;
179 inode->i_gid = parent->i_gid;
ef915725
LH
180 inode->i_mtime = parent->i_mtime;
181 inode->i_ctime = parent->i_ctime;
182 inode->i_atime = parent->i_atime;
b377ff13 183 ci->i_rbytes = 0;
ef915725 184 ci->i_btime = ceph_inode(parent)->i_btime;
893e456b 185
d3c51ae1
JL
186 if (inode->i_state & I_NEW) {
187 inode->i_op = &ceph_snapdir_iops;
188 inode->i_fop = &ceph_snapdir_fops;
189 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
893e456b 190 unlock_new_inode(inode);
d3c51ae1 191 }
893e456b 192
355da1eb 193 return inode;
322794d3
XL
194err:
195 if ((inode->i_state & I_NEW))
196 discard_new_inode(inode);
197 else
198 iput(inode);
199 return ERR_PTR(-ENOTDIR);
355da1eb
SW
200}
201
202const struct inode_operations ceph_file_iops = {
203 .permission = ceph_permission,
204 .setattr = ceph_setattr,
205 .getattr = ceph_getattr,
355da1eb 206 .listxattr = ceph_listxattr,
cac2f8b8 207 .get_inode_acl = ceph_get_acl,
72466d0b 208 .set_acl = ceph_set_acl,
355da1eb
SW
209};
210
211
212/*
213 * We use a 'frag tree' to keep track of the MDS's directory fragments
214 * for a given inode (usually there is just a single fragment). We
215 * need to know when a child frag is delegated to a new MDS, or when
216 * it is flagged as replicated, so we can direct our requests
217 * accordingly.
218 */
219
220/*
221 * find/create a frag in the tree
222 */
223static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
224 u32 f)
225{
226 struct rb_node **p;
227 struct rb_node *parent = NULL;
228 struct ceph_inode_frag *frag;
229 int c;
230
231 p = &ci->i_fragtree.rb_node;
232 while (*p) {
233 parent = *p;
234 frag = rb_entry(parent, struct ceph_inode_frag, node);
235 c = ceph_frag_compare(f, frag->frag);
236 if (c < 0)
237 p = &(*p)->rb_left;
238 else if (c > 0)
239 p = &(*p)->rb_right;
240 else
241 return frag;
242 }
243
244 frag = kmalloc(sizeof(*frag), GFP_NOFS);
51308806 245 if (!frag)
355da1eb 246 return ERR_PTR(-ENOMEM);
51308806 247
355da1eb
SW
248 frag->frag = f;
249 frag->split_by = 0;
250 frag->mds = -1;
251 frag->ndist = 0;
252
253 rb_link_node(&frag->node, parent, p);
254 rb_insert_color(&frag->node, &ci->i_fragtree);
255
256 dout("get_or_create_frag added %llx.%llx frag %x\n",
874c8ca1 257 ceph_vinop(&ci->netfs.inode), f);
355da1eb
SW
258 return frag;
259}
260
261/*
262 * find a specific frag @f
263 */
264struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
265{
266 struct rb_node *n = ci->i_fragtree.rb_node;
267
268 while (n) {
269 struct ceph_inode_frag *frag =
270 rb_entry(n, struct ceph_inode_frag, node);
271 int c = ceph_frag_compare(f, frag->frag);
272 if (c < 0)
273 n = n->rb_left;
274 else if (c > 0)
275 n = n->rb_right;
276 else
277 return frag;
278 }
279 return NULL;
280}
281
282/*
283 * Choose frag containing the given value @v. If @pfrag is
284 * specified, copy the frag delegation info to the caller if
285 * it is present.
286 */
3e7fbe9c
YZ
287static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
288 struct ceph_inode_frag *pfrag, int *found)
355da1eb
SW
289{
290 u32 t = ceph_frag_make(0, 0);
291 struct ceph_inode_frag *frag;
292 unsigned nway, i;
293 u32 n;
294
295 if (found)
296 *found = 0;
297
355da1eb
SW
298 while (1) {
299 WARN_ON(!ceph_frag_contains_value(t, v));
300 frag = __ceph_find_frag(ci, t);
301 if (!frag)
302 break; /* t is a leaf */
303 if (frag->split_by == 0) {
304 if (pfrag)
305 memcpy(pfrag, frag, sizeof(*pfrag));
306 if (found)
307 *found = 1;
308 break;
309 }
310
311 /* choose child */
312 nway = 1 << frag->split_by;
313 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
314 frag->split_by, nway);
315 for (i = 0; i < nway; i++) {
316 n = ceph_frag_make_child(t, frag->split_by, i);
317 if (ceph_frag_contains_value(n, v)) {
318 t = n;
319 break;
320 }
321 }
322 BUG_ON(i == nway);
323 }
324 dout("choose_frag(%x) = %x\n", v, t);
325
355da1eb
SW
326 return t;
327}
328
3e7fbe9c
YZ
329u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
330 struct ceph_inode_frag *pfrag, int *found)
331{
332 u32 ret;
333 mutex_lock(&ci->i_fragtree_mutex);
334 ret = __ceph_choose_frag(ci, v, pfrag, found);
335 mutex_unlock(&ci->i_fragtree_mutex);
336 return ret;
337}
338
355da1eb
SW
339/*
340 * Process dirfrag (delegation) info from the mds. Include leaf
341 * fragment in tree ONLY if ndist > 0. Otherwise, only
342 * branches/splits are included in i_fragtree)
343 */
344static int ceph_fill_dirfrag(struct inode *inode,
345 struct ceph_mds_reply_dirfrag *dirinfo)
346{
347 struct ceph_inode_info *ci = ceph_inode(inode);
348 struct ceph_inode_frag *frag;
349 u32 id = le32_to_cpu(dirinfo->frag);
350 int mds = le32_to_cpu(dirinfo->auth);
351 int ndist = le32_to_cpu(dirinfo->ndist);
8d08503c 352 int diri_auth = -1;
355da1eb
SW
353 int i;
354 int err = 0;
355
8d08503c
YZ
356 spin_lock(&ci->i_ceph_lock);
357 if (ci->i_auth_cap)
358 diri_auth = ci->i_auth_cap->mds;
359 spin_unlock(&ci->i_ceph_lock);
360
42172119
YZ
361 if (mds == -1) /* CDIR_AUTH_PARENT */
362 mds = diri_auth;
363
355da1eb 364 mutex_lock(&ci->i_fragtree_mutex);
8d08503c 365 if (ndist == 0 && mds == diri_auth) {
355da1eb
SW
366 /* no delegation info needed. */
367 frag = __ceph_find_frag(ci, id);
368 if (!frag)
369 goto out;
370 if (frag->split_by == 0) {
371 /* tree leaf, remove */
372 dout("fill_dirfrag removed %llx.%llx frag %x"
373 " (no ref)\n", ceph_vinop(inode), id);
374 rb_erase(&frag->node, &ci->i_fragtree);
375 kfree(frag);
376 } else {
377 /* tree branch, keep and clear */
378 dout("fill_dirfrag cleared %llx.%llx frag %x"
379 " referral\n", ceph_vinop(inode), id);
380 frag->mds = -1;
381 frag->ndist = 0;
382 }
383 goto out;
384 }
385
386
387 /* find/add this frag to store mds delegation info */
388 frag = __get_or_create_frag(ci, id);
389 if (IS_ERR(frag)) {
390 /* this is not the end of the world; we can continue
391 with bad/inaccurate delegation info */
392 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
393 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
394 err = -ENOMEM;
395 goto out;
396 }
397
398 frag->mds = mds;
399 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
400 for (i = 0; i < frag->ndist; i++)
401 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
402 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
403 ceph_vinop(inode), frag->frag, frag->ndist);
404
405out:
406 mutex_unlock(&ci->i_fragtree_mutex);
407 return err;
408}
409
a407846e
YZ
410static int frag_tree_split_cmp(const void *l, const void *r)
411{
412 struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
413 struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
fe2ed425
JL
414 return ceph_frag_compare(le32_to_cpu(ls->frag),
415 le32_to_cpu(rs->frag));
a407846e
YZ
416}
417
a4b7431f
YZ
418static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
419{
420 if (!frag)
421 return f == ceph_frag_make(0, 0);
422 if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
423 return false;
424 return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
425}
426
3e7fbe9c
YZ
427static int ceph_fill_fragtree(struct inode *inode,
428 struct ceph_frag_tree_head *fragtree,
429 struct ceph_mds_reply_dirfrag *dirinfo)
430{
431 struct ceph_inode_info *ci = ceph_inode(inode);
a4b7431f 432 struct ceph_inode_frag *frag, *prev_frag = NULL;
3e7fbe9c 433 struct rb_node *rb_node;
1b1bc16d
YZ
434 unsigned i, split_by, nsplits;
435 u32 id;
3e7fbe9c
YZ
436 bool update = false;
437
438 mutex_lock(&ci->i_fragtree_mutex);
439 nsplits = le32_to_cpu(fragtree->nsplits);
1b1bc16d
YZ
440 if (nsplits != ci->i_fragtree_nsplits) {
441 update = true;
442 } else if (nsplits) {
8032bf12 443 i = get_random_u32_below(nsplits);
3e7fbe9c
YZ
444 id = le32_to_cpu(fragtree->splits[i].frag);
445 if (!__ceph_find_frag(ci, id))
446 update = true;
447 } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
448 rb_node = rb_first(&ci->i_fragtree);
449 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
450 if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
451 update = true;
452 }
453 if (!update && dirinfo) {
454 id = le32_to_cpu(dirinfo->frag);
455 if (id != __ceph_choose_frag(ci, id, NULL, NULL))
456 update = true;
457 }
458 if (!update)
459 goto out_unlock;
460
a407846e
YZ
461 if (nsplits > 1) {
462 sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
463 frag_tree_split_cmp, NULL);
464 }
465
3e7fbe9c
YZ
466 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
467 rb_node = rb_first(&ci->i_fragtree);
468 for (i = 0; i < nsplits; i++) {
469 id = le32_to_cpu(fragtree->splits[i].frag);
1b1bc16d
YZ
470 split_by = le32_to_cpu(fragtree->splits[i].by);
471 if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
472 pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
473 "frag %x split by %d\n", ceph_vinop(inode),
474 i, nsplits, id, split_by);
475 continue;
476 }
3e7fbe9c
YZ
477 frag = NULL;
478 while (rb_node) {
479 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
480 if (ceph_frag_compare(frag->frag, id) >= 0) {
481 if (frag->frag != id)
482 frag = NULL;
483 else
484 rb_node = rb_next(rb_node);
485 break;
486 }
487 rb_node = rb_next(rb_node);
a4b7431f
YZ
488 /* delete stale split/leaf node */
489 if (frag->split_by > 0 ||
490 !is_frag_child(frag->frag, prev_frag)) {
491 rb_erase(&frag->node, &ci->i_fragtree);
1b1bc16d
YZ
492 if (frag->split_by > 0)
493 ci->i_fragtree_nsplits--;
a4b7431f
YZ
494 kfree(frag);
495 }
3e7fbe9c
YZ
496 frag = NULL;
497 }
498 if (!frag) {
499 frag = __get_or_create_frag(ci, id);
500 if (IS_ERR(frag))
501 continue;
502 }
1b1bc16d
YZ
503 if (frag->split_by == 0)
504 ci->i_fragtree_nsplits++;
505 frag->split_by = split_by;
3e7fbe9c 506 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
a4b7431f 507 prev_frag = frag;
3e7fbe9c
YZ
508 }
509 while (rb_node) {
510 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
511 rb_node = rb_next(rb_node);
a4b7431f
YZ
512 /* delete stale split/leaf node */
513 if (frag->split_by > 0 ||
514 !is_frag_child(frag->frag, prev_frag)) {
515 rb_erase(&frag->node, &ci->i_fragtree);
1b1bc16d
YZ
516 if (frag->split_by > 0)
517 ci->i_fragtree_nsplits--;
a4b7431f
YZ
518 kfree(frag);
519 }
3e7fbe9c
YZ
520 }
521out_unlock:
522 mutex_unlock(&ci->i_fragtree_mutex);
523 return 0;
524}
355da1eb
SW
525
526/*
527 * initialize a newly allocated inode.
528 */
529struct inode *ceph_alloc_inode(struct super_block *sb)
530{
531 struct ceph_inode_info *ci;
532 int i;
533
fd60b288 534 ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
355da1eb
SW
535 if (!ci)
536 return NULL;
537
874c8ca1 538 dout("alloc_inode %p\n", &ci->netfs.inode);
355da1eb 539
bc899ee1 540 /* Set parameters for the netfs library */
e81fb419 541 netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
bc899ee1 542
be655596
SW
543 spin_lock_init(&ci->i_ceph_lock);
544
355da1eb 545 ci->i_version = 0;
31c542a1 546 ci->i_inline_version = 0;
355da1eb
SW
547 ci->i_time_warp_seq = 0;
548 ci->i_ceph_flags = 0;
fdd4e158
YZ
549 atomic64_set(&ci->i_ordered_count, 1);
550 atomic64_set(&ci->i_release_count, 1);
551 atomic64_set(&ci->i_complete_seq[0], 0);
552 atomic64_set(&ci->i_complete_seq[1], 0);
355da1eb
SW
553 ci->i_symlink = NULL;
554
fb18a575
LH
555 ci->i_max_bytes = 0;
556 ci->i_max_files = 0;
557
6c0f3af7 558 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
785892fe 559 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
30c156d9 560 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
6c0f3af7 561
355da1eb
SW
562 ci->i_fragtree = RB_ROOT;
563 mutex_init(&ci->i_fragtree_mutex);
564
565 ci->i_xattrs.blob = NULL;
566 ci->i_xattrs.prealloc_blob = NULL;
567 ci->i_xattrs.dirty = false;
568 ci->i_xattrs.index = RB_ROOT;
569 ci->i_xattrs.count = 0;
570 ci->i_xattrs.names_size = 0;
571 ci->i_xattrs.vals_size = 0;
572 ci->i_xattrs.version = 0;
573 ci->i_xattrs.index_version = 0;
574
575 ci->i_caps = RB_ROOT;
576 ci->i_auth_cap = NULL;
577 ci->i_dirty_caps = 0;
578 ci->i_flushing_caps = 0;
579 INIT_LIST_HEAD(&ci->i_dirty_item);
580 INIT_LIST_HEAD(&ci->i_flushing_item);
f66fd9f0 581 ci->i_prealloc_cap_flush = NULL;
e4500b5e 582 INIT_LIST_HEAD(&ci->i_cap_flush_list);
355da1eb 583 init_waitqueue_head(&ci->i_cap_wq);
355da1eb
SW
584 ci->i_hold_caps_max = 0;
585 INIT_LIST_HEAD(&ci->i_cap_delay_list);
355da1eb
SW
586 INIT_LIST_HEAD(&ci->i_cap_snaps);
587 ci->i_head_snapc = NULL;
588 ci->i_snap_caps = 0;
589
719a2514 590 ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
774a6a11 591 for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
355da1eb
SW
592 ci->i_nr_by_mode[i] = 0;
593
b0d7c223 594 mutex_init(&ci->i_truncate_mutex);
355da1eb
SW
595 ci->i_truncate_seq = 0;
596 ci->i_truncate_size = 0;
597 ci->i_truncate_pending = 0;
598
599 ci->i_max_size = 0;
600 ci->i_reported_size = 0;
601 ci->i_wanted_max_size = 0;
602 ci->i_requested_max_size = 0;
603
604 ci->i_pin_ref = 0;
605 ci->i_rd_ref = 0;
606 ci->i_rdcache_ref = 0;
607 ci->i_wr_ref = 0;
d3d0720d 608 ci->i_wb_ref = 0;
f85122af 609 ci->i_fx_ref = 0;
355da1eb
SW
610 ci->i_wrbuffer_ref = 0;
611 ci->i_wrbuffer_ref_head = 0;
89aa5930 612 atomic_set(&ci->i_filelock_ref, 0);
1e9c2eb6 613 atomic_set(&ci->i_shared_gen, 1);
355da1eb
SW
614 ci->i_rdcache_gen = 0;
615 ci->i_rdcache_revoking = 0;
616
355da1eb 617 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
68cd5b4b 618 INIT_LIST_HEAD(&ci->i_unsafe_iops);
355da1eb
SW
619 spin_lock_init(&ci->i_unsafe_lock);
620
621 ci->i_snap_realm = NULL;
622 INIT_LIST_HEAD(&ci->i_snap_realm_item);
623 INIT_LIST_HEAD(&ci->i_snap_flush_item);
624
1cf89a8d
YZ
625 INIT_WORK(&ci->i_work, ceph_inode_work);
626 ci->i_work_mask = 0;
245ce991 627 memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
2d332d5b
JL
628#ifdef CONFIG_FS_ENCRYPTION
629 ci->fscrypt_auth = NULL;
630 ci->fscrypt_auth_len = 0;
631#endif
874c8ca1 632 return &ci->netfs.inode;
355da1eb
SW
633}
634
cfa6d412 635void ceph_free_inode(struct inode *inode)
fa0d7e3d 636{
fa0d7e3d
NP
637 struct ceph_inode_info *ci = ceph_inode(inode);
638
daf5cc27 639 kfree(ci->i_symlink);
2d332d5b
JL
640#ifdef CONFIG_FS_ENCRYPTION
641 kfree(ci->fscrypt_auth);
642#endif
fa0d7e3d
NP
643 kmem_cache_free(ceph_inode_cachep, ci);
644}
645
87bc5b89 646void ceph_evict_inode(struct inode *inode)
355da1eb
SW
647{
648 struct ceph_inode_info *ci = ceph_inode(inode);
1dd8d470 649 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
355da1eb
SW
650 struct ceph_inode_frag *frag;
651 struct rb_node *n;
652
87bc5b89
YZ
653 dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
654
1dd8d470
XL
655 percpu_counter_dec(&mdsc->metric.total_inodes);
656
87bc5b89 657 truncate_inode_pages_final(&inode->i_data);
400e1286
JL
658 if (inode->i_state & I_PINNING_FSCACHE_WB)
659 ceph_fscache_unuse_cookie(inode, true);
87bc5b89 660 clear_inode(inode);
355da1eb 661
99ccbd22 662 ceph_fscache_unregister_inode_cookie(ci);
2d332d5b 663 fscrypt_put_encryption_info(inode);
99ccbd22 664
d6e47819 665 __ceph_remove_caps(ci);
355da1eb 666
55ab5520 667 if (__ceph_has_quota(ci, QUOTA_GET_ANY))
d557c48d
LH
668 ceph_adjust_quota_realms_count(inode, false);
669
8b218b8a
SW
670 /*
671 * we may still have a snap_realm reference if there are stray
d9df2783 672 * caps in i_snap_caps.
8b218b8a
SW
673 */
674 if (ci->i_snap_realm) {
75c9627e 675 if (ceph_snap(inode) == CEPH_NOSNAP) {
75c9627e 676 dout(" dropping residual ref to snap realm %p\n",
0ba92e1c
JL
677 ci->i_snap_realm);
678 ceph_change_snap_realm(inode, NULL);
75c9627e
YZ
679 } else {
680 ceph_put_snapid_map(mdsc, ci->i_snapid_map);
681 ci->i_snap_realm = NULL;
682 }
8b218b8a
SW
683 }
684
355da1eb
SW
685 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
686 frag = rb_entry(n, struct ceph_inode_frag, node);
687 rb_erase(n, &ci->i_fragtree);
688 kfree(frag);
689 }
1b1bc16d 690 ci->i_fragtree_nsplits = 0;
355da1eb
SW
691
692 __ceph_destroy_xattrs(ci);
b6c1d5b8
SW
693 if (ci->i_xattrs.blob)
694 ceph_buffer_put(ci->i_xattrs.blob);
695 if (ci->i_xattrs.prealloc_blob)
696 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
355da1eb 697
779fe0fb 698 ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
785892fe 699 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
355da1eb
SW
700}
701
224a7542
YZ
702static inline blkcnt_t calc_inode_blocks(u64 size)
703{
704 return (size + (1<<9) - 1) >> 9;
705}
706
355da1eb
SW
707/*
708 * Helpers to fill in size, ctime, mtime, and atime. We have to be
709 * careful because either the client or MDS may have more up to date
710 * info, depending on which capabilities are held, and whether
711 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
712 * and size are monotonically increasing, except when utimes() or
713 * truncate() increments the corresponding _seq values.)
714 */
715int ceph_fill_file_size(struct inode *inode, int issued,
716 u32 truncate_seq, u64 truncate_size, u64 size)
717{
718 struct ceph_inode_info *ci = ceph_inode(inode);
719 int queue_trunc = 0;
2d6795fb 720 loff_t isize = i_size_read(inode);
355da1eb
SW
721
722 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
2d6795fb
JL
723 (truncate_seq == ci->i_truncate_seq && size > isize)) {
724 dout("size %lld -> %llu\n", isize, size);
a3d714c3
YZ
725 if (size > 0 && S_ISDIR(inode->i_mode)) {
726 pr_err("fill_file_size non-zero size for directory\n");
727 size = 0;
728 }
99c88e69 729 i_size_write(inode, size);
224a7542 730 inode->i_blocks = calc_inode_blocks(size);
400e1286
JL
731 /*
732 * If we're expanding, then we should be able to just update
733 * the existing cookie.
734 */
735 if (size > isize)
736 ceph_fscache_update(inode);
355da1eb
SW
737 ci->i_reported_size = size;
738 if (truncate_seq != ci->i_truncate_seq) {
739 dout("truncate_seq %u -> %u\n",
740 ci->i_truncate_seq, truncate_seq);
741 ci->i_truncate_seq = truncate_seq;
b0d7c223
YZ
742
743 /* the MDS should have revoked these caps */
744 WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
745 CEPH_CAP_FILE_RD |
746 CEPH_CAP_FILE_WR |
747 CEPH_CAP_FILE_LAZYIO));
3d497d85
YS
748 /*
749 * If we hold relevant caps, or in the case where we're
750 * not the only client referencing this file and we
751 * don't hold those caps, then we need to check whether
752 * the file is either opened or mmaped
753 */
b0d7c223
YZ
754 if ((issued & (CEPH_CAP_FILE_CACHE|
755 CEPH_CAP_FILE_BUFFER)) ||
3d497d85 756 mapping_mapped(inode->i_mapping) ||
719a2514 757 __ceph_is_file_opened(ci)) {
355da1eb
SW
758 ci->i_truncate_pending++;
759 queue_trunc = 1;
760 }
761 }
762 }
763 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
764 ci->i_truncate_size != truncate_size) {
765 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
766 truncate_size);
767 ci->i_truncate_size = truncate_size;
768 }
769 return queue_trunc;
770}
771
772void ceph_fill_file_time(struct inode *inode, int issued,
9bbeab41
AB
773 u64 time_warp_seq, struct timespec64 *ctime,
774 struct timespec64 *mtime, struct timespec64 *atime)
355da1eb
SW
775{
776 struct ceph_inode_info *ci = ceph_inode(inode);
777 int warn = 0;
778
779 if (issued & (CEPH_CAP_FILE_EXCL|
780 CEPH_CAP_FILE_WR|
d8672d64
SW
781 CEPH_CAP_FILE_BUFFER|
782 CEPH_CAP_AUTH_EXCL|
783 CEPH_CAP_XATTR_EXCL)) {
ffdeec7a 784 if (ci->i_version == 0 ||
9bbeab41 785 timespec64_compare(ctime, &inode->i_ctime) > 0) {
13442b03 786 dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
9bbeab41
AB
787 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
788 ctime->tv_sec, ctime->tv_nsec);
789 inode->i_ctime = *ctime;
355da1eb 790 }
ffdeec7a
YZ
791 if (ci->i_version == 0 ||
792 ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
355da1eb 793 /* the MDS did a utimes() */
13442b03 794 dout("mtime %lld.%09ld -> %lld.%09ld "
355da1eb 795 "tw %d -> %d\n",
9bbeab41
AB
796 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
797 mtime->tv_sec, mtime->tv_nsec,
355da1eb
SW
798 ci->i_time_warp_seq, (int)time_warp_seq);
799
9bbeab41
AB
800 inode->i_mtime = *mtime;
801 inode->i_atime = *atime;
355da1eb
SW
802 ci->i_time_warp_seq = time_warp_seq;
803 } else if (time_warp_seq == ci->i_time_warp_seq) {
804 /* nobody did utimes(); take the max */
9bbeab41 805 if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
13442b03 806 dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
9bbeab41 807 inode->i_mtime.tv_sec,
355da1eb 808 inode->i_mtime.tv_nsec,
9bbeab41
AB
809 mtime->tv_sec, mtime->tv_nsec);
810 inode->i_mtime = *mtime;
355da1eb 811 }
9bbeab41 812 if (timespec64_compare(atime, &inode->i_atime) > 0) {
13442b03 813 dout("atime %lld.%09ld -> %lld.%09ld inc\n",
9bbeab41 814 inode->i_atime.tv_sec,
355da1eb 815 inode->i_atime.tv_nsec,
9bbeab41
AB
816 atime->tv_sec, atime->tv_nsec);
817 inode->i_atime = *atime;
355da1eb
SW
818 }
819 } else if (issued & CEPH_CAP_FILE_EXCL) {
820 /* we did a utimes(); ignore mds values */
821 } else {
822 warn = 1;
823 }
824 } else {
d8672d64 825 /* we have no write|excl caps; whatever the MDS says is true */
355da1eb 826 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
9bbeab41
AB
827 inode->i_ctime = *ctime;
828 inode->i_mtime = *mtime;
829 inode->i_atime = *atime;
355da1eb
SW
830 ci->i_time_warp_seq = time_warp_seq;
831 } else {
832 warn = 1;
833 }
834 }
835 if (warn) /* time_warp_seq shouldn't go backwards */
836 dout("%p mds time_warp_seq %llu < %u\n",
837 inode, time_warp_seq, ci->i_time_warp_seq);
838}
839
840/*
841 * Populate an inode based on info from mds. May be called on new or
842 * existing inodes.
843 */
966c7160
JL
844int ceph_fill_inode(struct inode *inode, struct page *locked_page,
845 struct ceph_mds_reply_info_in *iinfo,
846 struct ceph_mds_reply_dirfrag *dirinfo,
847 struct ceph_mds_session *session, int cap_fmode,
848 struct ceph_cap_reservation *caps_reservation)
355da1eb 849{
2678da88 850 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
355da1eb
SW
851 struct ceph_mds_reply_inode *info = iinfo->in;
852 struct ceph_inode_info *ci = ceph_inode(inode);
2af54a72 853 int issued, new_issued, info_caps;
9bbeab41 854 struct timespec64 mtime, atime, ctime;
355da1eb 855 struct ceph_buffer *xattr_blob = NULL;
af8a85a4 856 struct ceph_buffer *old_blob = NULL;
779fe0fb 857 struct ceph_string *pool_ns = NULL;
d9df2783 858 struct ceph_cap *new_cap = NULL;
355da1eb 859 int err = 0;
d9df2783 860 bool wake = false;
f98a128a
YZ
861 bool queue_trunc = false;
862 bool new_version = false;
31c542a1 863 bool fill_inline = false;
ed94f87c
JL
864 umode_t mode = le32_to_cpu(info->mode);
865 dev_t rdev = le32_to_cpu(info->rdev);
355da1eb 866
27171ae6
JL
867 lockdep_assert_held(&mdsc->snap_rwsem);
868
966c7160 869 dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
355da1eb
SW
870 inode, ceph_vinop(inode), le64_to_cpu(info->version),
871 ci->i_version);
872
ed94f87c
JL
873 /* Once I_NEW is cleared, we can't change type or dev numbers */
874 if (inode->i_state & I_NEW) {
875 inode->i_mode = mode;
876 } else {
877 if (inode_wrong_type(inode, mode)) {
878 pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
879 ceph_vinop(inode), inode->i_mode, mode);
880 return -ESTALE;
881 }
882
883 if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
884 pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
885 ceph_vinop(inode), MAJOR(inode->i_rdev),
886 MINOR(inode->i_rdev), MAJOR(rdev),
887 MINOR(rdev));
888 return -ESTALE;
889 }
890 }
891
2af54a72
YZ
892 info_caps = le32_to_cpu(info->cap.caps);
893
d9df2783 894 /* prealloc new cap struct */
9a6bed4f 895 if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
d9df2783 896 new_cap = ceph_get_cap(mdsc, caps_reservation);
9a6bed4f
JL
897 if (!new_cap)
898 return -ENOMEM;
899 }
d9df2783 900
355da1eb
SW
901 /*
902 * prealloc xattr data, if it looks like we'll need it. only
903 * if len > 4 (meaning there are actually xattrs; the first 4
904 * bytes are the xattr count).
905 */
906 if (iinfo->xattr_len > 4) {
b6c1d5b8 907 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
355da1eb 908 if (!xattr_blob)
966c7160 909 pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
355da1eb
SW
910 iinfo->xattr_len);
911 }
912
779fe0fb
YZ
913 if (iinfo->pool_ns_len > 0)
914 pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
915 iinfo->pool_ns_len);
916
75c9627e
YZ
917 if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
918 ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
919
be655596 920 spin_lock(&ci->i_ceph_lock);
355da1eb
SW
921
922 /*
923 * provided version will be odd if inode value is projected,
8bd59e01
SW
924 * even if stable. skip the update if we have newer stable
925 * info (ours>=theirs, e.g. due to racing mds replies), unless
926 * we are getting projected (unstable) info (in which case the
927 * version is odd, and we want ours>theirs).
928 * us them
929 * 2 2 skip
930 * 3 2 skip
931 * 3 3 update
355da1eb 932 */
f98a128a
YZ
933 if (ci->i_version == 0 ||
934 ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
935 le64_to_cpu(info->version) > (ci->i_version & ~1)))
936 new_version = true;
937
a35ead31
JL
938 /* Update change_attribute */
939 inode_set_max_iversion_raw(inode, iinfo->change_attr);
940
2af54a72
YZ
941 __ceph_caps_issued(ci, &issued);
942 issued |= __ceph_caps_dirty(ci);
943 new_issued = ~issued & info_caps;
355da1eb 944
75067034
LH
945 /* directories have fl_stripe_unit set to zero */
946 if (le32_to_cpu(info->layout.fl_stripe_unit))
947 inode->i_blkbits =
948 fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
949 else
950 inode->i_blkbits = CEPH_BLOCK_SHIFT;
355da1eb 951
d557c48d 952 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
fb18a575 953
2d332d5b
JL
954#ifdef CONFIG_FS_ENCRYPTION
955 if (iinfo->fscrypt_auth_len && (inode->i_state & I_NEW)) {
956 kfree(ci->fscrypt_auth);
957 ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
958 ci->fscrypt_auth = iinfo->fscrypt_auth;
959 iinfo->fscrypt_auth = NULL;
960 iinfo->fscrypt_auth_len = 0;
961 inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
962 }
963#endif
964
f98a128a
YZ
965 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
966 (issued & CEPH_CAP_AUTH_EXCL) == 0) {
ed94f87c 967 inode->i_mode = mode;
ab871b90
EB
968 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
969 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
355da1eb 970 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
bd2bae6a
EB
971 from_kuid(&init_user_ns, inode->i_uid),
972 from_kgid(&init_user_ns, inode->i_gid));
245ce991 973 ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
193e7b37 974 ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
355da1eb
SW
975 }
976
f98a128a
YZ
977 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
978 (issued & CEPH_CAP_LINK_EXCL) == 0)
bfe86848 979 set_nlink(inode, le32_to_cpu(info->nlink));
355da1eb 980
f98a128a
YZ
981 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
982 /* be careful with mtime, atime, size */
9bbeab41
AB
983 ceph_decode_timespec64(&atime, &info->atime);
984 ceph_decode_timespec64(&mtime, &info->mtime);
985 ceph_decode_timespec64(&ctime, &info->ctime);
f98a128a
YZ
986 ceph_fill_file_time(inode, issued,
987 le32_to_cpu(info->time_warp_seq),
988 &ctime, &mtime, &atime);
989 }
990
2af54a72
YZ
991 if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
992 ci->i_files = le64_to_cpu(info->files);
993 ci->i_subdirs = le64_to_cpu(info->subdirs);
994 }
995
f98a128a
YZ
996 if (new_version ||
997 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7627151e 998 s64 old_pool = ci->i_layout.pool_id;
779fe0fb
YZ
999 struct ceph_string *old_ns;
1000
7627151e 1001 ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
779fe0fb
YZ
1002 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
1003 lockdep_is_held(&ci->i_ceph_lock));
1004 rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
1005
1006 if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
10183a69 1007 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
10183a69 1008
779fe0fb 1009 pool_ns = old_ns;
10183a69 1010
f98a128a
YZ
1011 queue_trunc = ceph_fill_file_size(inode, issued,
1012 le32_to_cpu(info->truncate_seq),
1013 le64_to_cpu(info->truncate_size),
1014 le64_to_cpu(info->size));
1015 /* only update max_size on auth cap */
1016 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1017 ci->i_max_size != le64_to_cpu(info->max_size)) {
1018 dout("max_size %lld -> %llu\n", ci->i_max_size,
1019 le64_to_cpu(info->max_size));
1020 ci->i_max_size = le64_to_cpu(info->max_size);
1021 }
1022 }
355da1eb 1023
49a9f4f6
YZ
1024 /* layout and rstat are not tracked by capability, update them if
1025 * the inode info is from auth mds */
1026 if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1027 if (S_ISDIR(inode->i_mode)) {
1028 ci->i_dir_layout = iinfo->dir_layout;
1029 ci->i_rbytes = le64_to_cpu(info->rbytes);
1030 ci->i_rfiles = le64_to_cpu(info->rfiles);
1031 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
08796873 1032 ci->i_dir_pin = iinfo->dir_pin;
e7f72952 1033 ci->i_rsnaps = iinfo->rsnaps;
9bbeab41 1034 ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
49a9f4f6
YZ
1035 }
1036 }
1037
355da1eb
SW
1038 /* xattrs */
1039 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
508b32d8 1040 if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
355da1eb
SW
1041 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
1042 if (ci->i_xattrs.blob)
af8a85a4 1043 old_blob = ci->i_xattrs.blob;
355da1eb
SW
1044 ci->i_xattrs.blob = xattr_blob;
1045 if (xattr_blob)
1046 memcpy(ci->i_xattrs.blob->vec.iov_base,
1047 iinfo->xattr_data, iinfo->xattr_len);
1048 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
7221fe4c 1049 ceph_forget_all_cached_acls(inode);
ac6713cc 1050 ceph_security_invalidate_secctx(inode);
a6424e48 1051 xattr_blob = NULL;
355da1eb
SW
1052 }
1053
ffdeec7a 1054 /* finally update i_version */
aae1a442
YZ
1055 if (le64_to_cpu(info->version) > ci->i_version)
1056 ci->i_version = le64_to_cpu(info->version);
ffdeec7a 1057
355da1eb 1058 inode->i_mapping->a_ops = &ceph_aops;
355da1eb
SW
1059
1060 switch (inode->i_mode & S_IFMT) {
1061 case S_IFIFO:
1062 case S_IFBLK:
1063 case S_IFCHR:
1064 case S_IFSOCK:
5ba72e60 1065 inode->i_blkbits = PAGE_SHIFT;
ed94f87c 1066 init_special_inode(inode, inode->i_mode, rdev);
355da1eb
SW
1067 inode->i_op = &ceph_file_iops;
1068 break;
1069 case S_IFREG:
1070 inode->i_op = &ceph_file_iops;
1071 inode->i_fop = &ceph_file_fops;
1072 break;
1073 case S_IFLNK:
1074 inode->i_op = &ceph_symlink_iops;
1075 if (!ci->i_symlink) {
810339ec 1076 u32 symlen = iinfo->symlink_len;
355da1eb
SW
1077 char *sym;
1078
be655596 1079 spin_unlock(&ci->i_ceph_lock);
355da1eb 1080
224a7542 1081 if (symlen != i_size_read(inode)) {
966c7160
JL
1082 pr_err("%s %llx.%llx BAD symlink "
1083 "size %lld\n", __func__,
1084 ceph_vinop(inode),
224a7542
YZ
1085 i_size_read(inode));
1086 i_size_write(inode, symlen);
1087 inode->i_blocks = calc_inode_blocks(symlen);
1088 }
810339ec 1089
355da1eb 1090 err = -ENOMEM;
810339ec 1091 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
355da1eb
SW
1092 if (!sym)
1093 goto out;
355da1eb 1094
be655596 1095 spin_lock(&ci->i_ceph_lock);
355da1eb
SW
1096 if (!ci->i_symlink)
1097 ci->i_symlink = sym;
1098 else
1099 kfree(sym); /* lost a race */
1100 }
ac194dcc 1101 inode->i_link = ci->i_symlink;
355da1eb
SW
1102 break;
1103 case S_IFDIR:
1104 inode->i_op = &ceph_dir_iops;
1105 inode->i_fop = &ceph_dir_fops;
355da1eb
SW
1106 break;
1107 default:
966c7160 1108 pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
355da1eb
SW
1109 ceph_vinop(inode), inode->i_mode);
1110 }
1111
355da1eb 1112 /* were we issued a capability? */
2af54a72 1113 if (info_caps) {
355da1eb
SW
1114 if (ceph_snap(inode) == CEPH_NOSNAP) {
1115 ceph_add_cap(inode, session,
1116 le64_to_cpu(info->cap.cap_id),
135e671e 1117 info_caps,
355da1eb
SW
1118 le32_to_cpu(info->cap.wanted),
1119 le32_to_cpu(info->cap.seq),
1120 le32_to_cpu(info->cap.mseq),
1121 le64_to_cpu(info->cap.realm),
d9df2783 1122 info->cap.flags, &new_cap);
2f92b3d0
YZ
1123
1124 /* set dir completion flag? */
1125 if (S_ISDIR(inode->i_mode) &&
1126 ci->i_files == 0 && ci->i_subdirs == 0 &&
2af54a72 1127 (info_caps & CEPH_CAP_FILE_SHARED) &&
2f92b3d0
YZ
1128 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1129 !__ceph_dir_is_complete(ci)) {
1130 dout(" marking %p complete (empty)\n", inode);
fdd4e158 1131 i_size_write(inode, 0);
2f92b3d0 1132 __ceph_dir_set_complete(ci,
fdd4e158
YZ
1133 atomic64_read(&ci->i_release_count),
1134 atomic64_read(&ci->i_ordered_count));
2f92b3d0
YZ
1135 }
1136
d9df2783 1137 wake = true;
355da1eb 1138 } else {
355da1eb 1139 dout(" %p got snap_caps %s\n", inode,
2af54a72
YZ
1140 ceph_cap_string(info_caps));
1141 ci->i_snap_caps |= info_caps;
355da1eb
SW
1142 }
1143 }
31c542a1
YZ
1144
1145 if (iinfo->inline_version > 0 &&
1146 iinfo->inline_version >= ci->i_inline_version) {
1147 int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1148 ci->i_inline_version = iinfo->inline_version;
48490776 1149 if (ceph_has_inline_data(ci) &&
2af54a72 1150 (locked_page || (info_caps & cache_caps)))
31c542a1
YZ
1151 fill_inline = true;
1152 }
1153
719a2514
YZ
1154 if (cap_fmode >= 0) {
1155 if (!info_caps)
1156 pr_warn("mds issued no caps on %llx.%llx\n",
1157 ceph_vinop(inode));
1158 __ceph_touch_fmode(ci, mdsc, cap_fmode);
1159 }
1160
be655596 1161 spin_unlock(&ci->i_ceph_lock);
355da1eb 1162
400e1286
JL
1163 ceph_fscache_register_inode_cookie(inode);
1164
31c542a1 1165 if (fill_inline)
01deead0 1166 ceph_fill_inline_data(inode, locked_page,
31c542a1
YZ
1167 iinfo->inline_data, iinfo->inline_len);
1168
d9df2783
YZ
1169 if (wake)
1170 wake_up_all(&ci->i_cap_wq);
1171
355da1eb
SW
1172 /* queue truncate if we saw i_size decrease */
1173 if (queue_trunc)
3c6f6b79 1174 ceph_queue_vmtruncate(inode);
355da1eb
SW
1175
1176 /* populate frag tree */
3e7fbe9c
YZ
1177 if (S_ISDIR(inode->i_mode))
1178 ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
355da1eb
SW
1179
1180 /* update delegation info? */
1181 if (dirinfo)
1182 ceph_fill_dirfrag(inode, dirinfo);
1183
1184 err = 0;
355da1eb 1185out:
d9df2783
YZ
1186 if (new_cap)
1187 ceph_put_cap(mdsc, new_cap);
af8a85a4
LH
1188 ceph_buffer_put(old_blob);
1189 ceph_buffer_put(xattr_blob);
779fe0fb 1190 ceph_put_string(pool_ns);
355da1eb
SW
1191 return err;
1192}
1193
1194/*
543212b3 1195 * caller should hold session s_mutex and dentry->d_lock.
355da1eb 1196 */
543212b3
YZ
1197static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
1198 struct ceph_mds_reply_lease *lease,
1199 struct ceph_mds_session *session,
1200 unsigned long from_time,
1201 struct ceph_mds_session **old_lease_session)
355da1eb
SW
1202{
1203 struct ceph_dentry_info *di = ceph_dentry(dentry);
f5e17aed 1204 unsigned mask = le16_to_cpu(lease->mask);
355da1eb
SW
1205 long unsigned duration = le32_to_cpu(lease->duration_ms);
1206 long unsigned ttl = from_time + (duration * HZ) / 1000;
1207 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
f5d55f03 1208
2f90b852
SW
1209 dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
1210 dentry, duration, ttl);
355da1eb 1211
18fc8abd
AV
1212 /* only track leases on regular dentries */
1213 if (ceph_snap(dir) != CEPH_NOSNAP)
543212b3 1214 return;
18fc8abd 1215
f5e17aed
JL
1216 if (mask & CEPH_LEASE_PRIMARY_LINK)
1217 di->flags |= CEPH_DENTRY_PRIMARY_LINK;
1218 else
1219 di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1220
97aeb6bf 1221 di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
f5e17aed 1222 if (!(mask & CEPH_LEASE_VALID)) {
37c4efc1 1223 __ceph_dentry_dir_lease_touch(di);
543212b3 1224 return;
37c4efc1 1225 }
355da1eb 1226
52d60f8e 1227 if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
9b16f03c 1228 time_before(ttl, di->time))
543212b3 1229 return; /* we already have a newer lease. */
355da1eb 1230
481f001f 1231 if (di->lease_session && di->lease_session != session) {
543212b3 1232 *old_lease_session = di->lease_session;
481f001f
YZ
1233 di->lease_session = NULL;
1234 }
355da1eb 1235
355da1eb
SW
1236 if (!di->lease_session)
1237 di->lease_session = ceph_get_mds_session(session);
52d60f8e 1238 di->lease_gen = atomic_read(&session->s_cap_gen);
355da1eb
SW
1239 di->lease_seq = le32_to_cpu(lease->seq);
1240 di->lease_renew_after = half_ttl;
1241 di->lease_renew_from = 0;
9b16f03c 1242 di->time = ttl;
37c4efc1
YZ
1243
1244 __ceph_dentry_lease_touch(di);
543212b3
YZ
1245}
1246
1247static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
1248 struct ceph_mds_reply_lease *lease,
1249 struct ceph_mds_session *session,
1250 unsigned long from_time)
1251{
1252 struct ceph_mds_session *old_lease_session = NULL;
1253 spin_lock(&dentry->d_lock);
1254 __update_dentry_lease(dir, dentry, lease, session, from_time,
1255 &old_lease_session);
1256 spin_unlock(&dentry->d_lock);
7e65624d 1257 ceph_put_mds_session(old_lease_session);
543212b3
YZ
1258}
1259
1260/*
1261 * update dentry lease without having parent inode locked
1262 */
1263static void update_dentry_lease_careful(struct dentry *dentry,
1264 struct ceph_mds_reply_lease *lease,
1265 struct ceph_mds_session *session,
1266 unsigned long from_time,
1267 char *dname, u32 dname_len,
1268 struct ceph_vino *pdvino,
1269 struct ceph_vino *ptvino)
1270
1271{
1272 struct inode *dir;
1273 struct ceph_mds_session *old_lease_session = NULL;
1274
1275 spin_lock(&dentry->d_lock);
1276 /* make sure dentry's name matches target */
1277 if (dentry->d_name.len != dname_len ||
1278 memcmp(dentry->d_name.name, dname, dname_len))
1279 goto out_unlock;
1280
1281 dir = d_inode(dentry->d_parent);
1282 /* make sure parent matches dvino */
1283 if (!ceph_ino_compare(dir, pdvino))
1284 goto out_unlock;
1285
1286 /* make sure dentry's inode matches target. NULL ptvino means that
1287 * we expect a negative dentry */
1288 if (ptvino) {
1289 if (d_really_is_negative(dentry))
1290 goto out_unlock;
1291 if (!ceph_ino_compare(d_inode(dentry), ptvino))
1292 goto out_unlock;
1293 } else {
1294 if (d_really_is_positive(dentry))
1295 goto out_unlock;
1296 }
1297
1298 __update_dentry_lease(dir, dentry, lease, session,
1299 from_time, &old_lease_session);
355da1eb
SW
1300out_unlock:
1301 spin_unlock(&dentry->d_lock);
7e65624d 1302 ceph_put_mds_session(old_lease_session);
355da1eb
SW
1303}
1304
1305/*
1306 * splice a dentry to an inode.
810313c5 1307 * caller must hold directory i_rwsem for this to be safe.
355da1eb 1308 */
2bf996ac 1309static int splice_dentry(struct dentry **pdn, struct inode *in)
355da1eb 1310{
2bf996ac 1311 struct dentry *dn = *pdn;
355da1eb
SW
1312 struct dentry *realdn;
1313
2b0143b5 1314 BUG_ON(d_inode(dn));
1cd3935b 1315
5495c2d0
YZ
1316 if (S_ISDIR(in->i_mode)) {
1317 /* If inode is directory, d_splice_alias() below will remove
1318 * 'realdn' from its origin parent. We need to ensure that
1319 * origin parent's readdir cache will not reference 'realdn'
1320 */
1321 realdn = d_find_any_alias(in);
1322 if (realdn) {
1323 struct ceph_dentry_info *di = ceph_dentry(realdn);
1324 spin_lock(&realdn->d_lock);
1325
1326 realdn->d_op->d_prune(realdn);
1327
1328 di->time = jiffies;
1329 di->lease_shared_gen = 0;
1330 di->offset = 0;
1331
1332 spin_unlock(&realdn->d_lock);
1333 dput(realdn);
1334 }
1335 }
1336
355da1eb
SW
1337 /* dn must be unhashed */
1338 if (!d_unhashed(dn))
1339 d_drop(dn);
41d28bca 1340 realdn = d_splice_alias(in, dn);
355da1eb 1341 if (IS_ERR(realdn)) {
d69ed05a
SW
1342 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
1343 PTR_ERR(realdn), dn, in, ceph_vinop(in));
2bf996ac
YZ
1344 return PTR_ERR(realdn);
1345 }
1346
1347 if (realdn) {
355da1eb
SW
1348 dout("dn %p (%d) spliced with %p (%d) "
1349 "inode %p ino %llx.%llx\n",
84d08fa8
AV
1350 dn, d_count(dn),
1351 realdn, d_count(realdn),
2b0143b5 1352 d_inode(realdn), ceph_vinop(d_inode(realdn)));
355da1eb 1353 dput(dn);
2bf996ac 1354 *pdn = realdn;
355da1eb
SW
1355 } else {
1356 BUG_ON(!ceph_dentry(dn));
355da1eb 1357 dout("dn %p attached to %p ino %llx.%llx\n",
2b0143b5 1358 dn, d_inode(dn), ceph_vinop(d_inode(dn)));
355da1eb 1359 }
2bf996ac 1360 return 0;
355da1eb
SW
1361}
1362
1363/*
1364 * Incorporate results into the local cache. This is either just
1365 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
1366 * after a lookup).
1367 *
1368 * A reply may contain
1369 * a directory inode along with a dentry.
1370 * and/or a target inode
1371 *
1372 * Called with snap_rwsem (read).
1373 */
f5a03b08 1374int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
355da1eb 1375{
f5a03b08 1376 struct ceph_mds_session *session = req->r_session;
355da1eb
SW
1377 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1378 struct inode *in = NULL;
f5d55f03 1379 struct ceph_vino tvino, dvino;
3d14c5d2 1380 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
355da1eb
SW
1381 int err = 0;
1382
1383 dout("fill_trace %p is_dentry %d is_target %d\n", req,
1384 rinfo->head->is_dentry, rinfo->head->is_target);
1385
355da1eb
SW
1386 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
1387 dout("fill_trace reply is empty!\n");
3dd69aab 1388 if (rinfo->head->result == 0 && req->r_parent)
167c9e35 1389 ceph_invalidate_dir_request(req);
355da1eb
SW
1390 return 0;
1391 }
1392
1393 if (rinfo->head->is_dentry) {
3dd69aab 1394 struct inode *dir = req->r_parent;
5b1daecd 1395
6c5e50fa 1396 if (dir) {
966c7160
JL
1397 err = ceph_fill_inode(dir, NULL, &rinfo->diri,
1398 rinfo->dirfrag, session, -1,
1399 &req->r_caps_reservation);
6c5e50fa 1400 if (err < 0)
19913b4e 1401 goto done;
6c5e50fa
SW
1402 } else {
1403 WARN_ON_ONCE(1);
1404 }
19913b4e 1405
74c9e6bf
YZ
1406 if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1407 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1408 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
19913b4e
YZ
1409 struct qstr dname;
1410 struct dentry *dn, *parent;
1411
1412 BUG_ON(!rinfo->head->is_target);
1413 BUG_ON(req->r_dentry);
1414
1415 parent = d_find_any_alias(dir);
1416 BUG_ON(!parent);
1417
1418 dname.name = rinfo->dname;
1419 dname.len = rinfo->dname_len;
8387ff25 1420 dname.hash = full_name_hash(parent, dname.name, dname.len);
f5d55f03
JL
1421 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1422 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
19913b4e
YZ
1423retry_lookup:
1424 dn = d_lookup(parent, &dname);
1425 dout("d_lookup on parent=%p name=%.*s got %p\n",
1426 parent, dname.len, dname.name, dn);
1427
1428 if (!dn) {
1429 dn = d_alloc(parent, &dname);
1430 dout("d_alloc %p '%.*s' = %p\n", parent,
1431 dname.len, dname.name, dn);
d37b1d99 1432 if (!dn) {
19913b4e
YZ
1433 dput(parent);
1434 err = -ENOMEM;
1435 goto done;
1436 }
ad5cb123 1437 err = 0;
2b0143b5 1438 } else if (d_really_is_positive(dn) &&
f5d55f03
JL
1439 (ceph_ino(d_inode(dn)) != tvino.ino ||
1440 ceph_snap(d_inode(dn)) != tvino.snap)) {
19913b4e 1441 dout(" dn %p points to wrong inode %p\n",
2b0143b5 1442 dn, d_inode(dn));
933ad2c9 1443 ceph_dir_clear_ordered(dir);
19913b4e
YZ
1444 d_delete(dn);
1445 dput(dn);
1446 goto retry_lookup;
1447 }
1448
1449 req->r_dentry = dn;
1450 dput(parent);
1451 }
5b1daecd
SW
1452 }
1453
86b58d13 1454 if (rinfo->head->is_target) {
bca9fc14
JL
1455 /* Should be filled in by handle_reply */
1456 BUG_ON(!req->r_target_inode);
86b58d13 1457
bca9fc14 1458 in = req->r_target_inode;
966c7160
JL
1459 err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
1460 NULL, session,
bc2de10d 1461 (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
3bb48b41 1462 !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
57c21994 1463 rinfo->head->result == 0) ? req->r_fmode : -1,
86b58d13
YZ
1464 &req->r_caps_reservation);
1465 if (err < 0) {
966c7160 1466 pr_err("ceph_fill_inode badness %p %llx.%llx\n",
86b58d13 1467 in, ceph_vinop(in));
bca9fc14 1468 req->r_target_inode = NULL;
893e456b
JL
1469 if (in->i_state & I_NEW)
1470 discard_new_inode(in);
68cbb805
JL
1471 else
1472 iput(in);
86b58d13
YZ
1473 goto done;
1474 }
893e456b
JL
1475 if (in->i_state & I_NEW)
1476 unlock_new_inode(in);
86b58d13
YZ
1477 }
1478
9358c6d4
SW
1479 /*
1480 * ignore null lease/binding on snapdir ENOENT, or else we
1481 * will have trouble splicing in the virtual snapdir later
1482 */
3dd69aab
JL
1483 if (rinfo->head->is_dentry &&
1484 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1485 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
9358c6d4 1486 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
3d14c5d2 1487 fsc->mount_options->snapdir_name,
9358c6d4 1488 req->r_dentry->d_name.len))) {
355da1eb
SW
1489 /*
1490 * lookup link rename : null -> possibly existing inode
1491 * mknod symlink mkdir : null -> new inode
1492 * unlink : linked -> null
1493 */
3dd69aab 1494 struct inode *dir = req->r_parent;
355da1eb
SW
1495 struct dentry *dn = req->r_dentry;
1496 bool have_dir_cap, have_lease;
1497
1498 BUG_ON(!dn);
1499 BUG_ON(!dir);
2b0143b5 1500 BUG_ON(d_inode(dn->d_parent) != dir);
f5d55f03
JL
1501
1502 dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1503 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1504
1505 BUG_ON(ceph_ino(dir) != dvino.ino);
1506 BUG_ON(ceph_snap(dir) != dvino.snap);
355da1eb 1507
355da1eb
SW
1508 /* do we have a lease on the whole dir? */
1509 have_dir_cap =
1510 (le32_to_cpu(rinfo->diri.in->cap.caps) &
1511 CEPH_CAP_FILE_SHARED);
1512
1513 /* do we have a dn lease? */
1514 have_lease = have_dir_cap ||
2f90b852 1515 le32_to_cpu(rinfo->dlease->duration_ms);
355da1eb
SW
1516 if (!have_lease)
1517 dout("fill_trace no dentry lease or dir cap\n");
1518
1519 /* rename? */
1520 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
0a8a70f9
YZ
1521 struct inode *olddir = req->r_old_dentry_dir;
1522 BUG_ON(!olddir);
1523
a455589f 1524 dout(" src %p '%pd' dst %p '%pd'\n",
355da1eb 1525 req->r_old_dentry,
a455589f
AV
1526 req->r_old_dentry,
1527 dn, dn);
355da1eb
SW
1528 dout("fill_trace doing d_move %p -> %p\n",
1529 req->r_old_dentry, dn);
c10f5e12 1530
fdd4e158
YZ
1531 /* d_move screws up sibling dentries' offsets */
1532 ceph_dir_clear_ordered(dir);
1533 ceph_dir_clear_ordered(olddir);
1534
355da1eb 1535 d_move(req->r_old_dentry, dn);
a455589f
AV
1536 dout(" src %p '%pd' dst %p '%pd'\n",
1537 req->r_old_dentry,
355da1eb 1538 req->r_old_dentry,
a455589f 1539 dn, dn);
81a6cf2d 1540
c4a29f26
SW
1541 /* ensure target dentry is invalidated, despite
1542 rehashing bug in vfs_rename_dir */
81a6cf2d
SW
1543 ceph_invalidate_dentry_lease(dn);
1544
99ccbd22 1545 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1cd3935b 1546 ceph_dentry(req->r_old_dentry)->offset);
81a6cf2d 1547
2bf996ac
YZ
1548 /* swap r_dentry and r_old_dentry in case that
1549 * splice_dentry() gets called later. This is safe
1550 * because no other place will use them */
1551 req->r_dentry = req->r_old_dentry;
1552 req->r_old_dentry = dn;
1553 dn = req->r_dentry;
355da1eb
SW
1554 }
1555
1556 /* null dentry? */
1557 if (!rinfo->head->is_target) {
1558 dout("fill_trace null dentry\n");
2b0143b5 1559 if (d_really_is_positive(dn)) {
355da1eb 1560 dout("d_delete %p\n", dn);
5495c2d0 1561 ceph_dir_clear_ordered(dir);
355da1eb 1562 d_delete(dn);
80d025ff
JL
1563 } else if (have_lease) {
1564 if (d_unhashed(dn))
f8b31710 1565 d_add(dn, NULL);
7ffe4fce
XL
1566 }
1567
1568 if (!d_unhashed(dn) && have_lease)
543212b3
YZ
1569 update_dentry_lease(dir, dn,
1570 rinfo->dlease, session,
1571 req->r_request_started);
355da1eb
SW
1572 goto done;
1573 }
1574
1575 /* attach proper inode */
2b0143b5 1576 if (d_really_is_negative(dn)) {
70db4f36 1577 ceph_dir_clear_ordered(dir);
86b58d13 1578 ihold(in);
2bf996ac
YZ
1579 err = splice_dentry(&req->r_dentry, in);
1580 if (err < 0)
355da1eb 1581 goto done;
2bf996ac 1582 dn = req->r_dentry; /* may have spliced */
2b0143b5 1583 } else if (d_really_is_positive(dn) && d_inode(dn) != in) {
355da1eb 1584 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
2b0143b5 1585 dn, d_inode(dn), ceph_vinop(d_inode(dn)),
86b58d13 1586 ceph_vinop(in));
200fd27c 1587 d_invalidate(dn);
355da1eb 1588 have_lease = false;
355da1eb
SW
1589 }
1590
f5d55f03 1591 if (have_lease) {
543212b3
YZ
1592 update_dentry_lease(dir, dn,
1593 rinfo->dlease, session,
1594 req->r_request_started);
f5d55f03 1595 }
355da1eb 1596 dout(" final dn %p\n", dn);
bc2de10d
JL
1597 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1598 req->r_op == CEPH_MDS_OP_MKSNAP) &&
1f08529c 1599 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
bc2de10d 1600 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
3dd69aab 1601 struct inode *dir = req->r_parent;
355da1eb
SW
1602
1603 /* fill out a snapdir LOOKUPSNAP dentry */
0a8a70f9
YZ
1604 BUG_ON(!dir);
1605 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
2bf996ac
YZ
1606 BUG_ON(!req->r_dentry);
1607 dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
70db4f36 1608 ceph_dir_clear_ordered(dir);
86b58d13 1609 ihold(in);
2bf996ac
YZ
1610 err = splice_dentry(&req->r_dentry, in);
1611 if (err < 0)
355da1eb 1612 goto done;
543212b3
YZ
1613 } else if (rinfo->head->is_dentry && req->r_dentry) {
1614 /* parent inode is not locked, be carefull */
cdde7c43 1615 struct ceph_vino *ptvino = NULL;
543212b3
YZ
1616 dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1617 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1618 if (rinfo->head->is_target) {
1619 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1620 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1621 ptvino = &tvino;
cdde7c43 1622 }
543212b3
YZ
1623 update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
1624 session, req->r_request_started,
1625 rinfo->dname, rinfo->dname_len,
1626 &dvino, ptvino);
355da1eb 1627 }
355da1eb
SW
1628done:
1629 dout("fill_trace done err=%d\n", err);
1630 return err;
1631}
1632
1633/*
1634 * Prepopulate our cache with readdir results, leases, etc.
1635 */
79f9f99a
SW
1636static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1637 struct ceph_mds_session *session)
1638{
1639 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1640 int i, err = 0;
1641
1642 for (i = 0; i < rinfo->dir_nr; i++) {
2a5beea3 1643 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
79f9f99a
SW
1644 struct ceph_vino vino;
1645 struct inode *in;
1646 int rc;
1647
2a5beea3
YZ
1648 vino.ino = le64_to_cpu(rde->inode.in->ino);
1649 vino.snap = le64_to_cpu(rde->inode.in->snapid);
79f9f99a 1650
ec9595c0 1651 in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
79f9f99a
SW
1652 if (IS_ERR(in)) {
1653 err = PTR_ERR(in);
1654 dout("new_inode badness got %d\n", err);
1655 continue;
1656 }
966c7160
JL
1657 rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
1658 -1, &req->r_caps_reservation);
79f9f99a 1659 if (rc < 0) {
966c7160
JL
1660 pr_err("ceph_fill_inode badness on %p got %d\n",
1661 in, rc);
79f9f99a 1662 err = rc;
893e456b
JL
1663 if (in->i_state & I_NEW) {
1664 ihold(in);
1665 discard_new_inode(in);
1666 }
1667 } else if (in->i_state & I_NEW) {
1668 unlock_new_inode(in);
79f9f99a 1669 }
893e456b 1670
23c2c76e 1671 iput(in);
79f9f99a
SW
1672 }
1673
1674 return err;
1675}
1676
fdd4e158
YZ
1677void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
1678{
1679 if (ctl->page) {
1680 kunmap(ctl->page);
09cbfeaf 1681 put_page(ctl->page);
fdd4e158
YZ
1682 ctl->page = NULL;
1683 }
1684}
1685
1686static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
1687 struct ceph_readdir_cache_control *ctl,
1688 struct ceph_mds_request *req)
1689{
1690 struct ceph_inode_info *ci = ceph_inode(dir);
09cbfeaf 1691 unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
fdd4e158
YZ
1692 unsigned idx = ctl->index % nsize;
1693 pgoff_t pgoff = ctl->index / nsize;
1694
1695 if (!ctl->page || pgoff != page_index(ctl->page)) {
1696 ceph_readdir_cache_release(ctl);
af5e5eb5
YZ
1697 if (idx == 0)
1698 ctl->page = grab_cache_page(&dir->i_data, pgoff);
1699 else
1700 ctl->page = find_lock_page(&dir->i_data, pgoff);
fdd4e158
YZ
1701 if (!ctl->page) {
1702 ctl->index = -1;
af5e5eb5 1703 return idx == 0 ? -ENOMEM : 0;
fdd4e158
YZ
1704 }
1705 /* reading/filling the cache are serialized by
810313c5 1706 * i_rwsem, no need to use page lock */
fdd4e158
YZ
1707 unlock_page(ctl->page);
1708 ctl->dentries = kmap(ctl->page);
af5e5eb5 1709 if (idx == 0)
09cbfeaf 1710 memset(ctl->dentries, 0, PAGE_SIZE);
fdd4e158
YZ
1711 }
1712
1713 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
1714 req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
1715 dout("readdir cache dn %p idx %d\n", dn, ctl->index);
1716 ctl->dentries[idx] = dn;
1717 ctl->index++;
1718 } else {
1719 dout("disable readdir cache\n");
1720 ctl->index = -1;
1721 }
1722 return 0;
1723}
1724
355da1eb
SW
1725int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1726 struct ceph_mds_session *session)
1727{
1728 struct dentry *parent = req->r_dentry;
f3c4ebe6 1729 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
355da1eb
SW
1730 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1731 struct qstr dname;
1732 struct dentry *dn;
1733 struct inode *in;
315f2408 1734 int err = 0, skipped = 0, ret, i;
0f51a983 1735 u32 frag = le32_to_cpu(req->r_args.readdir.frag);
f3c4ebe6
YZ
1736 u32 last_hash = 0;
1737 u32 fpos_offset;
fdd4e158
YZ
1738 struct ceph_readdir_cache_control cache_ctl = {};
1739
bc2de10d 1740 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
fdd4e158 1741 return readdir_prepopulate_inodes_only(req, session);
81c6aea5 1742
79162547
YZ
1743 if (rinfo->hash_order) {
1744 if (req->r_path2) {
1745 last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1746 req->r_path2,
1747 strlen(req->r_path2));
1748 last_hash = ceph_frag_value(last_hash);
1749 } else if (rinfo->offset_hash) {
1750 /* mds understands offset_hash */
1751 WARN_ON_ONCE(req->r_readdir_offset != 2);
0f51a983 1752 last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
79162547 1753 }
f3c4ebe6
YZ
1754 }
1755
81c6aea5
YZ
1756 if (rinfo->dir_dir &&
1757 le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1758 dout("readdir_prepopulate got new frag %x -> %x\n",
1759 frag, le32_to_cpu(rinfo->dir_dir->frag));
1760 frag = le32_to_cpu(rinfo->dir_dir->frag);
f3c4ebe6 1761 if (!rinfo->hash_order)
fdd4e158 1762 req->r_readdir_offset = 2;
81c6aea5 1763 }
355da1eb
SW
1764
1765 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
355da1eb
SW
1766 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1767 rinfo->dir_nr, parent);
1768 } else {
1769 dout("readdir_prepopulate %d items under dn %p\n",
1770 rinfo->dir_nr, parent);
1771 if (rinfo->dir_dir)
2b0143b5 1772 ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
355da1eb 1773
8d45b911
YZ
1774 if (ceph_frag_is_leftmost(frag) &&
1775 req->r_readdir_offset == 2 &&
1776 !(rinfo->hash_order && last_hash)) {
1777 /* note dir version at start of readdir so we can
1778 * tell if any dentries get dropped */
1779 req->r_dir_release_cnt =
1780 atomic64_read(&ci->i_release_count);
1781 req->r_dir_ordered_cnt =
1782 atomic64_read(&ci->i_ordered_count);
1783 req->r_readdir_cache_idx = 0;
1784 }
fdd4e158
YZ
1785 }
1786
1787 cache_ctl.index = req->r_readdir_cache_idx;
f3c4ebe6 1788 fpos_offset = req->r_readdir_offset;
fdd4e158 1789
86b58d13 1790 /* FIXME: release caps/leases if error occurs */
355da1eb 1791 for (i = 0; i < rinfo->dir_nr; i++) {
2a5beea3 1792 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
543212b3 1793 struct ceph_vino tvino;
355da1eb 1794
2a5beea3
YZ
1795 dname.name = rde->name;
1796 dname.len = rde->name_len;
8387ff25 1797 dname.hash = full_name_hash(parent, dname.name, dname.len);
355da1eb 1798
f5d55f03
JL
1799 tvino.ino = le64_to_cpu(rde->inode.in->ino);
1800 tvino.snap = le64_to_cpu(rde->inode.in->snapid);
355da1eb 1801
f3c4ebe6
YZ
1802 if (rinfo->hash_order) {
1803 u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1804 rde->name, rde->name_len);
1805 hash = ceph_frag_value(hash);
1806 if (hash != last_hash)
1807 fpos_offset = 2;
1808 last_hash = hash;
1809 rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
1810 } else {
1811 rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
1812 }
355da1eb
SW
1813
1814retry_lookup:
1815 dn = d_lookup(parent, &dname);
1816 dout("d_lookup on parent=%p name=%.*s got %p\n",
1817 parent, dname.len, dname.name, dn);
1818
1819 if (!dn) {
1820 dn = d_alloc(parent, &dname);
1821 dout("d_alloc %p '%.*s' = %p\n", parent,
1822 dname.len, dname.name, dn);
d37b1d99 1823 if (!dn) {
355da1eb
SW
1824 dout("d_alloc badness\n");
1825 err = -ENOMEM;
1826 goto out;
1827 }
2b0143b5 1828 } else if (d_really_is_positive(dn) &&
f5d55f03
JL
1829 (ceph_ino(d_inode(dn)) != tvino.ino ||
1830 ceph_snap(d_inode(dn)) != tvino.snap)) {
5495c2d0 1831 struct ceph_dentry_info *di = ceph_dentry(dn);
355da1eb 1832 dout(" dn %p points to wrong inode %p\n",
2b0143b5 1833 dn, d_inode(dn));
5495c2d0
YZ
1834
1835 spin_lock(&dn->d_lock);
1836 if (di->offset > 0 &&
1837 di->lease_shared_gen ==
1838 atomic_read(&ci->i_shared_gen)) {
1839 __ceph_dir_clear_ordered(ci);
1840 di->offset = 0;
1841 }
1842 spin_unlock(&dn->d_lock);
1843
355da1eb
SW
1844 d_delete(dn);
1845 dput(dn);
1846 goto retry_lookup;
355da1eb
SW
1847 }
1848
355da1eb 1849 /* inode */
2b0143b5
DH
1850 if (d_really_is_positive(dn)) {
1851 in = d_inode(dn);
355da1eb 1852 } else {
ec9595c0 1853 in = ceph_get_inode(parent->d_sb, tvino, NULL);
ac1f12ef 1854 if (IS_ERR(in)) {
355da1eb 1855 dout("new_inode badness\n");
2744c171 1856 d_drop(dn);
355da1eb 1857 dput(dn);
ac1f12ef 1858 err = PTR_ERR(in);
355da1eb
SW
1859 goto out;
1860 }
355da1eb
SW
1861 }
1862
966c7160
JL
1863 ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
1864 -1, &req->r_caps_reservation);
fdd4e158 1865 if (ret < 0) {
966c7160 1866 pr_err("ceph_fill_inode badness on %p\n", in);
3e1d0452 1867 if (d_really_is_negative(dn)) {
893e456b
JL
1868 if (in->i_state & I_NEW) {
1869 ihold(in);
1870 discard_new_inode(in);
1871 }
23c2c76e 1872 iput(in);
3e1d0452 1873 }
86b58d13 1874 d_drop(dn);
fdd4e158 1875 err = ret;
d69ed05a 1876 goto next_item;
355da1eb 1877 }
893e456b
JL
1878 if (in->i_state & I_NEW)
1879 unlock_new_inode(in);
86b58d13 1880
2b0143b5 1881 if (d_really_is_negative(dn)) {
315f2408
YZ
1882 if (ceph_security_xattr_deadlock(in)) {
1883 dout(" skip splicing dn %p to inode %p"
1884 " (security xattr deadlock)\n", dn, in);
23c2c76e 1885 iput(in);
315f2408
YZ
1886 skipped++;
1887 goto next_item;
1888 }
1889
2bf996ac
YZ
1890 err = splice_dentry(&dn, in);
1891 if (err < 0)
86b58d13 1892 goto next_item;
86b58d13
YZ
1893 }
1894
f3c4ebe6 1895 ceph_dentry(dn)->offset = rde->offset;
86b58d13 1896
543212b3
YZ
1897 update_dentry_lease(d_inode(parent), dn,
1898 rde->lease, req->r_session,
1899 req->r_request_started);
fdd4e158 1900
315f2408 1901 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
fdd4e158
YZ
1902 ret = fill_readdir_cache(d_inode(parent), dn,
1903 &cache_ctl, req);
1904 if (ret < 0)
1905 err = ret;
1906 }
d69ed05a 1907next_item:
2bf996ac 1908 dput(dn);
355da1eb 1909 }
355da1eb 1910out:
315f2408 1911 if (err == 0 && skipped == 0) {
bc2de10d 1912 set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
fdd4e158
YZ
1913 req->r_readdir_cache_idx = cache_ctl.index;
1914 }
1915 ceph_readdir_cache_release(&cache_ctl);
355da1eb
SW
1916 dout("readdir_prepopulate done\n");
1917 return err;
1918}
1919
efb0ca76 1920bool ceph_inode_set_size(struct inode *inode, loff_t size)
355da1eb
SW
1921{
1922 struct ceph_inode_info *ci = ceph_inode(inode);
efb0ca76 1923 bool ret;
355da1eb 1924
be655596 1925 spin_lock(&ci->i_ceph_lock);
2d6795fb 1926 dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
99c88e69 1927 i_size_write(inode, size);
400e1286 1928 ceph_fscache_update(inode);
224a7542 1929 inode->i_blocks = calc_inode_blocks(size);
355da1eb 1930
efb0ca76 1931 ret = __ceph_should_report_size(ci);
355da1eb 1932
be655596 1933 spin_unlock(&ci->i_ceph_lock);
400e1286 1934
355da1eb
SW
1935 return ret;
1936}
1937
64f28c62 1938void ceph_queue_inode_work(struct inode *inode, int work_bit)
355da1eb 1939{
64f28c62 1940 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1cf89a8d 1941 struct ceph_inode_info *ci = ceph_inode(inode);
64f28c62 1942 set_bit(work_bit, &ci->i_work_mask);
1cf89a8d
YZ
1943
1944 ihold(inode);
64f28c62
JL
1945 if (queue_work(fsc->inode_wq, &ci->i_work)) {
1946 dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
1cf89a8d 1947 } else {
64f28c62 1948 dout("queue_inode_work %p already queued, mask=%lx\n",
1cf89a8d
YZ
1949 inode, ci->i_work_mask);
1950 iput(inode);
1951 }
1952}
1953
1954static void ceph_do_invalidate_pages(struct inode *inode)
1955{
1956 struct ceph_inode_info *ci = ceph_inode(inode);
355da1eb
SW
1957 u32 orig_gen;
1958 int check = 0;
1959
400e1286
JL
1960 ceph_fscache_invalidate(inode, false);
1961
b0d7c223 1962 mutex_lock(&ci->i_truncate_mutex);
6c93df5d 1963
5d6451b1 1964 if (ceph_inode_is_shutdown(inode)) {
6407fbb9
JL
1965 pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
1966 __func__, ceph_vinop(inode));
6c93df5d
YZ
1967 mapping_set_error(inode->i_mapping, -EIO);
1968 truncate_pagecache(inode, 0);
1969 mutex_unlock(&ci->i_truncate_mutex);
1970 goto out;
1971 }
1972
be655596 1973 spin_lock(&ci->i_ceph_lock);
355da1eb
SW
1974 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1975 ci->i_rdcache_gen, ci->i_rdcache_revoking);
cd045cb4 1976 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
9563f88c
YZ
1977 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
1978 check = 1;
be655596 1979 spin_unlock(&ci->i_ceph_lock);
b0d7c223 1980 mutex_unlock(&ci->i_truncate_mutex);
355da1eb
SW
1981 goto out;
1982 }
1983 orig_gen = ci->i_rdcache_gen;
be655596 1984 spin_unlock(&ci->i_ceph_lock);
355da1eb 1985
9abd4db7 1986 if (invalidate_inode_pages2(inode->i_mapping) < 0) {
6407fbb9
JL
1987 pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
1988 ceph_vinop(inode));
9abd4db7 1989 }
355da1eb 1990
be655596 1991 spin_lock(&ci->i_ceph_lock);
cd045cb4
SW
1992 if (orig_gen == ci->i_rdcache_gen &&
1993 orig_gen == ci->i_rdcache_revoking) {
355da1eb
SW
1994 dout("invalidate_pages %p gen %d successful\n", inode,
1995 ci->i_rdcache_gen);
cd045cb4 1996 ci->i_rdcache_revoking--;
355da1eb
SW
1997 check = 1;
1998 } else {
cd045cb4
SW
1999 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
2000 inode, orig_gen, ci->i_rdcache_gen,
2001 ci->i_rdcache_revoking);
9563f88c
YZ
2002 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2003 check = 1;
355da1eb 2004 }
be655596 2005 spin_unlock(&ci->i_ceph_lock);
b0d7c223 2006 mutex_unlock(&ci->i_truncate_mutex);
9563f88c 2007out:
355da1eb 2008 if (check)
e4b731cc 2009 ceph_check_caps(ci, 0);
3c6f6b79
SW
2010}
2011
355da1eb 2012/*
355da1eb
SW
2013 * Make sure any pending truncation is applied before doing anything
2014 * that may depend on it.
2015 */
b415bf4f 2016void __ceph_do_pending_vmtruncate(struct inode *inode)
355da1eb
SW
2017{
2018 struct ceph_inode_info *ci = ceph_inode(inode);
2019 u64 to;
a85f50b6 2020 int wrbuffer_refs, finish = 0;
355da1eb 2021
b0d7c223 2022 mutex_lock(&ci->i_truncate_mutex);
355da1eb 2023retry:
be655596 2024 spin_lock(&ci->i_ceph_lock);
355da1eb
SW
2025 if (ci->i_truncate_pending == 0) {
2026 dout("__do_pending_vmtruncate %p none pending\n", inode);
be655596 2027 spin_unlock(&ci->i_ceph_lock);
b0d7c223 2028 mutex_unlock(&ci->i_truncate_mutex);
355da1eb
SW
2029 return;
2030 }
2031
2032 /*
2033 * make sure any dirty snapped pages are flushed before we
2034 * possibly truncate them.. so write AND block!
2035 */
2036 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
c8fd0d37 2037 spin_unlock(&ci->i_ceph_lock);
355da1eb
SW
2038 dout("__do_pending_vmtruncate %p flushing snaps first\n",
2039 inode);
355da1eb
SW
2040 filemap_write_and_wait_range(&inode->i_data, 0,
2041 inode->i_sb->s_maxbytes);
2042 goto retry;
2043 }
2044
b0d7c223
YZ
2045 /* there should be no reader or writer */
2046 WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
2047
355da1eb
SW
2048 to = ci->i_truncate_size;
2049 wrbuffer_refs = ci->i_wrbuffer_ref;
2050 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
2051 ci->i_truncate_pending, to);
be655596 2052 spin_unlock(&ci->i_ceph_lock);
355da1eb 2053
400e1286 2054 ceph_fscache_resize(inode, to);
4e217b5d 2055 truncate_pagecache(inode, to);
355da1eb 2056
be655596 2057 spin_lock(&ci->i_ceph_lock);
a85f50b6
YZ
2058 if (to == ci->i_truncate_size) {
2059 ci->i_truncate_pending = 0;
2060 finish = 1;
2061 }
be655596 2062 spin_unlock(&ci->i_ceph_lock);
a85f50b6
YZ
2063 if (!finish)
2064 goto retry;
355da1eb 2065
b0d7c223
YZ
2066 mutex_unlock(&ci->i_truncate_mutex);
2067
355da1eb 2068 if (wrbuffer_refs == 0)
e4b731cc 2069 ceph_check_caps(ci, 0);
a85f50b6
YZ
2070
2071 wake_up_all(&ci->i_cap_wq);
355da1eb
SW
2072}
2073
1cf89a8d
YZ
2074static void ceph_inode_work(struct work_struct *work)
2075{
2076 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
2077 i_work);
874c8ca1 2078 struct inode *inode = &ci->netfs.inode;
1cf89a8d
YZ
2079
2080 if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
2081 dout("writeback %p\n", inode);
2082 filemap_fdatawrite(&inode->i_data);
2083 }
2084 if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
2085 ceph_do_invalidate_pages(inode);
2086
2087 if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
2088 __ceph_do_pending_vmtruncate(inode);
2089
a8810cdc 2090 if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
e4b731cc 2091 ceph_check_caps(ci, 0);
a8810cdc
JL
2092
2093 if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
2094 ceph_flush_snaps(ci, NULL);
2095
1cf89a8d
YZ
2096 iput(inode);
2097}
2098
355da1eb
SW
2099/*
2100 * symlinks
2101 */
355da1eb 2102static const struct inode_operations ceph_symlink_iops = {
6b255391 2103 .get_link = simple_get_link,
0b932672
YZ
2104 .setattr = ceph_setattr,
2105 .getattr = ceph_getattr,
0b932672 2106 .listxattr = ceph_listxattr,
355da1eb
SW
2107};
2108
2d332d5b
JL
2109int __ceph_setattr(struct inode *inode, struct iattr *attr,
2110 struct ceph_iattr *cia)
355da1eb 2111{
355da1eb 2112 struct ceph_inode_info *ci = ceph_inode(inode);
c62498d7 2113 unsigned int ia_valid = attr->ia_valid;
355da1eb 2114 struct ceph_mds_request *req;
a26fecca 2115 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
f66fd9f0 2116 struct ceph_cap_flush *prealloc_cf;
355da1eb
SW
2117 int issued;
2118 int release = 0, dirtied = 0;
2119 int mask = 0;
2120 int err = 0;
fca65b4a 2121 int inode_dirty_flags = 0;
604d1b02 2122 bool lock_snap_rwsem = false;
355da1eb 2123
f66fd9f0
YZ
2124 prealloc_cf = ceph_alloc_cap_flush();
2125 if (!prealloc_cf)
2126 return -ENOMEM;
2127
355da1eb
SW
2128 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
2129 USE_AUTH_MDS);
f66fd9f0
YZ
2130 if (IS_ERR(req)) {
2131 ceph_free_cap_flush(prealloc_cf);
355da1eb 2132 return PTR_ERR(req);
f66fd9f0 2133 }
355da1eb 2134
be655596 2135 spin_lock(&ci->i_ceph_lock);
355da1eb 2136 issued = __ceph_caps_issued(ci, NULL);
604d1b02
YZ
2137
2138 if (!ci->i_head_snapc &&
2139 (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
2140 lock_snap_rwsem = true;
2141 if (!down_read_trylock(&mdsc->snap_rwsem)) {
2142 spin_unlock(&ci->i_ceph_lock);
2143 down_read(&mdsc->snap_rwsem);
2144 spin_lock(&ci->i_ceph_lock);
2145 issued = __ceph_caps_issued(ci, NULL);
2146 }
2147 }
2148
355da1eb 2149 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
2d332d5b
JL
2150#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
2151 if (cia && cia->fscrypt_auth) {
2152 u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
2153
2154 if (len > sizeof(*cia->fscrypt_auth)) {
2155 err = -EINVAL;
2156 spin_unlock(&ci->i_ceph_lock);
2157 goto out;
2158 }
2159
2160 dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
2161 ceph_vinop(inode), ci->fscrypt_auth_len, len);
2162
2163 /* It should never be re-set once set */
2164 WARN_ON_ONCE(ci->fscrypt_auth);
2165
2166 if (issued & CEPH_CAP_AUTH_EXCL) {
2167 dirtied |= CEPH_CAP_AUTH_EXCL;
2168 kfree(ci->fscrypt_auth);
2169 ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
2170 ci->fscrypt_auth_len = len;
2171 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2172 ci->fscrypt_auth_len != len ||
2173 memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
2174 req->r_fscrypt_auth = cia->fscrypt_auth;
2175 mask |= CEPH_SETATTR_FSCRYPT_AUTH;
2176 release |= CEPH_CAP_AUTH_SHARED;
2177 }
2178 cia->fscrypt_auth = NULL;
2179 }
2180#else
2181 if (cia && cia->fscrypt_auth) {
2182 err = -EINVAL;
2183 spin_unlock(&ci->i_ceph_lock);
2184 goto out;
2185 }
2186#endif /* CONFIG_FS_ENCRYPTION */
355da1eb
SW
2187
2188 if (ia_valid & ATTR_UID) {
2189 dout("setattr %p uid %d -> %d\n", inode,
bd2bae6a
EB
2190 from_kuid(&init_user_ns, inode->i_uid),
2191 from_kuid(&init_user_ns, attr->ia_uid));
355da1eb
SW
2192 if (issued & CEPH_CAP_AUTH_EXCL) {
2193 inode->i_uid = attr->ia_uid;
2194 dirtied |= CEPH_CAP_AUTH_EXCL;
2195 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
ab871b90
EB
2196 !uid_eq(attr->ia_uid, inode->i_uid)) {
2197 req->r_args.setattr.uid = cpu_to_le32(
2198 from_kuid(&init_user_ns, attr->ia_uid));
355da1eb
SW
2199 mask |= CEPH_SETATTR_UID;
2200 release |= CEPH_CAP_AUTH_SHARED;
2201 }
2202 }
2203 if (ia_valid & ATTR_GID) {
2204 dout("setattr %p gid %d -> %d\n", inode,
bd2bae6a
EB
2205 from_kgid(&init_user_ns, inode->i_gid),
2206 from_kgid(&init_user_ns, attr->ia_gid));
355da1eb
SW
2207 if (issued & CEPH_CAP_AUTH_EXCL) {
2208 inode->i_gid = attr->ia_gid;
2209 dirtied |= CEPH_CAP_AUTH_EXCL;
2210 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
ab871b90
EB
2211 !gid_eq(attr->ia_gid, inode->i_gid)) {
2212 req->r_args.setattr.gid = cpu_to_le32(
2213 from_kgid(&init_user_ns, attr->ia_gid));
355da1eb
SW
2214 mask |= CEPH_SETATTR_GID;
2215 release |= CEPH_CAP_AUTH_SHARED;
2216 }
2217 }
2218 if (ia_valid & ATTR_MODE) {
2219 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
2220 attr->ia_mode);
2221 if (issued & CEPH_CAP_AUTH_EXCL) {
2222 inode->i_mode = attr->ia_mode;
2223 dirtied |= CEPH_CAP_AUTH_EXCL;
2224 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2225 attr->ia_mode != inode->i_mode) {
7221fe4c 2226 inode->i_mode = attr->ia_mode;
355da1eb
SW
2227 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
2228 mask |= CEPH_SETATTR_MODE;
2229 release |= CEPH_CAP_AUTH_SHARED;
2230 }
2231 }
2232
2233 if (ia_valid & ATTR_ATIME) {
13442b03 2234 dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
9bbeab41
AB
2235 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
2236 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
355da1eb
SW
2237 if (issued & CEPH_CAP_FILE_EXCL) {
2238 ci->i_time_warp_seq++;
2239 inode->i_atime = attr->ia_atime;
2240 dirtied |= CEPH_CAP_FILE_EXCL;
2241 } else if ((issued & CEPH_CAP_FILE_WR) &&
95582b00 2242 timespec64_compare(&inode->i_atime,
355da1eb
SW
2243 &attr->ia_atime) < 0) {
2244 inode->i_atime = attr->ia_atime;
2245 dirtied |= CEPH_CAP_FILE_WR;
2246 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
95582b00 2247 !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
9bbeab41
AB
2248 ceph_encode_timespec64(&req->r_args.setattr.atime,
2249 &attr->ia_atime);
355da1eb 2250 mask |= CEPH_SETATTR_ATIME;
be70489e
YZ
2251 release |= CEPH_CAP_FILE_SHARED |
2252 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
355da1eb
SW
2253 }
2254 }
c62498d7 2255 if (ia_valid & ATTR_SIZE) {
2d6795fb
JL
2256 loff_t isize = i_size_read(inode);
2257
2258 dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
e90334e8
XL
2259 if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
2260 if (attr->ia_size > isize) {
2261 i_size_write(inode, attr->ia_size);
2262 inode->i_blocks = calc_inode_blocks(attr->ia_size);
2263 ci->i_reported_size = attr->ia_size;
2264 dirtied |= CEPH_CAP_FILE_EXCL;
2265 ia_valid |= ATTR_MTIME;
2266 }
c62498d7 2267 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2d6795fb 2268 attr->ia_size != isize) {
c62498d7 2269 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2d6795fb 2270 req->r_args.setattr.old_size = cpu_to_le64(isize);
c62498d7
JL
2271 mask |= CEPH_SETATTR_SIZE;
2272 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2273 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2274 }
2275 }
355da1eb 2276 if (ia_valid & ATTR_MTIME) {
13442b03 2277 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
9bbeab41
AB
2278 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
2279 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
355da1eb
SW
2280 if (issued & CEPH_CAP_FILE_EXCL) {
2281 ci->i_time_warp_seq++;
2282 inode->i_mtime = attr->ia_mtime;
2283 dirtied |= CEPH_CAP_FILE_EXCL;
2284 } else if ((issued & CEPH_CAP_FILE_WR) &&
95582b00 2285 timespec64_compare(&inode->i_mtime,
355da1eb
SW
2286 &attr->ia_mtime) < 0) {
2287 inode->i_mtime = attr->ia_mtime;
2288 dirtied |= CEPH_CAP_FILE_WR;
2289 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
95582b00 2290 !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
9bbeab41
AB
2291 ceph_encode_timespec64(&req->r_args.setattr.mtime,
2292 &attr->ia_mtime);
355da1eb 2293 mask |= CEPH_SETATTR_MTIME;
be70489e
YZ
2294 release |= CEPH_CAP_FILE_SHARED |
2295 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
355da1eb
SW
2296 }
2297 }
355da1eb
SW
2298
2299 /* these do nothing */
2300 if (ia_valid & ATTR_CTIME) {
2301 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
2302 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
13442b03 2303 dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
9bbeab41
AB
2304 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
2305 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
355da1eb 2306 only ? "ctime only" : "ignored");
355da1eb
SW
2307 if (only) {
2308 /*
2309 * if kernel wants to dirty ctime but nothing else,
2310 * we need to choose a cap to dirty under, or do
2311 * a almost-no-op setattr
2312 */
2313 if (issued & CEPH_CAP_AUTH_EXCL)
2314 dirtied |= CEPH_CAP_AUTH_EXCL;
2315 else if (issued & CEPH_CAP_FILE_EXCL)
2316 dirtied |= CEPH_CAP_FILE_EXCL;
2317 else if (issued & CEPH_CAP_XATTR_EXCL)
2318 dirtied |= CEPH_CAP_XATTR_EXCL;
2319 else
2320 mask |= CEPH_SETATTR_CTIME;
2321 }
2322 }
2323 if (ia_valid & ATTR_FILE)
2324 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
2325
2326 if (dirtied) {
f66fd9f0
YZ
2327 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
2328 &prealloc_cf);
4ca2fea6 2329 inode->i_ctime = attr->ia_ctime;
b4b924c7 2330 inode_inc_iversion_raw(inode);
355da1eb
SW
2331 }
2332
2333 release &= issued;
be655596 2334 spin_unlock(&ci->i_ceph_lock);
604d1b02
YZ
2335 if (lock_snap_rwsem)
2336 up_read(&mdsc->snap_rwsem);
355da1eb 2337
fca65b4a
SW
2338 if (inode_dirty_flags)
2339 __mark_inode_dirty(inode, inode_dirty_flags);
2340
355da1eb 2341 if (mask) {
70b666c3
SW
2342 req->r_inode = inode;
2343 ihold(inode);
355da1eb
SW
2344 req->r_inode_drop = release;
2345 req->r_args.setattr.mask = cpu_to_le32(mask);
2346 req->r_num_caps = 1;
0ed1e90a 2347 req->r_stamp = attr->ia_ctime;
752c8bdc 2348 err = ceph_mdsc_do_request(mdsc, NULL, req);
355da1eb 2349 }
2d332d5b 2350out:
355da1eb
SW
2351 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
2352 ceph_cap_string(dirtied), mask);
2353
355da1eb 2354 ceph_mdsc_put_request(req);
f66fd9f0 2355 ceph_free_cap_flush(prealloc_cf);
8179a101
YZ
2356
2357 if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
2358 __ceph_do_pending_vmtruncate(inode);
2359
355da1eb
SW
2360 return err;
2361}
2362
a26fecca
AG
2363/*
2364 * setattr
2365 */
c1632a0f 2366int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
549c7297 2367 struct iattr *attr)
a26fecca 2368{
fd5472ed 2369 struct inode *inode = d_inode(dentry);
36a4c72d 2370 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
fd5472ed
JK
2371 int err;
2372
2373 if (ceph_snap(inode) != CEPH_NOSNAP)
2374 return -EROFS;
2375
5d6451b1
JL
2376 if (ceph_inode_is_shutdown(inode))
2377 return -ESTALE;
2378
c1632a0f 2379 err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
fd5472ed
JK
2380 if (err != 0)
2381 return err;
2382
36a4c72d 2383 if ((attr->ia_valid & ATTR_SIZE) &&
2d6795fb 2384 attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
36a4c72d
CX
2385 return -EFBIG;
2386
2b83845f
LH
2387 if ((attr->ia_valid & ATTR_SIZE) &&
2388 ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
2389 return -EDQUOT;
2390
2d332d5b 2391 err = __ceph_setattr(inode, attr, NULL);
8179a101
YZ
2392
2393 if (err >= 0 && (attr->ia_valid & ATTR_MODE))
13e83a49 2394 err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
8179a101
YZ
2395
2396 return err;
a26fecca
AG
2397}
2398
5eed80fb
XL
2399int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
2400{
2401 int issued = ceph_caps_issued(ceph_inode(inode));
2402
2403 /*
2404 * If any 'x' caps is issued we can just choose the auth MDS
2405 * instead of the random replica MDSes. Because only when the
2406 * Locker is in LOCK_EXEC state will the loner client could
2407 * get the 'x' caps. And if we send the getattr requests to
2408 * any replica MDS it must auth pin and tries to rdlock from
2409 * the auth MDS, and then the auth MDS need to do the Locker
2410 * state transition to LOCK_SYNC. And after that the lock state
2411 * will change back.
2412 *
2413 * This cost much when doing the Locker state transition and
2414 * usually will need to revoke caps from clients.
8266c4d7
XL
2415 *
2416 * And for the 'Xs' caps for getxattr we will also choose the
2417 * auth MDS, because the MDS side code is buggy due to setxattr
2418 * won't notify the replica MDSes when the values changed and
2419 * the replica MDS will return the old values. Though we will
2420 * fix it in MDS code, but this still makes sense for old ceph.
5eed80fb
XL
2421 */
2422 if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
8266c4d7 2423 || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
5eed80fb
XL
2424 return USE_AUTH_MDS;
2425 else
2426 return USE_ANY_MDS;
2427}
2428
355da1eb
SW
2429/*
2430 * Verify that we have a lease on the given mask. If not,
2431 * do a getattr against an mds.
2432 */
01deead0
YZ
2433int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
2434 int mask, bool force)
355da1eb 2435{
3d14c5d2
YS
2436 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
2437 struct ceph_mds_client *mdsc = fsc->mdsc;
355da1eb 2438 struct ceph_mds_request *req;
49a9f4f6 2439 int mode;
355da1eb
SW
2440 int err;
2441
2442 if (ceph_snap(inode) == CEPH_SNAPDIR) {
2443 dout("do_getattr inode %p SNAPDIR\n", inode);
2444 return 0;
2445 }
2446
01deead0
YZ
2447 dout("do_getattr inode %p mask %s mode 0%o\n",
2448 inode, ceph_cap_string(mask), inode->i_mode);
1af16d54
XL
2449 if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
2450 return 0;
355da1eb 2451
5eed80fb 2452 mode = ceph_try_to_choose_auth_mds(inode, mask);
49a9f4f6 2453 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
355da1eb
SW
2454 if (IS_ERR(req))
2455 return PTR_ERR(req);
70b666c3
SW
2456 req->r_inode = inode;
2457 ihold(inode);
355da1eb
SW
2458 req->r_num_caps = 1;
2459 req->r_args.getattr.mask = cpu_to_le32(mask);
01deead0 2460 req->r_locked_page = locked_page;
355da1eb 2461 err = ceph_mdsc_do_request(mdsc, NULL, req);
01deead0
YZ
2462 if (locked_page && err == 0) {
2463 u64 inline_version = req->r_reply_info.targeti.inline_version;
2464 if (inline_version == 0) {
2465 /* the reply is supposed to contain inline data */
2466 err = -EINVAL;
48490776
XL
2467 } else if (inline_version == CEPH_INLINE_NONE ||
2468 inline_version == 1) {
01deead0
YZ
2469 err = -ENODATA;
2470 } else {
2471 err = req->r_reply_info.targeti.inline_len;
2472 }
2473 }
355da1eb
SW
2474 ceph_mdsc_put_request(req);
2475 dout("do_getattr result=%d\n", err);
2476 return err;
2477}
2478
6ddf5f16
MC
2479int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
2480 size_t size)
2481{
2482 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
2483 struct ceph_mds_client *mdsc = fsc->mdsc;
2484 struct ceph_mds_request *req;
2485 int mode = USE_AUTH_MDS;
2486 int err;
2487 char *xattr_value;
2488 size_t xattr_value_len;
2489
2490 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
2491 if (IS_ERR(req)) {
2492 err = -ENOMEM;
2493 goto out;
2494 }
2495
6eb06c46 2496 req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
6ddf5f16
MC
2497 req->r_path2 = kstrdup(name, GFP_NOFS);
2498 if (!req->r_path2) {
2499 err = -ENOMEM;
2500 goto put;
2501 }
2502
2503 ihold(inode);
2504 req->r_inode = inode;
2505 err = ceph_mdsc_do_request(mdsc, NULL, req);
2506 if (err < 0)
2507 goto put;
2508
2509 xattr_value = req->r_reply_info.xattr_info.xattr_value;
2510 xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
2511
2512 dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
2513
2514 err = (int)xattr_value_len;
2515 if (size == 0)
2516 goto put;
2517
2518 if (xattr_value_len > size) {
2519 err = -ERANGE;
2520 goto put;
2521 }
2522
2523 memcpy(value, xattr_value, xattr_value_len);
2524put:
2525 ceph_mdsc_put_request(req);
2526out:
2527 dout("do_getvxattr result=%d\n", err);
2528 return err;
2529}
2530
355da1eb
SW
2531
2532/*
2533 * Check inode permissions. We verify we have a valid value for
2534 * the AUTH cap, then call the generic handler.
2535 */
4609e1f1 2536int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
549c7297 2537 int mask)
355da1eb 2538{
b74c79e9
NP
2539 int err;
2540
10556cb2 2541 if (mask & MAY_NOT_BLOCK)
b74c79e9
NP
2542 return -ECHILD;
2543
508b32d8 2544 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
355da1eb
SW
2545
2546 if (!err)
4609e1f1 2547 err = generic_permission(&nop_mnt_idmap, inode, mask);
355da1eb
SW
2548 return err;
2549}
2550
428bb68a 2551/* Craft a mask of needed caps given a set of requested statx attrs. */
04fabb11 2552static int statx_to_caps(u32 want, umode_t mode)
428bb68a
JL
2553{
2554 int mask = 0;
2555
f6102994 2556 if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE))
428bb68a
JL
2557 mask |= CEPH_CAP_AUTH_SHARED;
2558
f6102994 2559 if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) {
04fabb11
JL
2560 /*
2561 * The link count for directories depends on inode->i_subdirs,
2562 * and that is only updated when Fs caps are held.
2563 */
2564 if (S_ISDIR(mode))
2565 mask |= CEPH_CAP_FILE_SHARED;
2566 else
2567 mask |= CEPH_CAP_LINK_SHARED;
2568 }
428bb68a 2569
f6102994 2570 if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE))
428bb68a
JL
2571 mask |= CEPH_CAP_FILE_SHARED;
2572
f6102994 2573 if (want & (STATX_CTIME|STATX_CHANGE_COOKIE))
428bb68a
JL
2574 mask |= CEPH_CAP_XATTR_SHARED;
2575
2576 return mask;
2577}
2578
355da1eb 2579/*
428bb68a
JL
2580 * Get all the attributes. If we have sufficient caps for the requested attrs,
2581 * then we can avoid talking to the MDS at all.
355da1eb 2582 */
b74d24f7 2583int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
549c7297 2584 struct kstat *stat, u32 request_mask, unsigned int flags)
355da1eb 2585{
a528d35e 2586 struct inode *inode = d_inode(path->dentry);
aa87052d 2587 struct super_block *sb = inode->i_sb;
232d4b01 2588 struct ceph_inode_info *ci = ceph_inode(inode);
58981784 2589 u32 valid_mask = STATX_BASIC_STATS;
428bb68a 2590 int err = 0;
355da1eb 2591
5d6451b1
JL
2592 if (ceph_inode_is_shutdown(inode))
2593 return -ESTALE;
2594
428bb68a 2595 /* Skip the getattr altogether if we're asked not to sync */
261998c3 2596 if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
04fabb11
JL
2597 err = ceph_do_getattr(inode,
2598 statx_to_caps(request_mask, inode->i_mode),
2599 flags & AT_STATX_FORCE_SYNC);
428bb68a
JL
2600 if (err)
2601 return err;
2602 }
2603
b74d24f7 2604 generic_fillattr(&nop_mnt_idmap, inode, stat);
ebce3eb2 2605 stat->ino = ceph_present_inode(inode);
58981784
JL
2606
2607 /*
2608 * btime on newly-allocated inodes is 0, so if this is still set to
2609 * that, then assume that it's not valid.
2610 */
2611 if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
2612 stat->btime = ci->i_btime;
2613 valid_mask |= STATX_BTIME;
2614 }
2615
f6102994
JL
2616 if (request_mask & STATX_CHANGE_COOKIE) {
2617 stat->change_cookie = inode_peek_iversion_raw(inode);
2618 valid_mask |= STATX_CHANGE_COOKIE;
2619 }
2620
428bb68a 2621 if (ceph_snap(inode) == CEPH_NOSNAP)
aa87052d 2622 stat->dev = sb->s_dev;
428bb68a
JL
2623 else
2624 stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
2625
2626 if (S_ISDIR(inode->i_mode)) {
aa87052d 2627 if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
428bb68a 2628 stat->size = ci->i_rbytes;
aa87052d
XL
2629 } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
2630 struct ceph_inode_info *pci;
2631 struct ceph_snap_realm *realm;
2632 struct inode *parent;
2633
2634 parent = ceph_lookup_inode(sb, ceph_ino(inode));
f86a4866 2635 if (IS_ERR(parent))
aa87052d
XL
2636 return PTR_ERR(parent);
2637
2638 pci = ceph_inode(parent);
2639 spin_lock(&pci->i_ceph_lock);
2640 realm = pci->i_snap_realm;
2641 if (realm)
2642 stat->size = realm->num_snaps;
2643 else
2644 stat->size = 0;
2645 spin_unlock(&pci->i_ceph_lock);
2646 iput(parent);
2647 } else {
428bb68a 2648 stat->size = ci->i_files + ci->i_subdirs;
aa87052d 2649 }
428bb68a
JL
2650 stat->blocks = 0;
2651 stat->blksize = 65536;
2652 /*
2653 * Some applications rely on the number of st_nlink
2654 * value on directories to be either 0 (if unlinked)
2655 * or 2 + number of subdirectories.
2656 */
2657 if (stat->nlink == 1)
2658 /* '.' + '..' + subdirs */
2659 stat->nlink = 1 + 1 + ci->i_subdirs;
355da1eb 2660 }
428bb68a 2661
f6102994
JL
2662 stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
2663 stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
58981784 2664 stat->result_mask = request_mask & valid_mask;
355da1eb
SW
2665 return err;
2666}
5d6451b1
JL
2667
2668void ceph_inode_shutdown(struct inode *inode)
2669{
2670 struct ceph_inode_info *ci = ceph_inode(inode);
2671 struct rb_node *p;
2672 int iputs = 0;
2673 bool invalidate = false;
2674
2675 spin_lock(&ci->i_ceph_lock);
2676 ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
2677 p = rb_first(&ci->i_caps);
2678 while (p) {
2679 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
2680
2681 p = rb_next(p);
2682 iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
2683 }
2684 spin_unlock(&ci->i_ceph_lock);
2685
2686 if (invalidate)
2687 ceph_queue_invalidate(inode);
2688 while (iputs--)
2689 iput(inode);
2690}