ceph: allow encrypting a directory while not having Ax caps
[linux-block.git] / fs / ceph / dir.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
3d14c5d2 2#include <linux/ceph/ceph_debug.h>
2817b000
SW
3
4#include <linux/spinlock.h>
2817b000 5#include <linux/namei.h>
5a0e3ad6 6#include <linux/slab.h>
2817b000 7#include <linux/sched.h>
2cdeb1e4 8#include <linux/xattr.h>
2817b000
SW
9
10#include "super.h"
3d14c5d2 11#include "mds_client.h"
af9ffa6d 12#include "crypto.h"
2817b000
SW
13
14/*
15 * Directory operations: readdir, lookup, create, link, unlink,
16 * rename, etc.
17 */
18
19/*
20 * Ceph MDS operations are specified in terms of a base ino and
21 * relative path. Thus, the client can specify an operation on a
22 * specific inode (e.g., a getattr due to fstat(2)), or as a path
23 * relative to, say, the root directory.
24 *
25 * Normally, we limit ourselves to strict inode ops (no path component)
26 * or dentry operations (a single path component relative to an ino). The
27 * exception to this is open_root_dentry(), which will open the mount
28 * point by name.
29 */
30
52dfb8ac 31const struct dentry_operations ceph_dentry_ops;
2817b000 32
37c4efc1
YZ
33static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
34static int __dir_lease_try_check(const struct dentry *dentry);
35
2817b000
SW
36/*
37 * Initialize ceph dentry state.
38 */
ad5cb123 39static int ceph_d_init(struct dentry *dentry)
2817b000
SW
40{
41 struct ceph_dentry_info *di;
2678da88 42 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
2817b000 43
99ec2697 44 di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
2817b000
SW
45 if (!di)
46 return -ENOMEM; /* oh well */
47
2817b000
SW
48 di->dentry = dentry;
49 di->lease_session = NULL;
9b16f03c 50 di->time = jiffies;
48d0cbd1 51 dentry->d_fsdata = di;
37c4efc1 52 INIT_LIST_HEAD(&di->lease_list);
f9009efa
XL
53
54 atomic64_inc(&mdsc->metric.total_dentries);
55
2817b000
SW
56 return 0;
57}
58
2817b000 59/*
f3c4ebe6
YZ
60 * for f_pos for readdir:
61 * - hash order:
62 * (0xff << 52) | ((24 bits hash) << 28) |
63 * (the nth entry has hash collision);
64 * - frag+name order;
65 * ((frag value) << 28) | (the nth entry in frag);
2817b000 66 */
f3c4ebe6
YZ
67#define OFFSET_BITS 28
68#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
69#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
70loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
71{
72 loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
73 if (hash_order)
74 fpos |= HASH_ORDER;
75 return fpos;
76}
77
78static bool is_hash_order(loff_t p)
79{
80 return (p & HASH_ORDER) == HASH_ORDER;
81}
82
2817b000
SW
83static unsigned fpos_frag(loff_t p)
84{
f3c4ebe6 85 return p >> OFFSET_BITS;
2817b000 86}
f3c4ebe6
YZ
87
88static unsigned fpos_hash(loff_t p)
89{
90 return ceph_frag_value(fpos_frag(p));
91}
92
2817b000
SW
93static unsigned fpos_off(loff_t p)
94{
f3c4ebe6 95 return p & OFFSET_MASK;
2817b000
SW
96}
97
4d5f5df6
YZ
98static int fpos_cmp(loff_t l, loff_t r)
99{
100 int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
101 if (v)
102 return v;
103 return (int)(fpos_off(l) - fpos_off(r));
104}
105
fdd4e158
YZ
106/*
107 * make note of the last dentry we read, so we can
108 * continue at the same lexicographical point,
109 * regardless of what dir changes take place on the
110 * server.
111 */
bb48bd4d 112static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
fdd4e158
YZ
113 int len, unsigned next_offset)
114{
115 char *buf = kmalloc(len+1, GFP_KERNEL);
116 if (!buf)
117 return -ENOMEM;
bb48bd4d
CX
118 kfree(dfi->last_name);
119 dfi->last_name = buf;
120 memcpy(dfi->last_name, name, len);
121 dfi->last_name[len] = 0;
122 dfi->next_offset = next_offset;
123 dout("note_last_dentry '%s'\n", dfi->last_name);
fdd4e158
YZ
124 return 0;
125}
126
c530cd24
YZ
127
128static struct dentry *
129__dcache_find_get_entry(struct dentry *parent, u64 idx,
130 struct ceph_readdir_cache_control *cache_ctl)
131{
132 struct inode *dir = d_inode(parent);
133 struct dentry *dentry;
134 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
135 loff_t ptr_pos = idx * sizeof(struct dentry *);
136 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
137
138 if (ptr_pos >= i_size_read(dir))
139 return NULL;
140
141 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
142 ceph_readdir_cache_release(cache_ctl);
143 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
144 if (!cache_ctl->page) {
145 dout(" page %lu not found\n", ptr_pgoff);
146 return ERR_PTR(-EAGAIN);
147 }
148 /* reading/filling the cache are serialized by
810313c5 149 i_rwsem, no need to use page lock */
c530cd24
YZ
150 unlock_page(cache_ctl->page);
151 cache_ctl->dentries = kmap(cache_ctl->page);
152 }
153
154 cache_ctl->index = idx & idx_mask;
155
156 rcu_read_lock();
157 spin_lock(&parent->d_lock);
158 /* check i_size again here, because empty directory can be
810313c5 159 * marked as complete while not holding the i_rwsem. */
c530cd24
YZ
160 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
161 dentry = cache_ctl->dentries[cache_ctl->index];
162 else
163 dentry = NULL;
164 spin_unlock(&parent->d_lock);
165 if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
166 dentry = NULL;
167 rcu_read_unlock();
168 return dentry ? : ERR_PTR(-EAGAIN);
169}
170
2817b000
SW
171/*
172 * When possible, we try to satisfy a readdir by peeking at the
173 * dcache. We make this work by carefully ordering dentries on
946e51f2 174 * d_child when we initially get results back from the MDS, and
2817b000
SW
175 * falling back to a "normal" sync readdir if any dentries in the dir
176 * are dropped.
177 *
2f276c51 178 * Complete dir indicates that we have all dentries in the dir. It is
2817b000
SW
179 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
180 * the MDS if/when the directory is modified).
181 */
a30be7cb 182static int __dcache_readdir(struct file *file, struct dir_context *ctx,
97aeb6bf 183 int shared_gen)
2817b000 184{
bb48bd4d 185 struct ceph_dir_file_info *dfi = file->private_data;
b583043e 186 struct dentry *parent = file->f_path.dentry;
2b0143b5 187 struct inode *dir = d_inode(parent);
fdd4e158 188 struct dentry *dentry, *last = NULL;
2817b000 189 struct ceph_dentry_info *di;
fdd4e158 190 struct ceph_readdir_cache_control cache_ctl = {};
c530cd24
YZ
191 u64 idx = 0;
192 int err = 0;
2817b000 193
97aeb6bf 194 dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
2817b000 195
c530cd24
YZ
196 /* search start position */
197 if (ctx->pos > 2) {
198 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
199 while (count > 0) {
200 u64 step = count >> 1;
201 dentry = __dcache_find_get_entry(parent, idx + step,
202 &cache_ctl);
203 if (!dentry) {
204 /* use linar search */
205 idx = 0;
206 break;
207 }
208 if (IS_ERR(dentry)) {
209 err = PTR_ERR(dentry);
210 goto out;
211 }
212 di = ceph_dentry(dentry);
213 spin_lock(&dentry->d_lock);
214 if (fpos_cmp(di->offset, ctx->pos) < 0) {
215 idx += step + 1;
216 count -= step + 1;
217 } else {
218 count = step;
219 }
220 spin_unlock(&dentry->d_lock);
221 dput(dentry);
222 }
2817b000 223
c530cd24 224 dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
2817b000
SW
225 }
226
fdd4e158 227
c530cd24
YZ
228 for (;;) {
229 bool emit_dentry = false;
230 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
231 if (!dentry) {
bb48bd4d 232 dfi->file_info.flags |= CEPH_F_ATEND;
fdd4e158
YZ
233 err = 0;
234 break;
2817b000 235 }
c530cd24
YZ
236 if (IS_ERR(dentry)) {
237 err = PTR_ERR(dentry);
238 goto out;
fdd4e158
YZ
239 }
240
fdd4e158 241 spin_lock(&dentry->d_lock);
5495c2d0
YZ
242 di = ceph_dentry(dentry);
243 if (d_unhashed(dentry) ||
244 d_really_is_negative(dentry) ||
af9ffa6d
XL
245 di->lease_shared_gen != shared_gen ||
246 ((dentry->d_flags & DCACHE_NOKEY_NAME) &&
247 fscrypt_has_encryption_key(dir))) {
5495c2d0
YZ
248 spin_unlock(&dentry->d_lock);
249 dput(dentry);
250 err = -EAGAIN;
251 goto out;
252 }
253 if (fpos_cmp(ctx->pos, di->offset) <= 0) {
37c4efc1 254 __ceph_dentry_dir_lease_touch(di);
fdd4e158
YZ
255 emit_dentry = true;
256 }
da502956 257 spin_unlock(&dentry->d_lock);
2817b000 258
fdd4e158 259 if (emit_dentry) {
f3c4ebe6 260 dout(" %llx dentry %p %pd %p\n", di->offset,
fdd4e158
YZ
261 dentry, dentry, d_inode(dentry));
262 ctx->pos = di->offset;
263 if (!dir_emit(ctx, dentry->d_name.name,
ebce3eb2 264 dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
fdd4e158
YZ
265 d_inode(dentry)->i_mode >> 12)) {
266 dput(dentry);
267 err = 0;
268 break;
269 }
270 ctx->pos++;
0081bd83 271
fdd4e158
YZ
272 if (last)
273 dput(last);
274 last = dentry;
275 } else {
276 dput(dentry);
2817b000 277 }
fdd4e158 278 }
c530cd24 279out:
fdd4e158
YZ
280 ceph_readdir_cache_release(&cache_ctl);
281 if (last) {
282 int ret;
283 di = ceph_dentry(last);
bb48bd4d 284 ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
fdd4e158
YZ
285 fpos_off(di->offset) + 1);
286 if (ret < 0)
287 err = ret;
2817b000 288 dput(last);
84583cfb 289 /* last_name no longer match cache index */
bb48bd4d
CX
290 if (dfi->readdir_cache_idx >= 0) {
291 dfi->readdir_cache_idx = -1;
292 dfi->dir_release_count = 0;
84583cfb 293 }
fdd4e158 294 }
2817b000
SW
295 return err;
296}
297
bb48bd4d 298static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
f3c4ebe6 299{
bb48bd4d 300 if (!dfi->last_readdir)
f3c4ebe6
YZ
301 return true;
302 if (is_hash_order(pos))
bb48bd4d 303 return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos));
f3c4ebe6 304 else
bb48bd4d 305 return dfi->frag != fpos_frag(pos);
f3c4ebe6
YZ
306}
307
77acfa29 308static int ceph_readdir(struct file *file, struct dir_context *ctx)
2817b000 309{
bb48bd4d 310 struct ceph_dir_file_info *dfi = file->private_data;
77acfa29 311 struct inode *inode = file_inode(file);
2817b000 312 struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2
YS
313 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
314 struct ceph_mds_client *mdsc = fsc->mdsc;
8974eebd 315 int i;
2817b000 316 int err;
b50c2de5 317 unsigned frag = -1;
2817b000 318 struct ceph_mds_reply_info_parsed *rinfo;
2817b000 319
8974eebd 320 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
bb48bd4d 321 if (dfi->file_info.flags & CEPH_F_ATEND)
2817b000
SW
322 return 0;
323
324 /* always start with . and .. */
77acfa29 325 if (ctx->pos == 0) {
2817b000 326 dout("readdir off 0 -> '.'\n");
ebce3eb2 327 if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
77acfa29 328 inode->i_mode >> 12))
2817b000 329 return 0;
77acfa29 330 ctx->pos = 1;
2817b000 331 }
77acfa29 332 if (ctx->pos == 1) {
ebce3eb2
JL
333 u64 ino;
334 struct dentry *dentry = file->f_path.dentry;
335
336 spin_lock(&dentry->d_lock);
337 ino = ceph_present_inode(dentry->d_parent->d_inode);
338 spin_unlock(&dentry->d_lock);
339
2817b000 340 dout("readdir off 1 -> '..'\n");
ebce3eb2 341 if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
2817b000 342 return 0;
77acfa29 343 ctx->pos = 2;
2817b000
SW
344 }
345
af9ffa6d
XL
346 err = fscrypt_prepare_readdir(inode);
347 if (err)
348 return err;
349
be655596 350 spin_lock(&ci->i_ceph_lock);
719a2514
YZ
351 /* request Fx cap. if have Fx, we don't need to release Fs cap
352 * for later create/unlink. */
353 __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
354 /* can we use the dcache? */
fdd4e158 355 if (ceph_test_mount_opt(fsc, DCACHE) &&
3d14c5d2 356 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
a0dff78d 357 ceph_snap(inode) != CEPH_SNAPDIR &&
70db4f36 358 __ceph_dir_is_complete_ordered(ci) &&
1af16d54 359 __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
97aeb6bf 360 int shared_gen = atomic_read(&ci->i_shared_gen);
1af16d54 361
be655596 362 spin_unlock(&ci->i_ceph_lock);
a30be7cb 363 err = __dcache_readdir(file, ctx, shared_gen);
efa4c120 364 if (err != -EAGAIN)
2817b000 365 return err;
efa4c120 366 } else {
be655596 367 spin_unlock(&ci->i_ceph_lock);
2817b000 368 }
2817b000
SW
369
370 /* proceed with a normal readdir */
2817b000
SW
371more:
372 /* do we have the correct frag content buffered? */
bb48bd4d 373 if (need_send_readdir(dfi, ctx->pos)) {
2817b000
SW
374 struct ceph_mds_request *req;
375 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
376 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
377
378 /* discard old result, if any */
bb48bd4d
CX
379 if (dfi->last_readdir) {
380 ceph_mdsc_put_request(dfi->last_readdir);
381 dfi->last_readdir = NULL;
393f6620 382 }
2817b000 383
f3c4ebe6 384 if (is_hash_order(ctx->pos)) {
b50c2de5
YZ
385 /* fragtree isn't always accurate. choose frag
386 * based on previous reply when possible. */
387 if (frag == (unsigned)-1)
388 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
389 NULL, NULL);
f3c4ebe6
YZ
390 } else {
391 frag = fpos_frag(ctx->pos);
392 }
393
2817b000 394 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
bb48bd4d 395 ceph_vinop(inode), frag, dfi->last_name);
2817b000
SW
396 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
397 if (IS_ERR(req))
398 return PTR_ERR(req);
af9ffa6d 399
54008399
YZ
400 err = ceph_alloc_readdir_reply_buffer(req, inode);
401 if (err) {
402 ceph_mdsc_put_request(req);
403 return err;
404 }
2817b000
SW
405 /* hints to request -> mds selection code */
406 req->r_direct_mode = USE_AUTH_MDS;
5d37ca14
YZ
407 if (op == CEPH_MDS_OP_READDIR) {
408 req->r_direct_hash = ceph_frag_value(frag);
409 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
87c91a96 410 req->r_inode_drop = CEPH_CAP_FILE_EXCL;
5d37ca14 411 }
bb48bd4d 412 if (dfi->last_name) {
af9ffa6d
XL
413 struct qstr d_name = { .name = dfi->last_name,
414 .len = strlen(dfi->last_name) };
415
416 req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
a149bb9a
SK
417 if (!req->r_path2) {
418 ceph_mdsc_put_request(req);
419 return -ENOMEM;
420 }
af9ffa6d
XL
421
422 err = ceph_encode_encrypted_dname(inode, &d_name,
423 req->r_path2);
424 if (err < 0) {
425 ceph_mdsc_put_request(req);
426 return err;
427 }
79162547
YZ
428 } else if (is_hash_order(ctx->pos)) {
429 req->r_args.readdir.offset_hash =
430 cpu_to_le32(fpos_hash(ctx->pos));
a149bb9a 431 }
79162547 432
bb48bd4d
CX
433 req->r_dir_release_cnt = dfi->dir_release_count;
434 req->r_dir_ordered_cnt = dfi->dir_ordered_count;
435 req->r_readdir_cache_idx = dfi->readdir_cache_idx;
436 req->r_readdir_offset = dfi->next_offset;
2817b000 437 req->r_args.readdir.frag = cpu_to_le32(frag);
956d39d6
YZ
438 req->r_args.readdir.flags =
439 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
a149bb9a
SK
440
441 req->r_inode = inode;
442 ihold(inode);
443 req->r_dentry = dget(file->f_path.dentry);
2817b000
SW
444 err = ceph_mdsc_do_request(mdsc, NULL, req);
445 if (err < 0) {
446 ceph_mdsc_put_request(req);
447 return err;
448 }
f3c4ebe6
YZ
449 dout("readdir got and parsed readdir result=%d on "
450 "frag %x, end=%d, complete=%d, hash_order=%d\n",
451 err, frag,
2817b000 452 (int)req->r_reply_info.dir_end,
f3c4ebe6
YZ
453 (int)req->r_reply_info.dir_complete,
454 (int)req->r_reply_info.hash_order);
2817b000 455
81c6aea5
YZ
456 rinfo = &req->r_reply_info;
457 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
458 frag = le32_to_cpu(rinfo->dir_dir->frag);
f3c4ebe6 459 if (!rinfo->hash_order) {
bb48bd4d 460 dfi->next_offset = req->r_readdir_offset;
f3c4ebe6
YZ
461 /* adjust ctx->pos to beginning of frag */
462 ctx->pos = ceph_make_fpos(frag,
bb48bd4d 463 dfi->next_offset,
f3c4ebe6
YZ
464 false);
465 }
81c6aea5 466 }
fdd4e158 467
bb48bd4d
CX
468 dfi->frag = frag;
469 dfi->last_readdir = req;
2817b000 470
bc2de10d 471 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
bb48bd4d
CX
472 dfi->readdir_cache_idx = req->r_readdir_cache_idx;
473 if (dfi->readdir_cache_idx < 0) {
fdd4e158 474 /* preclude from marking dir ordered */
bb48bd4d 475 dfi->dir_ordered_count = 0;
8974eebd 476 } else if (ceph_frag_is_leftmost(frag) &&
bb48bd4d 477 dfi->next_offset == 2) {
fdd4e158
YZ
478 /* note dir version at start of readdir so
479 * we can tell if any dentries get dropped */
bb48bd4d
CX
480 dfi->dir_release_count = req->r_dir_release_cnt;
481 dfi->dir_ordered_count = req->r_dir_ordered_cnt;
fdd4e158
YZ
482 }
483 } else {
4c069a58 484 dout("readdir !did_prepopulate\n");
fdd4e158 485 /* disable readdir cache */
bb48bd4d 486 dfi->readdir_cache_idx = -1;
fdd4e158 487 /* preclude from marking dir complete */
bb48bd4d 488 dfi->dir_release_count = 0;
fdd4e158
YZ
489 }
490
f3c4ebe6
YZ
491 /* note next offset and last dentry name */
492 if (rinfo->dir_nr > 0) {
2a5beea3
YZ
493 struct ceph_mds_reply_dir_entry *rde =
494 rinfo->dir_entries + (rinfo->dir_nr-1);
f3c4ebe6
YZ
495 unsigned next_offset = req->r_reply_info.dir_end ?
496 2 : (fpos_off(rde->offset) + 1);
bb48bd4d 497 err = note_last_dentry(dfi, rde->name, rde->name_len,
f3c4ebe6 498 next_offset);
f639d986
XL
499 if (err) {
500 ceph_mdsc_put_request(dfi->last_readdir);
501 dfi->last_readdir = NULL;
2817b000 502 return err;
f639d986 503 }
f3c4ebe6 504 } else if (req->r_reply_info.dir_end) {
bb48bd4d 505 dfi->next_offset = 2;
f3c4ebe6 506 /* keep last name */
2817b000
SW
507 }
508 }
509
bb48bd4d 510 rinfo = &dfi->last_readdir->r_reply_info;
8974eebd 511 dout("readdir frag %x num %d pos %llx chunk first %llx\n",
bb48bd4d 512 dfi->frag, rinfo->dir_nr, ctx->pos,
8974eebd 513 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
77acfa29 514
8974eebd
YZ
515 i = 0;
516 /* search start position */
517 if (rinfo->dir_nr > 0) {
518 int step, nr = rinfo->dir_nr;
519 while (nr > 0) {
520 step = nr >> 1;
521 if (rinfo->dir_entries[i + step].offset < ctx->pos) {
522 i += step + 1;
523 nr -= step + 1;
524 } else {
525 nr = step;
526 }
527 }
528 }
529 for (; i < rinfo->dir_nr; i++) {
530 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
3105c19c 531
af9ffa6d
XL
532 if (rde->offset < ctx->pos) {
533 pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
534 __func__, rde->offset, ctx->pos);
535 return -EIO;
536 }
537
538 if (WARN_ON_ONCE(!rde->inode.in))
539 return -EIO;
8974eebd
YZ
540
541 ctx->pos = rde->offset;
542 dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
543 i, rinfo->dir_nr, ctx->pos,
2a5beea3 544 rde->name_len, rde->name, &rde->inode.in);
8974eebd 545
2a5beea3 546 if (!dir_emit(ctx, rde->name, rde->name_len,
ebce3eb2
JL
547 ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
548 le32_to_cpu(rde->inode.in->mode) >> 12)) {
f639d986
XL
549 /*
550 * NOTE: Here no need to put the 'dfi->last_readdir',
551 * because when dir_emit stops us it's most likely
552 * doesn't have enough memory, etc. So for next readdir
553 * it will continue.
554 */
2817b000
SW
555 dout("filldir stopping us...\n");
556 return 0;
557 }
af9ffa6d
XL
558
559 /* Reset the lengths to their original allocated vals */
77acfa29 560 ctx->pos++;
2817b000
SW
561 }
562
bb48bd4d
CX
563 ceph_mdsc_put_request(dfi->last_readdir);
564 dfi->last_readdir = NULL;
b50c2de5 565
bb48bd4d
CX
566 if (dfi->next_offset > 2) {
567 frag = dfi->frag;
2817b000
SW
568 goto more;
569 }
570
571 /* more frags? */
bb48bd4d
CX
572 if (!ceph_frag_is_rightmost(dfi->frag)) {
573 frag = ceph_frag_next(dfi->frag);
f3c4ebe6
YZ
574 if (is_hash_order(ctx->pos)) {
575 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
bb48bd4d 576 dfi->next_offset, true);
f3c4ebe6
YZ
577 if (new_pos > ctx->pos)
578 ctx->pos = new_pos;
579 /* keep last_name */
580 } else {
bb48bd4d
CX
581 ctx->pos = ceph_make_fpos(frag, dfi->next_offset,
582 false);
583 kfree(dfi->last_name);
584 dfi->last_name = NULL;
f3c4ebe6 585 }
2817b000
SW
586 dout("readdir next frag is %x\n", frag);
587 goto more;
588 }
bb48bd4d 589 dfi->file_info.flags |= CEPH_F_ATEND;
2817b000
SW
590
591 /*
592 * if dir_release_count still matches the dir, no dentries
593 * were released during the whole readdir, and we should have
594 * the complete dir contents in our cache.
595 */
bb48bd4d
CX
596 if (atomic64_read(&ci->i_release_count) ==
597 dfi->dir_release_count) {
fdd4e158 598 spin_lock(&ci->i_ceph_lock);
bb48bd4d
CX
599 if (dfi->dir_ordered_count ==
600 atomic64_read(&ci->i_ordered_count)) {
70db4f36 601 dout(" marking %p complete and ordered\n", inode);
fdd4e158
YZ
602 /* use i_size to track number of entries in
603 * readdir cache */
bb48bd4d
CX
604 BUG_ON(dfi->readdir_cache_idx < 0);
605 i_size_write(inode, dfi->readdir_cache_idx *
fdd4e158
YZ
606 sizeof(struct dentry*));
607 } else {
70db4f36 608 dout(" marking %p complete\n", inode);
fdd4e158 609 }
bb48bd4d
CX
610 __ceph_dir_set_complete(ci, dfi->dir_release_count,
611 dfi->dir_ordered_count);
fdd4e158 612 spin_unlock(&ci->i_ceph_lock);
2817b000 613 }
77acfa29 614 dout("readdir %p file %p done.\n", inode, file);
2817b000
SW
615 return 0;
616}
617
bb48bd4d 618static void reset_readdir(struct ceph_dir_file_info *dfi)
2817b000 619{
bb48bd4d
CX
620 if (dfi->last_readdir) {
621 ceph_mdsc_put_request(dfi->last_readdir);
622 dfi->last_readdir = NULL;
2817b000 623 }
bb48bd4d
CX
624 kfree(dfi->last_name);
625 dfi->last_name = NULL;
626 dfi->dir_release_count = 0;
627 dfi->readdir_cache_idx = -1;
628 dfi->next_offset = 2; /* compensate for . and .. */
629 dfi->file_info.flags &= ~CEPH_F_ATEND;
2817b000
SW
630}
631
8974eebd
YZ
632/*
633 * discard buffered readdir content on seekdir(0), or seek to new frag,
634 * or seek prior to current chunk
635 */
bb48bd4d 636static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
8974eebd
YZ
637{
638 struct ceph_mds_reply_info_parsed *rinfo;
f3c4ebe6 639 loff_t chunk_offset;
8974eebd
YZ
640 if (new_pos == 0)
641 return true;
f3c4ebe6
YZ
642 if (is_hash_order(new_pos)) {
643 /* no need to reset last_name for a forward seek when
644 * dentries are sotred in hash order */
bb48bd4d 645 } else if (dfi->frag != fpos_frag(new_pos)) {
8974eebd 646 return true;
f3c4ebe6 647 }
bb48bd4d 648 rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
8974eebd
YZ
649 if (!rinfo || !rinfo->dir_nr)
650 return true;
f3c4ebe6
YZ
651 chunk_offset = rinfo->dir_entries[0].offset;
652 return new_pos < chunk_offset ||
653 is_hash_order(new_pos) != is_hash_order(chunk_offset);
8974eebd
YZ
654}
655
965c8e59 656static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
2817b000 657{
bb48bd4d 658 struct ceph_dir_file_info *dfi = file->private_data;
2817b000 659 struct inode *inode = file->f_mapping->host;
2817b000
SW
660 loff_t retval;
661
5955102c 662 inode_lock(inode);
06222e49 663 retval = -EINVAL;
965c8e59 664 switch (whence) {
2817b000
SW
665 case SEEK_CUR:
666 offset += file->f_pos;
fcaddb1d 667 break;
06222e49
JB
668 case SEEK_SET:
669 break;
fdd4e158
YZ
670 case SEEK_END:
671 retval = -EOPNOTSUPP;
fcaddb1d 672 goto out;
06222e49
JB
673 default:
674 goto out;
2817b000 675 }
06222e49 676
f0494206 677 if (offset >= 0) {
bb48bd4d 678 if (need_reset_readdir(dfi, offset)) {
f3c4ebe6 679 dout("dir_llseek dropping %p content\n", file);
bb48bd4d 680 reset_readdir(dfi);
f3c4ebe6
YZ
681 } else if (is_hash_order(offset) && offset > file->f_pos) {
682 /* for hash offset, we don't know if a forward seek
683 * is within same frag */
bb48bd4d
CX
684 dfi->dir_release_count = 0;
685 dfi->readdir_cache_idx = -1;
f3c4ebe6
YZ
686 }
687
2817b000
SW
688 if (offset != file->f_pos) {
689 file->f_pos = offset;
690 file->f_version = 0;
bb48bd4d 691 dfi->file_info.flags &= ~CEPH_F_ATEND;
2817b000
SW
692 }
693 retval = offset;
2817b000 694 }
06222e49 695out:
5955102c 696 inode_unlock(inode);
2817b000
SW
697 return retval;
698}
699
700/*
468640e3 701 * Handle lookups for the hidden .snap directory.
2817b000 702 */
aa60cfc3 703struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
7a971e2c 704 struct dentry *dentry)
2817b000 705{
3d14c5d2 706 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
810313c5 707 struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
2817b000
SW
708
709 /* .snap dir? */
7a971e2c 710 if (ceph_snap(parent) == CEPH_NOSNAP &&
aa60cfc3
JL
711 strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
712 struct dentry *res;
2817b000 713 struct inode *inode = ceph_get_snapdir(parent);
aa60cfc3
JL
714
715 res = d_splice_alias(inode, dentry);
716 dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
717 dentry, dentry, inode, res);
718 if (res)
719 dentry = res;
2817b000 720 }
aa60cfc3 721 return dentry;
468640e3 722}
2817b000 723
468640e3
SW
724/*
725 * Figure out final result of a lookup/open request.
726 *
727 * Mainly, make sure we return the final req->r_dentry (if it already
728 * existed) in place of the original VFS-provided dentry when they
729 * differ.
730 *
731 * Gracefully handle the case where the MDS replies with -ENOENT and
732 * no trace (which it may do, at its discretion, e.g., if it doesn't
733 * care to issue a lease on the negative dentry).
734 */
735struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
736 struct dentry *dentry, int err)
737{
2817b000
SW
738 if (err == -ENOENT) {
739 /* no trace? */
740 err = 0;
741 if (!req->r_reply_info.head->is_dentry) {
742 dout("ENOENT and no trace, dentry %p inode %p\n",
2b0143b5
DH
743 dentry, d_inode(dentry));
744 if (d_really_is_positive(dentry)) {
2817b000
SW
745 d_drop(dentry);
746 err = -ENOENT;
747 } else {
748 d_add(dentry, NULL);
749 }
750 }
751 }
752 if (err)
753 dentry = ERR_PTR(err);
754 else if (dentry != req->r_dentry)
755 dentry = dget(req->r_dentry); /* we got spliced */
756 else
757 dentry = NULL;
758 return dentry;
759}
760
3b33f692 761static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
1d1de916
SW
762{
763 return ceph_ino(inode) == CEPH_INO_ROOT &&
764 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
765}
766
2817b000
SW
767/*
768 * Look up a single dir entry. If there is a lookup intent, inform
769 * the MDS so that it gets our 'caps wanted' value in a single op.
770 */
771static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
00cd8dd3 772 unsigned int flags)
2817b000 773{
3d14c5d2 774 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
2678da88 775 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000
SW
776 struct ceph_mds_request *req;
777 int op;
315f2408 778 int mask;
2817b000
SW
779 int err;
780
a455589f
AV
781 dout("lookup %p dentry %p '%pd'\n",
782 dir, dentry, dentry);
2817b000
SW
783
784 if (dentry->d_name.len > NAME_MAX)
785 return ERR_PTR(-ENAMETOOLONG);
786
cb3524a8
JL
787 if (IS_ENCRYPTED(dir)) {
788 err = __fscrypt_prepare_readdir(dir);
789 if (err)
790 return ERR_PTR(err);
791 if (!fscrypt_has_encryption_key(dir)) {
792 spin_lock(&dentry->d_lock);
793 dentry->d_flags |= DCACHE_NOKEY_NAME;
794 spin_unlock(&dentry->d_lock);
795 }
796 }
797
2817b000 798 /* can we conclude ENOENT locally? */
2b0143b5 799 if (d_really_is_negative(dentry)) {
2817b000
SW
800 struct ceph_inode_info *ci = ceph_inode(dir);
801 struct ceph_dentry_info *di = ceph_dentry(dentry);
802
be655596 803 spin_lock(&ci->i_ceph_lock);
891f3f5a 804 dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
2817b000 805 if (strncmp(dentry->d_name.name,
3d14c5d2 806 fsc->mount_options->snapdir_name,
2817b000 807 dentry->d_name.len) &&
1d1de916 808 !is_root_ceph_dentry(dir, dentry) &&
e2c3de04 809 ceph_test_mount_opt(fsc, DCACHE) &&
2f276c51 810 __ceph_dir_is_complete(ci) &&
1af16d54 811 __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
719a2514 812 __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
be655596 813 spin_unlock(&ci->i_ceph_lock);
2817b000
SW
814 dout(" dir %p complete, -ENOENT\n", dir);
815 d_add(dentry, NULL);
97aeb6bf 816 di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
2817b000
SW
817 return NULL;
818 }
be655596 819 spin_unlock(&ci->i_ceph_lock);
2817b000
SW
820 }
821
822 op = ceph_snap(dir) == CEPH_SNAPDIR ?
823 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
824 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
825 if (IS_ERR(req))
7e34bc52 826 return ERR_CAST(req);
2817b000
SW
827 req->r_dentry = dget(dentry);
828 req->r_num_caps = 2;
315f2408
YZ
829
830 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
831 if (ceph_security_xattr_wanted(dir))
832 mask |= CEPH_CAP_XATTR_SHARED;
833 req->r_args.getattr.mask = cpu_to_le32(mask);
834
4c183472 835 ihold(dir);
3dd69aab
JL
836 req->r_parent = dir;
837 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
2817b000 838 err = ceph_mdsc_do_request(mdsc, NULL, req);
7a971e2c
JL
839 if (err == -ENOENT) {
840 struct dentry *res;
841
842 res = ceph_handle_snapdir(req, dentry);
843 if (IS_ERR(res)) {
844 err = PTR_ERR(res);
845 } else {
846 dentry = res;
847 err = 0;
848 }
aa60cfc3 849 }
2817b000
SW
850 dentry = ceph_finish_lookup(req, dentry, err);
851 ceph_mdsc_put_request(req); /* will dput(dentry) */
852 dout("lookup result=%p\n", dentry);
853 return dentry;
854}
855
856/*
857 * If we do a create but get no trace back from the MDS, follow up with
858 * a lookup (the VFS expects us to link up the provided dentry).
859 */
860int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
861{
00cd8dd3 862 struct dentry *result = ceph_lookup(dir, dentry, 0);
2817b000
SW
863
864 if (result && !IS_ERR(result)) {
865 /*
866 * We created the item, then did a lookup, and found
867 * it was already linked to another inode we already
4d41cef2
YZ
868 * had in our cache (and thus got spliced). To not
869 * confuse VFS (especially when inode is a directory),
870 * we don't link our dentry to that inode, return an
871 * error instead.
872 *
873 * This event should be rare and it happens only when
874 * we talk to old MDS. Recent MDS does not send traceless
875 * reply for request that creates new inode.
2817b000 876 */
5cba372c 877 d_drop(result);
4d41cef2 878 return -ESTALE;
2817b000
SW
879 }
880 return PTR_ERR(result);
881}
882
5ebb29be 883static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
549c7297 884 struct dentry *dentry, umode_t mode, dev_t rdev)
2817b000 885{
2678da88 886 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000 887 struct ceph_mds_request *req;
5c31e92d 888 struct ceph_acl_sec_ctx as_ctx = {};
2817b000
SW
889 int err;
890
891 if (ceph_snap(dir) != CEPH_NOSNAP)
892 return -EROFS;
893
4868e537
XL
894 err = ceph_wait_on_conflict_unlink(dentry);
895 if (err)
896 return err;
897
0459871c
CX
898 if (ceph_quota_is_max_files_exceeded(dir)) {
899 err = -EDQUOT;
900 goto out;
901 }
b7a29217 902
1a67aafb 903 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
2817b000
SW
904 dir, dentry, mode, rdev);
905 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
906 if (IS_ERR(req)) {
b1ee94aa
YZ
907 err = PTR_ERR(req);
908 goto out;
2817b000 909 }
ec9595c0
JL
910
911 req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
912 if (IS_ERR(req->r_new_inode)) {
913 err = PTR_ERR(req->r_new_inode);
914 req->r_new_inode = NULL;
915 goto out_req;
916 }
917
2817b000
SW
918 req->r_dentry = dget(dentry);
919 req->r_num_caps = 2;
3dd69aab 920 req->r_parent = dir;
4c183472 921 ihold(dir);
3dd69aab 922 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
2817b000
SW
923 req->r_args.mknod.mode = cpu_to_le32(mode);
924 req->r_args.mknod.rdev = cpu_to_le32(rdev);
d9d00f71
XL
925 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
926 CEPH_CAP_XATTR_EXCL;
2817b000 927 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ec9595c0
JL
928
929 ceph_as_ctx_to_req(req, &as_ctx);
930
2817b000
SW
931 err = ceph_mdsc_do_request(mdsc, dir, req);
932 if (!err && !req->r_reply_info.head->is_dentry)
933 err = ceph_handle_notrace_create(dir, dentry);
ec9595c0 934out_req:
2817b000 935 ceph_mdsc_put_request(req);
b1ee94aa 936out:
7221fe4c 937 if (!err)
5c31e92d 938 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
b20a95a0 939 else
2817b000 940 d_drop(dentry);
5c31e92d 941 ceph_release_acl_sec_ctx(&as_ctx);
2817b000
SW
942 return err;
943}
944
6c960e68 945static int ceph_create(struct mnt_idmap *idmap, struct inode *dir,
549c7297 946 struct dentry *dentry, umode_t mode, bool excl)
2817b000 947{
5ebb29be 948 return ceph_mknod(idmap, dir, dentry, mode, 0);
2817b000
SW
949}
950
79f2f6ad
JL
951#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
952static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
953 const char *dest)
954{
955 int err;
956 int len = strlen(dest);
957 struct fscrypt_str osd_link = FSTR_INIT(NULL, 0);
958
959 err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX,
960 &osd_link);
961 if (err)
962 goto out;
963
964 err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link);
965 if (err)
966 goto out;
967
968 req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL);
969 if (!req->r_path2) {
970 err = -ENOMEM;
971 goto out;
972 }
973
974 len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2);
975 req->r_path2[len] = '\0';
976out:
977 fscrypt_fname_free_buffer(&osd_link);
978 return err;
979}
980#else
981static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
982 const char *dest)
983{
984 return -EOPNOTSUPP;
985}
986#endif
987
7a77db95 988static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
549c7297 989 struct dentry *dentry, const char *dest)
2817b000 990{
2678da88 991 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000 992 struct ceph_mds_request *req;
ac6713cc 993 struct ceph_acl_sec_ctx as_ctx = {};
ec9595c0 994 umode_t mode = S_IFLNK | 0777;
2817b000
SW
995 int err;
996
997 if (ceph_snap(dir) != CEPH_NOSNAP)
998 return -EROFS;
999
4868e537
XL
1000 err = ceph_wait_on_conflict_unlink(dentry);
1001 if (err)
1002 return err;
1003
67fcd151
CX
1004 if (ceph_quota_is_max_files_exceeded(dir)) {
1005 err = -EDQUOT;
1006 goto out;
1007 }
b7a29217 1008
2817b000
SW
1009 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
1010 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
1011 if (IS_ERR(req)) {
b1ee94aa
YZ
1012 err = PTR_ERR(req);
1013 goto out;
2817b000 1014 }
ec9595c0
JL
1015
1016 req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
1017 if (IS_ERR(req->r_new_inode)) {
1018 err = PTR_ERR(req->r_new_inode);
1019 req->r_new_inode = NULL;
1020 goto out_req;
1021 }
1022
3dd69aab 1023 req->r_parent = dir;
4c183472
JL
1024 ihold(dir);
1025
79f2f6ad
JL
1026 if (IS_ENCRYPTED(req->r_new_inode)) {
1027 err = prep_encrypted_symlink_target(req, dest);
1028 if (err)
1029 goto out_req;
1030 } else {
1031 req->r_path2 = kstrdup(dest, GFP_KERNEL);
1032 if (!req->r_path2) {
1033 err = -ENOMEM;
1034 goto out_req;
1035 }
1036 }
1037
3dd69aab 1038 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
a149bb9a
SK
1039 req->r_dentry = dget(dentry);
1040 req->r_num_caps = 2;
d9d00f71
XL
1041 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
1042 CEPH_CAP_XATTR_EXCL;
2817b000 1043 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ec9595c0
JL
1044
1045 ceph_as_ctx_to_req(req, &as_ctx);
1046
2817b000
SW
1047 err = ceph_mdsc_do_request(mdsc, dir, req);
1048 if (!err && !req->r_reply_info.head->is_dentry)
1049 err = ceph_handle_notrace_create(dir, dentry);
ec9595c0 1050out_req:
2817b000 1051 ceph_mdsc_put_request(req);
b1ee94aa
YZ
1052out:
1053 if (err)
2817b000 1054 d_drop(dentry);
ac6713cc 1055 ceph_release_acl_sec_ctx(&as_ctx);
2817b000
SW
1056 return err;
1057}
1058
c54bd91e 1059static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
549c7297 1060 struct dentry *dentry, umode_t mode)
2817b000 1061{
2678da88 1062 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000 1063 struct ceph_mds_request *req;
5c31e92d 1064 struct ceph_acl_sec_ctx as_ctx = {};
4868e537 1065 int err;
2817b000
SW
1066 int op;
1067
4868e537
XL
1068 err = ceph_wait_on_conflict_unlink(dentry);
1069 if (err)
1070 return err;
1071
2817b000
SW
1072 if (ceph_snap(dir) == CEPH_SNAPDIR) {
1073 /* mkdir .snap/foo is a MKSNAP */
1074 op = CEPH_MDS_OP_MKSNAP;
a455589f
AV
1075 dout("mksnap dir %p snap '%pd' dn %p\n", dir,
1076 dentry, dentry);
2817b000 1077 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
18bb1db3 1078 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
2817b000
SW
1079 op = CEPH_MDS_OP_MKDIR;
1080 } else {
4868e537 1081 err = -EROFS;
2817b000
SW
1082 goto out;
1083 }
b1ee94aa 1084
25963669
YZ
1085 if (op == CEPH_MDS_OP_MKDIR &&
1086 ceph_quota_is_max_files_exceeded(dir)) {
b7a29217
LH
1087 err = -EDQUOT;
1088 goto out;
1089 }
1090
b1ee94aa 1091
2817b000
SW
1092 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
1093 if (IS_ERR(req)) {
1094 err = PTR_ERR(req);
1095 goto out;
1096 }
1097
ec9595c0
JL
1098 mode |= S_IFDIR;
1099 req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
1100 if (IS_ERR(req->r_new_inode)) {
1101 err = PTR_ERR(req->r_new_inode);
1102 req->r_new_inode = NULL;
1103 goto out_req;
1104 }
1105
2817b000
SW
1106 req->r_dentry = dget(dentry);
1107 req->r_num_caps = 2;
3dd69aab 1108 req->r_parent = dir;
4c183472 1109 ihold(dir);
3dd69aab 1110 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
2817b000 1111 req->r_args.mkdir.mode = cpu_to_le32(mode);
d9d00f71
XL
1112 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
1113 CEPH_CAP_XATTR_EXCL;
2817b000 1114 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ec9595c0
JL
1115
1116 ceph_as_ctx_to_req(req, &as_ctx);
1117
2817b000 1118 err = ceph_mdsc_do_request(mdsc, dir, req);
275dd19e
YZ
1119 if (!err &&
1120 !req->r_reply_info.head->is_target &&
1121 !req->r_reply_info.head->is_dentry)
2817b000 1122 err = ceph_handle_notrace_create(dir, dentry);
ec9595c0 1123out_req:
2817b000
SW
1124 ceph_mdsc_put_request(req);
1125out:
b20a95a0 1126 if (!err)
5c31e92d 1127 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
b20a95a0 1128 else
2817b000 1129 d_drop(dentry);
5c31e92d 1130 ceph_release_acl_sec_ctx(&as_ctx);
2817b000
SW
1131 return err;
1132}
1133
1134static int ceph_link(struct dentry *old_dentry, struct inode *dir,
1135 struct dentry *dentry)
1136{
2678da88 1137 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000
SW
1138 struct ceph_mds_request *req;
1139 int err;
1140
a5ffd7b6
XL
1141 if (dentry->d_flags & DCACHE_DISCONNECTED)
1142 return -EINVAL;
1143
4868e537
XL
1144 err = ceph_wait_on_conflict_unlink(dentry);
1145 if (err)
1146 return err;
1147
2817b000
SW
1148 if (ceph_snap(dir) != CEPH_NOSNAP)
1149 return -EROFS;
1150
94af0470
JL
1151 err = fscrypt_prepare_link(old_dentry, dir, dentry);
1152 if (err)
1153 return err;
1154
a5ffd7b6
XL
1155 dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
1156 dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
2817b000
SW
1157 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
1158 if (IS_ERR(req)) {
1159 d_drop(dentry);
1160 return PTR_ERR(req);
1161 }
1162 req->r_dentry = dget(dentry);
1163 req->r_num_caps = 2;
4b58c9b1 1164 req->r_old_dentry = dget(old_dentry);
a5ffd7b6
XL
1165 /*
1166 * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we
1167 * will just pass the ino# to MDSs.
1168 */
1169 if (old_dentry->d_flags & DCACHE_DISCONNECTED)
1170 req->r_ino2 = ceph_vino(d_inode(old_dentry));
3dd69aab 1171 req->r_parent = dir;
4c183472 1172 ihold(dir);
3dd69aab 1173 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
d9d00f71 1174 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000 1175 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ad88f23f 1176 /* release LINK_SHARED on source inode (mds will lock it) */
d19a0b54 1177 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
2817b000 1178 err = ceph_mdsc_do_request(mdsc, dir, req);
70b666c3 1179 if (err) {
2817b000 1180 d_drop(dentry);
70b666c3 1181 } else if (!req->r_reply_info.head->is_dentry) {
2b0143b5
DH
1182 ihold(d_inode(old_dentry));
1183 d_instantiate(dentry, d_inode(old_dentry));
70b666c3 1184 }
2817b000
SW
1185 ceph_mdsc_put_request(req);
1186 return err;
1187}
1188
2ccb4546
JL
1189static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
1190 struct ceph_mds_request *req)
1191{
4868e537
XL
1192 struct dentry *dentry = req->r_dentry;
1193 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
1194 struct ceph_dentry_info *di = ceph_dentry(dentry);
2ccb4546
JL
1195 int result = req->r_err ? req->r_err :
1196 le32_to_cpu(req->r_reply_info.head->result);
1197
4868e537
XL
1198 if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
1199 pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
1200 __func__, dentry, dentry);
1201
1202 spin_lock(&fsc->async_unlink_conflict_lock);
1203 hash_del_rcu(&di->hnode);
1204 spin_unlock(&fsc->async_unlink_conflict_lock);
1205
1206 spin_lock(&dentry->d_lock);
1207 di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
1208 wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
1209 spin_unlock(&dentry->d_lock);
1210
1211 synchronize_rcu();
1212
2ccb4546
JL
1213 if (result == -EJUKEBOX)
1214 goto out;
1215
1216 /* If op failed, mark everyone involved for errors */
1217 if (result) {
2a575f13
JL
1218 int pathlen = 0;
1219 u64 base = 0;
4868e537 1220 char *path = ceph_mdsc_build_path(dentry, &pathlen,
2ccb4546
JL
1221 &base, 0);
1222
1223 /* mark error on parent + clear complete */
1224 mapping_set_error(req->r_parent->i_mapping, result);
1225 ceph_dir_clear_complete(req->r_parent);
1226
1227 /* drop the dentry -- we don't know its status */
4868e537
XL
1228 if (!d_unhashed(dentry))
1229 d_drop(dentry);
2ccb4546
JL
1230
1231 /* mark inode itself for an error (since metadata is bogus) */
1232 mapping_set_error(req->r_old_inode->i_mapping, result);
1233
4868e537 1234 pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
2ccb4546
JL
1235 base, IS_ERR(path) ? "<<bad>>" : path, result);
1236 ceph_mdsc_free_path(path, pathlen);
1237 }
1238out:
1239 iput(req->r_old_inode);
1240 ceph_mdsc_release_dir_caps(req);
1241}
1242
1243static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
1244{
1245 struct ceph_inode_info *ci = ceph_inode(dir);
1246 struct ceph_dentry_info *di;
1247 int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
1248
1249 spin_lock(&ci->i_ceph_lock);
1250 if ((__ceph_caps_issued(ci, NULL) & want) == want) {
1251 ceph_take_cap_refs(ci, want, false);
1252 got = want;
1253 }
1254 spin_unlock(&ci->i_ceph_lock);
1255
1256 /* If we didn't get anything, return 0 */
1257 if (!got)
1258 return 0;
1259
1260 spin_lock(&dentry->d_lock);
1261 di = ceph_dentry(dentry);
1262 /*
1263 * - We are holding Fx, which implies Fs caps.
1264 * - Only support async unlink for primary linkage
1265 */
1266 if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
1267 !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
1268 want = 0;
1269 spin_unlock(&dentry->d_lock);
1270
1271 /* Do we still want what we've got? */
1272 if (want == got)
1273 return got;
1274
1275 ceph_put_cap_refs(ci, got);
1276 return 0;
1277}
1278
2817b000
SW
1279/*
1280 * rmdir and unlink are differ only by the metadata op code
1281 */
1282static int ceph_unlink(struct inode *dir, struct dentry *dentry)
1283{
3d14c5d2
YS
1284 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
1285 struct ceph_mds_client *mdsc = fsc->mdsc;
2b0143b5 1286 struct inode *inode = d_inode(dentry);
2817b000 1287 struct ceph_mds_request *req;
2ccb4546 1288 bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
2817b000
SW
1289 int err = -EROFS;
1290 int op;
1291
1292 if (ceph_snap(dir) == CEPH_SNAPDIR) {
1293 /* rmdir .snap/foo is RMSNAP */
a455589f 1294 dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
2817b000
SW
1295 op = CEPH_MDS_OP_RMSNAP;
1296 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
1297 dout("unlink/rmdir dir %p dn %p inode %p\n",
1298 dir, dentry, inode);
e36cb0b8 1299 op = d_is_dir(dentry) ?
2817b000
SW
1300 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
1301 } else
1302 goto out;
2ccb4546 1303retry:
2817b000
SW
1304 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
1305 if (IS_ERR(req)) {
1306 err = PTR_ERR(req);
1307 goto out;
1308 }
1309 req->r_dentry = dget(dentry);
1310 req->r_num_caps = 2;
3dd69aab 1311 req->r_parent = dir;
4c183472 1312 ihold(dir);
d9d00f71 1313 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000 1314 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
6ef0bc6d 1315 req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
2ccb4546
JL
1316
1317 if (try_async && op == CEPH_MDS_OP_UNLINK &&
1318 (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
4868e537
XL
1319 struct ceph_dentry_info *di = ceph_dentry(dentry);
1320
ebce3eb2 1321 dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
2ccb4546
JL
1322 dentry->d_name.len, dentry->d_name.name,
1323 ceph_cap_string(req->r_dir_caps));
1324 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
1325 req->r_callback = ceph_async_unlink_cb;
1326 req->r_old_inode = d_inode(dentry);
1327 ihold(req->r_old_inode);
4868e537
XL
1328
1329 spin_lock(&dentry->d_lock);
1330 di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
1331 spin_unlock(&dentry->d_lock);
1332
1333 spin_lock(&fsc->async_unlink_conflict_lock);
1334 hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
1335 dentry->d_name.hash);
1336 spin_unlock(&fsc->async_unlink_conflict_lock);
1337
2ccb4546
JL
1338 err = ceph_mdsc_submit_request(mdsc, dir, req);
1339 if (!err) {
1340 /*
1341 * We have enough caps, so we assume that the unlink
1342 * will succeed. Fix up the target inode and dcache.
1343 */
1344 drop_nlink(inode);
1345 d_delete(dentry);
4868e537
XL
1346 } else {
1347 spin_lock(&fsc->async_unlink_conflict_lock);
1348 hash_del_rcu(&di->hnode);
1349 spin_unlock(&fsc->async_unlink_conflict_lock);
1350
1351 spin_lock(&dentry->d_lock);
1352 di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
1353 spin_unlock(&dentry->d_lock);
1354
1355 if (err == -EJUKEBOX) {
1356 try_async = false;
1357 ceph_mdsc_put_request(req);
1358 goto retry;
1359 }
2ccb4546
JL
1360 }
1361 } else {
1362 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
1363 err = ceph_mdsc_do_request(mdsc, dir, req);
1364 if (!err && !req->r_reply_info.head->is_dentry)
1365 d_delete(dentry);
1366 }
1367
2817b000
SW
1368 ceph_mdsc_put_request(req);
1369out:
1370 return err;
1371}
1372
e18275ae 1373static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
549c7297
CB
1374 struct dentry *old_dentry, struct inode *new_dir,
1375 struct dentry *new_dentry, unsigned int flags)
2817b000 1376{
2678da88 1377 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
2817b000 1378 struct ceph_mds_request *req;
0ea611a3 1379 int op = CEPH_MDS_OP_RENAME;
2817b000
SW
1380 int err;
1381
1cd66c93
MS
1382 if (flags)
1383 return -EINVAL;
1384
2817b000
SW
1385 if (ceph_snap(old_dir) != ceph_snap(new_dir))
1386 return -EXDEV;
0ea611a3
YZ
1387 if (ceph_snap(old_dir) != CEPH_NOSNAP) {
1388 if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
1389 op = CEPH_MDS_OP_RENAMESNAP;
1390 else
1391 return -EROFS;
1392 }
6646ea1c
LH
1393 /* don't allow cross-quota renames */
1394 if ((old_dir != new_dir) &&
1395 (!ceph_quota_is_same_realm(old_dir, new_dir)))
1396 return -EXDEV;
cafe21a4 1397
4868e537
XL
1398 err = ceph_wait_on_conflict_unlink(new_dentry);
1399 if (err)
1400 return err;
1401
94af0470
JL
1402 err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
1403 flags);
1404 if (err)
1405 return err;
1406
2817b000
SW
1407 dout("rename dir %p dentry %p to dir %p dentry %p\n",
1408 old_dir, old_dentry, new_dir, new_dentry);
0ea611a3 1409 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
2817b000
SW
1410 if (IS_ERR(req))
1411 return PTR_ERR(req);
180061a5 1412 ihold(old_dir);
2817b000
SW
1413 req->r_dentry = dget(new_dentry);
1414 req->r_num_caps = 2;
1415 req->r_old_dentry = dget(old_dentry);
180061a5 1416 req->r_old_dentry_dir = old_dir;
3dd69aab 1417 req->r_parent = new_dir;
4c183472 1418 ihold(new_dir);
3dd69aab 1419 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
d9d00f71 1420 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000 1421 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
d9d00f71 1422 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000
SW
1423 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1424 /* release LINK_RDCACHE on source inode (mds will lock it) */
d19a0b54 1425 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
6ef0bc6d
ZZ
1426 if (d_really_is_positive(new_dentry)) {
1427 req->r_inode_drop =
1428 ceph_drop_caps_for_unlink(d_inode(new_dentry));
1429 }
2817b000
SW
1430 err = ceph_mdsc_do_request(mdsc, old_dir, req);
1431 if (!err && !req->r_reply_info.head->is_dentry) {
1432 /*
1433 * Normally d_move() is done by fill_trace (called by
1434 * do_request, above). If there is no trace, we need
1435 * to do it here.
1436 */
1437 d_move(old_dentry, new_dentry);
1438 }
1439 ceph_mdsc_put_request(req);
1440 return err;
1441}
1442
37c4efc1
YZ
1443/*
1444 * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
1445 * Leases at front of the list will expire first. (Assume all leases have
1446 * similar duration)
1447 *
1448 * Called under dentry->d_lock.
1449 */
1450void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
1451{
1452 struct dentry *dn = di->dentry;
1453 struct ceph_mds_client *mdsc;
1454
1455 dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
1456
1457 di->flags |= CEPH_DENTRY_LEASE_LIST;
1458 if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
1459 di->flags |= CEPH_DENTRY_REFERENCED;
1460 return;
1461 }
1462
1463 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1464 spin_lock(&mdsc->dentry_list_lock);
1465 list_move_tail(&di->lease_list, &mdsc->dentry_leases);
1466 spin_unlock(&mdsc->dentry_list_lock);
1467}
1468
1469static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
1470 struct ceph_dentry_info *di)
1471{
1472 di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
1473 di->lease_gen = 0;
1474 di->time = jiffies;
1475 list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
1476}
1477
1478/*
1479 * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
1480 * list if it's not in the list, otherwise set 'referenced' flag.
1481 *
1482 * Called under dentry->d_lock.
1483 */
1484void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
1485{
1486 struct dentry *dn = di->dentry;
1487 struct ceph_mds_client *mdsc;
1488
0eb30853 1489 dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
37c4efc1
YZ
1490 di, dn, dn, di->offset);
1491
1492 if (!list_empty(&di->lease_list)) {
1493 if (di->flags & CEPH_DENTRY_LEASE_LIST) {
1494 /* don't remove dentry from dentry lease list
1495 * if its lease is valid */
1496 if (__dentry_lease_is_valid(di))
1497 return;
1498 } else {
1499 di->flags |= CEPH_DENTRY_REFERENCED;
1500 return;
1501 }
1502 }
1503
1504 if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
1505 di->flags |= CEPH_DENTRY_REFERENCED;
1506 di->flags &= ~CEPH_DENTRY_LEASE_LIST;
1507 return;
1508 }
1509
1510 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1511 spin_lock(&mdsc->dentry_list_lock);
1512 __dentry_dir_lease_touch(mdsc, di),
1513 spin_unlock(&mdsc->dentry_list_lock);
1514}
1515
1516static void __dentry_lease_unlist(struct ceph_dentry_info *di)
1517{
1518 struct ceph_mds_client *mdsc;
1519 if (di->flags & CEPH_DENTRY_SHRINK_LIST)
1520 return;
1521 if (list_empty(&di->lease_list))
1522 return;
1523
1524 mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
1525 spin_lock(&mdsc->dentry_list_lock);
1526 list_del_init(&di->lease_list);
1527 spin_unlock(&mdsc->dentry_list_lock);
1528}
1529
1530enum {
1531 KEEP = 0,
1532 DELETE = 1,
1533 TOUCH = 2,
1534 STOP = 4,
1535};
1536
1537struct ceph_lease_walk_control {
1538 bool dir_lease;
fe33032d 1539 bool expire_dir_lease;
37c4efc1
YZ
1540 unsigned long nr_to_scan;
1541 unsigned long dir_lease_ttl;
1542};
1543
1544static unsigned long
1545__dentry_leases_walk(struct ceph_mds_client *mdsc,
1546 struct ceph_lease_walk_control *lwc,
1547 int (*check)(struct dentry*, void*))
1548{
1549 struct ceph_dentry_info *di, *tmp;
1550 struct dentry *dentry, *last = NULL;
1551 struct list_head* list;
1552 LIST_HEAD(dispose);
1553 unsigned long freed = 0;
1554 int ret = 0;
1555
1556 list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
1557 spin_lock(&mdsc->dentry_list_lock);
1558 list_for_each_entry_safe(di, tmp, list, lease_list) {
1559 if (!lwc->nr_to_scan)
1560 break;
1561 --lwc->nr_to_scan;
1562
1563 dentry = di->dentry;
1564 if (last == dentry)
1565 break;
1566
1567 if (!spin_trylock(&dentry->d_lock))
1568 continue;
1569
516162b9 1570 if (__lockref_is_dead(&dentry->d_lockref)) {
37c4efc1
YZ
1571 list_del_init(&di->lease_list);
1572 goto next;
1573 }
1574
1575 ret = check(dentry, lwc);
1576 if (ret & TOUCH) {
1577 /* move it into tail of dir lease list */
1578 __dentry_dir_lease_touch(mdsc, di);
1579 if (!last)
1580 last = dentry;
1581 }
1582 if (ret & DELETE) {
1583 /* stale lease */
1584 di->flags &= ~CEPH_DENTRY_REFERENCED;
1585 if (dentry->d_lockref.count > 0) {
1586 /* update_dentry_lease() will re-add
1587 * it to lease list, or
1588 * ceph_d_delete() will return 1 when
1589 * last reference is dropped */
1590 list_del_init(&di->lease_list);
1591 } else {
1592 di->flags |= CEPH_DENTRY_SHRINK_LIST;
1593 list_move_tail(&di->lease_list, &dispose);
1594 dget_dlock(dentry);
1595 }
1596 }
1597next:
1598 spin_unlock(&dentry->d_lock);
1599 if (ret & STOP)
1600 break;
1601 }
1602 spin_unlock(&mdsc->dentry_list_lock);
1603
1604 while (!list_empty(&dispose)) {
1605 di = list_first_entry(&dispose, struct ceph_dentry_info,
1606 lease_list);
1607 dentry = di->dentry;
1608 spin_lock(&dentry->d_lock);
1609
1610 list_del_init(&di->lease_list);
1611 di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
1612 if (di->flags & CEPH_DENTRY_REFERENCED) {
1613 spin_lock(&mdsc->dentry_list_lock);
1614 if (di->flags & CEPH_DENTRY_LEASE_LIST) {
1615 list_add_tail(&di->lease_list,
1616 &mdsc->dentry_leases);
1617 } else {
1618 __dentry_dir_lease_touch(mdsc, di);
1619 }
1620 spin_unlock(&mdsc->dentry_list_lock);
1621 } else {
1622 freed++;
1623 }
1624
1625 spin_unlock(&dentry->d_lock);
1626 /* ceph_d_delete() does the trick */
1627 dput(dentry);
1628 }
1629 return freed;
1630}
1631
1632static int __dentry_lease_check(struct dentry *dentry, void *arg)
1633{
1634 struct ceph_dentry_info *di = ceph_dentry(dentry);
1635 int ret;
1636
1637 if (__dentry_lease_is_valid(di))
1638 return STOP;
1639 ret = __dir_lease_try_check(dentry);
1640 if (ret == -EBUSY)
1641 return KEEP;
1642 if (ret > 0)
1643 return TOUCH;
1644 return DELETE;
1645}
1646
1647static int __dir_lease_check(struct dentry *dentry, void *arg)
1648{
1649 struct ceph_lease_walk_control *lwc = arg;
1650 struct ceph_dentry_info *di = ceph_dentry(dentry);
1651
1652 int ret = __dir_lease_try_check(dentry);
1653 if (ret == -EBUSY)
1654 return KEEP;
1655 if (ret > 0) {
1656 if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
1657 return STOP;
1658 /* Move dentry to tail of dir lease list if we don't want
1659 * to delete it. So dentries in the list are checked in a
1660 * round robin manner */
fe33032d
YZ
1661 if (!lwc->expire_dir_lease)
1662 return TOUCH;
1663 if (dentry->d_lockref.count > 0 ||
1664 (di->flags & CEPH_DENTRY_REFERENCED))
1665 return TOUCH;
1666 /* invalidate dir lease */
1667 di->lease_shared_gen = 0;
37c4efc1
YZ
1668 }
1669 return DELETE;
1670}
1671
1672int ceph_trim_dentries(struct ceph_mds_client *mdsc)
1673{
1674 struct ceph_lease_walk_control lwc;
fe33032d 1675 unsigned long count;
37c4efc1
YZ
1676 unsigned long freed;
1677
fe33032d
YZ
1678 spin_lock(&mdsc->caps_list_lock);
1679 if (mdsc->caps_use_max > 0 &&
1680 mdsc->caps_use_count > mdsc->caps_use_max)
1681 count = mdsc->caps_use_count - mdsc->caps_use_max;
1682 else
1683 count = 0;
1684 spin_unlock(&mdsc->caps_list_lock);
1685
37c4efc1
YZ
1686 lwc.dir_lease = false;
1687 lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
1688 freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
1689 if (!lwc.nr_to_scan) /* more invalid leases */
1690 return -EAGAIN;
1691
1692 if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
1693 lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
1694
1695 lwc.dir_lease = true;
fe33032d
YZ
1696 lwc.expire_dir_lease = freed < count;
1697 lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
37c4efc1
YZ
1698 freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
1699 if (!lwc.nr_to_scan) /* more to check */
1700 return -EAGAIN;
1701
1702 return freed > 0 ? 1 : 0;
1703}
1704
81a6cf2d
SW
1705/*
1706 * Ensure a dentry lease will no longer revalidate.
1707 */
1708void ceph_invalidate_dentry_lease(struct dentry *dentry)
1709{
37c4efc1 1710 struct ceph_dentry_info *di = ceph_dentry(dentry);
81a6cf2d 1711 spin_lock(&dentry->d_lock);
37c4efc1
YZ
1712 di->time = jiffies;
1713 di->lease_shared_gen = 0;
f5e17aed 1714 di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
37c4efc1 1715 __dentry_lease_unlist(di);
81a6cf2d
SW
1716 spin_unlock(&dentry->d_lock);
1717}
2817b000
SW
1718
1719/*
1720 * Check if dentry lease is valid. If not, delete the lease. Try to
1721 * renew if the least is more than half up.
1722 */
1e9c2eb6
YZ
1723static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
1724{
1725 struct ceph_mds_session *session;
1726
1727 if (!di->lease_gen)
1728 return false;
1729
1730 session = di->lease_session;
1731 if (session) {
1732 u32 gen;
1733 unsigned long ttl;
1734
52d60f8e 1735 gen = atomic_read(&session->s_cap_gen);
1e9c2eb6 1736 ttl = session->s_cap_ttl;
1e9c2eb6
YZ
1737
1738 if (di->lease_gen == gen &&
1739 time_before(jiffies, ttl) &&
1740 time_before(jiffies, di->time))
1741 return true;
1742 }
1743 di->lease_gen = 0;
1744 return false;
1745}
1746
8f2a98ef 1747static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
2817b000
SW
1748{
1749 struct ceph_dentry_info *di;
2817b000 1750 struct ceph_mds_session *session = NULL;
2817b000 1751 u32 seq = 0;
1e9c2eb6 1752 int valid = 0;
2817b000
SW
1753
1754 spin_lock(&dentry->d_lock);
1755 di = ceph_dentry(dentry);
1e9c2eb6
YZ
1756 if (di && __dentry_lease_is_valid(di)) {
1757 valid = 1;
2817b000 1758
1e9c2eb6
YZ
1759 if (di->lease_renew_after &&
1760 time_after(jiffies, di->lease_renew_after)) {
1761 /*
1762 * We should renew. If we're in RCU walk mode
1763 * though, we can't do that so just return
1764 * -ECHILD.
1765 */
1766 if (flags & LOOKUP_RCU) {
1767 valid = -ECHILD;
1768 } else {
1769 session = ceph_get_mds_session(di->lease_session);
1770 seq = di->lease_seq;
1771 di->lease_renew_after = 0;
1772 di->lease_renew_from = jiffies;
2817b000 1773 }
2817b000
SW
1774 }
1775 }
1776 spin_unlock(&dentry->d_lock);
1777
1778 if (session) {
8f2a98ef 1779 ceph_mdsc_lease_send_msg(session, dentry,
2817b000
SW
1780 CEPH_MDS_LEASE_RENEW, seq);
1781 ceph_put_mds_session(session);
1782 }
1783 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
1784 return valid;
1785}
1786
1e9c2eb6
YZ
1787/*
1788 * Called under dentry->d_lock.
1789 */
1790static int __dir_lease_try_check(const struct dentry *dentry)
1791{
1792 struct ceph_dentry_info *di = ceph_dentry(dentry);
1793 struct inode *dir;
1794 struct ceph_inode_info *ci;
1795 int valid = 0;
1796
1797 if (!di->lease_shared_gen)
1798 return 0;
1799 if (IS_ROOT(dentry))
1800 return 0;
1801
1802 dir = d_inode(dentry->d_parent);
1803 ci = ceph_inode(dir);
1804
1805 if (spin_trylock(&ci->i_ceph_lock)) {
1806 if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
1807 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
1808 valid = 1;
1809 spin_unlock(&ci->i_ceph_lock);
1810 } else {
1811 valid = -EBUSY;
1812 }
1813
1814 if (!valid)
1815 di->lease_shared_gen = 0;
1816 return valid;
1817}
1818
2817b000
SW
1819/*
1820 * Check if directory-wide content lease/cap is valid.
1821 */
719a2514
YZ
1822static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
1823 struct ceph_mds_client *mdsc)
2817b000
SW
1824{
1825 struct ceph_inode_info *ci = ceph_inode(dir);
feab6ac2
YZ
1826 int valid;
1827 int shared_gen;
2817b000 1828
be655596 1829 spin_lock(&ci->i_ceph_lock);
feab6ac2 1830 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
719a2514
YZ
1831 if (valid) {
1832 __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
1833 shared_gen = atomic_read(&ci->i_shared_gen);
1834 }
be655596 1835 spin_unlock(&ci->i_ceph_lock);
feab6ac2
YZ
1836 if (valid) {
1837 struct ceph_dentry_info *di;
1838 spin_lock(&dentry->d_lock);
1839 di = ceph_dentry(dentry);
1840 if (dir == d_inode(dentry->d_parent) &&
1841 di && di->lease_shared_gen == shared_gen)
1842 __ceph_dentry_dir_lease_touch(di);
1843 else
1844 valid = 0;
1845 spin_unlock(&dentry->d_lock);
1846 }
1847 dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
1848 dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
2817b000
SW
1849 return valid;
1850}
1851
1852/*
1853 * Check if cached dentry can be trusted.
1854 */
0b728e19 1855static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
2817b000 1856{
bf1c6aca 1857 int valid = 0;
641235d8 1858 struct dentry *parent;
aa8dd816 1859 struct inode *dir, *inode;
719a2514 1860 struct ceph_mds_client *mdsc;
34286d66 1861
c5267601
JL
1862 valid = fscrypt_d_revalidate(dentry, flags);
1863 if (valid <= 0)
1864 return valid;
1865
f49d1e05 1866 if (flags & LOOKUP_RCU) {
52953d55 1867 parent = READ_ONCE(dentry->d_parent);
f49d1e05
JL
1868 dir = d_inode_rcu(parent);
1869 if (!dir)
1870 return -ECHILD;
aa8dd816 1871 inode = d_inode_rcu(dentry);
f49d1e05
JL
1872 } else {
1873 parent = dget_parent(dentry);
1874 dir = d_inode(parent);
aa8dd816 1875 inode = d_inode(dentry);
f49d1e05 1876 }
34286d66 1877
c5267601
JL
1878 dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
1879 dentry, inode, ceph_dentry(dentry)->offset,
1880 !!(dentry->d_flags & DCACHE_NOKEY_NAME));
2817b000 1881
719a2514
YZ
1882 mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
1883
2817b000
SW
1884 /* always trust cached snapped dentries, snapdir dentry */
1885 if (ceph_snap(dir) != CEPH_NOSNAP) {
a455589f 1886 dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
aa8dd816 1887 dentry, inode);
bf1c6aca 1888 valid = 1;
aa8dd816 1889 } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
bf1c6aca 1890 valid = 1;
14fb9c9e 1891 } else {
8f2a98ef 1892 valid = dentry_lease_is_valid(dentry, flags);
14fb9c9e
JL
1893 if (valid == -ECHILD)
1894 return valid;
719a2514 1895 if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
aa8dd816
AV
1896 if (inode)
1897 valid = ceph_is_any_caps(inode);
14fb9c9e
JL
1898 else
1899 valid = 1;
1900 }
2817b000 1901 }
2817b000 1902
200fd27c 1903 if (!valid) {
200fd27c 1904 struct ceph_mds_request *req;
1097680d
JL
1905 int op, err;
1906 u32 mask;
200fd27c 1907
f49d1e05
JL
1908 if (flags & LOOKUP_RCU)
1909 return -ECHILD;
1910
f9009efa
XL
1911 percpu_counter_inc(&mdsc->metric.d_lease_mis);
1912
200fd27c 1913 op = ceph_snap(dir) == CEPH_SNAPDIR ?
5eb9f604 1914 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
200fd27c
YZ
1915 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
1916 if (!IS_ERR(req)) {
1917 req->r_dentry = dget(dentry);
5eb9f604
JL
1918 req->r_num_caps = 2;
1919 req->r_parent = dir;
4c183472 1920 ihold(dir);
200fd27c
YZ
1921
1922 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
1923 if (ceph_security_xattr_wanted(dir))
1924 mask |= CEPH_CAP_XATTR_SHARED;
1097680d 1925 req->r_args.getattr.mask = cpu_to_le32(mask);
200fd27c 1926
200fd27c 1927 err = ceph_mdsc_do_request(mdsc, NULL, req);
c3f4688a
JL
1928 switch (err) {
1929 case 0:
1930 if (d_really_is_positive(dentry) &&
1931 d_inode(dentry) == req->r_target_inode)
1932 valid = 1;
1933 break;
1934 case -ENOENT:
1935 if (d_really_is_negative(dentry))
1936 valid = 1;
df561f66 1937 fallthrough;
c3f4688a
JL
1938 default:
1939 break;
200fd27c
YZ
1940 }
1941 ceph_mdsc_put_request(req);
1942 dout("d_revalidate %p lookup result=%d\n",
1943 dentry, err);
1944 }
f9009efa
XL
1945 } else {
1946 percpu_counter_inc(&mdsc->metric.d_lease_hit);
200fd27c
YZ
1947 }
1948
bf1c6aca 1949 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
37c4efc1 1950 if (!valid)
9215aeea 1951 ceph_dir_clear_complete(dir);
641235d8 1952
f49d1e05
JL
1953 if (!(flags & LOOKUP_RCU))
1954 dput(parent);
bf1c6aca 1955 return valid;
2817b000
SW
1956}
1957
1e9c2eb6
YZ
1958/*
1959 * Delete unused dentry that doesn't have valid lease
1960 *
1961 * Called under dentry->d_lock.
1962 */
1963static int ceph_d_delete(const struct dentry *dentry)
1964{
1965 struct ceph_dentry_info *di;
1966
1967 /* won't release caps */
1968 if (d_really_is_negative(dentry))
1969 return 0;
1970 if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
1971 return 0;
1972 /* vaild lease? */
1973 di = ceph_dentry(dentry);
1974 if (di) {
1975 if (__dentry_lease_is_valid(di))
1976 return 0;
1977 if (__dir_lease_try_check(dentry))
1978 return 0;
1979 }
1980 return 1;
1981}
1982
2817b000 1983/*
147851d2 1984 * Release our ceph_dentry_info.
2817b000 1985 */
147851d2 1986static void ceph_d_release(struct dentry *dentry)
2817b000
SW
1987{
1988 struct ceph_dentry_info *di = ceph_dentry(dentry);
f9009efa 1989 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
2817b000 1990
147851d2 1991 dout("d_release %p\n", dentry);
5b484a51 1992
f9009efa
XL
1993 atomic64_dec(&fsc->mdsc->metric.total_dentries);
1994
5b484a51 1995 spin_lock(&dentry->d_lock);
37c4efc1 1996 __dentry_lease_unlist(di);
5b484a51
JL
1997 dentry->d_fsdata = NULL;
1998 spin_unlock(&dentry->d_lock);
1999
7e65624d 2000 ceph_put_mds_session(di->lease_session);
3d8eb7a9 2001 kmem_cache_free(ceph_dentry_cachep, di);
2817b000
SW
2002}
2003
b58dc410
SW
2004/*
2005 * When the VFS prunes a dentry from the cache, we need to clear the
2006 * complete flag on the parent directory.
2007 *
2008 * Called under dentry->d_lock.
2009 */
2010static void ceph_d_prune(struct dentry *dentry)
2011{
5495c2d0
YZ
2012 struct ceph_inode_info *dir_ci;
2013 struct ceph_dentry_info *di;
2014
2015 dout("ceph_d_prune %pd %p\n", dentry, dentry);
b58dc410
SW
2016
2017 /* do we have a valid parent? */
8842b3be 2018 if (IS_ROOT(dentry))
b58dc410
SW
2019 return;
2020
5495c2d0
YZ
2021 /* we hold d_lock, so d_parent is stable */
2022 dir_ci = ceph_inode(d_inode(dentry->d_parent));
2023 if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
b58dc410 2024 return;
2817b000 2025
5495c2d0
YZ
2026 /* who calls d_delete() should also disable dcache readdir */
2027 if (d_really_is_negative(dentry))
18fc8abd
AV
2028 return;
2029
5495c2d0
YZ
2030 /* d_fsdata does not get cleared until d_release */
2031 if (!d_unhashed(dentry)) {
2032 __ceph_dir_clear_complete(dir_ci);
2033 return;
2034 }
2035
2036 /* Disable dcache readdir just in case that someone called d_drop()
2037 * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
2038 * properly (dcache readdir is still enabled) */
2039 di = ceph_dentry(dentry);
2040 if (di->offset > 0 &&
2041 di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen))
2042 __ceph_dir_clear_ordered(dir_ci);
b58dc410 2043}
2817b000
SW
2044
2045/*
2046 * read() on a dir. This weird interface hack only works if mounted
2047 * with '-o dirstat'.
2048 */
2049static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
2050 loff_t *ppos)
2051{
bb48bd4d 2052 struct ceph_dir_file_info *dfi = file->private_data;
496ad9aa 2053 struct inode *inode = file_inode(file);
2817b000
SW
2054 struct ceph_inode_info *ci = ceph_inode(inode);
2055 int left;
ae598083 2056 const int bufsize = 1024;
2817b000 2057
3d14c5d2 2058 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
2817b000
SW
2059 return -EISDIR;
2060
bb48bd4d
CX
2061 if (!dfi->dir_info) {
2062 dfi->dir_info = kmalloc(bufsize, GFP_KERNEL);
2063 if (!dfi->dir_info)
2817b000 2064 return -ENOMEM;
bb48bd4d
CX
2065 dfi->dir_info_len =
2066 snprintf(dfi->dir_info, bufsize,
2817b000
SW
2067 "entries: %20lld\n"
2068 " files: %20lld\n"
2069 " subdirs: %20lld\n"
2070 "rentries: %20lld\n"
2071 " rfiles: %20lld\n"
2072 " rsubdirs: %20lld\n"
2073 "rbytes: %20lld\n"
9bbeab41 2074 "rctime: %10lld.%09ld\n",
2817b000
SW
2075 ci->i_files + ci->i_subdirs,
2076 ci->i_files,
2077 ci->i_subdirs,
2078 ci->i_rfiles + ci->i_rsubdirs,
2079 ci->i_rfiles,
2080 ci->i_rsubdirs,
2081 ci->i_rbytes,
9bbeab41
AB
2082 ci->i_rctime.tv_sec,
2083 ci->i_rctime.tv_nsec);
2817b000
SW
2084 }
2085
bb48bd4d 2086 if (*ppos >= dfi->dir_info_len)
2817b000 2087 return 0;
bb48bd4d
CX
2088 size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
2089 left = copy_to_user(buf, dfi->dir_info + *ppos, size);
2817b000
SW
2090 if (left == size)
2091 return -EFAULT;
2092 *ppos += (size - left);
2093 return size - left;
2094}
2095
2817b000 2096
2817b000 2097
6c0f3af7
SW
2098/*
2099 * Return name hash for a given dentry. This is dependent on
2100 * the parent directory's hash function.
2101 */
e5f86dc3 2102unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
6c0f3af7 2103{
6c0f3af7 2104 struct ceph_inode_info *dci = ceph_inode(dir);
76a495d6 2105 unsigned hash;
6c0f3af7
SW
2106
2107 switch (dci->i_dir_layout.dl_dir_hash) {
2108 case 0: /* for backward compat */
2109 case CEPH_STR_HASH_LINUX:
2110 return dn->d_name.hash;
2111
2112 default:
76a495d6
JL
2113 spin_lock(&dn->d_lock);
2114 hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
6c0f3af7 2115 dn->d_name.name, dn->d_name.len);
76a495d6
JL
2116 spin_unlock(&dn->d_lock);
2117 return hash;
6c0f3af7
SW
2118 }
2119}
2120
3e327154 2121WRAP_DIR_ITER(ceph_readdir) // FIXME!
2817b000
SW
2122const struct file_operations ceph_dir_fops = {
2123 .read = ceph_read_dir,
3e327154 2124 .iterate_shared = shared_ceph_readdir,
2817b000
SW
2125 .llseek = ceph_dir_llseek,
2126 .open = ceph_open,
2127 .release = ceph_release,
2128 .unlocked_ioctl = ceph_ioctl,
18bd6caa 2129 .compat_ioctl = compat_ptr_ioctl,
da819c81 2130 .fsync = ceph_fsync,
597817dd
YZ
2131 .lock = ceph_lock,
2132 .flock = ceph_flock,
2817b000
SW
2133};
2134
38c48b5f 2135const struct file_operations ceph_snapdir_fops = {
3e327154 2136 .iterate_shared = shared_ceph_readdir,
38c48b5f
YZ
2137 .llseek = ceph_dir_llseek,
2138 .open = ceph_open,
2139 .release = ceph_release,
2140};
2141
2817b000
SW
2142const struct inode_operations ceph_dir_iops = {
2143 .lookup = ceph_lookup,
2144 .permission = ceph_permission,
2145 .getattr = ceph_getattr,
2146 .setattr = ceph_setattr,
2817b000 2147 .listxattr = ceph_listxattr,
cac2f8b8 2148 .get_inode_acl = ceph_get_acl,
72466d0b 2149 .set_acl = ceph_set_acl,
2817b000
SW
2150 .mknod = ceph_mknod,
2151 .symlink = ceph_symlink,
2152 .mkdir = ceph_mkdir,
2153 .link = ceph_link,
2154 .unlink = ceph_unlink,
2155 .rmdir = ceph_unlink,
2156 .rename = ceph_rename,
2157 .create = ceph_create,
2d83bde9 2158 .atomic_open = ceph_atomic_open,
2817b000
SW
2159};
2160
38c48b5f
YZ
2161const struct inode_operations ceph_snapdir_iops = {
2162 .lookup = ceph_lookup,
2163 .permission = ceph_permission,
2164 .getattr = ceph_getattr,
2165 .mkdir = ceph_mkdir,
2166 .rmdir = ceph_unlink,
0ea611a3 2167 .rename = ceph_rename,
38c48b5f
YZ
2168};
2169
52dfb8ac 2170const struct dentry_operations ceph_dentry_ops = {
2817b000 2171 .d_revalidate = ceph_d_revalidate,
1e9c2eb6 2172 .d_delete = ceph_d_delete,
147851d2 2173 .d_release = ceph_d_release,
b58dc410 2174 .d_prune = ceph_d_prune,
ad5cb123 2175 .d_init = ceph_d_init,
2817b000 2176};