ceph: create symlinks with encrypted and base64-encoded targets
[linux-block.git] / fs / ceph / dir.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
3d14c5d2 2#include <linux/ceph/ceph_debug.h>
2817b000
SW
3
4#include <linux/spinlock.h>
2817b000 5#include <linux/namei.h>
5a0e3ad6 6#include <linux/slab.h>
2817b000 7#include <linux/sched.h>
2cdeb1e4 8#include <linux/xattr.h>
2817b000
SW
9
10#include "super.h"
3d14c5d2 11#include "mds_client.h"
af9ffa6d 12#include "crypto.h"
2817b000
SW
13
14/*
15 * Directory operations: readdir, lookup, create, link, unlink,
16 * rename, etc.
17 */
18
19/*
20 * Ceph MDS operations are specified in terms of a base ino and
21 * relative path. Thus, the client can specify an operation on a
22 * specific inode (e.g., a getattr due to fstat(2)), or as a path
23 * relative to, say, the root directory.
24 *
25 * Normally, we limit ourselves to strict inode ops (no path component)
26 * or dentry operations (a single path component relative to an ino). The
27 * exception to this is open_root_dentry(), which will open the mount
28 * point by name.
29 */
30
52dfb8ac 31const struct dentry_operations ceph_dentry_ops;
2817b000 32
37c4efc1
YZ
33static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
34static int __dir_lease_try_check(const struct dentry *dentry);
35
2817b000
SW
36/*
37 * Initialize ceph dentry state.
38 */
ad5cb123 39static int ceph_d_init(struct dentry *dentry)
2817b000
SW
40{
41 struct ceph_dentry_info *di;
2678da88 42 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
2817b000 43
99ec2697 44 di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
2817b000
SW
45 if (!di)
46 return -ENOMEM; /* oh well */
47
2817b000
SW
48 di->dentry = dentry;
49 di->lease_session = NULL;
9b16f03c 50 di->time = jiffies;
48d0cbd1 51 dentry->d_fsdata = di;
37c4efc1 52 INIT_LIST_HEAD(&di->lease_list);
f9009efa
XL
53
54 atomic64_inc(&mdsc->metric.total_dentries);
55
2817b000
SW
56 return 0;
57}
58
2817b000 59/*
f3c4ebe6
YZ
60 * for f_pos for readdir:
61 * - hash order:
62 * (0xff << 52) | ((24 bits hash) << 28) |
63 * (the nth entry has hash collision);
64 * - frag+name order;
65 * ((frag value) << 28) | (the nth entry in frag);
2817b000 66 */
f3c4ebe6
YZ
67#define OFFSET_BITS 28
68#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
69#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
70loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
71{
72 loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
73 if (hash_order)
74 fpos |= HASH_ORDER;
75 return fpos;
76}
77
78static bool is_hash_order(loff_t p)
79{
80 return (p & HASH_ORDER) == HASH_ORDER;
81}
82
2817b000
SW
83static unsigned fpos_frag(loff_t p)
84{
f3c4ebe6 85 return p >> OFFSET_BITS;
2817b000 86}
f3c4ebe6
YZ
87
88static unsigned fpos_hash(loff_t p)
89{
90 return ceph_frag_value(fpos_frag(p));
91}
92
2817b000
SW
93static unsigned fpos_off(loff_t p)
94{
f3c4ebe6 95 return p & OFFSET_MASK;
2817b000
SW
96}
97
4d5f5df6
YZ
98static int fpos_cmp(loff_t l, loff_t r)
99{
100 int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
101 if (v)
102 return v;
103 return (int)(fpos_off(l) - fpos_off(r));
104}
105
fdd4e158
YZ
106/*
107 * make note of the last dentry we read, so we can
108 * continue at the same lexicographical point,
109 * regardless of what dir changes take place on the
110 * server.
111 */
bb48bd4d 112static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
fdd4e158
YZ
113 int len, unsigned next_offset)
114{
115 char *buf = kmalloc(len+1, GFP_KERNEL);
116 if (!buf)
117 return -ENOMEM;
bb48bd4d
CX
118 kfree(dfi->last_name);
119 dfi->last_name = buf;
120 memcpy(dfi->last_name, name, len);
121 dfi->last_name[len] = 0;
122 dfi->next_offset = next_offset;
123 dout("note_last_dentry '%s'\n", dfi->last_name);
fdd4e158
YZ
124 return 0;
125}
126
c530cd24
YZ
127
128static struct dentry *
129__dcache_find_get_entry(struct dentry *parent, u64 idx,
130 struct ceph_readdir_cache_control *cache_ctl)
131{
132 struct inode *dir = d_inode(parent);
133 struct dentry *dentry;
134 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
135 loff_t ptr_pos = idx * sizeof(struct dentry *);
136 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
137
138 if (ptr_pos >= i_size_read(dir))
139 return NULL;
140
141 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
142 ceph_readdir_cache_release(cache_ctl);
143 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
144 if (!cache_ctl->page) {
145 dout(" page %lu not found\n", ptr_pgoff);
146 return ERR_PTR(-EAGAIN);
147 }
148 /* reading/filling the cache are serialized by
810313c5 149 i_rwsem, no need to use page lock */
c530cd24
YZ
150 unlock_page(cache_ctl->page);
151 cache_ctl->dentries = kmap(cache_ctl->page);
152 }
153
154 cache_ctl->index = idx & idx_mask;
155
156 rcu_read_lock();
157 spin_lock(&parent->d_lock);
158 /* check i_size again here, because empty directory can be
810313c5 159 * marked as complete while not holding the i_rwsem. */
c530cd24
YZ
160 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
161 dentry = cache_ctl->dentries[cache_ctl->index];
162 else
163 dentry = NULL;
164 spin_unlock(&parent->d_lock);
165 if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
166 dentry = NULL;
167 rcu_read_unlock();
168 return dentry ? : ERR_PTR(-EAGAIN);
169}
170
2817b000
SW
171/*
172 * When possible, we try to satisfy a readdir by peeking at the
173 * dcache. We make this work by carefully ordering dentries on
946e51f2 174 * d_child when we initially get results back from the MDS, and
2817b000
SW
175 * falling back to a "normal" sync readdir if any dentries in the dir
176 * are dropped.
177 *
2f276c51 178 * Complete dir indicates that we have all dentries in the dir. It is
2817b000
SW
179 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
180 * the MDS if/when the directory is modified).
181 */
a30be7cb 182static int __dcache_readdir(struct file *file, struct dir_context *ctx,
97aeb6bf 183 int shared_gen)
2817b000 184{
bb48bd4d 185 struct ceph_dir_file_info *dfi = file->private_data;
b583043e 186 struct dentry *parent = file->f_path.dentry;
2b0143b5 187 struct inode *dir = d_inode(parent);
fdd4e158 188 struct dentry *dentry, *last = NULL;
2817b000 189 struct ceph_dentry_info *di;
fdd4e158 190 struct ceph_readdir_cache_control cache_ctl = {};
c530cd24
YZ
191 u64 idx = 0;
192 int err = 0;
2817b000 193
97aeb6bf 194 dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
2817b000 195
c530cd24
YZ
196 /* search start position */
197 if (ctx->pos > 2) {
198 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
199 while (count > 0) {
200 u64 step = count >> 1;
201 dentry = __dcache_find_get_entry(parent, idx + step,
202 &cache_ctl);
203 if (!dentry) {
204 /* use linar search */
205 idx = 0;
206 break;
207 }
208 if (IS_ERR(dentry)) {
209 err = PTR_ERR(dentry);
210 goto out;
211 }
212 di = ceph_dentry(dentry);
213 spin_lock(&dentry->d_lock);
214 if (fpos_cmp(di->offset, ctx->pos) < 0) {
215 idx += step + 1;
216 count -= step + 1;
217 } else {
218 count = step;
219 }
220 spin_unlock(&dentry->d_lock);
221 dput(dentry);
222 }
2817b000 223
c530cd24 224 dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
2817b000
SW
225 }
226
fdd4e158 227
c530cd24
YZ
228 for (;;) {
229 bool emit_dentry = false;
230 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
231 if (!dentry) {
bb48bd4d 232 dfi->file_info.flags |= CEPH_F_ATEND;
fdd4e158
YZ
233 err = 0;
234 break;
2817b000 235 }
c530cd24
YZ
236 if (IS_ERR(dentry)) {
237 err = PTR_ERR(dentry);
238 goto out;
fdd4e158
YZ
239 }
240
fdd4e158 241 spin_lock(&dentry->d_lock);
5495c2d0
YZ
242 di = ceph_dentry(dentry);
243 if (d_unhashed(dentry) ||
244 d_really_is_negative(dentry) ||
af9ffa6d
XL
245 di->lease_shared_gen != shared_gen ||
246 ((dentry->d_flags & DCACHE_NOKEY_NAME) &&
247 fscrypt_has_encryption_key(dir))) {
5495c2d0
YZ
248 spin_unlock(&dentry->d_lock);
249 dput(dentry);
250 err = -EAGAIN;
251 goto out;
252 }
253 if (fpos_cmp(ctx->pos, di->offset) <= 0) {
37c4efc1 254 __ceph_dentry_dir_lease_touch(di);
fdd4e158
YZ
255 emit_dentry = true;
256 }
da502956 257 spin_unlock(&dentry->d_lock);
2817b000 258
fdd4e158 259 if (emit_dentry) {
f3c4ebe6 260 dout(" %llx dentry %p %pd %p\n", di->offset,
fdd4e158
YZ
261 dentry, dentry, d_inode(dentry));
262 ctx->pos = di->offset;
263 if (!dir_emit(ctx, dentry->d_name.name,
ebce3eb2 264 dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
fdd4e158
YZ
265 d_inode(dentry)->i_mode >> 12)) {
266 dput(dentry);
267 err = 0;
268 break;
269 }
270 ctx->pos++;
0081bd83 271
fdd4e158
YZ
272 if (last)
273 dput(last);
274 last = dentry;
275 } else {
276 dput(dentry);
2817b000 277 }
fdd4e158 278 }
c530cd24 279out:
fdd4e158
YZ
280 ceph_readdir_cache_release(&cache_ctl);
281 if (last) {
282 int ret;
283 di = ceph_dentry(last);
bb48bd4d 284 ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
fdd4e158
YZ
285 fpos_off(di->offset) + 1);
286 if (ret < 0)
287 err = ret;
2817b000 288 dput(last);
84583cfb 289 /* last_name no longer match cache index */
bb48bd4d
CX
290 if (dfi->readdir_cache_idx >= 0) {
291 dfi->readdir_cache_idx = -1;
292 dfi->dir_release_count = 0;
84583cfb 293 }
fdd4e158 294 }
2817b000
SW
295 return err;
296}
297
bb48bd4d 298static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
f3c4ebe6 299{
bb48bd4d 300 if (!dfi->last_readdir)
f3c4ebe6
YZ
301 return true;
302 if (is_hash_order(pos))
bb48bd4d 303 return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos));
f3c4ebe6 304 else
bb48bd4d 305 return dfi->frag != fpos_frag(pos);
f3c4ebe6
YZ
306}
307
77acfa29 308static int ceph_readdir(struct file *file, struct dir_context *ctx)
2817b000 309{
bb48bd4d 310 struct ceph_dir_file_info *dfi = file->private_data;
77acfa29 311 struct inode *inode = file_inode(file);
2817b000 312 struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2
YS
313 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
314 struct ceph_mds_client *mdsc = fsc->mdsc;
8974eebd 315 int i;
2817b000 316 int err;
b50c2de5 317 unsigned frag = -1;
2817b000 318 struct ceph_mds_reply_info_parsed *rinfo;
2817b000 319
8974eebd 320 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
bb48bd4d 321 if (dfi->file_info.flags & CEPH_F_ATEND)
2817b000
SW
322 return 0;
323
324 /* always start with . and .. */
77acfa29 325 if (ctx->pos == 0) {
2817b000 326 dout("readdir off 0 -> '.'\n");
ebce3eb2 327 if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
77acfa29 328 inode->i_mode >> 12))
2817b000 329 return 0;
77acfa29 330 ctx->pos = 1;
2817b000 331 }
77acfa29 332 if (ctx->pos == 1) {
ebce3eb2
JL
333 u64 ino;
334 struct dentry *dentry = file->f_path.dentry;
335
336 spin_lock(&dentry->d_lock);
337 ino = ceph_present_inode(dentry->d_parent->d_inode);
338 spin_unlock(&dentry->d_lock);
339
2817b000 340 dout("readdir off 1 -> '..'\n");
ebce3eb2 341 if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
2817b000 342 return 0;
77acfa29 343 ctx->pos = 2;
2817b000
SW
344 }
345
af9ffa6d
XL
346 err = fscrypt_prepare_readdir(inode);
347 if (err)
348 return err;
349
be655596 350 spin_lock(&ci->i_ceph_lock);
719a2514
YZ
351 /* request Fx cap. if have Fx, we don't need to release Fs cap
352 * for later create/unlink. */
353 __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
354 /* can we use the dcache? */
fdd4e158 355 if (ceph_test_mount_opt(fsc, DCACHE) &&
3d14c5d2 356 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
a0dff78d 357 ceph_snap(inode) != CEPH_SNAPDIR &&
70db4f36 358 __ceph_dir_is_complete_ordered(ci) &&
1af16d54 359 __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
97aeb6bf 360 int shared_gen = atomic_read(&ci->i_shared_gen);
1af16d54 361
be655596 362 spin_unlock(&ci->i_ceph_lock);
a30be7cb 363 err = __dcache_readdir(file, ctx, shared_gen);
efa4c120 364 if (err != -EAGAIN)
2817b000 365 return err;
efa4c120 366 } else {
be655596 367 spin_unlock(&ci->i_ceph_lock);
2817b000 368 }
2817b000
SW
369
370 /* proceed with a normal readdir */
2817b000
SW
371more:
372 /* do we have the correct frag content buffered? */
bb48bd4d 373 if (need_send_readdir(dfi, ctx->pos)) {
2817b000
SW
374 struct ceph_mds_request *req;
375 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
376 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
377
378 /* discard old result, if any */
bb48bd4d
CX
379 if (dfi->last_readdir) {
380 ceph_mdsc_put_request(dfi->last_readdir);
381 dfi->last_readdir = NULL;
393f6620 382 }
2817b000 383
f3c4ebe6 384 if (is_hash_order(ctx->pos)) {
b50c2de5
YZ
385 /* fragtree isn't always accurate. choose frag
386 * based on previous reply when possible. */
387 if (frag == (unsigned)-1)
388 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
389 NULL, NULL);
f3c4ebe6
YZ
390 } else {
391 frag = fpos_frag(ctx->pos);
392 }
393
2817b000 394 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
bb48bd4d 395 ceph_vinop(inode), frag, dfi->last_name);
2817b000
SW
396 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
397 if (IS_ERR(req))
398 return PTR_ERR(req);
af9ffa6d 399
54008399
YZ
400 err = ceph_alloc_readdir_reply_buffer(req, inode);
401 if (err) {
402 ceph_mdsc_put_request(req);
403 return err;
404 }
2817b000
SW
405 /* hints to request -> mds selection code */
406 req->r_direct_mode = USE_AUTH_MDS;
5d37ca14
YZ
407 if (op == CEPH_MDS_OP_READDIR) {
408 req->r_direct_hash = ceph_frag_value(frag);
409 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
87c91a96 410 req->r_inode_drop = CEPH_CAP_FILE_EXCL;
5d37ca14 411 }
bb48bd4d 412 if (dfi->last_name) {
af9ffa6d
XL
413 struct qstr d_name = { .name = dfi->last_name,
414 .len = strlen(dfi->last_name) };
415
416 req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
a149bb9a
SK
417 if (!req->r_path2) {
418 ceph_mdsc_put_request(req);
419 return -ENOMEM;
420 }
af9ffa6d
XL
421
422 err = ceph_encode_encrypted_dname(inode, &d_name,
423 req->r_path2);
424 if (err < 0) {
425 ceph_mdsc_put_request(req);
426 return err;
427 }
79162547
YZ
428 } else if (is_hash_order(ctx->pos)) {
429 req->r_args.readdir.offset_hash =
430 cpu_to_le32(fpos_hash(ctx->pos));
a149bb9a 431 }
79162547 432
bb48bd4d
CX
433 req->r_dir_release_cnt = dfi->dir_release_count;
434 req->r_dir_ordered_cnt = dfi->dir_ordered_count;
435 req->r_readdir_cache_idx = dfi->readdir_cache_idx;
436 req->r_readdir_offset = dfi->next_offset;
2817b000 437 req->r_args.readdir.frag = cpu_to_le32(frag);
956d39d6
YZ
438 req->r_args.readdir.flags =
439 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
a149bb9a
SK
440
441 req->r_inode = inode;
442 ihold(inode);
443 req->r_dentry = dget(file->f_path.dentry);
2817b000
SW
444 err = ceph_mdsc_do_request(mdsc, NULL, req);
445 if (err < 0) {
446 ceph_mdsc_put_request(req);
447 return err;
448 }
f3c4ebe6
YZ
449 dout("readdir got and parsed readdir result=%d on "
450 "frag %x, end=%d, complete=%d, hash_order=%d\n",
451 err, frag,
2817b000 452 (int)req->r_reply_info.dir_end,
f3c4ebe6
YZ
453 (int)req->r_reply_info.dir_complete,
454 (int)req->r_reply_info.hash_order);
2817b000 455
81c6aea5
YZ
456 rinfo = &req->r_reply_info;
457 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
458 frag = le32_to_cpu(rinfo->dir_dir->frag);
f3c4ebe6 459 if (!rinfo->hash_order) {
bb48bd4d 460 dfi->next_offset = req->r_readdir_offset;
f3c4ebe6
YZ
461 /* adjust ctx->pos to beginning of frag */
462 ctx->pos = ceph_make_fpos(frag,
bb48bd4d 463 dfi->next_offset,
f3c4ebe6
YZ
464 false);
465 }
81c6aea5 466 }
fdd4e158 467
bb48bd4d
CX
468 dfi->frag = frag;
469 dfi->last_readdir = req;
2817b000 470
bc2de10d 471 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
bb48bd4d
CX
472 dfi->readdir_cache_idx = req->r_readdir_cache_idx;
473 if (dfi->readdir_cache_idx < 0) {
fdd4e158 474 /* preclude from marking dir ordered */
bb48bd4d 475 dfi->dir_ordered_count = 0;
8974eebd 476 } else if (ceph_frag_is_leftmost(frag) &&
bb48bd4d 477 dfi->next_offset == 2) {
fdd4e158
YZ
478 /* note dir version at start of readdir so
479 * we can tell if any dentries get dropped */
bb48bd4d
CX
480 dfi->dir_release_count = req->r_dir_release_cnt;
481 dfi->dir_ordered_count = req->r_dir_ordered_cnt;
fdd4e158
YZ
482 }
483 } else {
4c069a58 484 dout("readdir !did_prepopulate\n");
fdd4e158 485 /* disable readdir cache */
bb48bd4d 486 dfi->readdir_cache_idx = -1;
fdd4e158 487 /* preclude from marking dir complete */
bb48bd4d 488 dfi->dir_release_count = 0;
fdd4e158
YZ
489 }
490
f3c4ebe6
YZ
491 /* note next offset and last dentry name */
492 if (rinfo->dir_nr > 0) {
2a5beea3
YZ
493 struct ceph_mds_reply_dir_entry *rde =
494 rinfo->dir_entries + (rinfo->dir_nr-1);
f3c4ebe6
YZ
495 unsigned next_offset = req->r_reply_info.dir_end ?
496 2 : (fpos_off(rde->offset) + 1);
bb48bd4d 497 err = note_last_dentry(dfi, rde->name, rde->name_len,
f3c4ebe6 498 next_offset);
f639d986
XL
499 if (err) {
500 ceph_mdsc_put_request(dfi->last_readdir);
501 dfi->last_readdir = NULL;
2817b000 502 return err;
f639d986 503 }
f3c4ebe6 504 } else if (req->r_reply_info.dir_end) {
bb48bd4d 505 dfi->next_offset = 2;
f3c4ebe6 506 /* keep last name */
2817b000
SW
507 }
508 }
509
bb48bd4d 510 rinfo = &dfi->last_readdir->r_reply_info;
8974eebd 511 dout("readdir frag %x num %d pos %llx chunk first %llx\n",
bb48bd4d 512 dfi->frag, rinfo->dir_nr, ctx->pos,
8974eebd 513 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
77acfa29 514
8974eebd
YZ
515 i = 0;
516 /* search start position */
517 if (rinfo->dir_nr > 0) {
518 int step, nr = rinfo->dir_nr;
519 while (nr > 0) {
520 step = nr >> 1;
521 if (rinfo->dir_entries[i + step].offset < ctx->pos) {
522 i += step + 1;
523 nr -= step + 1;
524 } else {
525 nr = step;
526 }
527 }
528 }
529 for (; i < rinfo->dir_nr; i++) {
530 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
3105c19c 531
af9ffa6d
XL
532 if (rde->offset < ctx->pos) {
533 pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
534 __func__, rde->offset, ctx->pos);
535 return -EIO;
536 }
537
538 if (WARN_ON_ONCE(!rde->inode.in))
539 return -EIO;
8974eebd
YZ
540
541 ctx->pos = rde->offset;
542 dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
543 i, rinfo->dir_nr, ctx->pos,
2a5beea3 544 rde->name_len, rde->name, &rde->inode.in);
8974eebd 545
2a5beea3 546 if (!dir_emit(ctx, rde->name, rde->name_len,
ebce3eb2
JL
547 ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
548 le32_to_cpu(rde->inode.in->mode) >> 12)) {
f639d986
XL
549 /*
550 * NOTE: Here no need to put the 'dfi->last_readdir',
551 * because when dir_emit stops us it's most likely
552 * doesn't have enough memory, etc. So for next readdir
553 * it will continue.
554 */
2817b000
SW
555 dout("filldir stopping us...\n");
556 return 0;
557 }
af9ffa6d
XL
558
559 /* Reset the lengths to their original allocated vals */
77acfa29 560 ctx->pos++;
2817b000
SW
561 }
562
bb48bd4d
CX
563 ceph_mdsc_put_request(dfi->last_readdir);
564 dfi->last_readdir = NULL;
b50c2de5 565
bb48bd4d
CX
566 if (dfi->next_offset > 2) {
567 frag = dfi->frag;
2817b000
SW
568 goto more;
569 }
570
571 /* more frags? */
bb48bd4d
CX
572 if (!ceph_frag_is_rightmost(dfi->frag)) {
573 frag = ceph_frag_next(dfi->frag);
f3c4ebe6
YZ
574 if (is_hash_order(ctx->pos)) {
575 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
bb48bd4d 576 dfi->next_offset, true);
f3c4ebe6
YZ
577 if (new_pos > ctx->pos)
578 ctx->pos = new_pos;
579 /* keep last_name */
580 } else {
bb48bd4d
CX
581 ctx->pos = ceph_make_fpos(frag, dfi->next_offset,
582 false);
583 kfree(dfi->last_name);
584 dfi->last_name = NULL;
f3c4ebe6 585 }
2817b000
SW
586 dout("readdir next frag is %x\n", frag);
587 goto more;
588 }
bb48bd4d 589 dfi->file_info.flags |= CEPH_F_ATEND;
2817b000
SW
590
591 /*
592 * if dir_release_count still matches the dir, no dentries
593 * were released during the whole readdir, and we should have
594 * the complete dir contents in our cache.
595 */
bb48bd4d
CX
596 if (atomic64_read(&ci->i_release_count) ==
597 dfi->dir_release_count) {
fdd4e158 598 spin_lock(&ci->i_ceph_lock);
bb48bd4d
CX
599 if (dfi->dir_ordered_count ==
600 atomic64_read(&ci->i_ordered_count)) {
70db4f36 601 dout(" marking %p complete and ordered\n", inode);
fdd4e158
YZ
602 /* use i_size to track number of entries in
603 * readdir cache */
bb48bd4d
CX
604 BUG_ON(dfi->readdir_cache_idx < 0);
605 i_size_write(inode, dfi->readdir_cache_idx *
fdd4e158
YZ
606 sizeof(struct dentry*));
607 } else {
70db4f36 608 dout(" marking %p complete\n", inode);
fdd4e158 609 }
bb48bd4d
CX
610 __ceph_dir_set_complete(ci, dfi->dir_release_count,
611 dfi->dir_ordered_count);
fdd4e158 612 spin_unlock(&ci->i_ceph_lock);
2817b000 613 }
77acfa29 614 dout("readdir %p file %p done.\n", inode, file);
2817b000
SW
615 return 0;
616}
617
bb48bd4d 618static void reset_readdir(struct ceph_dir_file_info *dfi)
2817b000 619{
bb48bd4d
CX
620 if (dfi->last_readdir) {
621 ceph_mdsc_put_request(dfi->last_readdir);
622 dfi->last_readdir = NULL;
2817b000 623 }
bb48bd4d
CX
624 kfree(dfi->last_name);
625 dfi->last_name = NULL;
626 dfi->dir_release_count = 0;
627 dfi->readdir_cache_idx = -1;
628 dfi->next_offset = 2; /* compensate for . and .. */
629 dfi->file_info.flags &= ~CEPH_F_ATEND;
2817b000
SW
630}
631
8974eebd
YZ
632/*
633 * discard buffered readdir content on seekdir(0), or seek to new frag,
634 * or seek prior to current chunk
635 */
bb48bd4d 636static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
8974eebd
YZ
637{
638 struct ceph_mds_reply_info_parsed *rinfo;
f3c4ebe6 639 loff_t chunk_offset;
8974eebd
YZ
640 if (new_pos == 0)
641 return true;
f3c4ebe6
YZ
642 if (is_hash_order(new_pos)) {
643 /* no need to reset last_name for a forward seek when
644 * dentries are sotred in hash order */
bb48bd4d 645 } else if (dfi->frag != fpos_frag(new_pos)) {
8974eebd 646 return true;
f3c4ebe6 647 }
bb48bd4d 648 rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
8974eebd
YZ
649 if (!rinfo || !rinfo->dir_nr)
650 return true;
f3c4ebe6
YZ
651 chunk_offset = rinfo->dir_entries[0].offset;
652 return new_pos < chunk_offset ||
653 is_hash_order(new_pos) != is_hash_order(chunk_offset);
8974eebd
YZ
654}
655
965c8e59 656static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
2817b000 657{
bb48bd4d 658 struct ceph_dir_file_info *dfi = file->private_data;
2817b000 659 struct inode *inode = file->f_mapping->host;
2817b000
SW
660 loff_t retval;
661
5955102c 662 inode_lock(inode);
06222e49 663 retval = -EINVAL;
965c8e59 664 switch (whence) {
2817b000
SW
665 case SEEK_CUR:
666 offset += file->f_pos;
fcaddb1d 667 break;
06222e49
JB
668 case SEEK_SET:
669 break;
fdd4e158
YZ
670 case SEEK_END:
671 retval = -EOPNOTSUPP;
fcaddb1d 672 goto out;
06222e49
JB
673 default:
674 goto out;
2817b000 675 }
06222e49 676
f0494206 677 if (offset >= 0) {
bb48bd4d 678 if (need_reset_readdir(dfi, offset)) {
f3c4ebe6 679 dout("dir_llseek dropping %p content\n", file);
bb48bd4d 680 reset_readdir(dfi);
f3c4ebe6
YZ
681 } else if (is_hash_order(offset) && offset > file->f_pos) {
682 /* for hash offset, we don't know if a forward seek
683 * is within same frag */
bb48bd4d
CX
684 dfi->dir_release_count = 0;
685 dfi->readdir_cache_idx = -1;
f3c4ebe6
YZ
686 }
687
2817b000
SW
688 if (offset != file->f_pos) {
689 file->f_pos = offset;
690 file->f_version = 0;
bb48bd4d 691 dfi->file_info.flags &= ~CEPH_F_ATEND;
2817b000
SW
692 }
693 retval = offset;
2817b000 694 }
06222e49 695out:
5955102c 696 inode_unlock(inode);
2817b000
SW
697 return retval;
698}
699
700/*
468640e3 701 * Handle lookups for the hidden .snap directory.
2817b000 702 */
aa60cfc3 703struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
7a971e2c 704 struct dentry *dentry)
2817b000 705{
3d14c5d2 706 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
810313c5 707 struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
2817b000
SW
708
709 /* .snap dir? */
7a971e2c 710 if (ceph_snap(parent) == CEPH_NOSNAP &&
aa60cfc3
JL
711 strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
712 struct dentry *res;
2817b000 713 struct inode *inode = ceph_get_snapdir(parent);
aa60cfc3
JL
714
715 res = d_splice_alias(inode, dentry);
716 dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
717 dentry, dentry, inode, res);
718 if (res)
719 dentry = res;
2817b000 720 }
aa60cfc3 721 return dentry;
468640e3 722}
2817b000 723
468640e3
SW
724/*
725 * Figure out final result of a lookup/open request.
726 *
727 * Mainly, make sure we return the final req->r_dentry (if it already
728 * existed) in place of the original VFS-provided dentry when they
729 * differ.
730 *
731 * Gracefully handle the case where the MDS replies with -ENOENT and
732 * no trace (which it may do, at its discretion, e.g., if it doesn't
733 * care to issue a lease on the negative dentry).
734 */
735struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
736 struct dentry *dentry, int err)
737{
2817b000
SW
738 if (err == -ENOENT) {
739 /* no trace? */
740 err = 0;
741 if (!req->r_reply_info.head->is_dentry) {
742 dout("ENOENT and no trace, dentry %p inode %p\n",
2b0143b5
DH
743 dentry, d_inode(dentry));
744 if (d_really_is_positive(dentry)) {
2817b000
SW
745 d_drop(dentry);
746 err = -ENOENT;
747 } else {
748 d_add(dentry, NULL);
749 }
750 }
751 }
752 if (err)
753 dentry = ERR_PTR(err);
754 else if (dentry != req->r_dentry)
755 dentry = dget(req->r_dentry); /* we got spliced */
756 else
757 dentry = NULL;
758 return dentry;
759}
760
3b33f692 761static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
1d1de916
SW
762{
763 return ceph_ino(inode) == CEPH_INO_ROOT &&
764 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
765}
766
2817b000
SW
767/*
768 * Look up a single dir entry. If there is a lookup intent, inform
769 * the MDS so that it gets our 'caps wanted' value in a single op.
770 */
771static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
00cd8dd3 772 unsigned int flags)
2817b000 773{
3d14c5d2 774 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
2678da88 775 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000
SW
776 struct ceph_mds_request *req;
777 int op;
315f2408 778 int mask;
2817b000
SW
779 int err;
780
a455589f
AV
781 dout("lookup %p dentry %p '%pd'\n",
782 dir, dentry, dentry);
2817b000
SW
783
784 if (dentry->d_name.len > NAME_MAX)
785 return ERR_PTR(-ENAMETOOLONG);
786
cb3524a8
JL
787 if (IS_ENCRYPTED(dir)) {
788 err = __fscrypt_prepare_readdir(dir);
789 if (err)
790 return ERR_PTR(err);
791 if (!fscrypt_has_encryption_key(dir)) {
792 spin_lock(&dentry->d_lock);
793 dentry->d_flags |= DCACHE_NOKEY_NAME;
794 spin_unlock(&dentry->d_lock);
795 }
796 }
797
2817b000 798 /* can we conclude ENOENT locally? */
2b0143b5 799 if (d_really_is_negative(dentry)) {
2817b000
SW
800 struct ceph_inode_info *ci = ceph_inode(dir);
801 struct ceph_dentry_info *di = ceph_dentry(dentry);
802
be655596 803 spin_lock(&ci->i_ceph_lock);
891f3f5a 804 dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
2817b000 805 if (strncmp(dentry->d_name.name,
3d14c5d2 806 fsc->mount_options->snapdir_name,
2817b000 807 dentry->d_name.len) &&
1d1de916 808 !is_root_ceph_dentry(dir, dentry) &&
e2c3de04 809 ceph_test_mount_opt(fsc, DCACHE) &&
2f276c51 810 __ceph_dir_is_complete(ci) &&
1af16d54 811 __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
719a2514 812 __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
be655596 813 spin_unlock(&ci->i_ceph_lock);
2817b000
SW
814 dout(" dir %p complete, -ENOENT\n", dir);
815 d_add(dentry, NULL);
97aeb6bf 816 di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
2817b000
SW
817 return NULL;
818 }
be655596 819 spin_unlock(&ci->i_ceph_lock);
2817b000
SW
820 }
821
822 op = ceph_snap(dir) == CEPH_SNAPDIR ?
823 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
824 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
825 if (IS_ERR(req))
7e34bc52 826 return ERR_CAST(req);
2817b000
SW
827 req->r_dentry = dget(dentry);
828 req->r_num_caps = 2;
315f2408
YZ
829
830 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
831 if (ceph_security_xattr_wanted(dir))
832 mask |= CEPH_CAP_XATTR_SHARED;
833 req->r_args.getattr.mask = cpu_to_le32(mask);
834
4c183472 835 ihold(dir);
3dd69aab
JL
836 req->r_parent = dir;
837 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
2817b000 838 err = ceph_mdsc_do_request(mdsc, NULL, req);
7a971e2c
JL
839 if (err == -ENOENT) {
840 struct dentry *res;
841
842 res = ceph_handle_snapdir(req, dentry);
843 if (IS_ERR(res)) {
844 err = PTR_ERR(res);
845 } else {
846 dentry = res;
847 err = 0;
848 }
aa60cfc3 849 }
2817b000
SW
850 dentry = ceph_finish_lookup(req, dentry, err);
851 ceph_mdsc_put_request(req); /* will dput(dentry) */
852 dout("lookup result=%p\n", dentry);
853 return dentry;
854}
855
856/*
857 * If we do a create but get no trace back from the MDS, follow up with
858 * a lookup (the VFS expects us to link up the provided dentry).
859 */
860int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
861{
00cd8dd3 862 struct dentry *result = ceph_lookup(dir, dentry, 0);
2817b000
SW
863
864 if (result && !IS_ERR(result)) {
865 /*
866 * We created the item, then did a lookup, and found
867 * it was already linked to another inode we already
4d41cef2
YZ
868 * had in our cache (and thus got spliced). To not
869 * confuse VFS (especially when inode is a directory),
870 * we don't link our dentry to that inode, return an
871 * error instead.
872 *
873 * This event should be rare and it happens only when
874 * we talk to old MDS. Recent MDS does not send traceless
875 * reply for request that creates new inode.
2817b000 876 */
5cba372c 877 d_drop(result);
4d41cef2 878 return -ESTALE;
2817b000
SW
879 }
880 return PTR_ERR(result);
881}
882
5ebb29be 883static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
549c7297 884 struct dentry *dentry, umode_t mode, dev_t rdev)
2817b000 885{
2678da88 886 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000 887 struct ceph_mds_request *req;
5c31e92d 888 struct ceph_acl_sec_ctx as_ctx = {};
2817b000
SW
889 int err;
890
891 if (ceph_snap(dir) != CEPH_NOSNAP)
892 return -EROFS;
893
4868e537
XL
894 err = ceph_wait_on_conflict_unlink(dentry);
895 if (err)
896 return err;
897
0459871c
CX
898 if (ceph_quota_is_max_files_exceeded(dir)) {
899 err = -EDQUOT;
900 goto out;
901 }
b7a29217 902
1a67aafb 903 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
2817b000
SW
904 dir, dentry, mode, rdev);
905 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
906 if (IS_ERR(req)) {
b1ee94aa
YZ
907 err = PTR_ERR(req);
908 goto out;
2817b000 909 }
ec9595c0
JL
910
911 req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
912 if (IS_ERR(req->r_new_inode)) {
913 err = PTR_ERR(req->r_new_inode);
914 req->r_new_inode = NULL;
915 goto out_req;
916 }
917
2817b000
SW
918 req->r_dentry = dget(dentry);
919 req->r_num_caps = 2;
3dd69aab 920 req->r_parent = dir;
4c183472 921 ihold(dir);
3dd69aab 922 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
2817b000
SW
923 req->r_args.mknod.mode = cpu_to_le32(mode);
924 req->r_args.mknod.rdev = cpu_to_le32(rdev);
d9d00f71
XL
925 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
926 CEPH_CAP_XATTR_EXCL;
2817b000 927 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ec9595c0
JL
928
929 ceph_as_ctx_to_req(req, &as_ctx);
930
2817b000
SW
931 err = ceph_mdsc_do_request(mdsc, dir, req);
932 if (!err && !req->r_reply_info.head->is_dentry)
933 err = ceph_handle_notrace_create(dir, dentry);
ec9595c0 934out_req:
2817b000 935 ceph_mdsc_put_request(req);
b1ee94aa 936out:
7221fe4c 937 if (!err)
5c31e92d 938 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
b20a95a0 939 else
2817b000 940 d_drop(dentry);
5c31e92d 941 ceph_release_acl_sec_ctx(&as_ctx);
2817b000
SW
942 return err;
943}
944
6c960e68 945static int ceph_create(struct mnt_idmap *idmap, struct inode *dir,
549c7297 946 struct dentry *dentry, umode_t mode, bool excl)
2817b000 947{
5ebb29be 948 return ceph_mknod(idmap, dir, dentry, mode, 0);
2817b000
SW
949}
950
79f2f6ad
JL
951#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
952static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
953 const char *dest)
954{
955 int err;
956 int len = strlen(dest);
957 struct fscrypt_str osd_link = FSTR_INIT(NULL, 0);
958
959 err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX,
960 &osd_link);
961 if (err)
962 goto out;
963
964 err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link);
965 if (err)
966 goto out;
967
968 req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL);
969 if (!req->r_path2) {
970 err = -ENOMEM;
971 goto out;
972 }
973
974 len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2);
975 req->r_path2[len] = '\0';
976out:
977 fscrypt_fname_free_buffer(&osd_link);
978 return err;
979}
980#else
981static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
982 const char *dest)
983{
984 return -EOPNOTSUPP;
985}
986#endif
987
7a77db95 988static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
549c7297 989 struct dentry *dentry, const char *dest)
2817b000 990{
2678da88 991 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000 992 struct ceph_mds_request *req;
ac6713cc 993 struct ceph_acl_sec_ctx as_ctx = {};
ec9595c0 994 umode_t mode = S_IFLNK | 0777;
2817b000
SW
995 int err;
996
997 if (ceph_snap(dir) != CEPH_NOSNAP)
998 return -EROFS;
999
4868e537
XL
1000 err = ceph_wait_on_conflict_unlink(dentry);
1001 if (err)
1002 return err;
1003
67fcd151
CX
1004 if (ceph_quota_is_max_files_exceeded(dir)) {
1005 err = -EDQUOT;
1006 goto out;
1007 }
b7a29217 1008
2817b000
SW
1009 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
1010 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
1011 if (IS_ERR(req)) {
b1ee94aa
YZ
1012 err = PTR_ERR(req);
1013 goto out;
2817b000 1014 }
ec9595c0
JL
1015
1016 req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
1017 if (IS_ERR(req->r_new_inode)) {
1018 err = PTR_ERR(req->r_new_inode);
1019 req->r_new_inode = NULL;
1020 goto out_req;
1021 }
1022
3dd69aab 1023 req->r_parent = dir;
4c183472
JL
1024 ihold(dir);
1025
79f2f6ad
JL
1026 if (IS_ENCRYPTED(req->r_new_inode)) {
1027 err = prep_encrypted_symlink_target(req, dest);
1028 if (err)
1029 goto out_req;
1030 } else {
1031 req->r_path2 = kstrdup(dest, GFP_KERNEL);
1032 if (!req->r_path2) {
1033 err = -ENOMEM;
1034 goto out_req;
1035 }
1036 }
1037
3dd69aab 1038 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
a149bb9a
SK
1039 req->r_dentry = dget(dentry);
1040 req->r_num_caps = 2;
d9d00f71
XL
1041 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
1042 CEPH_CAP_XATTR_EXCL;
2817b000 1043 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ec9595c0
JL
1044
1045 ceph_as_ctx_to_req(req, &as_ctx);
1046
2817b000
SW
1047 err = ceph_mdsc_do_request(mdsc, dir, req);
1048 if (!err && !req->r_reply_info.head->is_dentry)
1049 err = ceph_handle_notrace_create(dir, dentry);
ec9595c0 1050out_req:
2817b000 1051 ceph_mdsc_put_request(req);
b1ee94aa
YZ
1052out:
1053 if (err)
2817b000 1054 d_drop(dentry);
ac6713cc 1055 ceph_release_acl_sec_ctx(&as_ctx);
2817b000
SW
1056 return err;
1057}
1058
c54bd91e 1059static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
549c7297 1060 struct dentry *dentry, umode_t mode)
2817b000 1061{
2678da88 1062 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000 1063 struct ceph_mds_request *req;
5c31e92d 1064 struct ceph_acl_sec_ctx as_ctx = {};
4868e537 1065 int err;
2817b000
SW
1066 int op;
1067
4868e537
XL
1068 err = ceph_wait_on_conflict_unlink(dentry);
1069 if (err)
1070 return err;
1071
2817b000
SW
1072 if (ceph_snap(dir) == CEPH_SNAPDIR) {
1073 /* mkdir .snap/foo is a MKSNAP */
1074 op = CEPH_MDS_OP_MKSNAP;
a455589f
AV
1075 dout("mksnap dir %p snap '%pd' dn %p\n", dir,
1076 dentry, dentry);
2817b000 1077 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
18bb1db3 1078 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
2817b000
SW
1079 op = CEPH_MDS_OP_MKDIR;
1080 } else {
4868e537 1081 err = -EROFS;
2817b000
SW
1082 goto out;
1083 }
b1ee94aa 1084
25963669
YZ
1085 if (op == CEPH_MDS_OP_MKDIR &&
1086 ceph_quota_is_max_files_exceeded(dir)) {
b7a29217
LH
1087 err = -EDQUOT;
1088 goto out;
1089 }
1090
b1ee94aa 1091
2817b000
SW
1092 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
1093 if (IS_ERR(req)) {
1094 err = PTR_ERR(req);
1095 goto out;
1096 }
1097
ec9595c0
JL
1098 mode |= S_IFDIR;
1099 req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
1100 if (IS_ERR(req->r_new_inode)) {
1101 err = PTR_ERR(req->r_new_inode);
1102 req->r_new_inode = NULL;
1103 goto out_req;
1104 }
1105
2817b000
SW
1106 req->r_dentry = dget(dentry);
1107 req->r_num_caps = 2;
3dd69aab 1108 req->r_parent = dir;
4c183472 1109 ihold(dir);
3dd69aab 1110 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
2817b000 1111 req->r_args.mkdir.mode = cpu_to_le32(mode);
d9d00f71
XL
1112 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
1113 CEPH_CAP_XATTR_EXCL;
2817b000 1114 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ec9595c0
JL
1115
1116 ceph_as_ctx_to_req(req, &as_ctx);
1117
2817b000 1118 err = ceph_mdsc_do_request(mdsc, dir, req);
275dd19e
YZ
1119 if (!err &&
1120 !req->r_reply_info.head->is_target &&
1121 !req->r_reply_info.head->is_dentry)
2817b000 1122 err = ceph_handle_notrace_create(dir, dentry);
ec9595c0 1123out_req:
2817b000
SW
1124 ceph_mdsc_put_request(req);
1125out:
b20a95a0 1126 if (!err)
5c31e92d 1127 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
b20a95a0 1128 else
2817b000 1129 d_drop(dentry);
5c31e92d 1130 ceph_release_acl_sec_ctx(&as_ctx);
2817b000
SW
1131 return err;
1132}
1133
1134static int ceph_link(struct dentry *old_dentry, struct inode *dir,
1135 struct dentry *dentry)
1136{
2678da88 1137 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
2817b000
SW
1138 struct ceph_mds_request *req;
1139 int err;
1140
a5ffd7b6
XL
1141 if (dentry->d_flags & DCACHE_DISCONNECTED)
1142 return -EINVAL;
1143
4868e537
XL
1144 err = ceph_wait_on_conflict_unlink(dentry);
1145 if (err)
1146 return err;
1147
2817b000
SW
1148 if (ceph_snap(dir) != CEPH_NOSNAP)
1149 return -EROFS;
1150
a5ffd7b6
XL
1151 dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
1152 dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
2817b000
SW
1153 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
1154 if (IS_ERR(req)) {
1155 d_drop(dentry);
1156 return PTR_ERR(req);
1157 }
1158 req->r_dentry = dget(dentry);
1159 req->r_num_caps = 2;
4b58c9b1 1160 req->r_old_dentry = dget(old_dentry);
a5ffd7b6
XL
1161 /*
1162 * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we
1163 * will just pass the ino# to MDSs.
1164 */
1165 if (old_dentry->d_flags & DCACHE_DISCONNECTED)
1166 req->r_ino2 = ceph_vino(d_inode(old_dentry));
3dd69aab 1167 req->r_parent = dir;
4c183472 1168 ihold(dir);
3dd69aab 1169 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
d9d00f71 1170 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000 1171 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
ad88f23f 1172 /* release LINK_SHARED on source inode (mds will lock it) */
d19a0b54 1173 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
2817b000 1174 err = ceph_mdsc_do_request(mdsc, dir, req);
70b666c3 1175 if (err) {
2817b000 1176 d_drop(dentry);
70b666c3 1177 } else if (!req->r_reply_info.head->is_dentry) {
2b0143b5
DH
1178 ihold(d_inode(old_dentry));
1179 d_instantiate(dentry, d_inode(old_dentry));
70b666c3 1180 }
2817b000
SW
1181 ceph_mdsc_put_request(req);
1182 return err;
1183}
1184
2ccb4546
JL
1185static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
1186 struct ceph_mds_request *req)
1187{
4868e537
XL
1188 struct dentry *dentry = req->r_dentry;
1189 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
1190 struct ceph_dentry_info *di = ceph_dentry(dentry);
2ccb4546
JL
1191 int result = req->r_err ? req->r_err :
1192 le32_to_cpu(req->r_reply_info.head->result);
1193
4868e537
XL
1194 if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
1195 pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
1196 __func__, dentry, dentry);
1197
1198 spin_lock(&fsc->async_unlink_conflict_lock);
1199 hash_del_rcu(&di->hnode);
1200 spin_unlock(&fsc->async_unlink_conflict_lock);
1201
1202 spin_lock(&dentry->d_lock);
1203 di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
1204 wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
1205 spin_unlock(&dentry->d_lock);
1206
1207 synchronize_rcu();
1208
2ccb4546
JL
1209 if (result == -EJUKEBOX)
1210 goto out;
1211
1212 /* If op failed, mark everyone involved for errors */
1213 if (result) {
2a575f13
JL
1214 int pathlen = 0;
1215 u64 base = 0;
4868e537 1216 char *path = ceph_mdsc_build_path(dentry, &pathlen,
2ccb4546
JL
1217 &base, 0);
1218
1219 /* mark error on parent + clear complete */
1220 mapping_set_error(req->r_parent->i_mapping, result);
1221 ceph_dir_clear_complete(req->r_parent);
1222
1223 /* drop the dentry -- we don't know its status */
4868e537
XL
1224 if (!d_unhashed(dentry))
1225 d_drop(dentry);
2ccb4546
JL
1226
1227 /* mark inode itself for an error (since metadata is bogus) */
1228 mapping_set_error(req->r_old_inode->i_mapping, result);
1229
4868e537 1230 pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
2ccb4546
JL
1231 base, IS_ERR(path) ? "<<bad>>" : path, result);
1232 ceph_mdsc_free_path(path, pathlen);
1233 }
1234out:
1235 iput(req->r_old_inode);
1236 ceph_mdsc_release_dir_caps(req);
1237}
1238
1239static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
1240{
1241 struct ceph_inode_info *ci = ceph_inode(dir);
1242 struct ceph_dentry_info *di;
1243 int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
1244
1245 spin_lock(&ci->i_ceph_lock);
1246 if ((__ceph_caps_issued(ci, NULL) & want) == want) {
1247 ceph_take_cap_refs(ci, want, false);
1248 got = want;
1249 }
1250 spin_unlock(&ci->i_ceph_lock);
1251
1252 /* If we didn't get anything, return 0 */
1253 if (!got)
1254 return 0;
1255
1256 spin_lock(&dentry->d_lock);
1257 di = ceph_dentry(dentry);
1258 /*
1259 * - We are holding Fx, which implies Fs caps.
1260 * - Only support async unlink for primary linkage
1261 */
1262 if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
1263 !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
1264 want = 0;
1265 spin_unlock(&dentry->d_lock);
1266
1267 /* Do we still want what we've got? */
1268 if (want == got)
1269 return got;
1270
1271 ceph_put_cap_refs(ci, got);
1272 return 0;
1273}
1274
2817b000
SW
1275/*
1276 * rmdir and unlink are differ only by the metadata op code
1277 */
1278static int ceph_unlink(struct inode *dir, struct dentry *dentry)
1279{
3d14c5d2
YS
1280 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
1281 struct ceph_mds_client *mdsc = fsc->mdsc;
2b0143b5 1282 struct inode *inode = d_inode(dentry);
2817b000 1283 struct ceph_mds_request *req;
2ccb4546 1284 bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
2817b000
SW
1285 int err = -EROFS;
1286 int op;
1287
1288 if (ceph_snap(dir) == CEPH_SNAPDIR) {
1289 /* rmdir .snap/foo is RMSNAP */
a455589f 1290 dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
2817b000
SW
1291 op = CEPH_MDS_OP_RMSNAP;
1292 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
1293 dout("unlink/rmdir dir %p dn %p inode %p\n",
1294 dir, dentry, inode);
e36cb0b8 1295 op = d_is_dir(dentry) ?
2817b000
SW
1296 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
1297 } else
1298 goto out;
2ccb4546 1299retry:
2817b000
SW
1300 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
1301 if (IS_ERR(req)) {
1302 err = PTR_ERR(req);
1303 goto out;
1304 }
1305 req->r_dentry = dget(dentry);
1306 req->r_num_caps = 2;
3dd69aab 1307 req->r_parent = dir;
4c183472 1308 ihold(dir);
d9d00f71 1309 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000 1310 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
6ef0bc6d 1311 req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
2ccb4546
JL
1312
1313 if (try_async && op == CEPH_MDS_OP_UNLINK &&
1314 (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
4868e537
XL
1315 struct ceph_dentry_info *di = ceph_dentry(dentry);
1316
ebce3eb2 1317 dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
2ccb4546
JL
1318 dentry->d_name.len, dentry->d_name.name,
1319 ceph_cap_string(req->r_dir_caps));
1320 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
1321 req->r_callback = ceph_async_unlink_cb;
1322 req->r_old_inode = d_inode(dentry);
1323 ihold(req->r_old_inode);
4868e537
XL
1324
1325 spin_lock(&dentry->d_lock);
1326 di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
1327 spin_unlock(&dentry->d_lock);
1328
1329 spin_lock(&fsc->async_unlink_conflict_lock);
1330 hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
1331 dentry->d_name.hash);
1332 spin_unlock(&fsc->async_unlink_conflict_lock);
1333
2ccb4546
JL
1334 err = ceph_mdsc_submit_request(mdsc, dir, req);
1335 if (!err) {
1336 /*
1337 * We have enough caps, so we assume that the unlink
1338 * will succeed. Fix up the target inode and dcache.
1339 */
1340 drop_nlink(inode);
1341 d_delete(dentry);
4868e537
XL
1342 } else {
1343 spin_lock(&fsc->async_unlink_conflict_lock);
1344 hash_del_rcu(&di->hnode);
1345 spin_unlock(&fsc->async_unlink_conflict_lock);
1346
1347 spin_lock(&dentry->d_lock);
1348 di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
1349 spin_unlock(&dentry->d_lock);
1350
1351 if (err == -EJUKEBOX) {
1352 try_async = false;
1353 ceph_mdsc_put_request(req);
1354 goto retry;
1355 }
2ccb4546
JL
1356 }
1357 } else {
1358 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
1359 err = ceph_mdsc_do_request(mdsc, dir, req);
1360 if (!err && !req->r_reply_info.head->is_dentry)
1361 d_delete(dentry);
1362 }
1363
2817b000
SW
1364 ceph_mdsc_put_request(req);
1365out:
1366 return err;
1367}
1368
e18275ae 1369static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
549c7297
CB
1370 struct dentry *old_dentry, struct inode *new_dir,
1371 struct dentry *new_dentry, unsigned int flags)
2817b000 1372{
2678da88 1373 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
2817b000 1374 struct ceph_mds_request *req;
0ea611a3 1375 int op = CEPH_MDS_OP_RENAME;
2817b000
SW
1376 int err;
1377
1cd66c93
MS
1378 if (flags)
1379 return -EINVAL;
1380
2817b000
SW
1381 if (ceph_snap(old_dir) != ceph_snap(new_dir))
1382 return -EXDEV;
0ea611a3
YZ
1383 if (ceph_snap(old_dir) != CEPH_NOSNAP) {
1384 if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
1385 op = CEPH_MDS_OP_RENAMESNAP;
1386 else
1387 return -EROFS;
1388 }
6646ea1c
LH
1389 /* don't allow cross-quota renames */
1390 if ((old_dir != new_dir) &&
1391 (!ceph_quota_is_same_realm(old_dir, new_dir)))
1392 return -EXDEV;
cafe21a4 1393
4868e537
XL
1394 err = ceph_wait_on_conflict_unlink(new_dentry);
1395 if (err)
1396 return err;
1397
2817b000
SW
1398 dout("rename dir %p dentry %p to dir %p dentry %p\n",
1399 old_dir, old_dentry, new_dir, new_dentry);
0ea611a3 1400 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
2817b000
SW
1401 if (IS_ERR(req))
1402 return PTR_ERR(req);
180061a5 1403 ihold(old_dir);
2817b000
SW
1404 req->r_dentry = dget(new_dentry);
1405 req->r_num_caps = 2;
1406 req->r_old_dentry = dget(old_dentry);
180061a5 1407 req->r_old_dentry_dir = old_dir;
3dd69aab 1408 req->r_parent = new_dir;
4c183472 1409 ihold(new_dir);
3dd69aab 1410 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
d9d00f71 1411 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000 1412 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
d9d00f71 1413 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
2817b000
SW
1414 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1415 /* release LINK_RDCACHE on source inode (mds will lock it) */
d19a0b54 1416 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
6ef0bc6d
ZZ
1417 if (d_really_is_positive(new_dentry)) {
1418 req->r_inode_drop =
1419 ceph_drop_caps_for_unlink(d_inode(new_dentry));
1420 }
2817b000
SW
1421 err = ceph_mdsc_do_request(mdsc, old_dir, req);
1422 if (!err && !req->r_reply_info.head->is_dentry) {
1423 /*
1424 * Normally d_move() is done by fill_trace (called by
1425 * do_request, above). If there is no trace, we need
1426 * to do it here.
1427 */
1428 d_move(old_dentry, new_dentry);
1429 }
1430 ceph_mdsc_put_request(req);
1431 return err;
1432}
1433
37c4efc1
YZ
1434/*
1435 * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
1436 * Leases at front of the list will expire first. (Assume all leases have
1437 * similar duration)
1438 *
1439 * Called under dentry->d_lock.
1440 */
1441void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
1442{
1443 struct dentry *dn = di->dentry;
1444 struct ceph_mds_client *mdsc;
1445
1446 dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
1447
1448 di->flags |= CEPH_DENTRY_LEASE_LIST;
1449 if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
1450 di->flags |= CEPH_DENTRY_REFERENCED;
1451 return;
1452 }
1453
1454 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1455 spin_lock(&mdsc->dentry_list_lock);
1456 list_move_tail(&di->lease_list, &mdsc->dentry_leases);
1457 spin_unlock(&mdsc->dentry_list_lock);
1458}
1459
1460static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
1461 struct ceph_dentry_info *di)
1462{
1463 di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
1464 di->lease_gen = 0;
1465 di->time = jiffies;
1466 list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
1467}
1468
1469/*
1470 * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
1471 * list if it's not in the list, otherwise set 'referenced' flag.
1472 *
1473 * Called under dentry->d_lock.
1474 */
1475void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
1476{
1477 struct dentry *dn = di->dentry;
1478 struct ceph_mds_client *mdsc;
1479
0eb30853 1480 dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
37c4efc1
YZ
1481 di, dn, dn, di->offset);
1482
1483 if (!list_empty(&di->lease_list)) {
1484 if (di->flags & CEPH_DENTRY_LEASE_LIST) {
1485 /* don't remove dentry from dentry lease list
1486 * if its lease is valid */
1487 if (__dentry_lease_is_valid(di))
1488 return;
1489 } else {
1490 di->flags |= CEPH_DENTRY_REFERENCED;
1491 return;
1492 }
1493 }
1494
1495 if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
1496 di->flags |= CEPH_DENTRY_REFERENCED;
1497 di->flags &= ~CEPH_DENTRY_LEASE_LIST;
1498 return;
1499 }
1500
1501 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1502 spin_lock(&mdsc->dentry_list_lock);
1503 __dentry_dir_lease_touch(mdsc, di),
1504 spin_unlock(&mdsc->dentry_list_lock);
1505}
1506
1507static void __dentry_lease_unlist(struct ceph_dentry_info *di)
1508{
1509 struct ceph_mds_client *mdsc;
1510 if (di->flags & CEPH_DENTRY_SHRINK_LIST)
1511 return;
1512 if (list_empty(&di->lease_list))
1513 return;
1514
1515 mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
1516 spin_lock(&mdsc->dentry_list_lock);
1517 list_del_init(&di->lease_list);
1518 spin_unlock(&mdsc->dentry_list_lock);
1519}
1520
1521enum {
1522 KEEP = 0,
1523 DELETE = 1,
1524 TOUCH = 2,
1525 STOP = 4,
1526};
1527
1528struct ceph_lease_walk_control {
1529 bool dir_lease;
fe33032d 1530 bool expire_dir_lease;
37c4efc1
YZ
1531 unsigned long nr_to_scan;
1532 unsigned long dir_lease_ttl;
1533};
1534
1535static unsigned long
1536__dentry_leases_walk(struct ceph_mds_client *mdsc,
1537 struct ceph_lease_walk_control *lwc,
1538 int (*check)(struct dentry*, void*))
1539{
1540 struct ceph_dentry_info *di, *tmp;
1541 struct dentry *dentry, *last = NULL;
1542 struct list_head* list;
1543 LIST_HEAD(dispose);
1544 unsigned long freed = 0;
1545 int ret = 0;
1546
1547 list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
1548 spin_lock(&mdsc->dentry_list_lock);
1549 list_for_each_entry_safe(di, tmp, list, lease_list) {
1550 if (!lwc->nr_to_scan)
1551 break;
1552 --lwc->nr_to_scan;
1553
1554 dentry = di->dentry;
1555 if (last == dentry)
1556 break;
1557
1558 if (!spin_trylock(&dentry->d_lock))
1559 continue;
1560
516162b9 1561 if (__lockref_is_dead(&dentry->d_lockref)) {
37c4efc1
YZ
1562 list_del_init(&di->lease_list);
1563 goto next;
1564 }
1565
1566 ret = check(dentry, lwc);
1567 if (ret & TOUCH) {
1568 /* move it into tail of dir lease list */
1569 __dentry_dir_lease_touch(mdsc, di);
1570 if (!last)
1571 last = dentry;
1572 }
1573 if (ret & DELETE) {
1574 /* stale lease */
1575 di->flags &= ~CEPH_DENTRY_REFERENCED;
1576 if (dentry->d_lockref.count > 0) {
1577 /* update_dentry_lease() will re-add
1578 * it to lease list, or
1579 * ceph_d_delete() will return 1 when
1580 * last reference is dropped */
1581 list_del_init(&di->lease_list);
1582 } else {
1583 di->flags |= CEPH_DENTRY_SHRINK_LIST;
1584 list_move_tail(&di->lease_list, &dispose);
1585 dget_dlock(dentry);
1586 }
1587 }
1588next:
1589 spin_unlock(&dentry->d_lock);
1590 if (ret & STOP)
1591 break;
1592 }
1593 spin_unlock(&mdsc->dentry_list_lock);
1594
1595 while (!list_empty(&dispose)) {
1596 di = list_first_entry(&dispose, struct ceph_dentry_info,
1597 lease_list);
1598 dentry = di->dentry;
1599 spin_lock(&dentry->d_lock);
1600
1601 list_del_init(&di->lease_list);
1602 di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
1603 if (di->flags & CEPH_DENTRY_REFERENCED) {
1604 spin_lock(&mdsc->dentry_list_lock);
1605 if (di->flags & CEPH_DENTRY_LEASE_LIST) {
1606 list_add_tail(&di->lease_list,
1607 &mdsc->dentry_leases);
1608 } else {
1609 __dentry_dir_lease_touch(mdsc, di);
1610 }
1611 spin_unlock(&mdsc->dentry_list_lock);
1612 } else {
1613 freed++;
1614 }
1615
1616 spin_unlock(&dentry->d_lock);
1617 /* ceph_d_delete() does the trick */
1618 dput(dentry);
1619 }
1620 return freed;
1621}
1622
1623static int __dentry_lease_check(struct dentry *dentry, void *arg)
1624{
1625 struct ceph_dentry_info *di = ceph_dentry(dentry);
1626 int ret;
1627
1628 if (__dentry_lease_is_valid(di))
1629 return STOP;
1630 ret = __dir_lease_try_check(dentry);
1631 if (ret == -EBUSY)
1632 return KEEP;
1633 if (ret > 0)
1634 return TOUCH;
1635 return DELETE;
1636}
1637
1638static int __dir_lease_check(struct dentry *dentry, void *arg)
1639{
1640 struct ceph_lease_walk_control *lwc = arg;
1641 struct ceph_dentry_info *di = ceph_dentry(dentry);
1642
1643 int ret = __dir_lease_try_check(dentry);
1644 if (ret == -EBUSY)
1645 return KEEP;
1646 if (ret > 0) {
1647 if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
1648 return STOP;
1649 /* Move dentry to tail of dir lease list if we don't want
1650 * to delete it. So dentries in the list are checked in a
1651 * round robin manner */
fe33032d
YZ
1652 if (!lwc->expire_dir_lease)
1653 return TOUCH;
1654 if (dentry->d_lockref.count > 0 ||
1655 (di->flags & CEPH_DENTRY_REFERENCED))
1656 return TOUCH;
1657 /* invalidate dir lease */
1658 di->lease_shared_gen = 0;
37c4efc1
YZ
1659 }
1660 return DELETE;
1661}
1662
1663int ceph_trim_dentries(struct ceph_mds_client *mdsc)
1664{
1665 struct ceph_lease_walk_control lwc;
fe33032d 1666 unsigned long count;
37c4efc1
YZ
1667 unsigned long freed;
1668
fe33032d
YZ
1669 spin_lock(&mdsc->caps_list_lock);
1670 if (mdsc->caps_use_max > 0 &&
1671 mdsc->caps_use_count > mdsc->caps_use_max)
1672 count = mdsc->caps_use_count - mdsc->caps_use_max;
1673 else
1674 count = 0;
1675 spin_unlock(&mdsc->caps_list_lock);
1676
37c4efc1
YZ
1677 lwc.dir_lease = false;
1678 lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
1679 freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
1680 if (!lwc.nr_to_scan) /* more invalid leases */
1681 return -EAGAIN;
1682
1683 if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
1684 lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
1685
1686 lwc.dir_lease = true;
fe33032d
YZ
1687 lwc.expire_dir_lease = freed < count;
1688 lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
37c4efc1
YZ
1689 freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
1690 if (!lwc.nr_to_scan) /* more to check */
1691 return -EAGAIN;
1692
1693 return freed > 0 ? 1 : 0;
1694}
1695
81a6cf2d
SW
1696/*
1697 * Ensure a dentry lease will no longer revalidate.
1698 */
1699void ceph_invalidate_dentry_lease(struct dentry *dentry)
1700{
37c4efc1 1701 struct ceph_dentry_info *di = ceph_dentry(dentry);
81a6cf2d 1702 spin_lock(&dentry->d_lock);
37c4efc1
YZ
1703 di->time = jiffies;
1704 di->lease_shared_gen = 0;
f5e17aed 1705 di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
37c4efc1 1706 __dentry_lease_unlist(di);
81a6cf2d
SW
1707 spin_unlock(&dentry->d_lock);
1708}
2817b000
SW
1709
1710/*
1711 * Check if dentry lease is valid. If not, delete the lease. Try to
1712 * renew if the least is more than half up.
1713 */
1e9c2eb6
YZ
1714static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
1715{
1716 struct ceph_mds_session *session;
1717
1718 if (!di->lease_gen)
1719 return false;
1720
1721 session = di->lease_session;
1722 if (session) {
1723 u32 gen;
1724 unsigned long ttl;
1725
52d60f8e 1726 gen = atomic_read(&session->s_cap_gen);
1e9c2eb6 1727 ttl = session->s_cap_ttl;
1e9c2eb6
YZ
1728
1729 if (di->lease_gen == gen &&
1730 time_before(jiffies, ttl) &&
1731 time_before(jiffies, di->time))
1732 return true;
1733 }
1734 di->lease_gen = 0;
1735 return false;
1736}
1737
8f2a98ef 1738static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
2817b000
SW
1739{
1740 struct ceph_dentry_info *di;
2817b000 1741 struct ceph_mds_session *session = NULL;
2817b000 1742 u32 seq = 0;
1e9c2eb6 1743 int valid = 0;
2817b000
SW
1744
1745 spin_lock(&dentry->d_lock);
1746 di = ceph_dentry(dentry);
1e9c2eb6
YZ
1747 if (di && __dentry_lease_is_valid(di)) {
1748 valid = 1;
2817b000 1749
1e9c2eb6
YZ
1750 if (di->lease_renew_after &&
1751 time_after(jiffies, di->lease_renew_after)) {
1752 /*
1753 * We should renew. If we're in RCU walk mode
1754 * though, we can't do that so just return
1755 * -ECHILD.
1756 */
1757 if (flags & LOOKUP_RCU) {
1758 valid = -ECHILD;
1759 } else {
1760 session = ceph_get_mds_session(di->lease_session);
1761 seq = di->lease_seq;
1762 di->lease_renew_after = 0;
1763 di->lease_renew_from = jiffies;
2817b000 1764 }
2817b000
SW
1765 }
1766 }
1767 spin_unlock(&dentry->d_lock);
1768
1769 if (session) {
8f2a98ef 1770 ceph_mdsc_lease_send_msg(session, dentry,
2817b000
SW
1771 CEPH_MDS_LEASE_RENEW, seq);
1772 ceph_put_mds_session(session);
1773 }
1774 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
1775 return valid;
1776}
1777
1e9c2eb6
YZ
1778/*
1779 * Called under dentry->d_lock.
1780 */
1781static int __dir_lease_try_check(const struct dentry *dentry)
1782{
1783 struct ceph_dentry_info *di = ceph_dentry(dentry);
1784 struct inode *dir;
1785 struct ceph_inode_info *ci;
1786 int valid = 0;
1787
1788 if (!di->lease_shared_gen)
1789 return 0;
1790 if (IS_ROOT(dentry))
1791 return 0;
1792
1793 dir = d_inode(dentry->d_parent);
1794 ci = ceph_inode(dir);
1795
1796 if (spin_trylock(&ci->i_ceph_lock)) {
1797 if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
1798 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
1799 valid = 1;
1800 spin_unlock(&ci->i_ceph_lock);
1801 } else {
1802 valid = -EBUSY;
1803 }
1804
1805 if (!valid)
1806 di->lease_shared_gen = 0;
1807 return valid;
1808}
1809
2817b000
SW
1810/*
1811 * Check if directory-wide content lease/cap is valid.
1812 */
719a2514
YZ
1813static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
1814 struct ceph_mds_client *mdsc)
2817b000
SW
1815{
1816 struct ceph_inode_info *ci = ceph_inode(dir);
feab6ac2
YZ
1817 int valid;
1818 int shared_gen;
2817b000 1819
be655596 1820 spin_lock(&ci->i_ceph_lock);
feab6ac2 1821 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
719a2514
YZ
1822 if (valid) {
1823 __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
1824 shared_gen = atomic_read(&ci->i_shared_gen);
1825 }
be655596 1826 spin_unlock(&ci->i_ceph_lock);
feab6ac2
YZ
1827 if (valid) {
1828 struct ceph_dentry_info *di;
1829 spin_lock(&dentry->d_lock);
1830 di = ceph_dentry(dentry);
1831 if (dir == d_inode(dentry->d_parent) &&
1832 di && di->lease_shared_gen == shared_gen)
1833 __ceph_dentry_dir_lease_touch(di);
1834 else
1835 valid = 0;
1836 spin_unlock(&dentry->d_lock);
1837 }
1838 dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
1839 dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
2817b000
SW
1840 return valid;
1841}
1842
1843/*
1844 * Check if cached dentry can be trusted.
1845 */
0b728e19 1846static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
2817b000 1847{
bf1c6aca 1848 int valid = 0;
641235d8 1849 struct dentry *parent;
aa8dd816 1850 struct inode *dir, *inode;
719a2514 1851 struct ceph_mds_client *mdsc;
34286d66 1852
c5267601
JL
1853 valid = fscrypt_d_revalidate(dentry, flags);
1854 if (valid <= 0)
1855 return valid;
1856
f49d1e05 1857 if (flags & LOOKUP_RCU) {
52953d55 1858 parent = READ_ONCE(dentry->d_parent);
f49d1e05
JL
1859 dir = d_inode_rcu(parent);
1860 if (!dir)
1861 return -ECHILD;
aa8dd816 1862 inode = d_inode_rcu(dentry);
f49d1e05
JL
1863 } else {
1864 parent = dget_parent(dentry);
1865 dir = d_inode(parent);
aa8dd816 1866 inode = d_inode(dentry);
f49d1e05 1867 }
34286d66 1868
c5267601
JL
1869 dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
1870 dentry, inode, ceph_dentry(dentry)->offset,
1871 !!(dentry->d_flags & DCACHE_NOKEY_NAME));
2817b000 1872
719a2514
YZ
1873 mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
1874
2817b000
SW
1875 /* always trust cached snapped dentries, snapdir dentry */
1876 if (ceph_snap(dir) != CEPH_NOSNAP) {
a455589f 1877 dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
aa8dd816 1878 dentry, inode);
bf1c6aca 1879 valid = 1;
aa8dd816 1880 } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
bf1c6aca 1881 valid = 1;
14fb9c9e 1882 } else {
8f2a98ef 1883 valid = dentry_lease_is_valid(dentry, flags);
14fb9c9e
JL
1884 if (valid == -ECHILD)
1885 return valid;
719a2514 1886 if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
aa8dd816
AV
1887 if (inode)
1888 valid = ceph_is_any_caps(inode);
14fb9c9e
JL
1889 else
1890 valid = 1;
1891 }
2817b000 1892 }
2817b000 1893
200fd27c 1894 if (!valid) {
200fd27c 1895 struct ceph_mds_request *req;
1097680d
JL
1896 int op, err;
1897 u32 mask;
200fd27c 1898
f49d1e05
JL
1899 if (flags & LOOKUP_RCU)
1900 return -ECHILD;
1901
f9009efa
XL
1902 percpu_counter_inc(&mdsc->metric.d_lease_mis);
1903
200fd27c 1904 op = ceph_snap(dir) == CEPH_SNAPDIR ?
5eb9f604 1905 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
200fd27c
YZ
1906 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
1907 if (!IS_ERR(req)) {
1908 req->r_dentry = dget(dentry);
5eb9f604
JL
1909 req->r_num_caps = 2;
1910 req->r_parent = dir;
4c183472 1911 ihold(dir);
200fd27c
YZ
1912
1913 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
1914 if (ceph_security_xattr_wanted(dir))
1915 mask |= CEPH_CAP_XATTR_SHARED;
1097680d 1916 req->r_args.getattr.mask = cpu_to_le32(mask);
200fd27c 1917
200fd27c 1918 err = ceph_mdsc_do_request(mdsc, NULL, req);
c3f4688a
JL
1919 switch (err) {
1920 case 0:
1921 if (d_really_is_positive(dentry) &&
1922 d_inode(dentry) == req->r_target_inode)
1923 valid = 1;
1924 break;
1925 case -ENOENT:
1926 if (d_really_is_negative(dentry))
1927 valid = 1;
df561f66 1928 fallthrough;
c3f4688a
JL
1929 default:
1930 break;
200fd27c
YZ
1931 }
1932 ceph_mdsc_put_request(req);
1933 dout("d_revalidate %p lookup result=%d\n",
1934 dentry, err);
1935 }
f9009efa
XL
1936 } else {
1937 percpu_counter_inc(&mdsc->metric.d_lease_hit);
200fd27c
YZ
1938 }
1939
bf1c6aca 1940 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
37c4efc1 1941 if (!valid)
9215aeea 1942 ceph_dir_clear_complete(dir);
641235d8 1943
f49d1e05
JL
1944 if (!(flags & LOOKUP_RCU))
1945 dput(parent);
bf1c6aca 1946 return valid;
2817b000
SW
1947}
1948
1e9c2eb6
YZ
1949/*
1950 * Delete unused dentry that doesn't have valid lease
1951 *
1952 * Called under dentry->d_lock.
1953 */
1954static int ceph_d_delete(const struct dentry *dentry)
1955{
1956 struct ceph_dentry_info *di;
1957
1958 /* won't release caps */
1959 if (d_really_is_negative(dentry))
1960 return 0;
1961 if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
1962 return 0;
1963 /* vaild lease? */
1964 di = ceph_dentry(dentry);
1965 if (di) {
1966 if (__dentry_lease_is_valid(di))
1967 return 0;
1968 if (__dir_lease_try_check(dentry))
1969 return 0;
1970 }
1971 return 1;
1972}
1973
2817b000 1974/*
147851d2 1975 * Release our ceph_dentry_info.
2817b000 1976 */
147851d2 1977static void ceph_d_release(struct dentry *dentry)
2817b000
SW
1978{
1979 struct ceph_dentry_info *di = ceph_dentry(dentry);
f9009efa 1980 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
2817b000 1981
147851d2 1982 dout("d_release %p\n", dentry);
5b484a51 1983
f9009efa
XL
1984 atomic64_dec(&fsc->mdsc->metric.total_dentries);
1985
5b484a51 1986 spin_lock(&dentry->d_lock);
37c4efc1 1987 __dentry_lease_unlist(di);
5b484a51
JL
1988 dentry->d_fsdata = NULL;
1989 spin_unlock(&dentry->d_lock);
1990
7e65624d 1991 ceph_put_mds_session(di->lease_session);
3d8eb7a9 1992 kmem_cache_free(ceph_dentry_cachep, di);
2817b000
SW
1993}
1994
b58dc410
SW
1995/*
1996 * When the VFS prunes a dentry from the cache, we need to clear the
1997 * complete flag on the parent directory.
1998 *
1999 * Called under dentry->d_lock.
2000 */
2001static void ceph_d_prune(struct dentry *dentry)
2002{
5495c2d0
YZ
2003 struct ceph_inode_info *dir_ci;
2004 struct ceph_dentry_info *di;
2005
2006 dout("ceph_d_prune %pd %p\n", dentry, dentry);
b58dc410
SW
2007
2008 /* do we have a valid parent? */
8842b3be 2009 if (IS_ROOT(dentry))
b58dc410
SW
2010 return;
2011
5495c2d0
YZ
2012 /* we hold d_lock, so d_parent is stable */
2013 dir_ci = ceph_inode(d_inode(dentry->d_parent));
2014 if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
b58dc410 2015 return;
2817b000 2016
5495c2d0
YZ
2017 /* who calls d_delete() should also disable dcache readdir */
2018 if (d_really_is_negative(dentry))
18fc8abd
AV
2019 return;
2020
5495c2d0
YZ
2021 /* d_fsdata does not get cleared until d_release */
2022 if (!d_unhashed(dentry)) {
2023 __ceph_dir_clear_complete(dir_ci);
2024 return;
2025 }
2026
2027 /* Disable dcache readdir just in case that someone called d_drop()
2028 * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
2029 * properly (dcache readdir is still enabled) */
2030 di = ceph_dentry(dentry);
2031 if (di->offset > 0 &&
2032 di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen))
2033 __ceph_dir_clear_ordered(dir_ci);
b58dc410 2034}
2817b000
SW
2035
2036/*
2037 * read() on a dir. This weird interface hack only works if mounted
2038 * with '-o dirstat'.
2039 */
2040static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
2041 loff_t *ppos)
2042{
bb48bd4d 2043 struct ceph_dir_file_info *dfi = file->private_data;
496ad9aa 2044 struct inode *inode = file_inode(file);
2817b000
SW
2045 struct ceph_inode_info *ci = ceph_inode(inode);
2046 int left;
ae598083 2047 const int bufsize = 1024;
2817b000 2048
3d14c5d2 2049 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
2817b000
SW
2050 return -EISDIR;
2051
bb48bd4d
CX
2052 if (!dfi->dir_info) {
2053 dfi->dir_info = kmalloc(bufsize, GFP_KERNEL);
2054 if (!dfi->dir_info)
2817b000 2055 return -ENOMEM;
bb48bd4d
CX
2056 dfi->dir_info_len =
2057 snprintf(dfi->dir_info, bufsize,
2817b000
SW
2058 "entries: %20lld\n"
2059 " files: %20lld\n"
2060 " subdirs: %20lld\n"
2061 "rentries: %20lld\n"
2062 " rfiles: %20lld\n"
2063 " rsubdirs: %20lld\n"
2064 "rbytes: %20lld\n"
9bbeab41 2065 "rctime: %10lld.%09ld\n",
2817b000
SW
2066 ci->i_files + ci->i_subdirs,
2067 ci->i_files,
2068 ci->i_subdirs,
2069 ci->i_rfiles + ci->i_rsubdirs,
2070 ci->i_rfiles,
2071 ci->i_rsubdirs,
2072 ci->i_rbytes,
9bbeab41
AB
2073 ci->i_rctime.tv_sec,
2074 ci->i_rctime.tv_nsec);
2817b000
SW
2075 }
2076
bb48bd4d 2077 if (*ppos >= dfi->dir_info_len)
2817b000 2078 return 0;
bb48bd4d
CX
2079 size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
2080 left = copy_to_user(buf, dfi->dir_info + *ppos, size);
2817b000
SW
2081 if (left == size)
2082 return -EFAULT;
2083 *ppos += (size - left);
2084 return size - left;
2085}
2086
2817b000 2087
2817b000 2088
6c0f3af7
SW
2089/*
2090 * Return name hash for a given dentry. This is dependent on
2091 * the parent directory's hash function.
2092 */
e5f86dc3 2093unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
6c0f3af7 2094{
6c0f3af7 2095 struct ceph_inode_info *dci = ceph_inode(dir);
76a495d6 2096 unsigned hash;
6c0f3af7
SW
2097
2098 switch (dci->i_dir_layout.dl_dir_hash) {
2099 case 0: /* for backward compat */
2100 case CEPH_STR_HASH_LINUX:
2101 return dn->d_name.hash;
2102
2103 default:
76a495d6
JL
2104 spin_lock(&dn->d_lock);
2105 hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
6c0f3af7 2106 dn->d_name.name, dn->d_name.len);
76a495d6
JL
2107 spin_unlock(&dn->d_lock);
2108 return hash;
6c0f3af7
SW
2109 }
2110}
2111
3e327154 2112WRAP_DIR_ITER(ceph_readdir) // FIXME!
2817b000
SW
2113const struct file_operations ceph_dir_fops = {
2114 .read = ceph_read_dir,
3e327154 2115 .iterate_shared = shared_ceph_readdir,
2817b000
SW
2116 .llseek = ceph_dir_llseek,
2117 .open = ceph_open,
2118 .release = ceph_release,
2119 .unlocked_ioctl = ceph_ioctl,
18bd6caa 2120 .compat_ioctl = compat_ptr_ioctl,
da819c81 2121 .fsync = ceph_fsync,
597817dd
YZ
2122 .lock = ceph_lock,
2123 .flock = ceph_flock,
2817b000
SW
2124};
2125
38c48b5f 2126const struct file_operations ceph_snapdir_fops = {
3e327154 2127 .iterate_shared = shared_ceph_readdir,
38c48b5f
YZ
2128 .llseek = ceph_dir_llseek,
2129 .open = ceph_open,
2130 .release = ceph_release,
2131};
2132
2817b000
SW
2133const struct inode_operations ceph_dir_iops = {
2134 .lookup = ceph_lookup,
2135 .permission = ceph_permission,
2136 .getattr = ceph_getattr,
2137 .setattr = ceph_setattr,
2817b000 2138 .listxattr = ceph_listxattr,
cac2f8b8 2139 .get_inode_acl = ceph_get_acl,
72466d0b 2140 .set_acl = ceph_set_acl,
2817b000
SW
2141 .mknod = ceph_mknod,
2142 .symlink = ceph_symlink,
2143 .mkdir = ceph_mkdir,
2144 .link = ceph_link,
2145 .unlink = ceph_unlink,
2146 .rmdir = ceph_unlink,
2147 .rename = ceph_rename,
2148 .create = ceph_create,
2d83bde9 2149 .atomic_open = ceph_atomic_open,
2817b000
SW
2150};
2151
38c48b5f
YZ
2152const struct inode_operations ceph_snapdir_iops = {
2153 .lookup = ceph_lookup,
2154 .permission = ceph_permission,
2155 .getattr = ceph_getattr,
2156 .mkdir = ceph_mkdir,
2157 .rmdir = ceph_unlink,
0ea611a3 2158 .rename = ceph_rename,
38c48b5f
YZ
2159};
2160
52dfb8ac 2161const struct dentry_operations ceph_dentry_ops = {
2817b000 2162 .d_revalidate = ceph_d_revalidate,
1e9c2eb6 2163 .d_delete = ceph_d_delete,
147851d2 2164 .d_release = ceph_d_release,
b58dc410 2165 .d_prune = ceph_d_prune,
ad5cb123 2166 .d_init = ceph_d_init,
2817b000 2167};