ceph: add ceph_cap_unlink_work to fire check_caps() immediately
[linux-2.6-block.git] / fs / ceph / caps.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
3d14c5d2 2#include <linux/ceph/ceph_debug.h>
a8599bd8
SW
3
4#include <linux/fs.h>
5#include <linux/kernel.h>
174cd4b1 6#include <linux/sched/signal.h>
5a0e3ad6 7#include <linux/slab.h>
a8599bd8
SW
8#include <linux/vmalloc.h>
9#include <linux/wait.h>
f1a3d572 10#include <linux/writeback.h>
176c77c9 11#include <linux/iversion.h>
5970e15d 12#include <linux/filelock.h>
a8599bd8
SW
13
14#include "super.h"
3d14c5d2 15#include "mds_client.h"
99ccbd22 16#include "cache.h"
2d332d5b 17#include "crypto.h"
3d14c5d2
YS
18#include <linux/ceph/decode.h>
19#include <linux/ceph/messenger.h>
a8599bd8
SW
20
21/*
22 * Capability management
23 *
24 * The Ceph metadata servers control client access to inode metadata
25 * and file data by issuing capabilities, granting clients permission
26 * to read and/or write both inode field and file data to OSDs
27 * (storage nodes). Each capability consists of a set of bits
28 * indicating which operations are allowed.
29 *
30 * If the client holds a *_SHARED cap, the client has a coherent value
31 * that can be safely read from the cached inode.
32 *
33 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
34 * client is allowed to change inode attributes (e.g., file size,
35 * mtime), note its dirty state in the ceph_cap, and asynchronously
36 * flush that metadata change to the MDS.
37 *
38 * In the event of a conflicting operation (perhaps by another
39 * client), the MDS will revoke the conflicting client capabilities.
40 *
41 * In order for a client to cache an inode, it must hold a capability
42 * with at least one MDS server. When inodes are released, release
43 * notifications are batched and periodically sent en masse to the MDS
44 * cluster to release server state.
45 */
46
0e294387 47static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
7bc00fdd
YZ
48static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
49 struct ceph_mds_session *session,
50 struct ceph_inode_info *ci,
51 u64 oldest_flush_tid);
a8599bd8
SW
52
53/*
54 * Generate readable cap strings for debugging output.
55 */
56#define MAX_CAP_STR 20
57static char cap_str[MAX_CAP_STR][40];
58static DEFINE_SPINLOCK(cap_str_lock);
59static int last_cap_str;
60
61static char *gcap_string(char *s, int c)
62{
63 if (c & CEPH_CAP_GSHARED)
64 *s++ = 's';
65 if (c & CEPH_CAP_GEXCL)
66 *s++ = 'x';
67 if (c & CEPH_CAP_GCACHE)
68 *s++ = 'c';
69 if (c & CEPH_CAP_GRD)
70 *s++ = 'r';
71 if (c & CEPH_CAP_GWR)
72 *s++ = 'w';
73 if (c & CEPH_CAP_GBUFFER)
74 *s++ = 'b';
49a9f4f6
YZ
75 if (c & CEPH_CAP_GWREXTEND)
76 *s++ = 'a';
a8599bd8
SW
77 if (c & CEPH_CAP_GLAZYIO)
78 *s++ = 'l';
79 return s;
80}
81
82const char *ceph_cap_string(int caps)
83{
84 int i;
85 char *s;
86 int c;
87
88 spin_lock(&cap_str_lock);
89 i = last_cap_str++;
90 if (last_cap_str == MAX_CAP_STR)
91 last_cap_str = 0;
92 spin_unlock(&cap_str_lock);
93
94 s = cap_str[i];
95
96 if (caps & CEPH_CAP_PIN)
97 *s++ = 'p';
98
99 c = (caps >> CEPH_CAP_SAUTH) & 3;
100 if (c) {
101 *s++ = 'A';
102 s = gcap_string(s, c);
103 }
104
105 c = (caps >> CEPH_CAP_SLINK) & 3;
106 if (c) {
107 *s++ = 'L';
108 s = gcap_string(s, c);
109 }
110
111 c = (caps >> CEPH_CAP_SXATTR) & 3;
112 if (c) {
113 *s++ = 'X';
114 s = gcap_string(s, c);
115 }
116
117 c = caps >> CEPH_CAP_SFILE;
118 if (c) {
119 *s++ = 'F';
120 s = gcap_string(s, c);
121 }
122
123 if (s == cap_str[i])
124 *s++ = '-';
125 *s = 0;
126 return cap_str[i];
127}
128
37151668 129void ceph_caps_init(struct ceph_mds_client *mdsc)
a8599bd8 130{
37151668
YS
131 INIT_LIST_HEAD(&mdsc->caps_list);
132 spin_lock_init(&mdsc->caps_list_lock);
a8599bd8
SW
133}
134
37151668 135void ceph_caps_finalize(struct ceph_mds_client *mdsc)
a8599bd8
SW
136{
137 struct ceph_cap *cap;
138
37151668
YS
139 spin_lock(&mdsc->caps_list_lock);
140 while (!list_empty(&mdsc->caps_list)) {
141 cap = list_first_entry(&mdsc->caps_list,
142 struct ceph_cap, caps_item);
a8599bd8
SW
143 list_del(&cap->caps_item);
144 kmem_cache_free(ceph_cap_cachep, cap);
145 }
37151668
YS
146 mdsc->caps_total_count = 0;
147 mdsc->caps_avail_count = 0;
148 mdsc->caps_use_count = 0;
149 mdsc->caps_reserve_count = 0;
150 mdsc->caps_min_count = 0;
151 spin_unlock(&mdsc->caps_list_lock);
85ccce43
SW
152}
153
fe33032d
YZ
154void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
155 struct ceph_mount_options *fsopt)
85ccce43 156{
37151668 157 spin_lock(&mdsc->caps_list_lock);
fe33032d
YZ
158 mdsc->caps_min_count = fsopt->max_readdir;
159 if (mdsc->caps_min_count < 1024)
160 mdsc->caps_min_count = 1024;
161 mdsc->caps_use_max = fsopt->caps_max;
162 if (mdsc->caps_use_max > 0 &&
163 mdsc->caps_use_max < mdsc->caps_min_count)
164 mdsc->caps_use_max = mdsc->caps_min_count;
37151668 165 spin_unlock(&mdsc->caps_list_lock);
a8599bd8
SW
166}
167
7bf8f736
CX
168static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
169{
170 struct ceph_cap *cap;
171 int i;
172
173 if (nr_caps) {
174 BUG_ON(mdsc->caps_reserve_count < nr_caps);
175 mdsc->caps_reserve_count -= nr_caps;
176 if (mdsc->caps_avail_count >=
177 mdsc->caps_reserve_count + mdsc->caps_min_count) {
178 mdsc->caps_total_count -= nr_caps;
179 for (i = 0; i < nr_caps; i++) {
180 cap = list_first_entry(&mdsc->caps_list,
181 struct ceph_cap, caps_item);
182 list_del(&cap->caps_item);
183 kmem_cache_free(ceph_cap_cachep, cap);
184 }
185 } else {
186 mdsc->caps_avail_count += nr_caps;
187 }
188
38d46409
XL
189 doutc(mdsc->fsc->client,
190 "caps %d = %d used + %d resv + %d avail\n",
191 mdsc->caps_total_count, mdsc->caps_use_count,
192 mdsc->caps_reserve_count, mdsc->caps_avail_count);
7bf8f736
CX
193 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
194 mdsc->caps_reserve_count +
195 mdsc->caps_avail_count);
196 }
197}
198
e30ee581
ZZ
199/*
200 * Called under mdsc->mutex.
201 */
202int ceph_reserve_caps(struct ceph_mds_client *mdsc,
37151668 203 struct ceph_cap_reservation *ctx, int need)
a8599bd8 204{
38d46409 205 struct ceph_client *cl = mdsc->fsc->client;
e30ee581 206 int i, j;
a8599bd8
SW
207 struct ceph_cap *cap;
208 int have;
209 int alloc = 0;
e30ee581 210 int max_caps;
e5bc08d0 211 int err = 0;
e30ee581
ZZ
212 bool trimmed = false;
213 struct ceph_mds_session *s;
a8599bd8 214 LIST_HEAD(newcaps);
a8599bd8 215
38d46409 216 doutc(cl, "ctx=%p need=%d\n", ctx, need);
a8599bd8
SW
217
218 /* first reserve any caps that are already allocated */
37151668
YS
219 spin_lock(&mdsc->caps_list_lock);
220 if (mdsc->caps_avail_count >= need)
a8599bd8
SW
221 have = need;
222 else
37151668
YS
223 have = mdsc->caps_avail_count;
224 mdsc->caps_avail_count -= have;
225 mdsc->caps_reserve_count += have;
226 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
227 mdsc->caps_reserve_count +
228 mdsc->caps_avail_count);
229 spin_unlock(&mdsc->caps_list_lock);
a8599bd8 230
79cd674a 231 for (i = have; i < need; ) {
a8599bd8 232 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
79cd674a
CX
233 if (cap) {
234 list_add(&cap->caps_item, &newcaps);
235 alloc++;
236 i++;
237 continue;
238 }
239
240 if (!trimmed) {
241 for (j = 0; j < mdsc->max_sessions; j++) {
242 s = __ceph_lookup_mds_session(mdsc, j);
243 if (!s)
244 continue;
245 mutex_unlock(&mdsc->mutex);
246
247 mutex_lock(&s->s_mutex);
248 max_caps = s->s_nr_caps - (need - i);
249 ceph_trim_caps(mdsc, s, max_caps);
250 mutex_unlock(&s->s_mutex);
251
252 ceph_put_mds_session(s);
253 mutex_lock(&mdsc->mutex);
e30ee581 254 }
79cd674a
CX
255 trimmed = true;
256
257 spin_lock(&mdsc->caps_list_lock);
258 if (mdsc->caps_avail_count) {
259 int more_have;
260 if (mdsc->caps_avail_count >= need - i)
261 more_have = need - i;
262 else
263 more_have = mdsc->caps_avail_count;
264
265 i += more_have;
266 have += more_have;
267 mdsc->caps_avail_count -= more_have;
268 mdsc->caps_reserve_count += more_have;
269
270 }
271 spin_unlock(&mdsc->caps_list_lock);
272
273 continue;
e30ee581 274 }
79cd674a 275
38d46409
XL
276 pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need,
277 have + alloc);
e5bc08d0
CX
278 err = -ENOMEM;
279 break;
280 }
281
282 if (!err) {
283 BUG_ON(have + alloc != need);
284 ctx->count = need;
fe33032d 285 ctx->used = 0;
a8599bd8 286 }
a8599bd8 287
37151668
YS
288 spin_lock(&mdsc->caps_list_lock);
289 mdsc->caps_total_count += alloc;
290 mdsc->caps_reserve_count += alloc;
291 list_splice(&newcaps, &mdsc->caps_list);
a8599bd8 292
37151668
YS
293 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
294 mdsc->caps_reserve_count +
295 mdsc->caps_avail_count);
e5bc08d0
CX
296
297 if (err)
298 __ceph_unreserve_caps(mdsc, have + alloc);
299
37151668 300 spin_unlock(&mdsc->caps_list_lock);
a8599bd8 301
38d46409
XL
302 doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx,
303 mdsc->caps_total_count, mdsc->caps_use_count,
304 mdsc->caps_reserve_count, mdsc->caps_avail_count);
e5bc08d0 305 return err;
a8599bd8
SW
306}
307
7bf8f736 308void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
fe33032d 309 struct ceph_cap_reservation *ctx)
a8599bd8 310{
38d46409 311 struct ceph_client *cl = mdsc->fsc->client;
fe33032d
YZ
312 bool reclaim = false;
313 if (!ctx->count)
314 return;
315
38d46409 316 doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count);
7bf8f736
CX
317 spin_lock(&mdsc->caps_list_lock);
318 __ceph_unreserve_caps(mdsc, ctx->count);
319 ctx->count = 0;
fe33032d
YZ
320
321 if (mdsc->caps_use_max > 0 &&
322 mdsc->caps_use_count > mdsc->caps_use_max)
323 reclaim = true;
7bf8f736 324 spin_unlock(&mdsc->caps_list_lock);
fe33032d
YZ
325
326 if (reclaim)
327 ceph_reclaim_caps_nr(mdsc, ctx->used);
a8599bd8
SW
328}
329
d9df2783
YZ
330struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
331 struct ceph_cap_reservation *ctx)
a8599bd8 332{
38d46409 333 struct ceph_client *cl = mdsc->fsc->client;
a8599bd8
SW
334 struct ceph_cap *cap = NULL;
335
336 /* temporary, until we do something about cap import/export */
443b3760
SW
337 if (!ctx) {
338 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
339 if (cap) {
4d1d0534 340 spin_lock(&mdsc->caps_list_lock);
37151668
YS
341 mdsc->caps_use_count++;
342 mdsc->caps_total_count++;
4d1d0534 343 spin_unlock(&mdsc->caps_list_lock);
e327ce06
CX
344 } else {
345 spin_lock(&mdsc->caps_list_lock);
346 if (mdsc->caps_avail_count) {
347 BUG_ON(list_empty(&mdsc->caps_list));
348
349 mdsc->caps_avail_count--;
350 mdsc->caps_use_count++;
351 cap = list_first_entry(&mdsc->caps_list,
352 struct ceph_cap, caps_item);
353 list_del(&cap->caps_item);
354
355 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
356 mdsc->caps_reserve_count + mdsc->caps_avail_count);
357 }
358 spin_unlock(&mdsc->caps_list_lock);
443b3760 359 }
e327ce06 360
443b3760
SW
361 return cap;
362 }
a8599bd8 363
37151668 364 spin_lock(&mdsc->caps_list_lock);
38d46409
XL
365 doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx,
366 ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
367 mdsc->caps_reserve_count, mdsc->caps_avail_count);
a8599bd8 368 BUG_ON(!ctx->count);
37151668
YS
369 BUG_ON(ctx->count > mdsc->caps_reserve_count);
370 BUG_ON(list_empty(&mdsc->caps_list));
a8599bd8
SW
371
372 ctx->count--;
fe33032d 373 ctx->used++;
37151668
YS
374 mdsc->caps_reserve_count--;
375 mdsc->caps_use_count++;
a8599bd8 376
37151668 377 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
a8599bd8
SW
378 list_del(&cap->caps_item);
379
37151668
YS
380 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
381 mdsc->caps_reserve_count + mdsc->caps_avail_count);
382 spin_unlock(&mdsc->caps_list_lock);
a8599bd8
SW
383 return cap;
384}
385
37151668 386void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
a8599bd8 387{
38d46409
XL
388 struct ceph_client *cl = mdsc->fsc->client;
389
37151668 390 spin_lock(&mdsc->caps_list_lock);
38d46409
XL
391 doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap,
392 mdsc->caps_total_count, mdsc->caps_use_count,
393 mdsc->caps_reserve_count, mdsc->caps_avail_count);
37151668 394 mdsc->caps_use_count--;
a8599bd8 395 /*
85ccce43
SW
396 * Keep some preallocated caps around (ceph_min_count), to
397 * avoid lots of free/alloc churn.
a8599bd8 398 */
37151668
YS
399 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
400 mdsc->caps_min_count) {
401 mdsc->caps_total_count--;
a8599bd8
SW
402 kmem_cache_free(ceph_cap_cachep, cap);
403 } else {
37151668
YS
404 mdsc->caps_avail_count++;
405 list_add(&cap->caps_item, &mdsc->caps_list);
a8599bd8
SW
406 }
407
37151668
YS
408 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
409 mdsc->caps_reserve_count + mdsc->caps_avail_count);
410 spin_unlock(&mdsc->caps_list_lock);
a8599bd8
SW
411}
412
3d14c5d2 413void ceph_reservation_status(struct ceph_fs_client *fsc,
85ccce43
SW
414 int *total, int *avail, int *used, int *reserved,
415 int *min)
a8599bd8 416{
3d14c5d2 417 struct ceph_mds_client *mdsc = fsc->mdsc;
37151668 418
b884014a
CX
419 spin_lock(&mdsc->caps_list_lock);
420
a8599bd8 421 if (total)
37151668 422 *total = mdsc->caps_total_count;
a8599bd8 423 if (avail)
37151668 424 *avail = mdsc->caps_avail_count;
a8599bd8 425 if (used)
37151668 426 *used = mdsc->caps_use_count;
a8599bd8 427 if (reserved)
37151668 428 *reserved = mdsc->caps_reserve_count;
85ccce43 429 if (min)
37151668 430 *min = mdsc->caps_min_count;
b884014a
CX
431
432 spin_unlock(&mdsc->caps_list_lock);
a8599bd8
SW
433}
434
435/*
436 * Find ceph_cap for given mds, if any.
437 *
be655596 438 * Called with i_ceph_lock held.
a8599bd8 439 */
aaf67de7 440struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
a8599bd8
SW
441{
442 struct ceph_cap *cap;
443 struct rb_node *n = ci->i_caps.rb_node;
444
445 while (n) {
446 cap = rb_entry(n, struct ceph_cap, ci_node);
447 if (mds < cap->mds)
448 n = n->rb_left;
449 else if (mds > cap->mds)
450 n = n->rb_right;
451 else
452 return cap;
453 }
454 return NULL;
455}
456
2bc50259
GF
457struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
458{
459 struct ceph_cap *cap;
460
be655596 461 spin_lock(&ci->i_ceph_lock);
2bc50259 462 cap = __get_cap_for_mds(ci, mds);
be655596 463 spin_unlock(&ci->i_ceph_lock);
2bc50259
GF
464 return cap;
465}
466
a8599bd8 467/*
be655596 468 * Called under i_ceph_lock.
a8599bd8
SW
469 */
470static void __insert_cap_node(struct ceph_inode_info *ci,
471 struct ceph_cap *new)
472{
473 struct rb_node **p = &ci->i_caps.rb_node;
474 struct rb_node *parent = NULL;
475 struct ceph_cap *cap = NULL;
476
477 while (*p) {
478 parent = *p;
479 cap = rb_entry(parent, struct ceph_cap, ci_node);
480 if (new->mds < cap->mds)
481 p = &(*p)->rb_left;
482 else if (new->mds > cap->mds)
483 p = &(*p)->rb_right;
484 else
485 BUG();
486 }
487
488 rb_link_node(&new->ci_node, parent, p);
489 rb_insert_color(&new->ci_node, &ci->i_caps);
490}
491
492/*
493 * (re)set cap hold timeouts, which control the delayed release
494 * of unused caps back to the MDS. Should be called on cap use.
495 */
496static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
497 struct ceph_inode_info *ci)
498{
38d46409 499 struct inode *inode = &ci->netfs.inode;
fe33032d 500 struct ceph_mount_options *opt = mdsc->fsc->mount_options;
38d46409 501
a8599bd8 502 ci->i_hold_caps_max = round_jiffies(jiffies +
fe33032d 503 opt->caps_wanted_delay_max * HZ);
38d46409
XL
504 doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode,
505 ceph_vinop(inode), ci->i_hold_caps_max - jiffies);
a8599bd8
SW
506}
507
508/*
509 * (Re)queue cap at the end of the delayed cap release list.
510 *
511 * If I_FLUSH is set, leave the inode at the front of the list.
512 *
be655596 513 * Caller holds i_ceph_lock
a8599bd8
SW
514 * -> we take mdsc->cap_delay_lock
515 */
516static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
a0d93e32 517 struct ceph_inode_info *ci)
a8599bd8 518{
38d46409
XL
519 struct inode *inode = &ci->netfs.inode;
520
521 doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n",
522 inode, ceph_vinop(inode), ci->i_ceph_flags,
523 ci->i_hold_caps_max);
a8599bd8
SW
524 if (!mdsc->stopping) {
525 spin_lock(&mdsc->cap_delay_lock);
526 if (!list_empty(&ci->i_cap_delay_list)) {
527 if (ci->i_ceph_flags & CEPH_I_FLUSH)
528 goto no_change;
529 list_del_init(&ci->i_cap_delay_list);
530 }
a0d93e32 531 __cap_set_timeouts(mdsc, ci);
a8599bd8
SW
532 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
533no_change:
534 spin_unlock(&mdsc->cap_delay_lock);
535 }
536}
537
538/*
539 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
540 * indicating we should send a cap message to flush dirty metadata
541 * asap, and move to the front of the delayed cap list.
542 */
543static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
544 struct ceph_inode_info *ci)
545{
38d46409
XL
546 struct inode *inode = &ci->netfs.inode;
547
548 doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
a8599bd8
SW
549 spin_lock(&mdsc->cap_delay_lock);
550 ci->i_ceph_flags |= CEPH_I_FLUSH;
551 if (!list_empty(&ci->i_cap_delay_list))
552 list_del_init(&ci->i_cap_delay_list);
553 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
554 spin_unlock(&mdsc->cap_delay_lock);
555}
556
557/*
558 * Cancel delayed work on cap.
559 *
be655596 560 * Caller must hold i_ceph_lock.
a8599bd8
SW
561 */
562static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
563 struct ceph_inode_info *ci)
564{
38d46409
XL
565 struct inode *inode = &ci->netfs.inode;
566
567 doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
a8599bd8
SW
568 if (list_empty(&ci->i_cap_delay_list))
569 return;
570 spin_lock(&mdsc->cap_delay_lock);
571 list_del_init(&ci->i_cap_delay_list);
572 spin_unlock(&mdsc->cap_delay_lock);
573}
574
785892fe 575/* Common issue checks for add_cap, handle_cap_grant. */
a8599bd8
SW
576static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
577 unsigned issued)
578{
38d46409
XL
579 struct inode *inode = &ci->netfs.inode;
580 struct ceph_client *cl = ceph_inode_to_client(inode);
581
a8599bd8
SW
582 unsigned had = __ceph_caps_issued(ci, NULL);
583
785892fe
JL
584 lockdep_assert_held(&ci->i_ceph_lock);
585
a8599bd8
SW
586 /*
587 * Each time we receive FILE_CACHE anew, we increment
588 * i_rdcache_gen.
589 */
874c8ca1 590 if (S_ISREG(ci->netfs.inode.i_mode) &&
525d15e8 591 (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
99ccbd22 592 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
a8599bd8 593 ci->i_rdcache_gen++;
99ccbd22 594 }
a8599bd8
SW
595
596 /*
15b51bd6
YZ
597 * If FILE_SHARED is newly issued, mark dir not complete. We don't
598 * know what happened to this directory while we didn't have the cap.
599 * If FILE_SHARED is being revoked, also mark dir not complete. It
600 * stops on-going cached readdir.
a8599bd8 601 */
15b51bd6
YZ
602 if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
603 if (issued & CEPH_CAP_FILE_SHARED)
97aeb6bf 604 atomic_inc(&ci->i_shared_gen);
874c8ca1 605 if (S_ISDIR(ci->netfs.inode.i_mode)) {
38d46409 606 doutc(cl, " marking %p NOT complete\n", inode);
2f276c51 607 __ceph_dir_clear_complete(ci);
a8673d61 608 }
a8599bd8 609 }
785892fe
JL
610
611 /* Wipe saved layout if we're losing DIR_CREATE caps */
874c8ca1 612 if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
785892fe
JL
613 !(issued & CEPH_CAP_DIR_CREATE)) {
614 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
615 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
616 }
a8599bd8
SW
617}
618
1cf03a68
JL
619/**
620 * change_auth_cap_ses - move inode to appropriate lists when auth caps change
621 * @ci: inode to be moved
622 * @session: new auth caps session
623 */
e19feff9
XL
624void change_auth_cap_ses(struct ceph_inode_info *ci,
625 struct ceph_mds_session *session)
1cf03a68
JL
626{
627 lockdep_assert_held(&ci->i_ceph_lock);
628
629 if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
630 return;
631
632 spin_lock(&session->s_mdsc->cap_dirty_lock);
633 if (!list_empty(&ci->i_dirty_item))
634 list_move(&ci->i_dirty_item, &session->s_cap_dirty);
635 if (!list_empty(&ci->i_flushing_item))
636 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
637 spin_unlock(&session->s_mdsc->cap_dirty_lock);
638}
639
a8599bd8
SW
640/*
641 * Add a capability under the given MDS session.
642 *
354c63a0 643 * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
a8599bd8
SW
644 *
645 * @fmode is the open file mode, if we are opening a file, otherwise
646 * it is < 0. (This is so we can atomically add the cap and add an
647 * open file reference to it.)
648 */
d9df2783
YZ
649void ceph_add_cap(struct inode *inode,
650 struct ceph_mds_session *session, u64 cap_id,
135e671e 651 unsigned issued, unsigned wanted,
d9df2783
YZ
652 unsigned seq, unsigned mseq, u64 realmino, int flags,
653 struct ceph_cap **new_cap)
a8599bd8 654{
5995d90d 655 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
38d46409 656 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8 657 struct ceph_inode_info *ci = ceph_inode(inode);
a8599bd8
SW
658 struct ceph_cap *cap;
659 int mds = session->s_mds;
660 int actual_wanted;
606d1023 661 u32 gen;
a8599bd8 662
354c63a0
JL
663 lockdep_assert_held(&ci->i_ceph_lock);
664
38d46409
XL
665 doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode,
666 ceph_vinop(inode), session->s_mds, cap_id,
667 ceph_cap_string(issued), seq);
a8599bd8 668
52d60f8e 669 gen = atomic_read(&session->s_cap_gen);
606d1023 670
a8599bd8
SW
671 cap = __get_cap_for_mds(ci, mds);
672 if (!cap) {
d9df2783
YZ
673 cap = *new_cap;
674 *new_cap = NULL;
a8599bd8
SW
675
676 cap->issued = 0;
677 cap->implemented = 0;
678 cap->mds = mds;
679 cap->mds_wanted = 0;
964266cc 680 cap->mseq = 0;
a8599bd8
SW
681
682 cap->ci = ci;
683 __insert_cap_node(ci, cap);
684
a8599bd8
SW
685 /* add to session cap list */
686 cap->session = session;
687 spin_lock(&session->s_cap_lock);
688 list_add_tail(&cap->session_caps, &session->s_caps);
689 session->s_nr_caps++;
4f1d756d 690 atomic64_inc(&mdsc->metric.total_caps);
a8599bd8 691 spin_unlock(&session->s_cap_lock);
11df2dfb 692 } else {
32f6511a
YZ
693 spin_lock(&session->s_cap_lock);
694 list_move_tail(&cap->session_caps, &session->s_caps);
695 spin_unlock(&session->s_cap_lock);
696
606d1023 697 if (cap->cap_gen < gen)
d2f8bb27
YZ
698 cap->issued = cap->implemented = CEPH_CAP_PIN;
699
11df2dfb
YZ
700 /*
701 * auth mds of the inode changed. we received the cap export
702 * message, but still haven't received the cap import message.
703 * handle_cap_export() updated the new auth MDS' cap.
704 *
705 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
706 * a message that was send before the cap import message. So
707 * don't remove caps.
708 */
709 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
710 WARN_ON(cap != ci->i_auth_cap);
711 WARN_ON(cap->cap_id != cap_id);
712 seq = cap->seq;
713 mseq = cap->mseq;
714 issued |= cap->issued;
715 flags |= CEPH_CAP_FLAG_AUTH;
716 }
717 }
a8599bd8 718
7d9c9193
YZ
719 if (!ci->i_snap_realm ||
720 ((flags & CEPH_CAP_FLAG_AUTH) &&
721 realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
a8599bd8
SW
722 /*
723 * add this inode to the appropriate snap realm
724 */
725 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
726 realmino);
692e1715 727 if (realm)
0ba92e1c 728 ceph_change_snap_realm(inode, realm);
692e1715
JL
729 else
730 WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
731 __func__, realmino, ci->i_vino.ino,
732 ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
a8599bd8
SW
733 }
734
735 __check_cap_issue(ci, cap, issued);
736
737 /*
738 * If we are issued caps we don't want, or the mds' wanted
739 * value appears to be off, queue a check so we'll release
740 * later and/or update the mds wanted value.
741 */
742 actual_wanted = __ceph_caps_wanted(ci);
743 if ((wanted & ~actual_wanted) ||
744 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
38d46409
XL
745 doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n",
746 ceph_cap_string(issued), ceph_cap_string(wanted),
747 ceph_cap_string(actual_wanted));
a0d93e32 748 __cap_delay_requeue(mdsc, ci);
a8599bd8
SW
749 }
750
b8c2f3ae 751 if (flags & CEPH_CAP_FLAG_AUTH) {
d37b1d99 752 if (!ci->i_auth_cap ||
d9ffc4f7 753 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
1cf03a68
JL
754 if (ci->i_auth_cap &&
755 ci->i_auth_cap->session != cap->session)
756 change_auth_cap_ses(ci, cap->session);
b8c2f3ae 757 ci->i_auth_cap = cap;
d9ffc4f7
YZ
758 cap->mds_wanted = wanted;
759 }
11df2dfb
YZ
760 } else {
761 WARN_ON(ci->i_auth_cap == cap);
8a92a119 762 }
a8599bd8 763
38d46409
XL
764 doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n",
765 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
766 ceph_cap_string(issued|cap->issued), seq, mds);
a8599bd8
SW
767 cap->cap_id = cap_id;
768 cap->issued = issued;
769 cap->implemented |= issued;
d1b87809 770 if (ceph_seq_cmp(mseq, cap->mseq) > 0)
964266cc
YZ
771 cap->mds_wanted = wanted;
772 else
773 cap->mds_wanted |= wanted;
a8599bd8
SW
774 cap->seq = seq;
775 cap->issue_seq = seq;
776 cap->mseq = mseq;
606d1023 777 cap->cap_gen = gen;
f7913573 778 wake_up_all(&ci->i_cap_wq);
a8599bd8
SW
779}
780
781/*
782 * Return true if cap has not timed out and belongs to the current
783 * generation of the MDS session (i.e. has not gone 'stale' due to
784 * us losing touch with the mds).
785 */
786static int __cap_is_valid(struct ceph_cap *cap)
787{
38d46409
XL
788 struct inode *inode = &cap->ci->netfs.inode;
789 struct ceph_client *cl = cap->session->s_mdsc->fsc->client;
a8599bd8 790 unsigned long ttl;
cdac8303 791 u32 gen;
a8599bd8 792
52d60f8e 793 gen = atomic_read(&cap->session->s_cap_gen);
a8599bd8 794 ttl = cap->session->s_cap_ttl;
a8599bd8 795
685f9a5d 796 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
38d46409
XL
797 doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n",
798 inode, ceph_vinop(inode), cap,
799 ceph_cap_string(cap->issued), cap->cap_gen, gen);
a8599bd8
SW
800 return 0;
801 }
802
803 return 1;
804}
805
806/*
807 * Return set of valid cap bits issued to us. Note that caps time
808 * out, and may be invalidated in bulk if the client session times out
809 * and session->s_cap_gen is bumped.
810 */
811int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
812{
38d46409
XL
813 struct inode *inode = &ci->netfs.inode;
814 struct ceph_client *cl = ceph_inode_to_client(inode);
d9df2783 815 int have = ci->i_snap_caps;
a8599bd8
SW
816 struct ceph_cap *cap;
817 struct rb_node *p;
818
819 if (implemented)
820 *implemented = 0;
821 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
822 cap = rb_entry(p, struct ceph_cap, ci_node);
823 if (!__cap_is_valid(cap))
824 continue;
38d46409
XL
825 doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode,
826 ceph_vinop(inode), cap, ceph_cap_string(cap->issued));
a8599bd8
SW
827 have |= cap->issued;
828 if (implemented)
829 *implemented |= cap->implemented;
830 }
b1530f57
YZ
831 /*
832 * exclude caps issued by non-auth MDS, but are been revoking
833 * by the auth MDS. The non-auth MDS should be revoking/exporting
834 * these caps, but the message is delayed.
835 */
836 if (ci->i_auth_cap) {
837 cap = ci->i_auth_cap;
838 have &= ~cap->implemented | cap->issued;
839 }
a8599bd8
SW
840 return have;
841}
842
843/*
844 * Get cap bits issued by caps other than @ocap
845 */
846int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
847{
848 int have = ci->i_snap_caps;
849 struct ceph_cap *cap;
850 struct rb_node *p;
851
852 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
853 cap = rb_entry(p, struct ceph_cap, ci_node);
854 if (cap == ocap)
855 continue;
856 if (!__cap_is_valid(cap))
857 continue;
858 have |= cap->issued;
859 }
860 return have;
861}
862
863/*
864 * Move a cap to the end of the LRU (oldest caps at list head, newest
865 * at list tail).
866 */
867static void __touch_cap(struct ceph_cap *cap)
868{
38d46409 869 struct inode *inode = &cap->ci->netfs.inode;
a8599bd8 870 struct ceph_mds_session *s = cap->session;
38d46409 871 struct ceph_client *cl = s->s_mdsc->fsc->client;
a8599bd8 872
a8599bd8 873 spin_lock(&s->s_cap_lock);
d37b1d99 874 if (!s->s_cap_iterator) {
38d46409
XL
875 doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode,
876 ceph_vinop(inode), cap, s->s_mds);
5dacf091
SW
877 list_move_tail(&cap->session_caps, &s->s_caps);
878 } else {
38d46409
XL
879 doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n",
880 inode, ceph_vinop(inode), cap, s->s_mds);
5dacf091 881 }
a8599bd8
SW
882 spin_unlock(&s->s_cap_lock);
883}
884
885/*
886 * Check if we hold the given mask. If so, move the cap(s) to the
887 * front of their respective LRUs. (This is the preferred way for
888 * callers to check for caps they want.)
889 */
890int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
891{
38d46409
XL
892 struct inode *inode = &ci->netfs.inode;
893 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8
SW
894 struct ceph_cap *cap;
895 struct rb_node *p;
896 int have = ci->i_snap_caps;
897
898 if ((have & mask) == mask) {
38d46409
XL
899 doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n",
900 inode, ceph_vinop(inode), ceph_cap_string(have),
901 ceph_cap_string(mask));
a8599bd8
SW
902 return 1;
903 }
904
905 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
906 cap = rb_entry(p, struct ceph_cap, ci_node);
907 if (!__cap_is_valid(cap))
908 continue;
909 if ((cap->issued & mask) == mask) {
38d46409
XL
910 doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n",
911 inode, ceph_vinop(inode), cap,
912 ceph_cap_string(cap->issued),
913 ceph_cap_string(mask));
a8599bd8
SW
914 if (touch)
915 __touch_cap(cap);
916 return 1;
917 }
918
919 /* does a combination of caps satisfy mask? */
920 have |= cap->issued;
921 if ((have & mask) == mask) {
38d46409
XL
922 doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n",
923 inode, ceph_vinop(inode),
924 ceph_cap_string(cap->issued),
925 ceph_cap_string(mask));
a8599bd8
SW
926 if (touch) {
927 struct rb_node *q;
928
25985edc 929 /* touch this + preceding caps */
a8599bd8
SW
930 __touch_cap(cap);
931 for (q = rb_first(&ci->i_caps); q != p;
932 q = rb_next(q)) {
933 cap = rb_entry(q, struct ceph_cap,
934 ci_node);
935 if (!__cap_is_valid(cap))
936 continue;
9f8b72b3
XL
937 if (cap->issued & mask)
938 __touch_cap(cap);
a8599bd8
SW
939 }
940 }
941 return 1;
942 }
943 }
944
945 return 0;
946}
947
1af16d54
XL
948int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
949 int touch)
950{
5995d90d 951 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
1af16d54
XL
952 int r;
953
954 r = __ceph_caps_issued_mask(ci, mask, touch);
955 if (r)
956 ceph_update_cap_hit(&fsc->mdsc->metric);
957 else
958 ceph_update_cap_mis(&fsc->mdsc->metric);
959 return r;
960}
961
a8599bd8
SW
962/*
963 * Return true if mask caps are currently being revoked by an MDS.
964 */
6ee6b953
YZ
965int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
966 struct ceph_cap *ocap, int mask)
a8599bd8 967{
a8599bd8
SW
968 struct ceph_cap *cap;
969 struct rb_node *p;
a8599bd8 970
a8599bd8
SW
971 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
972 cap = rb_entry(p, struct ceph_cap, ci_node);
9563f88c 973 if (cap != ocap &&
6ee6b953
YZ
974 (cap->implemented & ~cap->issued & mask))
975 return 1;
a8599bd8 976 }
6ee6b953
YZ
977 return 0;
978}
979
980int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
981{
874c8ca1 982 struct inode *inode = &ci->netfs.inode;
38d46409 983 struct ceph_client *cl = ceph_inode_to_client(inode);
6ee6b953
YZ
984 int ret;
985
986 spin_lock(&ci->i_ceph_lock);
987 ret = __ceph_caps_revoking_other(ci, NULL, mask);
be655596 988 spin_unlock(&ci->i_ceph_lock);
38d46409
XL
989 doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode),
990 ceph_cap_string(mask), ret);
a8599bd8
SW
991 return ret;
992}
993
994int __ceph_caps_used(struct ceph_inode_info *ci)
995{
996 int used = 0;
997 if (ci->i_pin_ref)
998 used |= CEPH_CAP_PIN;
999 if (ci->i_rd_ref)
1000 used |= CEPH_CAP_FILE_RD;
fdd4e158 1001 if (ci->i_rdcache_ref ||
874c8ca1
DH
1002 (S_ISREG(ci->netfs.inode.i_mode) &&
1003 ci->netfs.inode.i_data.nrpages))
a8599bd8
SW
1004 used |= CEPH_CAP_FILE_CACHE;
1005 if (ci->i_wr_ref)
1006 used |= CEPH_CAP_FILE_WR;
d3d0720d 1007 if (ci->i_wb_ref || ci->i_wrbuffer_ref)
a8599bd8 1008 used |= CEPH_CAP_FILE_BUFFER;
f85122af
JL
1009 if (ci->i_fx_ref)
1010 used |= CEPH_CAP_FILE_EXCL;
a8599bd8
SW
1011 return used;
1012}
1013
719a2514
YZ
1014#define FMODE_WAIT_BIAS 1000
1015
a8599bd8
SW
1016/*
1017 * wanted, by virtue of open file modes
1018 */
1019int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
1020{
719a2514
YZ
1021 const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
1022 const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
1023 const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
1024 const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
1025 struct ceph_mount_options *opt =
5995d90d 1026 ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
719a2514
YZ
1027 unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1028 unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1029
874c8ca1 1030 if (S_ISDIR(ci->netfs.inode.i_mode)) {
719a2514
YZ
1031 int want = 0;
1032
1033 /* use used_cutoff here, to keep dir's wanted caps longer */
1034 if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
1035 time_after(ci->i_last_rd, used_cutoff))
1036 want |= CEPH_CAP_ANY_SHARED;
1037
1038 if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
1039 time_after(ci->i_last_wr, used_cutoff)) {
1040 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1041 if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1042 want |= CEPH_CAP_ANY_DIR_OPS;
1043 }
1044
1045 if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
1046 want |= CEPH_CAP_PIN;
1047
1048 return want;
1049 } else {
1050 int bits = 0;
1051
1052 if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
1053 if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
1054 time_after(ci->i_last_rd, used_cutoff))
1055 bits |= 1 << RD_SHIFT;
1056 } else if (time_after(ci->i_last_rd, idle_cutoff)) {
1057 bits |= 1 << RD_SHIFT;
1058 }
1059
1060 if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
1061 if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
1062 time_after(ci->i_last_wr, used_cutoff))
1063 bits |= 1 << WR_SHIFT;
1064 } else if (time_after(ci->i_last_wr, idle_cutoff)) {
1065 bits |= 1 << WR_SHIFT;
1066 }
1067
1068 /* check lazyio only when read/write is wanted */
1069 if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
1070 ci->i_nr_by_mode[LAZY_SHIFT] > 0)
1071 bits |= 1 << LAZY_SHIFT;
1072
1073 return bits ? ceph_caps_for_mode(bits >> 1) : 0;
774a6a11 1074 }
a8599bd8
SW
1075}
1076
525d15e8
YZ
1077/*
1078 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1079 */
1080int __ceph_caps_wanted(struct ceph_inode_info *ci)
1081{
1082 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
874c8ca1 1083 if (S_ISDIR(ci->netfs.inode.i_mode)) {
a25949b9
JL
1084 /* we want EXCL if holding caps of dir ops */
1085 if (w & CEPH_CAP_ANY_DIR_OPS)
1086 w |= CEPH_CAP_FILE_EXCL;
1087 } else {
525d15e8
YZ
1088 /* we want EXCL if dirty data */
1089 if (w & CEPH_CAP_FILE_BUFFER)
1090 w |= CEPH_CAP_FILE_EXCL;
1091 }
1092 return w;
1093}
1094
a8599bd8
SW
1095/*
1096 * Return caps we have registered with the MDS(s) as 'wanted'.
1097 */
c1944fed 1098int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
a8599bd8
SW
1099{
1100 struct ceph_cap *cap;
1101 struct rb_node *p;
1102 int mds_wanted = 0;
1103
1104 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1105 cap = rb_entry(p, struct ceph_cap, ci_node);
c1944fed 1106 if (check && !__cap_is_valid(cap))
a8599bd8 1107 continue;
a2550604
YZ
1108 if (cap == ci->i_auth_cap)
1109 mds_wanted |= cap->mds_wanted;
1110 else
1111 mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
a8599bd8
SW
1112 }
1113 return mds_wanted;
1114}
1115
9215aeea
YZ
1116int ceph_is_any_caps(struct inode *inode)
1117{
1118 struct ceph_inode_info *ci = ceph_inode(inode);
1119 int ret;
1120
1121 spin_lock(&ci->i_ceph_lock);
bd84fbcb 1122 ret = __ceph_is_any_real_caps(ci);
9215aeea
YZ
1123 spin_unlock(&ci->i_ceph_lock);
1124
1125 return ret;
1126}
1127
a8599bd8 1128/*
f818a736
SW
1129 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
1130 *
be655596 1131 * caller should hold i_ceph_lock.
a6369741 1132 * caller will not hold session s_mutex if called from destroy_inode.
a8599bd8 1133 */
a096b09a 1134void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
a8599bd8
SW
1135{
1136 struct ceph_mds_session *session = cap->session;
38d46409 1137 struct ceph_client *cl = session->s_mdsc->fsc->client;
a8599bd8 1138 struct ceph_inode_info *ci = cap->ci;
38d46409 1139 struct inode *inode = &ci->netfs.inode;
e5cafce3 1140 struct ceph_mds_client *mdsc;
f818a736 1141 int removed = 0;
a8599bd8 1142
e5cafce3
LH
1143 /* 'ci' being NULL means the remove have already occurred */
1144 if (!ci) {
38d46409 1145 doutc(cl, "inode is NULL\n");
e5cafce3
LH
1146 return;
1147 }
1148
a76d0a9c
XL
1149 lockdep_assert_held(&ci->i_ceph_lock);
1150
38d46409 1151 doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode));
a8599bd8 1152
5995d90d 1153 mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc;
e5cafce3 1154
ea60ed6f
LH
1155 /* remove from inode's cap rbtree, and clear auth cap */
1156 rb_erase(&cap->ci_node, &ci->i_caps);
a76d0a9c 1157 if (ci->i_auth_cap == cap)
ea60ed6f
LH
1158 ci->i_auth_cap = NULL;
1159
7c1332b8
SW
1160 /* remove from session list */
1161 spin_lock(&session->s_cap_lock);
1162 if (session->s_cap_iterator == cap) {
1163 /* not yet, we are iterating over this very cap */
38d46409
XL
1164 doutc(cl, "delaying %p removal from session %p\n", cap,
1165 cap->session);
7c1332b8
SW
1166 } else {
1167 list_del_init(&cap->session_caps);
1168 session->s_nr_caps--;
4f1d756d 1169 atomic64_dec(&mdsc->metric.total_caps);
7c1332b8 1170 cap->session = NULL;
f818a736 1171 removed = 1;
7c1332b8 1172 }
f818a736
SW
1173 /* protect backpointer with s_cap_lock: see iterate_session_caps */
1174 cap->ci = NULL;
745a8e3b
YZ
1175
1176 /*
1177 * s_cap_reconnect is protected by s_cap_lock. no one changes
1178 * s_cap_gen while session is in the reconnect state.
1179 */
1180 if (queue_release &&
52d60f8e
JL
1181 (!session->s_cap_reconnect ||
1182 cap->cap_gen == atomic_read(&session->s_cap_gen))) {
745a8e3b
YZ
1183 cap->queue_release = 1;
1184 if (removed) {
e3ec8d68 1185 __ceph_queue_cap_release(session, cap);
745a8e3b
YZ
1186 removed = 0;
1187 }
1188 } else {
1189 cap->queue_release = 0;
1190 }
1191 cap->cap_ino = ci->i_vino.ino;
1192
7c1332b8
SW
1193 spin_unlock(&session->s_cap_lock);
1194
f818a736 1195 if (removed)
37151668 1196 ceph_put_cap(mdsc, cap);
a8599bd8 1197
bd84fbcb
XL
1198 if (!__ceph_is_any_real_caps(ci)) {
1199 /* when reconnect denied, we remove session caps forcibly,
1200 * i_wr_ref can be non-zero. If there are ongoing write,
1201 * keep i_snap_realm.
1202 */
1203 if (ci->i_wr_ref == 0 && ci->i_snap_realm)
874c8ca1 1204 ceph_change_snap_realm(&ci->netfs.inode, NULL);
db40cc17 1205
a8599bd8 1206 __cap_delay_cancel(mdsc, ci);
bd84fbcb 1207 }
a8599bd8
SW
1208}
1209
197b7d79
XL
1210void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1211 bool queue_release)
a76d0a9c
XL
1212{
1213 struct ceph_inode_info *ci = cap->ci;
1214 struct ceph_fs_client *fsc;
1215
1216 /* 'ci' being NULL means the remove have already occurred */
1217 if (!ci) {
38d46409 1218 doutc(mdsc->fsc->client, "inode is NULL\n");
a76d0a9c
XL
1219 return;
1220 }
1221
1222 lockdep_assert_held(&ci->i_ceph_lock);
1223
5995d90d 1224 fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
a76d0a9c
XL
1225 WARN_ON_ONCE(ci->i_auth_cap == cap &&
1226 !list_empty(&ci->i_dirty_item) &&
1227 !fsc->blocklisted &&
874c8ca1 1228 !ceph_inode_is_shutdown(&ci->netfs.inode));
a76d0a9c
XL
1229
1230 __ceph_remove_cap(cap, queue_release);
1231}
1232
0ff8bfb3
JL
1233struct cap_msg_args {
1234 struct ceph_mds_session *session;
1235 u64 ino, cid, follows;
1236 u64 flush_tid, oldest_flush_tid, size, max_size;
1237 u64 xattr_version;
176c77c9 1238 u64 change_attr;
0ff8bfb3 1239 struct ceph_buffer *xattr_buf;
0a454bdd 1240 struct ceph_buffer *old_xattr_buf;
ec62b894 1241 struct timespec64 atime, mtime, ctime, btime;
0ff8bfb3
JL
1242 int op, caps, wanted, dirty;
1243 u32 seq, issue_seq, mseq, time_warp_seq;
1e4ef0c6 1244 u32 flags;
0ff8bfb3
JL
1245 kuid_t uid;
1246 kgid_t gid;
1247 umode_t mode;
1248 bool inline_data;
0a454bdd 1249 bool wake;
16be62fc 1250 bool encrypted;
2d332d5b 1251 u32 fscrypt_auth_len;
2d332d5b 1252 u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
0ff8bfb3
JL
1253};
1254
16d68903
JL
1255/* Marshal up the cap msg to the MDS */
1256static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
a8599bd8
SW
1257{
1258 struct ceph_mds_caps *fc;
e20d258d 1259 void *p;
38d46409
XL
1260 struct ceph_mds_client *mdsc = arg->session->s_mdsc;
1261 struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
1262
1263 doutc(mdsc->fsc->client,
1264 "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u"
1265 " tid %llu/%llu mseq %u follows %lld size %llu/%llu"
1266 " xattr_ver %llu xattr_len %d\n",
1267 ceph_cap_op_name(arg->op), arg->cid, arg->ino,
1268 ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
1269 ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
1270 arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
1271 arg->size, arg->max_size, arg->xattr_version,
1272 arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
a8599bd8 1273
2d332d5b 1274 msg->hdr.version = cpu_to_le16(12);
0ff8bfb3 1275 msg->hdr.tid = cpu_to_le64(arg->flush_tid);
a8599bd8 1276
6df058c0 1277 fc = msg->front.iov_base;
a8599bd8
SW
1278 memset(fc, 0, sizeof(*fc));
1279
0ff8bfb3
JL
1280 fc->cap_id = cpu_to_le64(arg->cid);
1281 fc->op = cpu_to_le32(arg->op);
1282 fc->seq = cpu_to_le32(arg->seq);
1283 fc->issue_seq = cpu_to_le32(arg->issue_seq);
1284 fc->migrate_seq = cpu_to_le32(arg->mseq);
1285 fc->caps = cpu_to_le32(arg->caps);
1286 fc->wanted = cpu_to_le32(arg->wanted);
1287 fc->dirty = cpu_to_le32(arg->dirty);
1288 fc->ino = cpu_to_le64(arg->ino);
1289 fc->snap_follows = cpu_to_le64(arg->follows);
1290
16be62fc
JL
1291#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1292 if (arg->encrypted)
1293 fc->size = cpu_to_le64(round_up(arg->size,
1294 CEPH_FSCRYPT_BLOCK_SIZE));
1295 else
1296#endif
1297 fc->size = cpu_to_le64(arg->size);
0ff8bfb3 1298 fc->max_size = cpu_to_le64(arg->max_size);
9bbeab41
AB
1299 ceph_encode_timespec64(&fc->mtime, &arg->mtime);
1300 ceph_encode_timespec64(&fc->atime, &arg->atime);
1301 ceph_encode_timespec64(&fc->ctime, &arg->ctime);
0ff8bfb3
JL
1302 fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
1303
1304 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
1305 fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
1306 fc->mode = cpu_to_le32(arg->mode);
1307
1308 fc->xattr_version = cpu_to_le64(arg->xattr_version);
1309 if (arg->xattr_buf) {
1310 msg->middle = ceph_buffer_get(arg->xattr_buf);
1311 fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1312 msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
9670079f
JL
1313 }
1314
e20d258d 1315 p = fc + 1;
43b29673 1316 /* flock buffer size (version 2) */
e20d258d 1317 ceph_encode_32(&p, 0);
43b29673 1318 /* inline version (version 4) */
0ff8bfb3 1319 ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
e20d258d
YZ
1320 /* inline data size */
1321 ceph_encode_32(&p, 0);
92475f05
JL
1322 /*
1323 * osd_epoch_barrier (version 5)
1324 * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
1325 * case it was recently changed
1326 */
1327 ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
43b29673 1328 /* oldest_flush_tid (version 6) */
0ff8bfb3 1329 ceph_encode_64(&p, arg->oldest_flush_tid);
e20d258d 1330
43b29673
JL
1331 /*
1332 * caller_uid/caller_gid (version 7)
1333 *
1334 * Currently, we don't properly track which caller dirtied the caps
1335 * last, and force a flush of them when there is a conflict. For now,
1336 * just set this to 0:0, to emulate how the MDS has worked up to now.
1337 */
1338 ceph_encode_32(&p, 0);
1339 ceph_encode_32(&p, 0);
1340
1341 /* pool namespace (version 8) (mds always ignores this) */
1342 ceph_encode_32(&p, 0);
1343
176c77c9 1344 /* btime and change_attr (version 9) */
ec62b894 1345 ceph_encode_timespec64(p, &arg->btime);
43b29673 1346 p += sizeof(struct ceph_timespec);
176c77c9 1347 ceph_encode_64(&p, arg->change_attr);
43b29673
JL
1348
1349 /* Advisory flags (version 10) */
1e4ef0c6 1350 ceph_encode_32(&p, arg->flags);
2d332d5b
JL
1351
1352 /* dirstats (version 11) - these are r/o on the client */
1353 ceph_encode_64(&p, 0);
1354 ceph_encode_64(&p, 0);
1355
1356#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
16be62fc
JL
1357 /*
1358 * fscrypt_auth and fscrypt_file (version 12)
1359 *
1360 * fscrypt_auth holds the crypto context (if any). fscrypt_file
1361 * tracks the real i_size as an __le64 field (and we use a rounded-up
1362 * i_size in the traditional size field).
1363 */
2d332d5b
JL
1364 ceph_encode_32(&p, arg->fscrypt_auth_len);
1365 ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len);
16be62fc
JL
1366 ceph_encode_32(&p, sizeof(__le64));
1367 ceph_encode_64(&p, arg->size);
2d332d5b
JL
1368#else /* CONFIG_FS_ENCRYPTION */
1369 ceph_encode_32(&p, 0);
1370 ceph_encode_32(&p, 0);
1371#endif /* CONFIG_FS_ENCRYPTION */
a8599bd8
SW
1372}
1373
1374/*
d6e47819 1375 * Queue cap releases when an inode is dropped from our cache.
a8599bd8 1376 */
d6e47819 1377void __ceph_remove_caps(struct ceph_inode_info *ci)
a8599bd8 1378{
197b7d79 1379 struct inode *inode = &ci->netfs.inode;
5995d90d 1380 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
a8599bd8
SW
1381 struct rb_node *p;
1382
d6e47819
YZ
1383 /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
1384 * may call __ceph_caps_issued_mask() on a freeing inode. */
1385 spin_lock(&ci->i_ceph_lock);
a8599bd8
SW
1386 p = rb_first(&ci->i_caps);
1387 while (p) {
1388 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
a8599bd8 1389 p = rb_next(p);
197b7d79 1390 ceph_remove_cap(mdsc, cap, true);
a8599bd8 1391 }
d6e47819 1392 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
1393}
1394
1395/*
0a454bdd
JL
1396 * Prepare to send a cap message to an MDS. Update the cap state, and populate
1397 * the arg struct with the parameters that will need to be sent. This should
1398 * be done under the i_ceph_lock to guard against changes to cap state.
a8599bd8
SW
1399 *
1400 * Make note of max_size reported/requested from mds, revoked caps
1401 * that have now been implemented.
a8599bd8 1402 */
0a454bdd
JL
1403static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
1404 int op, int flags, int used, int want, int retain,
1405 int flushing, u64 flush_tid, u64 oldest_flush_tid)
a8599bd8
SW
1406{
1407 struct ceph_inode_info *ci = cap->ci;
874c8ca1 1408 struct inode *inode = &ci->netfs.inode;
38d46409 1409 struct ceph_client *cl = ceph_inode_to_client(inode);
bb0581f0 1410 int held, revoking;
a8599bd8 1411
0a454bdd 1412 lockdep_assert_held(&ci->i_ceph_lock);
891f3f5a 1413
68c28323
SW
1414 held = cap->issued | cap->implemented;
1415 revoking = cap->implemented & ~cap->issued;
1416 retain &= ~revoking;
68c28323 1417
38d46409
XL
1418 doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n",
1419 inode, ceph_vinop(inode), cap, cap->session,
1420 ceph_cap_string(held), ceph_cap_string(held & retain),
1421 ceph_cap_string(revoking));
a8599bd8
SW
1422 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1423
a0d93e32 1424 ci->i_ceph_flags &= ~CEPH_I_FLUSH;
a8599bd8
SW
1425
1426 cap->issued &= retain; /* drop bits we don't want */
0a454bdd
JL
1427 /*
1428 * Wake up any waiters on wanted -> needed transition. This is due to
1429 * the weird transition from buffered to sync IO... we need to flush
1430 * dirty pages _before_ allowing sync writes to avoid reordering.
1431 */
1432 arg->wake = cap->implemented & ~cap->issued;
a8599bd8
SW
1433 cap->implemented &= cap->issued | used;
1434 cap->mds_wanted = want;
1435
0a454bdd
JL
1436 arg->session = cap->session;
1437 arg->ino = ceph_vino(inode).ino;
1438 arg->cid = cap->cap_id;
1439 arg->follows = flushing ? ci->i_head_snapc->seq : 0;
1440 arg->flush_tid = flush_tid;
1441 arg->oldest_flush_tid = oldest_flush_tid;
2d6795fb 1442 arg->size = i_size_read(inode);
0a454bdd
JL
1443 ci->i_reported_size = arg->size;
1444 arg->max_size = ci->i_wanted_max_size;
6f05b30e
YZ
1445 if (cap == ci->i_auth_cap) {
1446 if (want & CEPH_CAP_ANY_FILE_WR)
1447 ci->i_requested_max_size = arg->max_size;
1448 else
1449 ci->i_requested_max_size = 0;
1450 }
a8599bd8 1451
082afec9 1452 if (flushing & CEPH_CAP_XATTR_EXCL) {
0a454bdd
JL
1453 arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1454 arg->xattr_version = ci->i_xattrs.version;
cda4672d 1455 arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
0ff8bfb3 1456 } else {
0a454bdd
JL
1457 arg->xattr_buf = NULL;
1458 arg->old_xattr_buf = NULL;
a8599bd8
SW
1459 }
1460
c453bdb5
JL
1461 arg->mtime = inode_get_mtime(inode);
1462 arg->atime = inode_get_atime(inode);
7795aef0 1463 arg->ctime = inode_get_ctime(inode);
0a454bdd
JL
1464 arg->btime = ci->i_btime;
1465 arg->change_attr = inode_peek_iversion_raw(inode);
0ff8bfb3 1466
0a454bdd
JL
1467 arg->op = op;
1468 arg->caps = cap->implemented;
1469 arg->wanted = want;
1470 arg->dirty = flushing;
0ff8bfb3 1471
0a454bdd
JL
1472 arg->seq = cap->seq;
1473 arg->issue_seq = cap->issue_seq;
1474 arg->mseq = cap->mseq;
1475 arg->time_warp_seq = ci->i_time_warp_seq;
0ff8bfb3 1476
0a454bdd
JL
1477 arg->uid = inode->i_uid;
1478 arg->gid = inode->i_gid;
1479 arg->mode = inode->i_mode;
0ff8bfb3 1480
0a454bdd 1481 arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
49ada6e8
YZ
1482 if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1483 !list_empty(&ci->i_cap_snaps)) {
1484 struct ceph_cap_snap *capsnap;
1485 list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1486 if (capsnap->cap_flush.tid)
1487 break;
1488 if (capsnap->need_flush) {
1489 flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1490 break;
1491 }
1492 }
1493 }
0a454bdd 1494 arg->flags = flags;
16be62fc 1495 arg->encrypted = IS_ENCRYPTED(inode);
2d332d5b
JL
1496#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1497 if (ci->fscrypt_auth_len &&
1498 WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
1499 /* Don't set this if it's too big */
1500 arg->fscrypt_auth_len = 0;
1501 } else {
1502 arg->fscrypt_auth_len = ci->fscrypt_auth_len;
1503 memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
1504 min_t(size_t, ci->fscrypt_auth_len,
1505 sizeof(arg->fscrypt_auth)));
1506 }
2d332d5b 1507#endif /* CONFIG_FS_ENCRYPTION */
0a454bdd 1508}
a8599bd8 1509
16be62fc 1510#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
2d332d5b 1511#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
16be62fc 1512 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
2d332d5b 1513
2d332d5b
JL
1514static inline int cap_msg_size(struct cap_msg_args *arg)
1515{
16be62fc 1516 return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
2d332d5b
JL
1517}
1518#else
16be62fc
JL
1519#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
1520 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
1521
2d332d5b
JL
1522static inline int cap_msg_size(struct cap_msg_args *arg)
1523{
1524 return CAP_MSG_FIXED_FIELDS;
1525}
1526#endif /* CONFIG_FS_ENCRYPTION */
1527
0a454bdd
JL
1528/*
1529 * Send a cap msg on the given inode.
1530 *
1531 * Caller should hold snap_rwsem (read), s_mutex.
1532 */
52311980 1533static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
0a454bdd 1534{
16d68903 1535 struct ceph_msg *msg;
874c8ca1 1536 struct inode *inode = &ci->netfs.inode;
38d46409 1537 struct ceph_client *cl = ceph_inode_to_client(inode);
12fe3dda 1538
2d332d5b
JL
1539 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
1540 false);
16d68903 1541 if (!msg) {
38d46409
XL
1542 pr_err_client(cl,
1543 "error allocating cap msg: ino (%llx.%llx)"
1544 " flushing %s tid %llu, requeuing cap.\n",
1545 ceph_vinop(inode), ceph_cap_string(arg->dirty),
1546 arg->flush_tid);
a0d93e32 1547 spin_lock(&ci->i_ceph_lock);
52311980 1548 __cap_delay_requeue(arg->session->s_mdsc, ci);
a0d93e32 1549 spin_unlock(&ci->i_ceph_lock);
16d68903 1550 return;
a8599bd8
SW
1551 }
1552
16d68903
JL
1553 encode_cap_msg(msg, arg);
1554 ceph_con_send(&arg->session->s_con, msg);
0a454bdd 1555 ceph_buffer_put(arg->old_xattr_buf);
cda4672d 1556 ceph_buffer_put(arg->xattr_buf);
0a454bdd
JL
1557 if (arg->wake)
1558 wake_up_all(&ci->i_cap_wq);
a8599bd8
SW
1559}
1560
0e294387
YZ
1561static inline int __send_flush_snap(struct inode *inode,
1562 struct ceph_mds_session *session,
1563 struct ceph_cap_snap *capsnap,
1564 u32 mseq, u64 oldest_flush_tid)
1565{
0ff8bfb3 1566 struct cap_msg_args arg;
16d68903
JL
1567 struct ceph_msg *msg;
1568
0ff8bfb3
JL
1569 arg.session = session;
1570 arg.ino = ceph_vino(inode).ino;
1571 arg.cid = 0;
1572 arg.follows = capsnap->follows;
1573 arg.flush_tid = capsnap->cap_flush.tid;
1574 arg.oldest_flush_tid = oldest_flush_tid;
1575
1576 arg.size = capsnap->size;
1577 arg.max_size = 0;
1578 arg.xattr_version = capsnap->xattr_version;
1579 arg.xattr_buf = capsnap->xattr_blob;
0a454bdd 1580 arg.old_xattr_buf = NULL;
0ff8bfb3
JL
1581
1582 arg.atime = capsnap->atime;
1583 arg.mtime = capsnap->mtime;
1584 arg.ctime = capsnap->ctime;
ec62b894 1585 arg.btime = capsnap->btime;
176c77c9 1586 arg.change_attr = capsnap->change_attr;
0ff8bfb3
JL
1587
1588 arg.op = CEPH_CAP_OP_FLUSHSNAP;
1589 arg.caps = capsnap->issued;
1590 arg.wanted = 0;
1591 arg.dirty = capsnap->dirty;
1592
1593 arg.seq = 0;
1594 arg.issue_seq = 0;
1595 arg.mseq = mseq;
1596 arg.time_warp_seq = capsnap->time_warp_seq;
1597
1598 arg.uid = capsnap->uid;
1599 arg.gid = capsnap->gid;
1600 arg.mode = capsnap->mode;
1601
1602 arg.inline_data = capsnap->inline_data;
1e4ef0c6 1603 arg.flags = 0;
0a454bdd 1604 arg.wake = false;
16be62fc 1605 arg.encrypted = IS_ENCRYPTED(inode);
0ff8bfb3 1606
16be62fc 1607 /* No fscrypt_auth changes from a capsnap.*/
2d332d5b 1608 arg.fscrypt_auth_len = 0;
2d332d5b
JL
1609
1610 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg),
1611 GFP_NOFS, false);
1612 if (!msg)
1613 return -ENOMEM;
0ff8bfb3 1614
16d68903
JL
1615 encode_cap_msg(msg, &arg);
1616 ceph_con_send(&arg.session->s_con, msg);
1617 return 0;
0e294387
YZ
1618}
1619
a8599bd8
SW
1620/*
1621 * When a snapshot is taken, clients accumulate dirty metadata on
1622 * inodes with capabilities in ceph_cap_snaps to describe the file
1623 * state at the time the snapshot was taken. This must be flushed
1624 * asynchronously back to the MDS once sync writes complete and dirty
1625 * data is written out.
1626 *
7732fe16 1627 * Called under i_ceph_lock.
a8599bd8 1628 */
ed9b430c
YZ
1629static void __ceph_flush_snaps(struct ceph_inode_info *ci,
1630 struct ceph_mds_session *session)
be655596
SW
1631 __releases(ci->i_ceph_lock)
1632 __acquires(ci->i_ceph_lock)
a8599bd8 1633{
874c8ca1 1634 struct inode *inode = &ci->netfs.inode;
ed9b430c 1635 struct ceph_mds_client *mdsc = session->s_mdsc;
38d46409 1636 struct ceph_client *cl = mdsc->fsc->client;
a8599bd8 1637 struct ceph_cap_snap *capsnap;
ed9b430c
YZ
1638 u64 oldest_flush_tid = 0;
1639 u64 first_tid = 1, last_tid = 0;
a8599bd8 1640
38d46409
XL
1641 doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode),
1642 session);
a8599bd8 1643
a8599bd8 1644 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
a8599bd8
SW
1645 /*
1646 * we need to wait for sync writes to complete and for dirty
1647 * pages to be written out.
1648 */
1649 if (capsnap->dirty_pages || capsnap->writing)
cfc0bf66 1650 break;
a8599bd8 1651
86056090
YZ
1652 /* should be removed by ceph_try_drop_cap_snap() */
1653 BUG_ON(!capsnap->need_flush);
819ccbfa 1654
e835124c 1655 /* only flush each capsnap once */
0e294387 1656 if (capsnap->cap_flush.tid > 0) {
38d46409 1657 doutc(cl, "already flushed %p, skipping\n", capsnap);
e835124c
SW
1658 continue;
1659 }
1660
553adfd9 1661 spin_lock(&mdsc->cap_dirty_lock);
0e294387
YZ
1662 capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1663 list_add_tail(&capsnap->cap_flush.g_list,
1664 &mdsc->cap_flush_list);
ed9b430c
YZ
1665 if (oldest_flush_tid == 0)
1666 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
0e294387
YZ
1667 if (list_empty(&ci->i_flushing_item)) {
1668 list_add_tail(&ci->i_flushing_item,
1669 &session->s_cap_flushing);
1670 }
553adfd9
YZ
1671 spin_unlock(&mdsc->cap_dirty_lock);
1672
0e294387
YZ
1673 list_add_tail(&capsnap->cap_flush.i_list,
1674 &ci->i_cap_flush_list);
1675
ed9b430c
YZ
1676 if (first_tid == 1)
1677 first_tid = capsnap->cap_flush.tid;
1678 last_tid = capsnap->cap_flush.tid;
1679 }
1680
1681 ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
1682
1683 while (first_tid <= last_tid) {
1684 struct ceph_cap *cap = ci->i_auth_cap;
57a5df0e 1685 struct ceph_cap_flush *cf = NULL, *iter;
ed9b430c
YZ
1686 int ret;
1687
1688 if (!(cap && cap->session == session)) {
38d46409
XL
1689 doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n",
1690 inode, ceph_vinop(inode), cap, session->s_mds);
ed9b430c
YZ
1691 break;
1692 }
1693
1694 ret = -ENOENT;
57a5df0e
JK
1695 list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
1696 if (iter->tid >= first_tid) {
1697 cf = iter;
ed9b430c
YZ
1698 ret = 0;
1699 break;
1700 }
1701 }
1702 if (ret < 0)
1703 break;
1704
1705 first_tid = cf->tid + 1;
1706
1707 capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
805692d0 1708 refcount_inc(&capsnap->nref);
be655596 1709 spin_unlock(&ci->i_ceph_lock);
a8599bd8 1710
38d46409
XL
1711 doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode,
1712 ceph_vinop(inode), capsnap, cf->tid,
1713 ceph_cap_string(capsnap->dirty));
a8599bd8 1714
ed9b430c
YZ
1715 ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
1716 oldest_flush_tid);
1717 if (ret < 0) {
38d46409
XL
1718 pr_err_client(cl, "error sending cap flushsnap, "
1719 "ino (%llx.%llx) tid %llu follows %llu\n",
1720 ceph_vinop(inode), cf->tid,
1721 capsnap->follows);
ed9b430c 1722 }
a8599bd8 1723
ed9b430c 1724 ceph_put_cap_snap(capsnap);
be655596 1725 spin_lock(&ci->i_ceph_lock);
a8599bd8 1726 }
ed9b430c 1727}
a8599bd8 1728
ed9b430c
YZ
1729void ceph_flush_snaps(struct ceph_inode_info *ci,
1730 struct ceph_mds_session **psession)
1731{
874c8ca1 1732 struct inode *inode = &ci->netfs.inode;
5995d90d 1733 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
38d46409 1734 struct ceph_client *cl = ceph_inode_to_client(inode);
e4d2b16a 1735 struct ceph_mds_session *session = NULL;
409e873e 1736 bool need_put = false;
ed9b430c 1737 int mds;
e4d2b16a 1738
38d46409 1739 doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
e4d2b16a
YZ
1740 if (psession)
1741 session = *psession;
ed9b430c
YZ
1742retry:
1743 spin_lock(&ci->i_ceph_lock);
1744 if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
38d46409 1745 doutc(cl, " no capsnap needs flush, doing nothing\n");
ed9b430c
YZ
1746 goto out;
1747 }
1748 if (!ci->i_auth_cap) {
38d46409 1749 doutc(cl, " no auth cap (migrating?), doing nothing\n");
ed9b430c
YZ
1750 goto out;
1751 }
a8599bd8 1752
ed9b430c
YZ
1753 mds = ci->i_auth_cap->session->s_mds;
1754 if (session && session->s_mds != mds) {
38d46409 1755 doutc(cl, " oops, wrong session %p mutex\n", session);
a8599bd8 1756 ceph_put_mds_session(session);
ed9b430c
YZ
1757 session = NULL;
1758 }
1759 if (!session) {
1760 spin_unlock(&ci->i_ceph_lock);
1761 mutex_lock(&mdsc->mutex);
1762 session = __ceph_lookup_mds_session(mdsc, mds);
1763 mutex_unlock(&mdsc->mutex);
ed9b430c 1764 goto retry;
a8599bd8 1765 }
a8599bd8 1766
24d063ac 1767 // make sure flushsnap messages are sent in proper order.
054f8d41 1768 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
24d063ac 1769 __kick_flushing_caps(mdsc, session, ci, 0);
24d063ac 1770
ed9b430c
YZ
1771 __ceph_flush_snaps(ci, session);
1772out:
be655596 1773 spin_unlock(&ci->i_ceph_lock);
ed9b430c 1774
7732fe16 1775 if (psession)
ed9b430c 1776 *psession = session;
7732fe16 1777 else
ed9b430c 1778 ceph_put_mds_session(session);
ed9b430c
YZ
1779 /* we flushed them all; remove this inode from the queue */
1780 spin_lock(&mdsc->snap_flush_lock);
409e873e
XL
1781 if (!list_empty(&ci->i_snap_flush_item))
1782 need_put = true;
ed9b430c
YZ
1783 list_del_init(&ci->i_snap_flush_item);
1784 spin_unlock(&mdsc->snap_flush_lock);
409e873e
XL
1785
1786 if (need_put)
1787 iput(inode);
a8599bd8
SW
1788}
1789
76e3b390 1790/*
fca65b4a
SW
1791 * Mark caps dirty. If inode is newly dirty, return the dirty flags.
1792 * Caller is then responsible for calling __mark_inode_dirty with the
1793 * returned flags value.
76e3b390 1794 */
f66fd9f0
YZ
1795int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
1796 struct ceph_cap_flush **pcf)
76e3b390 1797{
640ef79d 1798 struct ceph_mds_client *mdsc =
5995d90d 1799 ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc;
874c8ca1 1800 struct inode *inode = &ci->netfs.inode;
38d46409 1801 struct ceph_client *cl = ceph_inode_to_client(inode);
76e3b390
SW
1802 int was = ci->i_dirty_caps;
1803 int dirty = 0;
1804
c7e4f85c
JL
1805 lockdep_assert_held(&ci->i_ceph_lock);
1806
571ade33 1807 if (!ci->i_auth_cap) {
38d46409
XL
1808 pr_warn_client(cl, "%p %llx.%llx mask %s, "
1809 "but no auth cap (session was closed?)\n",
1810 inode, ceph_vinop(inode),
1811 ceph_cap_string(mask));
571ade33
YZ
1812 return 0;
1813 }
1814
38d46409
XL
1815 doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode,
1816 ceph_vinop(inode), ceph_cap_string(mask),
1817 ceph_cap_string(was), ceph_cap_string(was | mask));
76e3b390
SW
1818 ci->i_dirty_caps |= mask;
1819 if (was == 0) {
1cf03a68
JL
1820 struct ceph_mds_session *session = ci->i_auth_cap->session;
1821
f66fd9f0
YZ
1822 WARN_ON_ONCE(ci->i_prealloc_cap_flush);
1823 swap(ci->i_prealloc_cap_flush, *pcf);
1824
604d1b02
YZ
1825 if (!ci->i_head_snapc) {
1826 WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
7d8cb26d
SW
1827 ci->i_head_snapc = ceph_get_snap_context(
1828 ci->i_snap_realm->cached_context);
604d1b02 1829 }
38d46409
XL
1830 doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n",
1831 inode, ceph_vinop(inode), ci->i_head_snapc,
1832 ci->i_auth_cap);
76e3b390
SW
1833 BUG_ON(!list_empty(&ci->i_dirty_item));
1834 spin_lock(&mdsc->cap_dirty_lock);
1cf03a68 1835 list_add(&ci->i_dirty_item, &session->s_cap_dirty);
76e3b390
SW
1836 spin_unlock(&mdsc->cap_dirty_lock);
1837 if (ci->i_flushing_caps == 0) {
3772d26d 1838 ihold(inode);
76e3b390
SW
1839 dirty |= I_DIRTY_SYNC;
1840 }
f66fd9f0
YZ
1841 } else {
1842 WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
76e3b390
SW
1843 }
1844 BUG_ON(list_empty(&ci->i_dirty_item));
1845 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1846 (mask & CEPH_CAP_FILE_BUFFER))
1847 dirty |= I_DIRTY_DATASYNC;
a0d93e32 1848 __cap_delay_requeue(mdsc, ci);
fca65b4a 1849 return dirty;
76e3b390
SW
1850}
1851
f66fd9f0
YZ
1852struct ceph_cap_flush *ceph_alloc_cap_flush(void)
1853{
b2f9fa1f
XL
1854 struct ceph_cap_flush *cf;
1855
1856 cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
05a444d3
CIK
1857 if (!cf)
1858 return NULL;
1859
b2f9fa1f
XL
1860 cf->is_capsnap = false;
1861 return cf;
f66fd9f0
YZ
1862}
1863
1864void ceph_free_cap_flush(struct ceph_cap_flush *cf)
1865{
1866 if (cf)
1867 kmem_cache_free(ceph_cap_flush_cachep, cf);
1868}
1869
a2971c8c
YZ
1870static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1871{
e4500b5e 1872 if (!list_empty(&mdsc->cap_flush_list)) {
a2971c8c 1873 struct ceph_cap_flush *cf =
e4500b5e
YZ
1874 list_first_entry(&mdsc->cap_flush_list,
1875 struct ceph_cap_flush, g_list);
a2971c8c
YZ
1876 return cf->tid;
1877 }
1878 return 0;
1879}
1880
c8799fc4
YZ
1881/*
1882 * Remove cap_flush from the mdsc's or inode's flushing cap list.
1883 * Return true if caller needs to wake up flush waiters.
1884 */
681ac634
JL
1885static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1886 struct ceph_cap_flush *cf)
c8799fc4
YZ
1887{
1888 struct ceph_cap_flush *prev;
1889 bool wake = cf->wake;
681ac634
JL
1890
1891 if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1892 prev = list_prev_entry(cf, g_list);
1893 prev->wake = true;
1894 wake = false;
1895 }
b2f9fa1f 1896 list_del_init(&cf->g_list);
681ac634
JL
1897 return wake;
1898}
1899
1900static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1901 struct ceph_cap_flush *cf)
1902{
1903 struct ceph_cap_flush *prev;
1904 bool wake = cf->wake;
1905
1906 if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1907 prev = list_prev_entry(cf, i_list);
1908 prev->wake = true;
1909 wake = false;
c8799fc4 1910 }
b2f9fa1f 1911 list_del_init(&cf->i_list);
c8799fc4
YZ
1912 return wake;
1913}
1914
a8599bd8
SW
1915/*
1916 * Add dirty inode to the flushing list. Assigned a seq number so we
1917 * can wait for caps to flush without starving.
cdc35f96 1918 *
9f3345d8 1919 * Called under i_ceph_lock. Returns the flush tid.
a8599bd8 1920 */
9f3345d8 1921static u64 __mark_caps_flushing(struct inode *inode,
c8799fc4 1922 struct ceph_mds_session *session, bool wake,
9f3345d8 1923 u64 *oldest_flush_tid)
a8599bd8 1924{
5995d90d 1925 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
38d46409 1926 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8 1927 struct ceph_inode_info *ci = ceph_inode(inode);
f66fd9f0 1928 struct ceph_cap_flush *cf = NULL;
cdc35f96 1929 int flushing;
50b885b9 1930
c7e4f85c 1931 lockdep_assert_held(&ci->i_ceph_lock);
cdc35f96 1932 BUG_ON(ci->i_dirty_caps == 0);
a8599bd8 1933 BUG_ON(list_empty(&ci->i_dirty_item));
f66fd9f0 1934 BUG_ON(!ci->i_prealloc_cap_flush);
cdc35f96
SW
1935
1936 flushing = ci->i_dirty_caps;
38d46409
XL
1937 doutc(cl, "flushing %s, flushing_caps %s -> %s\n",
1938 ceph_cap_string(flushing),
1939 ceph_cap_string(ci->i_flushing_caps),
1940 ceph_cap_string(ci->i_flushing_caps | flushing));
cdc35f96
SW
1941 ci->i_flushing_caps |= flushing;
1942 ci->i_dirty_caps = 0;
38d46409 1943 doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
cdc35f96 1944
f66fd9f0 1945 swap(cf, ci->i_prealloc_cap_flush);
553adfd9 1946 cf->caps = flushing;
c8799fc4 1947 cf->wake = wake;
553adfd9 1948
a8599bd8 1949 spin_lock(&mdsc->cap_dirty_lock);
afcdaea3
SW
1950 list_del_init(&ci->i_dirty_item);
1951
553adfd9 1952 cf->tid = ++mdsc->last_cap_flush_tid;
e4500b5e 1953 list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
a2971c8c 1954 *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
553adfd9 1955
a8599bd8
SW
1956 if (list_empty(&ci->i_flushing_item)) {
1957 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1958 mdsc->num_cap_flushing++;
a8599bd8
SW
1959 }
1960 spin_unlock(&mdsc->cap_dirty_lock);
cdc35f96 1961
e4500b5e 1962 list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
553adfd9 1963
9f3345d8 1964 return cf->tid;
a8599bd8
SW
1965}
1966
5ecad6fd
SW
1967/*
1968 * try to invalidate mapping pages without blocking.
1969 */
5ecad6fd 1970static int try_nonblocking_invalidate(struct inode *inode)
3eaf5aa1
JL
1971 __releases(ci->i_ceph_lock)
1972 __acquires(ci->i_ceph_lock)
5ecad6fd 1973{
38d46409 1974 struct ceph_client *cl = ceph_inode_to_client(inode);
5ecad6fd
SW
1975 struct ceph_inode_info *ci = ceph_inode(inode);
1976 u32 invalidating_gen = ci->i_rdcache_gen;
1977
be655596 1978 spin_unlock(&ci->i_ceph_lock);
400e1286 1979 ceph_fscache_invalidate(inode, false);
5ecad6fd 1980 invalidate_mapping_pages(&inode->i_data, 0, -1);
be655596 1981 spin_lock(&ci->i_ceph_lock);
5ecad6fd 1982
18a38193 1983 if (inode->i_data.nrpages == 0 &&
5ecad6fd
SW
1984 invalidating_gen == ci->i_rdcache_gen) {
1985 /* success. */
38d46409
XL
1986 doutc(cl, "%p %llx.%llx success\n", inode,
1987 ceph_vinop(inode));
cd045cb4
SW
1988 /* save any racing async invalidate some trouble */
1989 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
5ecad6fd
SW
1990 return 0;
1991 }
38d46409 1992 doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode));
5ecad6fd
SW
1993 return -1;
1994}
1995
efb0ca76
YZ
1996bool __ceph_should_report_size(struct ceph_inode_info *ci)
1997{
874c8ca1 1998 loff_t size = i_size_read(&ci->netfs.inode);
efb0ca76
YZ
1999 /* mds will adjust max size according to the reported size */
2000 if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
2001 return false;
2002 if (size >= ci->i_max_size)
2003 return true;
2004 /* half of previous max_size increment has been used */
2005 if (ci->i_max_size > ci->i_reported_size &&
2006 (size << 1) >= ci->i_max_size + ci->i_reported_size)
2007 return true;
2008 return false;
2009}
2010
a8599bd8
SW
2011/*
2012 * Swiss army knife function to examine currently used and wanted
2013 * versus held caps. Release, flush, ack revoked caps to mds as
2014 * appropriate.
2015 *
a8599bd8
SW
2016 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
2017 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
2018 * further delay.
2019 */
e4b731cc 2020void ceph_check_caps(struct ceph_inode_info *ci, int flags)
a8599bd8 2021{
874c8ca1 2022 struct inode *inode = &ci->netfs.inode;
2678da88 2023 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
38d46409 2024 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8 2025 struct ceph_cap *cap;
a2971c8c 2026 u64 flush_tid, oldest_flush_tid;
395c312b 2027 int file_wanted, used, cap_used;
cbd03635 2028 int issued, implemented, want, retain, revoking, flushing = 0;
a8599bd8
SW
2029 int mds = -1; /* keep track of how far we've gone through i_caps list
2030 to avoid an infinite loop on retry */
2031 struct rb_node *p;
3609404f 2032 bool queue_invalidate = false;
3609404f 2033 bool tried_invalidate = false;
a7437954 2034 bool queue_writeback = false;
e4b731cc 2035 struct ceph_mds_session *session = NULL;
6a92b08f 2036
be655596 2037 spin_lock(&ci->i_ceph_lock);
fbed7045 2038 if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
68c62bee
XL
2039 ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
2040
fbed7045
JL
2041 /* Don't send messages until we get async create reply */
2042 spin_unlock(&ci->i_ceph_lock);
fbed7045
JL
2043 return;
2044 }
2045
a8599bd8
SW
2046 if (ci->i_ceph_flags & CEPH_I_FLUSH)
2047 flags |= CHECK_CAPS_FLUSH;
a8599bd8 2048retry:
c74d79af 2049 /* Caps wanted by virtue of active open files. */
a8599bd8 2050 file_wanted = __ceph_caps_file_wanted(ci);
c74d79af
JL
2051
2052 /* Caps which have active references against them */
a8599bd8 2053 used = __ceph_caps_used(ci);
c74d79af
JL
2054
2055 /*
2056 * "issued" represents the current caps that the MDS wants us to have.
2057 * "implemented" is the set that we have been granted, and includes the
2058 * ones that have not yet been returned to the MDS (the "revoking" set,
2059 * usually because they have outstanding references).
2060 */
cbd03635
SW
2061 issued = __ceph_caps_issued(ci, &implemented);
2062 revoking = implemented & ~issued;
a8599bd8 2063
41445999 2064 want = file_wanted;
c74d79af
JL
2065
2066 /* The ones we currently want to retain (may be adjusted below) */
41445999 2067 retain = file_wanted | used | CEPH_CAP_PIN;
a8599bd8 2068 if (!mdsc->stopping && inode->i_nlink > 0) {
41445999 2069 if (file_wanted) {
a8599bd8 2070 retain |= CEPH_CAP_ANY; /* be greedy */
32ec4397
YZ
2071 } else if (S_ISDIR(inode->i_mode) &&
2072 (issued & CEPH_CAP_FILE_SHARED) &&
8a2ac3a8 2073 __ceph_dir_is_complete(ci)) {
32ec4397
YZ
2074 /*
2075 * If a directory is complete, we want to keep
2076 * the exclusive cap. So that MDS does not end up
2077 * revoking the shared cap on every create/unlink
2078 * operation.
2079 */
a25949b9 2080 if (IS_RDONLY(inode)) {
8a2ac3a8 2081 want = CEPH_CAP_ANY_SHARED;
a25949b9 2082 } else {
719a2514 2083 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
a25949b9 2084 }
32ec4397 2085 retain |= want;
a8599bd8 2086 } else {
32ec4397 2087
a8599bd8
SW
2088 retain |= CEPH_CAP_ANY_SHARED;
2089 /*
2090 * keep RD only if we didn't have the file open RW,
2091 * because then the mds would revoke it anyway to
2092 * journal max_size=0.
2093 */
2094 if (ci->i_max_size == 0)
2095 retain |= CEPH_CAP_ANY_RD;
2096 }
2097 }
2098
38d46409
XL
2099 doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
2100 "flushing %s issued %s revoking %s retain %s %s%s%s\n",
2101 inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
a8599bd8
SW
2102 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
2103 ceph_cap_string(ci->i_flushing_caps),
cbd03635 2104 ceph_cap_string(issued), ceph_cap_string(revoking),
a8599bd8
SW
2105 ceph_cap_string(retain),
2106 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
e027ddb6
XL
2107 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
2108 (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
a8599bd8
SW
2109
2110 /*
2111 * If we no longer need to hold onto old our caps, and we may
2112 * have cached pages, but don't want them, then try to invalidate.
2113 * If we fail, it's because pages are locked.... try again later.
2114 */
a0d93e32 2115 if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
525d15e8 2116 S_ISREG(inode->i_mode) &&
9abd4db7 2117 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
fdd4e158 2118 inode->i_data.nrpages && /* have cached pages */
5e804ac4
YZ
2119 (revoking & (CEPH_CAP_FILE_CACHE|
2120 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
a8599bd8 2121 !tried_invalidate) {
38d46409
XL
2122 doutc(cl, "trying to invalidate on %p %llx.%llx\n",
2123 inode, ceph_vinop(inode));
5ecad6fd 2124 if (try_nonblocking_invalidate(inode) < 0) {
38d46409 2125 doutc(cl, "queuing invalidate\n");
ee612d95
YZ
2126 queue_invalidate = true;
2127 ci->i_rdcache_revoking = ci->i_rdcache_gen;
a8599bd8 2128 }
3609404f 2129 tried_invalidate = true;
6a92b08f 2130 goto retry;
a8599bd8
SW
2131 }
2132
a8599bd8 2133 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
d67c72e6 2134 int mflags = 0;
0a454bdd
JL
2135 struct cap_msg_args arg;
2136
a8599bd8 2137 cap = rb_entry(p, struct ceph_cap, ci_node);
a8599bd8
SW
2138
2139 /* avoid looping forever */
2140 if (mds >= cap->mds ||
2141 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
2142 continue;
2143
c74d79af
JL
2144 /*
2145 * If we have an auth cap, we don't need to consider any
2146 * overlapping caps as used.
2147 */
395c312b
YZ
2148 cap_used = used;
2149 if (ci->i_auth_cap && cap != ci->i_auth_cap)
2150 cap_used &= ~ci->i_auth_cap->issued;
2151
a8599bd8 2152 revoking = cap->implemented & ~cap->issued;
38d46409
XL
2153 doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n",
2154 cap->mds, cap, ceph_cap_string(cap_used),
2155 ceph_cap_string(cap->issued),
2156 ceph_cap_string(cap->implemented),
2157 ceph_cap_string(revoking));
a8599bd8 2158
902d6d01
XL
2159 /* completed revocation? going down and there are no caps? */
2160 if (revoking) {
2161 if ((revoking & cap_used) == 0) {
2162 doutc(cl, "completed revocation of %s\n",
2163 ceph_cap_string(cap->implemented & ~cap->issued));
2164 goto ack;
2165 }
2166
2167 /*
2168 * If the "i_wrbuffer_ref" was increased by mmap or generic
2169 * cache write just before the ceph_check_caps() is called,
2170 * the Fb capability revoking will fail this time. Then we
2171 * must wait for the BDI's delayed work to flush the dirty
2172 * pages and to release the "i_wrbuffer_ref", which will cost
2173 * at most 5 seconds. That means the MDS needs to wait at
2174 * most 5 seconds to finished the Fb capability's revocation.
2175 *
2176 * Let's queue a writeback for it.
2177 */
2178 if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
2179 (revoking & CEPH_CAP_FILE_BUFFER))
2180 queue_writeback = true;
2181 }
2182
a8599bd8
SW
2183 if (cap == ci->i_auth_cap &&
2184 (cap->issued & CEPH_CAP_FILE_WR)) {
2185 /* request larger max_size from MDS? */
2186 if (ci->i_wanted_max_size > ci->i_max_size &&
2187 ci->i_wanted_max_size > ci->i_requested_max_size) {
38d46409 2188 doutc(cl, "requesting new max_size\n");
a8599bd8
SW
2189 goto ack;
2190 }
2191
2192 /* approaching file_max? */
efb0ca76 2193 if (__ceph_should_report_size(ci)) {
38d46409 2194 doutc(cl, "i_size approaching max_size\n");
a8599bd8
SW
2195 goto ack;
2196 }
2197 }
2198 /* flush anything dirty? */
7bc00fdd
YZ
2199 if (cap == ci->i_auth_cap) {
2200 if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
38d46409 2201 doutc(cl, "flushing dirty caps\n");
7bc00fdd
YZ
2202 goto ack;
2203 }
2204 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
38d46409 2205 doutc(cl, "flushing snap caps\n");
7bc00fdd
YZ
2206 goto ack;
2207 }
a8599bd8
SW
2208 }
2209
a8599bd8 2210 /* want more caps from mds? */
0aa971b6
YZ
2211 if (want & ~cap->mds_wanted) {
2212 if (want & ~(cap->mds_wanted | cap->issued))
2213 goto ack;
2214 if (!__cap_is_valid(cap))
2215 goto ack;
2216 }
a8599bd8
SW
2217
2218 /* things we might delay */
fdac94fa 2219 if ((cap->issued & ~retain) == 0)
a8599bd8
SW
2220 continue; /* nope, all good */
2221
a8599bd8 2222ack:
6a92b08f
JL
2223 ceph_put_mds_session(session);
2224 session = ceph_get_mds_session(cap->session);
7bc00fdd
YZ
2225
2226 /* kick flushing and flush snaps before sending normal
2227 * cap message */
2228 if (cap == ci->i_auth_cap &&
2229 (ci->i_ceph_flags &
2230 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
054f8d41 2231 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
24d063ac 2232 __kick_flushing_caps(mdsc, session, ci, 0);
ed9b430c
YZ
2233 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2234 __ceph_flush_snaps(ci, session);
2235
6a92b08f 2236 goto retry;
a8599bd8
SW
2237 }
2238
553adfd9 2239 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
9f3345d8
JL
2240 flushing = ci->i_dirty_caps;
2241 flush_tid = __mark_caps_flushing(inode, session, false,
2242 &oldest_flush_tid);
d67c72e6
JL
2243 if (flags & CHECK_CAPS_FLUSH &&
2244 list_empty(&session->s_cap_dirty))
2245 mflags |= CEPH_CLIENT_CAPS_SYNC;
553adfd9 2246 } else {
24be0c48 2247 flushing = 0;
553adfd9 2248 flush_tid = 0;
a2971c8c
YZ
2249 spin_lock(&mdsc->cap_dirty_lock);
2250 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2251 spin_unlock(&mdsc->cap_dirty_lock);
553adfd9 2252 }
a8599bd8
SW
2253
2254 mds = cap->mds; /* remember mds, so we don't repeat */
a8599bd8 2255
d67c72e6
JL
2256 __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
2257 want, retain, flushing, flush_tid, oldest_flush_tid);
0a454bdd 2258
6a92b08f 2259 spin_unlock(&ci->i_ceph_lock);
52311980 2260 __send_cap(&arg, ci);
6a92b08f 2261 spin_lock(&ci->i_ceph_lock);
0a454bdd 2262
be655596 2263 goto retry; /* retake i_ceph_lock and restart our cap scan. */
a8599bd8
SW
2264 }
2265
a0d93e32
YZ
2266 /* periodically re-calculate caps wanted by open files */
2267 if (__ceph_is_any_real_caps(ci) &&
2268 list_empty(&ci->i_cap_delay_list) &&
2269 (file_wanted & ~CEPH_CAP_PIN) &&
2270 !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
2271 __cap_delay_requeue(mdsc, ci);
719a2514 2272 }
a8599bd8 2273
be655596 2274 spin_unlock(&ci->i_ceph_lock);
a8599bd8 2275
6a92b08f 2276 ceph_put_mds_session(session);
a7437954
XL
2277 if (queue_writeback)
2278 ceph_queue_writeback(inode);
cbd03635 2279 if (queue_invalidate)
3c6f6b79 2280 ceph_queue_invalidate(inode);
a8599bd8
SW
2281}
2282
a8599bd8
SW
2283/*
2284 * Try to flush dirty caps back to the auth mds.
2285 */
553adfd9 2286static int try_flush_caps(struct inode *inode, u64 *ptid)
a8599bd8 2287{
5995d90d 2288 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
a8599bd8 2289 struct ceph_inode_info *ci = ceph_inode(inode);
89b52fe1 2290 int flushing = 0;
a2971c8c 2291 u64 flush_tid = 0, oldest_flush_tid = 0;
a8599bd8 2292
be655596 2293 spin_lock(&ci->i_ceph_lock);
d6cee9db 2294retry_locked:
a8599bd8
SW
2295 if (ci->i_dirty_caps && ci->i_auth_cap) {
2296 struct ceph_cap *cap = ci->i_auth_cap;
0a454bdd 2297 struct cap_msg_args arg;
0449a352 2298 struct ceph_mds_session *session = cap->session;
a8599bd8 2299
0449a352 2300 if (session->s_state < CEPH_MDS_SESSION_OPEN) {
6c2838fb 2301 spin_unlock(&ci->i_ceph_lock);
a8599bd8 2302 goto out;
6c2838fb 2303 }
a8599bd8 2304
d6cee9db
YZ
2305 if (ci->i_ceph_flags &
2306 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2307 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2308 __kick_flushing_caps(mdsc, session, ci, 0);
2309 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2310 __ceph_flush_snaps(ci, session);
2311 goto retry_locked;
2312 }
2313
9f3345d8
JL
2314 flushing = ci->i_dirty_caps;
2315 flush_tid = __mark_caps_flushing(inode, session, true,
2316 &oldest_flush_tid);
a8599bd8 2317
0a454bdd 2318 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
a0d93e32
YZ
2319 __ceph_caps_used(ci), __ceph_caps_wanted(ci),
2320 (cap->issued | cap->implemented),
2321 flushing, flush_tid, oldest_flush_tid);
0a454bdd
JL
2322 spin_unlock(&ci->i_ceph_lock);
2323
52311980 2324 __send_cap(&arg, ci);
553adfd9 2325 } else {
e4500b5e 2326 if (!list_empty(&ci->i_cap_flush_list)) {
553adfd9 2327 struct ceph_cap_flush *cf =
e4500b5e 2328 list_last_entry(&ci->i_cap_flush_list,
c8799fc4
YZ
2329 struct ceph_cap_flush, i_list);
2330 cf->wake = true;
553adfd9
YZ
2331 flush_tid = cf->tid;
2332 }
2333 flushing = ci->i_flushing_caps;
2334 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
2335 }
2336out:
553adfd9 2337 *ptid = flush_tid;
a8599bd8
SW
2338 return flushing;
2339}
2340
2341/*
2342 * Return true if we've flushed caps through the given flush_tid.
2343 */
553adfd9 2344static int caps_are_flushed(struct inode *inode, u64 flush_tid)
a8599bd8
SW
2345{
2346 struct ceph_inode_info *ci = ceph_inode(inode);
553adfd9 2347 int ret = 1;
a8599bd8 2348
be655596 2349 spin_lock(&ci->i_ceph_lock);
e4500b5e
YZ
2350 if (!list_empty(&ci->i_cap_flush_list)) {
2351 struct ceph_cap_flush * cf =
2352 list_first_entry(&ci->i_cap_flush_list,
2353 struct ceph_cap_flush, i_list);
553adfd9 2354 if (cf->tid <= flush_tid)
a8599bd8 2355 ret = 0;
89b52fe1 2356 }
be655596 2357 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
2358 return ret;
2359}
2360
da819c81 2361/*
ae067063 2362 * flush the mdlog and wait for any unsafe requests to complete.
da819c81 2363 */
ae067063 2364static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
da819c81 2365{
5995d90d 2366 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
38d46409 2367 struct ceph_client *cl = ceph_inode_to_client(inode);
da819c81 2368 struct ceph_inode_info *ci = ceph_inode(inode);
68cd5b4b
YZ
2369 struct ceph_mds_request *req1 = NULL, *req2 = NULL;
2370 int ret, err = 0;
da819c81
YZ
2371
2372 spin_lock(&ci->i_unsafe_lock);
68cd5b4b
YZ
2373 if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
2374 req1 = list_last_entry(&ci->i_unsafe_dirops,
2375 struct ceph_mds_request,
2376 r_unsafe_dir_item);
2377 ceph_mdsc_get_request(req1);
2378 }
2379 if (!list_empty(&ci->i_unsafe_iops)) {
2380 req2 = list_last_entry(&ci->i_unsafe_iops,
2381 struct ceph_mds_request,
2382 r_unsafe_target_item);
2383 ceph_mdsc_get_request(req2);
2384 }
2385 spin_unlock(&ci->i_unsafe_lock);
da819c81 2386
e1a4541e
XL
2387 /*
2388 * Trigger to flush the journal logs in all the relevant MDSes
2389 * manually, or in the worst case we must wait at most 5 seconds
2390 * to wait the journal logs to be flushed by the MDSes periodically.
2391 */
5bd76b8d 2392 if (req1 || req2) {
e1a4541e 2393 struct ceph_mds_request *req;
5bd76b8d
XL
2394 struct ceph_mds_session **sessions;
2395 struct ceph_mds_session *s;
2396 unsigned int max_sessions;
e1a4541e
XL
2397 int i;
2398
5bd76b8d
XL
2399 mutex_lock(&mdsc->mutex);
2400 max_sessions = mdsc->max_sessions;
2401
aa1d6272 2402 sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
89d43d05 2403 if (!sessions) {
5bd76b8d 2404 mutex_unlock(&mdsc->mutex);
89d43d05
XL
2405 err = -ENOMEM;
2406 goto out;
2407 }
e1a4541e
XL
2408
2409 spin_lock(&ci->i_unsafe_lock);
2410 if (req1) {
2411 list_for_each_entry(req, &ci->i_unsafe_dirops,
2412 r_unsafe_dir_item) {
2413 s = req->r_session;
7acae618
XL
2414 if (!s)
2415 continue;
e1a4541e
XL
2416 if (!sessions[s->s_mds]) {
2417 s = ceph_get_mds_session(s);
2418 sessions[s->s_mds] = s;
2419 }
2420 }
2421 }
2422 if (req2) {
2423 list_for_each_entry(req, &ci->i_unsafe_iops,
2424 r_unsafe_target_item) {
2425 s = req->r_session;
7acae618
XL
2426 if (!s)
2427 continue;
e1a4541e
XL
2428 if (!sessions[s->s_mds]) {
2429 s = ceph_get_mds_session(s);
2430 sessions[s->s_mds] = s;
2431 }
2432 }
2433 }
2434 spin_unlock(&ci->i_unsafe_lock);
2435
2436 /* the auth MDS */
2437 spin_lock(&ci->i_ceph_lock);
2438 if (ci->i_auth_cap) {
5bd76b8d
XL
2439 s = ci->i_auth_cap->session;
2440 if (!sessions[s->s_mds])
2441 sessions[s->s_mds] = ceph_get_mds_session(s);
e1a4541e
XL
2442 }
2443 spin_unlock(&ci->i_ceph_lock);
5bd76b8d 2444 mutex_unlock(&mdsc->mutex);
e1a4541e
XL
2445
2446 /* send flush mdlog request to MDSes */
89d43d05 2447 for (i = 0; i < max_sessions; i++) {
e1a4541e
XL
2448 s = sessions[i];
2449 if (s) {
2450 send_flush_mdlog(s);
2451 ceph_put_mds_session(s);
2452 }
2453 }
2454 kfree(sessions);
2455 }
2456
38d46409
XL
2457 doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode,
2458 ceph_vinop(inode), req1 ? req1->r_tid : 0ULL,
2459 req2 ? req2->r_tid : 0ULL);
68cd5b4b
YZ
2460 if (req1) {
2461 ret = !wait_for_completion_timeout(&req1->r_safe_completion,
2462 ceph_timeout_jiffies(req1->r_timeout));
da819c81 2463 if (ret)
68cd5b4b 2464 err = -EIO;
68cd5b4b
YZ
2465 }
2466 if (req2) {
2467 ret = !wait_for_completion_timeout(&req2->r_safe_completion,
2468 ceph_timeout_jiffies(req2->r_timeout));
2469 if (ret)
2470 err = -EIO;
68cd5b4b 2471 }
89d43d05
XL
2472
2473out:
2474 if (req1)
2475 ceph_mdsc_put_request(req1);
2476 if (req2)
2477 ceph_mdsc_put_request(req2);
68cd5b4b 2478 return err;
da819c81
YZ
2479}
2480
02c24a82 2481int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
a8599bd8 2482{
7ea80859 2483 struct inode *inode = file->f_mapping->host;
a8599bd8 2484 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 2485 struct ceph_client *cl = ceph_inode_to_client(inode);
553adfd9 2486 u64 flush_tid;
f4b97866 2487 int ret, err;
a8599bd8
SW
2488 int dirty;
2489
38d46409
XL
2490 doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode),
2491 datasync ? " datasync" : "");
9a5530c6 2492
b74fceae 2493 ret = file_write_and_wait_range(file, start, end);
da819c81
YZ
2494 if (datasync)
2495 goto out;
2496
891f3f5a
JL
2497 ret = ceph_wait_on_async_create(inode);
2498 if (ret)
2499 goto out;
2500
553adfd9 2501 dirty = try_flush_caps(inode, &flush_tid);
38d46409 2502 doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty));
a8599bd8 2503
ae067063 2504 err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
da819c81 2505
a8599bd8
SW
2506 /*
2507 * only wait on non-file metadata writeback (the mds
2508 * can recover size and mtime, so we don't need to
2509 * wait for that)
2510 */
f4b97866
YZ
2511 if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2512 err = wait_event_interruptible(ci->i_cap_wq,
da819c81 2513 caps_are_flushed(inode, flush_tid));
a8599bd8 2514 }
f4b97866
YZ
2515
2516 if (err < 0)
2517 ret = err;
2518
1bd85aa6
JL
2519 err = file_check_and_advance_wb_err(file);
2520 if (err < 0)
2521 ret = err;
da819c81 2522out:
38d46409
XL
2523 doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode),
2524 datasync ? " datasync" : "", ret);
a8599bd8
SW
2525 return ret;
2526}
2527
2528/*
2529 * Flush any dirty caps back to the mds. If we aren't asked to wait,
2530 * queue inode for flush but don't do so immediately, because we can
2531 * get by with fewer MDS messages if we wait for data writeback to
2532 * complete first.
2533 */
f1a3d572 2534int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
a8599bd8
SW
2535{
2536 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 2537 struct ceph_client *cl = ceph_inode_to_client(inode);
553adfd9 2538 u64 flush_tid;
a8599bd8
SW
2539 int err = 0;
2540 int dirty;
16515a6d 2541 int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
a8599bd8 2542
38d46409 2543 doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait);
400e1286 2544 ceph_fscache_unpin_writeback(inode, wbc);
a8599bd8 2545 if (wait) {
fbed7045
JL
2546 err = ceph_wait_on_async_create(inode);
2547 if (err)
2548 return err;
553adfd9 2549 dirty = try_flush_caps(inode, &flush_tid);
a8599bd8
SW
2550 if (dirty)
2551 err = wait_event_interruptible(ci->i_cap_wq,
2552 caps_are_flushed(inode, flush_tid));
2553 } else {
640ef79d 2554 struct ceph_mds_client *mdsc =
5995d90d 2555 ceph_sb_to_fs_client(inode->i_sb)->mdsc;
a8599bd8 2556
be655596 2557 spin_lock(&ci->i_ceph_lock);
a8599bd8
SW
2558 if (__ceph_caps_dirty(ci))
2559 __cap_delay_requeue_front(mdsc, ci);
be655596 2560 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
2561 }
2562 return err;
2563}
2564
0e294387
YZ
2565static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2566 struct ceph_mds_session *session,
2567 struct ceph_inode_info *ci,
2568 u64 oldest_flush_tid)
2569 __releases(ci->i_ceph_lock)
2570 __acquires(ci->i_ceph_lock)
553adfd9 2571{
874c8ca1 2572 struct inode *inode = &ci->netfs.inode;
38d46409 2573 struct ceph_client *cl = mdsc->fsc->client;
553adfd9
YZ
2574 struct ceph_cap *cap;
2575 struct ceph_cap_flush *cf;
0e294387 2576 int ret;
553adfd9 2577 u64 first_tid = 0;
49ada6e8 2578 u64 last_snap_flush = 0;
553adfd9 2579
fbed7045
JL
2580 /* Don't do anything until create reply comes in */
2581 if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
2582 return;
2583
054f8d41
YZ
2584 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2585
49ada6e8 2586 list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
b2f9fa1f 2587 if (cf->is_capsnap) {
49ada6e8
YZ
2588 last_snap_flush = cf->tid;
2589 break;
2590 }
2591 }
2592
e4500b5e
YZ
2593 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2594 if (cf->tid < first_tid)
2595 continue;
2596
553adfd9
YZ
2597 cap = ci->i_auth_cap;
2598 if (!(cap && cap->session == session)) {
38d46409
XL
2599 pr_err_client(cl, "%p auth cap %p not mds%d ???\n",
2600 inode, cap, session->s_mds);
553adfd9
YZ
2601 break;
2602 }
2603
553adfd9
YZ
2604 first_tid = cf->tid + 1;
2605
b2f9fa1f 2606 if (!cf->is_capsnap) {
0a454bdd
JL
2607 struct cap_msg_args arg;
2608
38d46409
XL
2609 doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n",
2610 inode, ceph_vinop(inode), cap, cf->tid,
2611 ceph_cap_string(cf->caps));
0a454bdd 2612 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
49ada6e8
YZ
2613 (cf->tid < last_snap_flush ?
2614 CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
2615 __ceph_caps_used(ci),
0e294387 2616 __ceph_caps_wanted(ci),
49ada6e8 2617 (cap->issued | cap->implemented),
0e294387 2618 cf->caps, cf->tid, oldest_flush_tid);
0a454bdd 2619 spin_unlock(&ci->i_ceph_lock);
52311980 2620 __send_cap(&arg, ci);
0e294387
YZ
2621 } else {
2622 struct ceph_cap_snap *capsnap =
2623 container_of(cf, struct ceph_cap_snap,
2624 cap_flush);
38d46409
XL
2625 doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n",
2626 inode, ceph_vinop(inode), capsnap, cf->tid,
2627 ceph_cap_string(capsnap->dirty));
0e294387 2628
805692d0 2629 refcount_inc(&capsnap->nref);
0e294387
YZ
2630 spin_unlock(&ci->i_ceph_lock);
2631
2632 ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
2633 oldest_flush_tid);
2634 if (ret < 0) {
38d46409
XL
2635 pr_err_client(cl, "error sending cap flushsnap,"
2636 " %p %llx.%llx tid %llu follows %llu\n",
2637 inode, ceph_vinop(inode), cf->tid,
2638 capsnap->follows);
0e294387
YZ
2639 }
2640
2641 ceph_put_cap_snap(capsnap);
2642 }
e4500b5e
YZ
2643
2644 spin_lock(&ci->i_ceph_lock);
553adfd9 2645 }
553adfd9
YZ
2646}
2647
e548e9b9
YZ
2648void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2649 struct ceph_mds_session *session)
2650{
38d46409 2651 struct ceph_client *cl = mdsc->fsc->client;
e548e9b9
YZ
2652 struct ceph_inode_info *ci;
2653 struct ceph_cap *cap;
0e294387 2654 u64 oldest_flush_tid;
e548e9b9 2655
38d46409 2656 doutc(cl, "mds%d\n", session->s_mds);
0e294387
YZ
2657
2658 spin_lock(&mdsc->cap_dirty_lock);
2659 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2660 spin_unlock(&mdsc->cap_dirty_lock);
2661
e548e9b9 2662 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
38d46409
XL
2663 struct inode *inode = &ci->netfs.inode;
2664
e548e9b9
YZ
2665 spin_lock(&ci->i_ceph_lock);
2666 cap = ci->i_auth_cap;
2667 if (!(cap && cap->session == session)) {
38d46409
XL
2668 pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
2669 inode, ceph_vinop(inode), cap,
2670 session->s_mds);
e548e9b9
YZ
2671 spin_unlock(&ci->i_ceph_lock);
2672 continue;
2673 }
2674
2675
2676 /*
2677 * if flushing caps were revoked, we re-send the cap flush
2678 * in client reconnect stage. This guarantees MDS * processes
2679 * the cap flush message before issuing the flushing caps to
2680 * other client.
2681 */
2682 if ((cap->issued & ci->i_flushing_caps) !=
2683 ci->i_flushing_caps) {
81c5a148
YZ
2684 /* encode_caps_cb() also will reset these sequence
2685 * numbers. make sure sequence numbers in cap flush
2686 * message match later reconnect message */
2687 cap->seq = 0;
2688 cap->issue_seq = 0;
2689 cap->mseq = 0;
0e294387
YZ
2690 __kick_flushing_caps(mdsc, session, ci,
2691 oldest_flush_tid);
13c2b57d
YZ
2692 } else {
2693 ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
e548e9b9
YZ
2694 }
2695
e548e9b9
YZ
2696 spin_unlock(&ci->i_ceph_lock);
2697 }
2698}
2699
a8599bd8
SW
2700void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2701 struct ceph_mds_session *session)
2702{
38d46409 2703 struct ceph_client *cl = mdsc->fsc->client;
a8599bd8 2704 struct ceph_inode_info *ci;
13c2b57d 2705 struct ceph_cap *cap;
0e294387 2706 u64 oldest_flush_tid;
a8599bd8 2707
829ad4db
JL
2708 lockdep_assert_held(&session->s_mutex);
2709
38d46409 2710 doutc(cl, "mds%d\n", session->s_mds);
0e294387
YZ
2711
2712 spin_lock(&mdsc->cap_dirty_lock);
2713 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2714 spin_unlock(&mdsc->cap_dirty_lock);
2715
a8599bd8 2716 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
38d46409
XL
2717 struct inode *inode = &ci->netfs.inode;
2718
0e294387 2719 spin_lock(&ci->i_ceph_lock);
13c2b57d
YZ
2720 cap = ci->i_auth_cap;
2721 if (!(cap && cap->session == session)) {
38d46409
XL
2722 pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
2723 inode, ceph_vinop(inode), cap,
2724 session->s_mds);
13c2b57d
YZ
2725 spin_unlock(&ci->i_ceph_lock);
2726 continue;
2727 }
2728 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
13c2b57d
YZ
2729 __kick_flushing_caps(mdsc, session, ci,
2730 oldest_flush_tid);
2731 }
0e294387 2732 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
2733 }
2734}
2735
e8a4d267
JL
2736void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2737 struct ceph_inode_info *ci)
088b3f5e 2738{
e8a4d267
JL
2739 struct ceph_mds_client *mdsc = session->s_mdsc;
2740 struct ceph_cap *cap = ci->i_auth_cap;
38d46409 2741 struct inode *inode = &ci->netfs.inode;
e8a4d267
JL
2742
2743 lockdep_assert_held(&ci->i_ceph_lock);
088b3f5e 2744
38d46409
XL
2745 doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n",
2746 inode, ceph_vinop(inode),
2747 ceph_cap_string(ci->i_flushing_caps));
005c4697 2748
0e294387
YZ
2749 if (!list_empty(&ci->i_cap_flush_list)) {
2750 u64 oldest_flush_tid;
005c4697
YZ
2751 spin_lock(&mdsc->cap_dirty_lock);
2752 list_move_tail(&ci->i_flushing_item,
2753 &cap->session->s_cap_flushing);
0e294387 2754 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
005c4697
YZ
2755 spin_unlock(&mdsc->cap_dirty_lock);
2756
0e294387 2757 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
088b3f5e
SW
2758 }
2759}
2760
a8599bd8
SW
2761
2762/*
2763 * Take references to capabilities we hold, so that we don't release
2764 * them to the MDS prematurely.
a8599bd8 2765 */
40dcf75e 2766void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
5dda377c 2767 bool snap_rwsem_locked)
a8599bd8 2768{
38d46409
XL
2769 struct inode *inode = &ci->netfs.inode;
2770 struct ceph_client *cl = ceph_inode_to_client(inode);
2771
40dcf75e
JL
2772 lockdep_assert_held(&ci->i_ceph_lock);
2773
a8599bd8
SW
2774 if (got & CEPH_CAP_PIN)
2775 ci->i_pin_ref++;
2776 if (got & CEPH_CAP_FILE_RD)
2777 ci->i_rd_ref++;
2778 if (got & CEPH_CAP_FILE_CACHE)
2779 ci->i_rdcache_ref++;
f85122af
JL
2780 if (got & CEPH_CAP_FILE_EXCL)
2781 ci->i_fx_ref++;
5dda377c
YZ
2782 if (got & CEPH_CAP_FILE_WR) {
2783 if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
2784 BUG_ON(!snap_rwsem_locked);
2785 ci->i_head_snapc = ceph_get_snap_context(
2786 ci->i_snap_realm->cached_context);
2787 }
a8599bd8 2788 ci->i_wr_ref++;
5dda377c 2789 }
a8599bd8 2790 if (got & CEPH_CAP_FILE_BUFFER) {
d3d0720d 2791 if (ci->i_wb_ref == 0)
38d46409 2792 ihold(inode);
d3d0720d 2793 ci->i_wb_ref++;
38d46409
XL
2794 doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
2795 ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref);
a8599bd8
SW
2796 }
2797}
2798
2799/*
2800 * Try to grab cap references. Specify those refs we @want, and the
2801 * minimal set we @need. Also include the larger offset we are writing
2802 * to (when applicable), and check against max_size here as well.
2803 * Note that caller is responsible for ensuring max_size increases are
2804 * requested from the MDS.
1199d7da 2805 *
546d4020
YZ
2806 * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2807 * or a negative error code. There are 3 speical error codes:
8006daff
JL
2808 * -EAGAIN: need to sleep but non-blocking is specified
2809 * -EFBIG: ask caller to call check_max_size() and try again.
2810 * -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
a8599bd8 2811 */
ff5d913d 2812enum {
719a2514
YZ
2813 /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
2814 NON_BLOCKING = (1 << 8),
2815 CHECK_FILELOCK = (1 << 9),
ff5d913d
YZ
2816};
2817
5e3ded1b 2818static int try_get_cap_refs(struct inode *inode, int need, int want,
ff5d913d 2819 loff_t endoff, int flags, int *got)
a8599bd8 2820{
5e3ded1b 2821 struct ceph_inode_info *ci = ceph_inode(inode);
5995d90d 2822 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
38d46409 2823 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8 2824 int ret = 0;
c4d4a582 2825 int have, implemented;
5dda377c 2826 bool snap_rwsem_locked = false;
a8599bd8 2827
38d46409
XL
2828 doutc(cl, "%p %llx.%llx need %s want %s\n", inode,
2829 ceph_vinop(inode), ceph_cap_string(need),
2830 ceph_cap_string(want));
c4d4a582 2831
5dda377c 2832again:
be655596 2833 spin_lock(&ci->i_ceph_lock);
a8599bd8 2834
ff5d913d
YZ
2835 if ((flags & CHECK_FILELOCK) &&
2836 (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
38d46409
XL
2837 doutc(cl, "%p %llx.%llx error filelock\n", inode,
2838 ceph_vinop(inode));
ff5d913d
YZ
2839 ret = -EIO;
2840 goto out_unlock;
2841 }
2842
37505d57
YZ
2843 /* finish pending truncate */
2844 while (ci->i_truncate_pending) {
2845 spin_unlock(&ci->i_ceph_lock);
5dda377c
YZ
2846 if (snap_rwsem_locked) {
2847 up_read(&mdsc->snap_rwsem);
2848 snap_rwsem_locked = false;
2849 }
b415bf4f 2850 __ceph_do_pending_vmtruncate(inode);
37505d57
YZ
2851 spin_lock(&ci->i_ceph_lock);
2852 }
2853
3871cbb9
YZ
2854 have = __ceph_caps_issued(ci, &implemented);
2855
2856 if (have & need & CEPH_CAP_FILE_WR) {
a8599bd8 2857 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
38d46409
XL
2858 doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n",
2859 inode, ceph_vinop(inode), endoff, ci->i_max_size);
1199d7da 2860 if (endoff > ci->i_requested_max_size)
8006daff 2861 ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
3738daa6 2862 goto out_unlock;
a8599bd8
SW
2863 }
2864 /*
2865 * If a sync write is in progress, we must wait, so that we
2866 * can get a final snapshot value for size+mtime.
2867 */
2868 if (__ceph_have_pending_cap_snap(ci)) {
38d46409
XL
2869 doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode,
2870 ceph_vinop(inode));
3738daa6 2871 goto out_unlock;
a8599bd8
SW
2872 }
2873 }
a8599bd8 2874
a8599bd8
SW
2875 if ((have & need) == need) {
2876 /*
2877 * Look at (implemented & ~have & not) so that we keep waiting
2878 * on transition from wanted -> needed caps. This is needed
2879 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
2880 * going before a prior buffered writeback happens.
7c3ea987
XL
2881 *
2882 * For RDCACHE|RD -> RD, there is not need to wait and we can
2883 * just exclude the revoking caps and force to sync read.
a8599bd8
SW
2884 */
2885 int not = want & ~(have & need);
2886 int revoking = implemented & ~have;
7c3ea987 2887 int exclude = revoking & not;
38d46409
XL
2888 doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n",
2889 inode, ceph_vinop(inode), ceph_cap_string(have),
2890 ceph_cap_string(not), ceph_cap_string(revoking));
7c3ea987 2891 if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
5dda377c
YZ
2892 if (!snap_rwsem_locked &&
2893 !ci->i_head_snapc &&
2894 (need & CEPH_CAP_FILE_WR)) {
2895 if (!down_read_trylock(&mdsc->snap_rwsem)) {
2896 /*
2897 * we can not call down_read() when
2898 * task isn't in TASK_RUNNING state
2899 */
ff5d913d 2900 if (flags & NON_BLOCKING) {
1199d7da 2901 ret = -EAGAIN;
5dda377c
YZ
2902 goto out_unlock;
2903 }
2904
2905 spin_unlock(&ci->i_ceph_lock);
2906 down_read(&mdsc->snap_rwsem);
2907 snap_rwsem_locked = true;
2908 goto again;
2909 }
2910 snap_rwsem_locked = true;
2911 }
173e70e8 2912 if ((have & want) == want)
7c3ea987 2913 *got = need | (want & ~exclude);
173e70e8
YZ
2914 else
2915 *got = need;
40dcf75e 2916 ceph_take_cap_refs(ci, *got, true);
a8599bd8
SW
2917 ret = 1;
2918 }
2919 } else {
03f4fcb0 2920 int session_readonly = false;
c0e385b1 2921 int mds_wanted;
525d15e8
YZ
2922 if (ci->i_auth_cap &&
2923 (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
03f4fcb0
YZ
2924 struct ceph_mds_session *s = ci->i_auth_cap->session;
2925 spin_lock(&s->s_cap_lock);
2926 session_readonly = s->s_readonly;
2927 spin_unlock(&s->s_cap_lock);
2928 }
2929 if (session_readonly) {
38d46409
XL
2930 doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n",
2931 inode, ceph_vinop(inode), ceph_cap_string(need),
2932 ci->i_auth_cap->mds);
1199d7da 2933 ret = -EROFS;
03f4fcb0
YZ
2934 goto out_unlock;
2935 }
2936
5d6451b1 2937 if (ceph_inode_is_shutdown(inode)) {
38d46409
XL
2938 doutc(cl, "%p %llx.%llx inode is shutdown\n",
2939 inode, ceph_vinop(inode));
5d6451b1 2940 ret = -ESTALE;
c0e385b1
YZ
2941 goto out_unlock;
2942 }
2943 mds_wanted = __ceph_caps_mds_wanted(ci, false);
2944 if (need & ~mds_wanted) {
38d46409
XL
2945 doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n",
2946 inode, ceph_vinop(inode), ceph_cap_string(need),
2947 ceph_cap_string(mds_wanted));
8006daff 2948 ret = -EUCLEAN;
c0e385b1 2949 goto out_unlock;
48fec5d0
YZ
2950 }
2951
38d46409
XL
2952 doutc(cl, "%p %llx.%llx have %s need %s\n", inode,
2953 ceph_vinop(inode), ceph_cap_string(have),
2954 ceph_cap_string(need));
a8599bd8 2955 }
3738daa6 2956out_unlock:
719a2514
YZ
2957
2958 __ceph_touch_fmode(ci, mdsc, flags);
2959
be655596 2960 spin_unlock(&ci->i_ceph_lock);
5dda377c
YZ
2961 if (snap_rwsem_locked)
2962 up_read(&mdsc->snap_rwsem);
3738daa6 2963
1af16d54
XL
2964 if (!ret)
2965 ceph_update_cap_mis(&mdsc->metric);
2966 else if (ret == 1)
2967 ceph_update_cap_hit(&mdsc->metric);
2968
38d46409
XL
2969 doutc(cl, "%p %llx.%llx ret %d got %s\n", inode,
2970 ceph_vinop(inode), ret, ceph_cap_string(*got));
a8599bd8
SW
2971 return ret;
2972}
2973
2974/*
2975 * Check the offset we are writing up to against our current
2976 * max_size. If necessary, tell the MDS we want to write to
2977 * a larger offset.
2978 */
2979static void check_max_size(struct inode *inode, loff_t endoff)
2980{
2981 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 2982 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8
SW
2983 int check = 0;
2984
2985 /* do we need to explicitly request a larger max_size? */
be655596 2986 spin_lock(&ci->i_ceph_lock);
3871cbb9 2987 if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
38d46409
XL
2988 doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n",
2989 inode, ceph_vinop(inode), endoff);
a8599bd8 2990 ci->i_wanted_max_size = endoff;
a8599bd8 2991 }
3871cbb9
YZ
2992 /* duplicate ceph_check_caps()'s logic */
2993 if (ci->i_auth_cap &&
2994 (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
2995 ci->i_wanted_max_size > ci->i_max_size &&
2996 ci->i_wanted_max_size > ci->i_requested_max_size)
2997 check = 1;
be655596 2998 spin_unlock(&ci->i_ceph_lock);
a8599bd8 2999 if (check)
e4b731cc 3000 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
a8599bd8
SW
3001}
3002
719a2514
YZ
3003static inline int get_used_fmode(int caps)
3004{
3005 int fmode = 0;
3006 if (caps & CEPH_CAP_FILE_RD)
3007 fmode |= CEPH_FILE_MODE_RD;
3008 if (caps & CEPH_CAP_FILE_WR)
3009 fmode |= CEPH_FILE_MODE_WR;
3010 return fmode;
3011}
3012
5e3ded1b 3013int ceph_try_get_caps(struct inode *inode, int need, int want,
2ee9dd95 3014 bool nonblock, int *got)
2b1ac852 3015{
719a2514 3016 int ret, flags;
2b1ac852
YZ
3017
3018 BUG_ON(need & ~CEPH_CAP_FILE_RD);
a25949b9
JL
3019 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
3020 CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
3021 CEPH_CAP_ANY_DIR_OPS));
3022 if (need) {
3023 ret = ceph_pool_perm_check(inode, need);
3024 if (ret < 0)
3025 return ret;
3026 }
2b1ac852 3027
719a2514
YZ
3028 flags = get_used_fmode(need | want);
3029 if (nonblock)
3030 flags |= NON_BLOCKING;
3031
3032 ret = try_get_cap_refs(inode, need, want, 0, flags, got);
546d4020 3033 /* three special error codes */
8006daff 3034 if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN)
546d4020
YZ
3035 ret = 0;
3036 return ret;
2b1ac852
YZ
3037}
3038
a8599bd8
SW
3039/*
3040 * Wait for caps, and take cap references. If we can't get a WR cap
3041 * due to a small max_size, make sure we check_max_size (and possibly
3042 * ask the mds) so we don't get hung up indefinitely.
3043 */
5c64737d
XL
3044int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
3045 int want, loff_t endoff, int *got)
a8599bd8 3046{
5e3ded1b 3047 struct ceph_inode_info *ci = ceph_inode(inode);
5995d90d 3048 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
ff5d913d 3049 int ret, _got, flags;
a8599bd8 3050
5e3ded1b 3051 ret = ceph_pool_perm_check(inode, need);
10183a69
YZ
3052 if (ret < 0)
3053 return ret;
3054
5c64737d 3055 if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
81f148a9
YZ
3056 fi->filp_gen != READ_ONCE(fsc->filp_gen))
3057 return -EBADF;
3058
719a2514
YZ
3059 flags = get_used_fmode(need | want);
3060
5dda377c 3061 while (true) {
719a2514 3062 flags &= CEPH_FILE_MODE_MASK;
461ab10e 3063 if (vfs_inode_has_locks(inode))
719a2514 3064 flags |= CHECK_FILELOCK;
5dda377c 3065 _got = 0;
5e3ded1b 3066 ret = try_get_cap_refs(inode, need, want, endoff,
ff5d913d 3067 flags, &_got);
546d4020 3068 WARN_ON_ONCE(ret == -EAGAIN);
7b2f936f 3069 if (!ret) {
3a3430af
JL
3070 struct ceph_mds_client *mdsc = fsc->mdsc;
3071 struct cap_wait cw;
5c341ee3 3072 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3a3430af 3073
ebce3eb2 3074 cw.ino = ceph_ino(inode);
3a3430af
JL
3075 cw.tgid = current->tgid;
3076 cw.need = need;
3077 cw.want = want;
3078
3079 spin_lock(&mdsc->caps_list_lock);
3080 list_add(&cw.list, &mdsc->cap_wait_list);
3081 spin_unlock(&mdsc->caps_list_lock);
3082
719a2514
YZ
3083 /* make sure used fmode not timeout */
3084 ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
5c341ee3
NB
3085 add_wait_queue(&ci->i_cap_wq, &wait);
3086
ff5d913d 3087 flags |= NON_BLOCKING;
5e3ded1b 3088 while (!(ret = try_get_cap_refs(inode, need, want,
ff5d913d 3089 endoff, flags, &_got))) {
6e09d0fb
YZ
3090 if (signal_pending(current)) {
3091 ret = -ERESTARTSYS;
3092 break;
3093 }
5c341ee3 3094 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
6e09d0fb 3095 }
5c341ee3
NB
3096
3097 remove_wait_queue(&ci->i_cap_wq, &wait);
719a2514 3098 ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
3a3430af
JL
3099
3100 spin_lock(&mdsc->caps_list_lock);
3101 list_del(&cw.list);
3102 spin_unlock(&mdsc->caps_list_lock);
3103
7b2f936f 3104 if (ret == -EAGAIN)
5dda377c 3105 continue;
77310320 3106 }
81f148a9 3107
5c64737d 3108 if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
81f148a9
YZ
3109 fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
3110 if (ret >= 0 && _got)
3111 ceph_put_cap_refs(ci, _got);
3112 return -EBADF;
3113 }
3114
7b2f936f 3115 if (ret < 0) {
8006daff 3116 if (ret == -EFBIG || ret == -EUCLEAN) {
9bccb765
YZ
3117 int ret2 = ceph_wait_on_async_create(inode);
3118 if (ret2 < 0)
3119 return ret2;
3120 }
546d4020
YZ
3121 if (ret == -EFBIG) {
3122 check_max_size(inode, endoff);
3123 continue;
3124 }
8006daff 3125 if (ret == -EUCLEAN) {
7b2f936f 3126 /* session was killed, try renew caps */
719a2514 3127 ret = ceph_renew_caps(inode, flags);
7b2f936f
YZ
3128 if (ret == 0)
3129 continue;
3130 }
77310320 3131 return ret;
5dda377c 3132 }
c4d4a582 3133
874c8ca1 3134 if (S_ISREG(ci->netfs.inode.i_mode) &&
48490776 3135 ceph_has_inline_data(ci) &&
5dda377c 3136 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
5e3ded1b 3137 i_size_read(inode) > 0) {
5dda377c 3138 struct page *page =
5e3ded1b 3139 find_get_page(inode->i_mapping, 0);
5dda377c 3140 if (page) {
e72968e1
JL
3141 bool uptodate = PageUptodate(page);
3142
09cbfeaf 3143 put_page(page);
e72968e1
JL
3144 if (uptodate)
3145 break;
c4d4a582 3146 }
5dda377c
YZ
3147 /*
3148 * drop cap refs first because getattr while
3149 * holding * caps refs can cause deadlock.
3150 */
3151 ceph_put_cap_refs(ci, _got);
3152 _got = 0;
c4d4a582 3153
5dda377c
YZ
3154 /*
3155 * getattr request will bring inline data into
3156 * page cache
3157 */
5e3ded1b 3158 ret = __ceph_do_getattr(inode, NULL,
5dda377c
YZ
3159 CEPH_STAT_CAP_INLINE_DATA,
3160 true);
3161 if (ret < 0)
3162 return ret;
3163 continue;
3164 }
3165 break;
c4d4a582 3166 }
c4d4a582
YZ
3167 *got = _got;
3168 return 0;
a8599bd8
SW
3169}
3170
5c64737d
XL
3171int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff,
3172 int *got)
3173{
3174 struct ceph_file_info *fi = filp->private_data;
3175 struct inode *inode = file_inode(filp);
3176
3177 return __ceph_get_caps(inode, fi, need, want, endoff, got);
3178}
3179
a8599bd8
SW
3180/*
3181 * Take cap refs. Caller must already know we hold at least one ref
3182 * on the caps in question or we don't know this is safe.
3183 */
3184void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
3185{
be655596 3186 spin_lock(&ci->i_ceph_lock);
40dcf75e 3187 ceph_take_cap_refs(ci, caps, false);
be655596 3188 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
3189}
3190
86056090
YZ
3191
3192/*
3193 * drop cap_snap that is not associated with any snapshot.
3194 * we don't need to send FLUSHSNAP message for it.
3195 */
70220ac8
YZ
3196static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
3197 struct ceph_cap_snap *capsnap)
86056090 3198{
38d46409
XL
3199 struct inode *inode = &ci->netfs.inode;
3200 struct ceph_client *cl = ceph_inode_to_client(inode);
3201
86056090
YZ
3202 if (!capsnap->need_flush &&
3203 !capsnap->writing && !capsnap->dirty_pages) {
38d46409 3204 doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows);
0e294387 3205 BUG_ON(capsnap->cap_flush.tid > 0);
86056090 3206 ceph_put_snap_context(capsnap->context);
70220ac8
YZ
3207 if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
3208 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
3209
86056090 3210 list_del(&capsnap->ci_item);
86056090
YZ
3211 ceph_put_cap_snap(capsnap);
3212 return 1;
3213 }
3214 return 0;
3215}
3216
a8810cdc
JL
3217enum put_cap_refs_mode {
3218 PUT_CAP_REFS_SYNC = 0,
a8810cdc
JL
3219 PUT_CAP_REFS_ASYNC,
3220};
3221
a8599bd8
SW
3222/*
3223 * Release cap refs.
3224 *
3225 * If we released the last ref on any given cap, call ceph_check_caps
3226 * to release (or schedule a release).
3227 *
3228 * If we are releasing a WR cap (from a sync write), finalize any affected
3229 * cap_snap, and wake up any waiters.
3230 */
e64f44a8 3231static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
a8810cdc 3232 enum put_cap_refs_mode mode)
a8599bd8 3233{
874c8ca1 3234 struct inode *inode = &ci->netfs.inode;
38d46409 3235 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8 3236 int last = 0, put = 0, flushsnaps = 0, wake = 0;
558b4510 3237 bool check_flushsnaps = false;
a8599bd8 3238
be655596 3239 spin_lock(&ci->i_ceph_lock);
a8599bd8
SW
3240 if (had & CEPH_CAP_PIN)
3241 --ci->i_pin_ref;
3242 if (had & CEPH_CAP_FILE_RD)
3243 if (--ci->i_rd_ref == 0)
3244 last++;
3245 if (had & CEPH_CAP_FILE_CACHE)
3246 if (--ci->i_rdcache_ref == 0)
3247 last++;
f85122af
JL
3248 if (had & CEPH_CAP_FILE_EXCL)
3249 if (--ci->i_fx_ref == 0)
3250 last++;
a8599bd8 3251 if (had & CEPH_CAP_FILE_BUFFER) {
d3d0720d 3252 if (--ci->i_wb_ref == 0) {
a8599bd8 3253 last++;
558b4510 3254 /* put the ref held by ceph_take_cap_refs() */
a8599bd8 3255 put++;
558b4510 3256 check_flushsnaps = true;
a8599bd8 3257 }
38d46409
XL
3258 doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
3259 ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref);
a8599bd8 3260 }
558b4510 3261 if (had & CEPH_CAP_FILE_WR) {
a8599bd8 3262 if (--ci->i_wr_ref == 0) {
2d12ad95
XL
3263 /*
3264 * The Fb caps will always be took and released
3265 * together with the Fw caps.
3266 */
3267 WARN_ON_ONCE(ci->i_wb_ref);
3268
a8599bd8 3269 last++;
558b4510 3270 check_flushsnaps = true;
5dda377c
YZ
3271 if (ci->i_wrbuffer_ref_head == 0 &&
3272 ci->i_dirty_caps == 0 &&
3273 ci->i_flushing_caps == 0) {
3274 BUG_ON(!ci->i_head_snapc);
3275 ceph_put_snap_context(ci->i_head_snapc);
3276 ci->i_head_snapc = NULL;
3277 }
db40cc17 3278 /* see comment in __ceph_remove_cap() */
bd84fbcb 3279 if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
0ba92e1c 3280 ceph_change_snap_realm(inode, NULL);
a8599bd8 3281 }
558b4510
XL
3282 }
3283 if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
3284 struct ceph_cap_snap *capsnap =
3285 list_last_entry(&ci->i_cap_snaps,
3286 struct ceph_cap_snap,
3287 ci_item);
3288
3289 capsnap->writing = 0;
3290 if (ceph_try_drop_cap_snap(ci, capsnap))
3291 /* put the ref held by ceph_queue_cap_snap() */
3292 put++;
3293 else if (__ceph_finish_cap_snap(ci, capsnap))
3294 flushsnaps = 1;
3295 wake = 1;
3296 }
be655596 3297 spin_unlock(&ci->i_ceph_lock);
a8599bd8 3298
38d46409
XL
3299 doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode),
3300 ceph_cap_string(had), last ? " last" : "", put ? " put" : "");
a8599bd8 3301
a8810cdc
JL
3302 switch (mode) {
3303 case PUT_CAP_REFS_SYNC:
64f36da5 3304 if (last)
e4b731cc 3305 ceph_check_caps(ci, 0);
64f36da5
JL
3306 else if (flushsnaps)
3307 ceph_flush_snaps(ci, NULL);
a8810cdc
JL
3308 break;
3309 case PUT_CAP_REFS_ASYNC:
3310 if (last)
3311 ceph_queue_check_caps(inode);
3312 else if (flushsnaps)
3313 ceph_queue_flush_snaps(inode);
3314 break;
3315 default:
3316 break;
64f36da5 3317 }
a8599bd8 3318 if (wake)
03066f23 3319 wake_up_all(&ci->i_cap_wq);
86056090 3320 while (put-- > 0)
a8599bd8
SW
3321 iput(inode);
3322}
3323
e64f44a8
XL
3324void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3325{
a8810cdc
JL
3326 __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC);
3327}
3328
3329void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
3330{
3331 __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
e64f44a8
XL
3332}
3333
a8599bd8
SW
3334/*
3335 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
3336 * context. Adjust per-snap dirty page accounting as appropriate.
3337 * Once all dirty data for a cap_snap is flushed, flush snapped file
3338 * metadata back to the MDS. If we dropped the last ref, call
3339 * ceph_check_caps.
3340 */
3341void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
3342 struct ceph_snap_context *snapc)
3343{
874c8ca1 3344 struct inode *inode = &ci->netfs.inode;
38d46409 3345 struct ceph_client *cl = ceph_inode_to_client(inode);
3ffa9d6f 3346 struct ceph_cap_snap *capsnap = NULL, *iter;
70220ac8
YZ
3347 int put = 0;
3348 bool last = false;
70220ac8
YZ
3349 bool flush_snaps = false;
3350 bool complete_capsnap = false;
a8599bd8 3351
be655596 3352 spin_lock(&ci->i_ceph_lock);
a8599bd8 3353 ci->i_wrbuffer_ref -= nr;
70220ac8
YZ
3354 if (ci->i_wrbuffer_ref == 0) {
3355 last = true;
3356 put++;
3357 }
a8599bd8
SW
3358
3359 if (ci->i_head_snapc == snapc) {
3360 ci->i_wrbuffer_ref_head -= nr;
7d8cb26d 3361 if (ci->i_wrbuffer_ref_head == 0 &&
5dda377c
YZ
3362 ci->i_wr_ref == 0 &&
3363 ci->i_dirty_caps == 0 &&
3364 ci->i_flushing_caps == 0) {
7d8cb26d 3365 BUG_ON(!ci->i_head_snapc);
a8599bd8
SW
3366 ceph_put_snap_context(ci->i_head_snapc);
3367 ci->i_head_snapc = NULL;
3368 }
38d46409
XL
3369 doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n",
3370 inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr,
3371 ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref,
3372 ci->i_wrbuffer_ref_head, last ? " LAST" : "");
a8599bd8 3373 } else {
3ffa9d6f
JK
3374 list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
3375 if (iter->context == snapc) {
3376 capsnap = iter;
a8599bd8
SW
3377 break;
3378 }
3379 }
a6d37ccd 3380
3ffa9d6f 3381 if (!capsnap) {
a6d37ccd
XL
3382 /*
3383 * The capsnap should already be removed when removing
3384 * auth cap in the case of a forced unmount.
3385 */
3386 WARN_ON_ONCE(ci->i_auth_cap);
3387 goto unlock;
3388 }
3389
819ccbfa
SW
3390 capsnap->dirty_pages -= nr;
3391 if (capsnap->dirty_pages == 0) {
70220ac8
YZ
3392 complete_capsnap = true;
3393 if (!capsnap->writing) {
3394 if (ceph_try_drop_cap_snap(ci, capsnap)) {
3395 put++;
3396 } else {
3397 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
3398 flush_snaps = true;
3399 }
3400 }
819ccbfa 3401 }
38d46409
XL
3402 doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n",
3403 inode, ceph_vinop(inode), capsnap, capsnap->context->seq,
3404 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
3405 ci->i_wrbuffer_ref, capsnap->dirty_pages,
3406 last ? " (wrbuffer last)" : "",
3407 complete_capsnap ? " (complete capsnap)" : "");
a8599bd8
SW
3408 }
3409
a6d37ccd 3410unlock:
be655596 3411 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
3412
3413 if (last) {
e4b731cc 3414 ceph_check_caps(ci, 0);
70220ac8 3415 } else if (flush_snaps) {
ed9b430c 3416 ceph_flush_snaps(ci, NULL);
a8599bd8 3417 }
70220ac8
YZ
3418 if (complete_capsnap)
3419 wake_up_all(&ci->i_cap_wq);
3e1d0452 3420 while (put-- > 0) {
23c2c76e 3421 iput(inode);
3e1d0452 3422 }
a8599bd8
SW
3423}
3424
ca20c991
YZ
3425/*
3426 * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
3427 */
3428static void invalidate_aliases(struct inode *inode)
3429{
38d46409 3430 struct ceph_client *cl = ceph_inode_to_client(inode);
ca20c991
YZ
3431 struct dentry *dn, *prev = NULL;
3432
38d46409 3433 doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
ca20c991
YZ
3434 d_prune_aliases(inode);
3435 /*
3436 * For non-directory inode, d_find_alias() only returns
fc12c80a
BF
3437 * hashed dentry. After calling d_invalidate(), the
3438 * dentry becomes unhashed.
ca20c991 3439 *
a8d436f0 3440 * For directory inode, d_find_alias() can return
fc12c80a 3441 * unhashed dentry. But directory inode should have
ca20c991
YZ
3442 * one alias at most.
3443 */
3444 while ((dn = d_find_alias(inode))) {
3445 if (dn == prev) {
3446 dput(dn);
3447 break;
3448 }
a8d436f0 3449 d_invalidate(dn);
ca20c991
YZ
3450 if (prev)
3451 dput(prev);
3452 prev = dn;
3453 }
3454 if (prev)
3455 dput(prev);
3456}
3457
a1c6b835
YZ
3458struct cap_extra_info {
3459 struct ceph_string *pool_ns;
3460 /* inline data */
3461 u64 inline_version;
3462 void *inline_data;
3463 u32 inline_len;
4985d6f9
YZ
3464 /* dirstat */
3465 bool dirstat_valid;
3466 u64 nfiles;
3467 u64 nsubdirs;
176c77c9 3468 u64 change_attr;
a1c6b835
YZ
3469 /* currently issued */
3470 int issued;
ec62b894 3471 struct timespec64 btime;
0d91f0ad
JL
3472 u8 *fscrypt_auth;
3473 u32 fscrypt_auth_len;
3474 u64 fscrypt_file_size;
a1c6b835
YZ
3475};
3476
a8599bd8
SW
3477/*
3478 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
3479 * actually be a revocation if it specifies a smaller cap set.)
3480 *
be655596 3481 * caller holds s_mutex and i_ceph_lock, we drop both.
a8599bd8 3482 */
a1c6b835 3483static void handle_cap_grant(struct inode *inode,
15637c8b 3484 struct ceph_mds_session *session,
a1c6b835
YZ
3485 struct ceph_cap *cap,
3486 struct ceph_mds_caps *grant,
3487 struct ceph_buffer *xattr_buf,
3488 struct cap_extra_info *extra_info)
2cd698be 3489 __releases(ci->i_ceph_lock)
a1c6b835 3490 __releases(session->s_mdsc->snap_rwsem)
a8599bd8 3491{
38d46409 3492 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8 3493 struct ceph_inode_info *ci = ceph_inode(inode);
2f56f56a 3494 int seq = le32_to_cpu(grant->seq);
a8599bd8 3495 int newcaps = le32_to_cpu(grant->caps);
2cd698be 3496 int used, wanted, dirty;
a8599bd8
SW
3497 u64 size = le64_to_cpu(grant->size);
3498 u64 max_size = le64_to_cpu(grant->max_size);
fdac94fa 3499 unsigned char check_caps = 0;
52d60f8e 3500 bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen);
ab6c2c3e
FF
3501 bool wake = false;
3502 bool writeback = false;
3503 bool queue_trunc = false;
3504 bool queue_invalidate = false;
ab6c2c3e 3505 bool deleted_inode = false;
31c542a1 3506 bool fill_inline = false;
a8599bd8 3507
0d91f0ad
JL
3508 /*
3509 * If there is at least one crypto block then we'll trust
3510 * fscrypt_file_size. If the real length of the file is 0, then
3511 * ignore it (it has probably been truncated down to 0 by the MDS).
3512 */
3513 if (IS_ENCRYPTED(inode) && size)
3514 size = extra_info->fscrypt_file_size;
3515
38d46409
XL
3516 doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode,
3517 ceph_vinop(inode), cap, session->s_mds, seq,
3518 ceph_cap_string(newcaps));
3519 doutc(cl, " size %llu max_size %llu, i_size %llu\n", size,
3520 max_size, i_size_read(inode));
a8599bd8 3521
11df2dfb 3522
a8599bd8
SW
3523 /*
3524 * If CACHE is being revoked, and we have no dirty buffers,
3525 * try to invalidate (once). (If there are dirty buffers, we
3526 * will invalidate _after_ writeback.)
3527 */
525d15e8 3528 if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
fdd4e158 3529 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3b454c49 3530 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
9abd4db7 3531 !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
e9075743 3532 if (try_nonblocking_invalidate(inode)) {
a8599bd8
SW
3533 /* there were locked pages.. invalidate later
3534 in a separate thread. */
3535 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
ab6c2c3e 3536 queue_invalidate = true;
a8599bd8
SW
3537 ci->i_rdcache_revoking = ci->i_rdcache_gen;
3538 }
a8599bd8 3539 }
a8599bd8
SW
3540 }
3541
d2f8bb27
YZ
3542 if (was_stale)
3543 cap->issued = cap->implemented = CEPH_CAP_PIN;
3544
3545 /*
3546 * auth mds of the inode changed. we received the cap export message,
3547 * but still haven't received the cap import message. handle_cap_export
3548 * updated the new auth MDS' cap.
3549 *
3550 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
3551 * that was sent before the cap import message. So don't remove caps.
3552 */
3553 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3554 WARN_ON(cap != ci->i_auth_cap);
3555 WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
3556 seq = cap->seq;
3557 newcaps |= cap->issued;
3558 }
3559
a8599bd8 3560 /* side effects now are allowed */
52d60f8e 3561 cap->cap_gen = atomic_read(&session->s_cap_gen);
11df2dfb 3562 cap->seq = seq;
a8599bd8
SW
3563
3564 __check_cap_issue(ci, cap, newcaps);
3565
176c77c9
JL
3566 inode_set_max_iversion_raw(inode, extra_info->change_attr);
3567
f98a128a 3568 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
a1c6b835 3569 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
ed94f87c
JL
3570 umode_t mode = le32_to_cpu(grant->mode);
3571
3572 if (inode_wrong_type(inode, mode))
3573 pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
3574 ceph_vinop(inode), inode->i_mode, mode);
3575 else
3576 inode->i_mode = mode;
05cb11c1
EB
3577 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
3578 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
ec62b894 3579 ci->i_btime = extra_info->btime;
38d46409
XL
3580 doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
3581 ceph_vinop(inode), inode->i_mode,
3582 from_kuid(&init_user_ns, inode->i_uid),
3583 from_kgid(&init_user_ns, inode->i_gid));
0d91f0ad
JL
3584#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
3585 if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
3586 memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
3587 ci->fscrypt_auth_len))
38d46409
XL
3588 pr_warn_ratelimited_client(cl,
3589 "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
3590 ci->fscrypt_auth_len,
0d91f0ad
JL
3591 extra_info->fscrypt_auth_len);
3592#endif
a8599bd8
SW
3593 }
3594
fa466743 3595 if ((newcaps & CEPH_CAP_LINK_SHARED) &&
a1c6b835 3596 (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
bfe86848 3597 set_nlink(inode, le32_to_cpu(grant->nlink));
76bdbc7a 3598 if (inode->i_nlink == 0)
ab6c2c3e 3599 deleted_inode = true;
ca20c991 3600 }
a8599bd8 3601
a1c6b835
YZ
3602 if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
3603 grant->xattr_len) {
a8599bd8
SW
3604 int len = le32_to_cpu(grant->xattr_len);
3605 u64 version = le64_to_cpu(grant->xattr_version);
3606
3607 if (version > ci->i_xattrs.version) {
38d46409
XL
3608 doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n",
3609 version, inode, ceph_vinop(inode), len);
a8599bd8
SW
3610 if (ci->i_xattrs.blob)
3611 ceph_buffer_put(ci->i_xattrs.blob);
3612 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
3613 ci->i_xattrs.version = version;
7221fe4c 3614 ceph_forget_all_cached_acls(inode);
ac6713cc 3615 ceph_security_invalidate_secctx(inode);
a8599bd8
SW
3616 }
3617 }
3618
f98a128a 3619 if (newcaps & CEPH_CAP_ANY_RD) {
9bbeab41 3620 struct timespec64 mtime, atime, ctime;
f98a128a 3621 /* ctime/mtime/atime? */
9bbeab41
AB
3622 ceph_decode_timespec64(&mtime, &grant->mtime);
3623 ceph_decode_timespec64(&atime, &grant->atime);
3624 ceph_decode_timespec64(&ctime, &grant->ctime);
a1c6b835 3625 ceph_fill_file_time(inode, extra_info->issued,
f98a128a
YZ
3626 le32_to_cpu(grant->time_warp_seq),
3627 &ctime, &mtime, &atime);
3628 }
3629
4985d6f9
YZ
3630 if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
3631 ci->i_files = extra_info->nfiles;
3632 ci->i_subdirs = extra_info->nsubdirs;
3633 }
3634
f98a128a
YZ
3635 if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
3636 /* file layout may have changed */
7627151e 3637 s64 old_pool = ci->i_layout.pool_id;
779fe0fb
YZ
3638 struct ceph_string *old_ns;
3639
7627151e 3640 ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
779fe0fb
YZ
3641 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
3642 lockdep_is_held(&ci->i_ceph_lock));
a1c6b835 3643 rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
779fe0fb 3644
a1c6b835
YZ
3645 if (ci->i_layout.pool_id != old_pool ||
3646 extra_info->pool_ns != old_ns)
7627151e 3647 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
5ea5c5e0 3648
a1c6b835 3649 extra_info->pool_ns = old_ns;
779fe0fb 3650
f98a128a 3651 /* size/truncate_seq? */
a1c6b835 3652 queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
f98a128a
YZ
3653 le32_to_cpu(grant->truncate_seq),
3654 le64_to_cpu(grant->truncate_size),
3655 size);
84eea8c7
YZ
3656 }
3657
3658 if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
3659 if (max_size != ci->i_max_size) {
38d46409
XL
3660 doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size,
3661 max_size);
f98a128a
YZ
3662 ci->i_max_size = max_size;
3663 if (max_size >= ci->i_wanted_max_size) {
3664 ci->i_wanted_max_size = 0; /* reset */
3665 ci->i_requested_max_size = 0;
3666 }
ab6c2c3e 3667 wake = true;
a8599bd8 3668 }
a8599bd8
SW
3669 }
3670
3671 /* check cap bits */
3672 wanted = __ceph_caps_wanted(ci);
3673 used = __ceph_caps_used(ci);
3674 dirty = __ceph_caps_dirty(ci);
38d46409
XL
3675 doutc(cl, " my wanted = %s, used = %s, dirty %s\n",
3676 ceph_cap_string(wanted), ceph_cap_string(used),
3677 ceph_cap_string(dirty));
fdac94fa
YZ
3678
3679 if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3680 (wanted & ~(cap->mds_wanted | newcaps))) {
3681 /*
3682 * If mds is importing cap, prior cap messages that update
3683 * 'wanted' may get dropped by mds (migrate seq mismatch).
3684 *
3685 * We don't send cap message to update 'wanted' if what we
3686 * want are already issued. If mds revokes caps, cap message
3687 * that releases caps also tells mds what we want. But if
3688 * caps got revoked by mds forcedly (session stale). We may
3689 * haven't told mds what we want.
3690 */
3691 check_caps = 1;
a8599bd8
SW
3692 }
3693
a8599bd8
SW
3694 /* revocation, grant, or no-op? */
3695 if (cap->issued & ~newcaps) {
3b454c49
SW
3696 int revoking = cap->issued & ~newcaps;
3697
38d46409
XL
3698 doutc(cl, "revocation: %s -> %s (revoking %s)\n",
3699 ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
3700 ceph_cap_string(revoking));
525d15e8
YZ
3701 if (S_ISREG(inode->i_mode) &&
3702 (revoking & used & CEPH_CAP_FILE_BUFFER))
ab6c2c3e 3703 writeback = true; /* initiate writeback; will delay ack */
525d15e8
YZ
3704 else if (queue_invalidate &&
3705 revoking == CEPH_CAP_FILE_CACHE &&
3706 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
3b454c49
SW
3707 ; /* do nothing yet, invalidation will be queued */
3708 else if (cap == ci->i_auth_cap)
3709 check_caps = 1; /* check auth cap only */
3710 else
3711 check_caps = 2; /* check all caps */
f7913573
XL
3712 /* If there is new caps, try to wake up the waiters */
3713 if (~cap->issued & newcaps)
3714 wake = true;
a8599bd8 3715 cap->issued = newcaps;
978097c9 3716 cap->implemented |= newcaps;
a8599bd8 3717 } else if (cap->issued == newcaps) {
38d46409
XL
3718 doutc(cl, "caps unchanged: %s -> %s\n",
3719 ceph_cap_string(cap->issued),
3720 ceph_cap_string(newcaps));
a8599bd8 3721 } else {
38d46409
XL
3722 doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued),
3723 ceph_cap_string(newcaps));
6ee6b953
YZ
3724 /* non-auth MDS is revoking the newly grant caps ? */
3725 if (cap == ci->i_auth_cap &&
3726 __ceph_caps_revoking_other(ci, cap, newcaps))
3727 check_caps = 2;
3728
a8599bd8
SW
3729 cap->issued = newcaps;
3730 cap->implemented |= newcaps; /* add bits only, to
3731 * avoid stepping on a
3732 * pending revocation */
ab6c2c3e 3733 wake = true;
a8599bd8 3734 }
978097c9 3735 BUG_ON(cap->issued & ~cap->implemented);
a8599bd8 3736
257e6172
XL
3737 /* don't let check_caps skip sending a response to MDS for revoke msgs */
3738 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
3739 cap->mds_wanted = 0;
3740 if (cap == ci->i_auth_cap)
3741 check_caps = 1; /* check auth cap only */
3742 else
3743 check_caps = 2; /* check all caps */
3744 }
3745
a1c6b835
YZ
3746 if (extra_info->inline_version > 0 &&
3747 extra_info->inline_version >= ci->i_inline_version) {
3748 ci->i_inline_version = extra_info->inline_version;
31c542a1
YZ
3749 if (ci->i_inline_version != CEPH_INLINE_NONE &&
3750 (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
3751 fill_inline = true;
3752 }
3753
58dd4385
JL
3754 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
3755 if (ci->i_auth_cap == cap) {
3756 if (newcaps & ~extra_info->issued)
3757 wake = true;
3758
3759 if (ci->i_requested_max_size > max_size ||
3760 !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
3761 /* re-request max_size if necessary */
3762 ci->i_requested_max_size = 0;
3763 wake = true;
3764 }
6f05b30e 3765
58dd4385 3766 ceph_kick_flushing_inode_caps(session, ci);
6f05b30e 3767 }
a1c6b835 3768 up_read(&session->s_mdsc->snap_rwsem);
2cd698be 3769 }
58dd4385 3770 spin_unlock(&ci->i_ceph_lock);
2cd698be 3771
31c542a1 3772 if (fill_inline)
a1c6b835
YZ
3773 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
3774 extra_info->inline_len);
31c542a1 3775
14649758 3776 if (queue_trunc)
c6bcda6f 3777 ceph_queue_vmtruncate(inode);
c6bcda6f 3778
3c6f6b79 3779 if (writeback)
a8599bd8
SW
3780 /*
3781 * queue inode for writeback: we can't actually call
3782 * filemap_write_and_wait, etc. from message handler
3783 * context.
3784 */
3c6f6b79
SW
3785 ceph_queue_writeback(inode);
3786 if (queue_invalidate)
3787 ceph_queue_invalidate(inode);
ca20c991
YZ
3788 if (deleted_inode)
3789 invalidate_aliases(inode);
a8599bd8 3790 if (wake)
03066f23 3791 wake_up_all(&ci->i_cap_wq);
15637c8b 3792
6a92b08f 3793 mutex_unlock(&session->s_mutex);
15637c8b 3794 if (check_caps == 1)
e4b731cc 3795 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
15637c8b 3796 else if (check_caps == 2)
e4b731cc 3797 ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
a8599bd8
SW
3798}
3799
3800/*
3801 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
3802 * MDS has been safely committed.
3803 */
6df058c0 3804static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
a8599bd8
SW
3805 struct ceph_mds_caps *m,
3806 struct ceph_mds_session *session,
3807 struct ceph_cap *cap)
be655596 3808 __releases(ci->i_ceph_lock)
a8599bd8
SW
3809{
3810 struct ceph_inode_info *ci = ceph_inode(inode);
5995d90d 3811 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
38d46409 3812 struct ceph_client *cl = mdsc->fsc->client;
e4500b5e 3813 struct ceph_cap_flush *cf, *tmp_cf;
553adfd9 3814 LIST_HEAD(to_remove);
a8599bd8
SW
3815 unsigned seq = le32_to_cpu(m->seq);
3816 int dirty = le32_to_cpu(m->dirty);
3817 int cleaned = 0;
c8799fc4 3818 bool drop = false;
7271efa7
TM
3819 bool wake_ci = false;
3820 bool wake_mdsc = false;
a8599bd8 3821
e4500b5e 3822 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
d7dbfb4f 3823 /* Is this the one that was flushed? */
553adfd9
YZ
3824 if (cf->tid == flush_tid)
3825 cleaned = cf->caps;
d7dbfb4f
JL
3826
3827 /* Is this a capsnap? */
b2f9fa1f 3828 if (cf->is_capsnap)
0e294387 3829 continue;
d7dbfb4f 3830
553adfd9 3831 if (cf->tid <= flush_tid) {
d7dbfb4f
JL
3832 /*
3833 * An earlier or current tid. The FLUSH_ACK should
3834 * represent a superset of this flush's caps.
3835 */
681ac634 3836 wake_ci |= __detach_cap_flush_from_ci(ci, cf);
e4500b5e 3837 list_add_tail(&cf->i_list, &to_remove);
553adfd9 3838 } else {
d7dbfb4f
JL
3839 /*
3840 * This is a later one. Any caps in it are still dirty
3841 * so don't count them as cleaned.
3842 */
553adfd9
YZ
3843 cleaned &= ~cf->caps;
3844 if (!cleaned)
3845 break;
3846 }
3847 }
a8599bd8 3848
38d46409
XL
3849 doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n",
3850 inode, ceph_vinop(inode), session->s_mds, seq,
3851 ceph_cap_string(dirty), ceph_cap_string(cleaned),
3852 ceph_cap_string(ci->i_flushing_caps),
3853 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
a8599bd8 3854
8310b089 3855 if (list_empty(&to_remove) && !cleaned)
a8599bd8
SW
3856 goto out;
3857
a8599bd8 3858 ci->i_flushing_caps &= ~cleaned;
a8599bd8
SW
3859
3860 spin_lock(&mdsc->cap_dirty_lock);
8310b089 3861
681ac634
JL
3862 list_for_each_entry(cf, &to_remove, i_list)
3863 wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
8310b089 3864
a8599bd8 3865 if (ci->i_flushing_caps == 0) {
0e294387
YZ
3866 if (list_empty(&ci->i_cap_flush_list)) {
3867 list_del_init(&ci->i_flushing_item);
3868 if (!list_empty(&session->s_cap_flushing)) {
38d46409
XL
3869 struct inode *inode =
3870 &list_first_entry(&session->s_cap_flushing,
3871 struct ceph_inode_info,
3872 i_flushing_item)->netfs.inode;
3873 doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n",
3874 session->s_mds, inode, ceph_vinop(inode));
0e294387
YZ
3875 }
3876 }
a8599bd8 3877 mdsc->num_cap_flushing--;
38d46409
XL
3878 doutc(cl, " %p %llx.%llx now !flushing\n", inode,
3879 ceph_vinop(inode));
afcdaea3
SW
3880
3881 if (ci->i_dirty_caps == 0) {
38d46409
XL
3882 doutc(cl, " %p %llx.%llx now clean\n", inode,
3883 ceph_vinop(inode));
afcdaea3 3884 BUG_ON(!list_empty(&ci->i_dirty_item));
c8799fc4 3885 drop = true;
5dda377c
YZ
3886 if (ci->i_wr_ref == 0 &&
3887 ci->i_wrbuffer_ref_head == 0) {
7d8cb26d
SW
3888 BUG_ON(!ci->i_head_snapc);
3889 ceph_put_snap_context(ci->i_head_snapc);
3890 ci->i_head_snapc = NULL;
3891 }
76e3b390
SW
3892 } else {
3893 BUG_ON(list_empty(&ci->i_dirty_item));
afcdaea3 3894 }
a8599bd8
SW
3895 }
3896 spin_unlock(&mdsc->cap_dirty_lock);
a8599bd8
SW
3897
3898out:
be655596 3899 spin_unlock(&ci->i_ceph_lock);
553adfd9
YZ
3900
3901 while (!list_empty(&to_remove)) {
3902 cf = list_first_entry(&to_remove,
e4500b5e 3903 struct ceph_cap_flush, i_list);
b2f9fa1f
XL
3904 list_del_init(&cf->i_list);
3905 if (!cf->is_capsnap)
3906 ceph_free_cap_flush(cf);
553adfd9 3907 }
c8799fc4
YZ
3908
3909 if (wake_ci)
3910 wake_up_all(&ci->i_cap_wq);
3911 if (wake_mdsc)
3912 wake_up_all(&mdsc->cap_flushing_wq);
afcdaea3 3913 if (drop)
a8599bd8
SW
3914 iput(inode);
3915}
3916
a6d37ccd
XL
3917void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3918 bool *wake_ci, bool *wake_mdsc)
3919{
3920 struct ceph_inode_info *ci = ceph_inode(inode);
5995d90d 3921 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
38d46409 3922 struct ceph_client *cl = mdsc->fsc->client;
a6d37ccd
XL
3923 bool ret;
3924
3925 lockdep_assert_held(&ci->i_ceph_lock);
3926
38d46409
XL
3927 doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap,
3928 inode, ceph_vinop(inode), ci);
a6d37ccd
XL
3929
3930 list_del_init(&capsnap->ci_item);
3931 ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
3932 if (wake_ci)
3933 *wake_ci = ret;
3934
3935 spin_lock(&mdsc->cap_dirty_lock);
3936 if (list_empty(&ci->i_cap_flush_list))
3937 list_del_init(&ci->i_flushing_item);
3938
3939 ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
3940 if (wake_mdsc)
3941 *wake_mdsc = ret;
3942 spin_unlock(&mdsc->cap_dirty_lock);
3943}
3944
3945void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3946 bool *wake_ci, bool *wake_mdsc)
3947{
3948 struct ceph_inode_info *ci = ceph_inode(inode);
3949
3950 lockdep_assert_held(&ci->i_ceph_lock);
3951
3952 WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
3953 __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
3954}
3955
a8599bd8
SW
3956/*
3957 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
3958 * throw away our cap_snap.
3959 *
3960 * Caller hold s_mutex.
3961 */
6df058c0 3962static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
a8599bd8
SW
3963 struct ceph_mds_caps *m,
3964 struct ceph_mds_session *session)
3965{
3966 struct ceph_inode_info *ci = ceph_inode(inode);
5995d90d 3967 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
38d46409 3968 struct ceph_client *cl = mdsc->fsc->client;
a8599bd8 3969 u64 follows = le64_to_cpu(m->snap_follows);
3ffa9d6f 3970 struct ceph_cap_snap *capsnap = NULL, *iter;
c8799fc4
YZ
3971 bool wake_ci = false;
3972 bool wake_mdsc = false;
a8599bd8 3973
38d46409
XL
3974 doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode,
3975 ceph_vinop(inode), ci, session->s_mds, follows);
a8599bd8 3976
be655596 3977 spin_lock(&ci->i_ceph_lock);
3ffa9d6f
JK
3978 list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
3979 if (iter->follows == follows) {
3980 if (iter->cap_flush.tid != flush_tid) {
38d46409
XL
3981 doutc(cl, " cap_snap %p follows %lld "
3982 "tid %lld != %lld\n", iter,
3983 follows, flush_tid,
3984 iter->cap_flush.tid);
a8599bd8
SW
3985 break;
3986 }
3ffa9d6f 3987 capsnap = iter;
a8599bd8
SW
3988 break;
3989 } else {
38d46409
XL
3990 doutc(cl, " skipping cap_snap %p follows %lld\n",
3991 iter, iter->follows);
a8599bd8
SW
3992 }
3993 }
3ffa9d6f 3994 if (capsnap)
a6d37ccd 3995 ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
be655596 3996 spin_unlock(&ci->i_ceph_lock);
a6d37ccd 3997
3ffa9d6f 3998 if (capsnap) {
0e294387
YZ
3999 ceph_put_snap_context(capsnap->context);
4000 ceph_put_cap_snap(capsnap);
c8799fc4
YZ
4001 if (wake_ci)
4002 wake_up_all(&ci->i_cap_wq);
4003 if (wake_mdsc)
4004 wake_up_all(&mdsc->cap_flushing_wq);
a8599bd8 4005 iput(inode);
0e294387 4006 }
a8599bd8
SW
4007}
4008
4009/*
4010 * Handle TRUNC from MDS, indicating file truncation.
4011 *
4012 * caller hold s_mutex.
4013 */
7391fba2 4014static bool handle_cap_trunc(struct inode *inode,
a8599bd8 4015 struct ceph_mds_caps *trunc,
0d91f0ad
JL
4016 struct ceph_mds_session *session,
4017 struct cap_extra_info *extra_info)
a8599bd8
SW
4018{
4019 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 4020 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8
SW
4021 int mds = session->s_mds;
4022 int seq = le32_to_cpu(trunc->seq);
4023 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
4024 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
4025 u64 size = le64_to_cpu(trunc->size);
4026 int implemented = 0;
4027 int dirty = __ceph_caps_dirty(ci);
4028 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
7391fba2
JL
4029 bool queue_trunc = false;
4030
4031 lockdep_assert_held(&ci->i_ceph_lock);
a8599bd8
SW
4032
4033 issued |= implemented | dirty;
4034
0d91f0ad
JL
4035 /*
4036 * If there is at least one crypto block then we'll trust
4037 * fscrypt_file_size. If the real length of the file is 0, then
4038 * ignore it (it has probably been truncated down to 0 by the MDS).
4039 */
4040 if (IS_ENCRYPTED(inode) && size)
4041 size = extra_info->fscrypt_file_size;
4042
38d46409
XL
4043 doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n",
4044 inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq);
a8599bd8
SW
4045 queue_trunc = ceph_fill_file_size(inode, issued,
4046 truncate_seq, truncate_size, size);
7391fba2 4047 return queue_trunc;
a8599bd8
SW
4048}
4049
4050/*
4051 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
4052 * different one. If we are the most recent migration we've seen (as
4053 * indicated by mseq), make note of the migrating cap bits for the
4054 * duration (until we see the corresponding IMPORT).
4055 *
4056 * caller holds s_mutex
4057 */
4058static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
11df2dfb
YZ
4059 struct ceph_mds_cap_peer *ph,
4060 struct ceph_mds_session *session)
a8599bd8 4061{
5995d90d 4062 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
38d46409 4063 struct ceph_client *cl = mdsc->fsc->client;
11df2dfb 4064 struct ceph_mds_session *tsession = NULL;
d9df2783 4065 struct ceph_cap *cap, *tcap, *new_cap = NULL;
a8599bd8 4066 struct ceph_inode_info *ci = ceph_inode(inode);
11df2dfb 4067 u64 t_cap_id;
a8599bd8 4068 unsigned mseq = le32_to_cpu(ex->migrate_seq);
11df2dfb
YZ
4069 unsigned t_seq, t_mseq;
4070 int target, issued;
4071 int mds = session->s_mds;
a8599bd8 4072
11df2dfb
YZ
4073 if (ph) {
4074 t_cap_id = le64_to_cpu(ph->cap_id);
4075 t_seq = le32_to_cpu(ph->seq);
4076 t_mseq = le32_to_cpu(ph->mseq);
4077 target = le32_to_cpu(ph->mds);
4078 } else {
4079 t_cap_id = t_seq = t_mseq = 0;
4080 target = -1;
4081 }
a8599bd8 4082
38d46409
XL
4083 doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n",
4084 inode, ceph_vinop(inode), ci, mds, mseq, target);
11df2dfb 4085retry:
7f47f7f3 4086 down_read(&mdsc->snap_rwsem);
be655596 4087 spin_lock(&ci->i_ceph_lock);
11df2dfb 4088 cap = __get_cap_for_mds(ci, mds);
ca665e02 4089 if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
11df2dfb 4090 goto out_unlock;
a8599bd8 4091
11df2dfb 4092 if (target < 0) {
197b7d79 4093 ceph_remove_cap(mdsc, cap, false);
11df2dfb 4094 goto out_unlock;
a8599bd8
SW
4095 }
4096
11df2dfb
YZ
4097 /*
4098 * now we know we haven't received the cap import message yet
4099 * because the exported cap still exist.
4100 */
db354052 4101
11df2dfb 4102 issued = cap->issued;
d84b37f9 4103 if (issued != cap->implemented)
38d46409
XL
4104 pr_err_ratelimited_client(cl, "issued != implemented: "
4105 "%p %llx.%llx mds%d seq %d mseq %d"
4106 " issued %s implemented %s\n",
4107 inode, ceph_vinop(inode), mds,
4108 cap->seq, cap->mseq,
4109 ceph_cap_string(issued),
4110 ceph_cap_string(cap->implemented));
d84b37f9 4111
11df2dfb
YZ
4112
4113 tcap = __get_cap_for_mds(ci, target);
4114 if (tcap) {
4115 /* already have caps from the target */
fa0aa3b8 4116 if (tcap->cap_id == t_cap_id &&
11df2dfb 4117 ceph_seq_cmp(tcap->seq, t_seq) < 0) {
38d46409
XL
4118 doutc(cl, " updating import cap %p mds%d\n", tcap,
4119 target);
11df2dfb
YZ
4120 tcap->cap_id = t_cap_id;
4121 tcap->seq = t_seq - 1;
4122 tcap->issue_seq = t_seq - 1;
11df2dfb
YZ
4123 tcap->issued |= issued;
4124 tcap->implemented |= issued;
1cf03a68 4125 if (cap == ci->i_auth_cap) {
11df2dfb 4126 ci->i_auth_cap = tcap;
1cf03a68 4127 change_auth_cap_ses(ci, tcap->session);
db354052 4128 }
a8599bd8 4129 }
197b7d79 4130 ceph_remove_cap(mdsc, cap, false);
11df2dfb 4131 goto out_unlock;
d9df2783 4132 } else if (tsession) {
11df2dfb 4133 /* add placeholder for the export tagert */
d9df2783 4134 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
00f06cba 4135 tcap = new_cap;
135e671e 4136 ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
d9df2783
YZ
4137 t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
4138
00f06cba
YZ
4139 if (!list_empty(&ci->i_cap_flush_list) &&
4140 ci->i_auth_cap == tcap) {
4141 spin_lock(&mdsc->cap_dirty_lock);
4142 list_move_tail(&ci->i_flushing_item,
4143 &tcap->session->s_cap_flushing);
4144 spin_unlock(&mdsc->cap_dirty_lock);
4145 }
4146
197b7d79 4147 ceph_remove_cap(mdsc, cap, false);
d9df2783 4148 goto out_unlock;
a8599bd8
SW
4149 }
4150
be655596 4151 spin_unlock(&ci->i_ceph_lock);
7f47f7f3 4152 up_read(&mdsc->snap_rwsem);
11df2dfb
YZ
4153 mutex_unlock(&session->s_mutex);
4154
4155 /* open target session */
4156 tsession = ceph_mdsc_open_export_target_session(mdsc, target);
4157 if (!IS_ERR(tsession)) {
4158 if (mds > target) {
4159 mutex_lock(&session->s_mutex);
4160 mutex_lock_nested(&tsession->s_mutex,
4161 SINGLE_DEPTH_NESTING);
4162 } else {
4163 mutex_lock(&tsession->s_mutex);
4164 mutex_lock_nested(&session->s_mutex,
4165 SINGLE_DEPTH_NESTING);
4166 }
d9df2783 4167 new_cap = ceph_get_cap(mdsc, NULL);
11df2dfb
YZ
4168 } else {
4169 WARN_ON(1);
4170 tsession = NULL;
4171 target = -1;
4d8e28ff 4172 mutex_lock(&session->s_mutex);
11df2dfb
YZ
4173 }
4174 goto retry;
4175
4176out_unlock:
4177 spin_unlock(&ci->i_ceph_lock);
7f47f7f3 4178 up_read(&mdsc->snap_rwsem);
11df2dfb
YZ
4179 mutex_unlock(&session->s_mutex);
4180 if (tsession) {
4181 mutex_unlock(&tsession->s_mutex);
4182 ceph_put_mds_session(tsession);
4183 }
d9df2783
YZ
4184 if (new_cap)
4185 ceph_put_cap(mdsc, new_cap);
a8599bd8
SW
4186}
4187
4188/*
2cd698be 4189 * Handle cap IMPORT.
a8599bd8 4190 *
2cd698be 4191 * caller holds s_mutex. acquires i_ceph_lock
a8599bd8
SW
4192 */
4193static void handle_cap_import(struct ceph_mds_client *mdsc,
4194 struct inode *inode, struct ceph_mds_caps *im,
4ee6a914 4195 struct ceph_mds_cap_peer *ph,
a8599bd8 4196 struct ceph_mds_session *session,
2cd698be 4197 struct ceph_cap **target_cap, int *old_issued)
a8599bd8
SW
4198{
4199 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 4200 struct ceph_client *cl = mdsc->fsc->client;
2cd698be 4201 struct ceph_cap *cap, *ocap, *new_cap = NULL;
a8599bd8 4202 int mds = session->s_mds;
2cd698be
YZ
4203 int issued;
4204 unsigned caps = le32_to_cpu(im->caps);
a8599bd8
SW
4205 unsigned wanted = le32_to_cpu(im->wanted);
4206 unsigned seq = le32_to_cpu(im->seq);
4207 unsigned mseq = le32_to_cpu(im->migrate_seq);
4208 u64 realmino = le64_to_cpu(im->realm);
4209 u64 cap_id = le64_to_cpu(im->cap_id);
4ee6a914
YZ
4210 u64 p_cap_id;
4211 int peer;
a8599bd8 4212
4ee6a914
YZ
4213 if (ph) {
4214 p_cap_id = le64_to_cpu(ph->cap_id);
4215 peer = le32_to_cpu(ph->mds);
4216 } else {
4217 p_cap_id = 0;
4218 peer = -1;
4219 }
db354052 4220
38d46409
XL
4221 doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n",
4222 inode, ceph_vinop(inode), ci, mds, mseq, peer);
d9df2783 4223retry:
d9df2783
YZ
4224 cap = __get_cap_for_mds(ci, mds);
4225 if (!cap) {
4226 if (!new_cap) {
4227 spin_unlock(&ci->i_ceph_lock);
4228 new_cap = ceph_get_cap(mdsc, NULL);
78333233 4229 spin_lock(&ci->i_ceph_lock);
d9df2783
YZ
4230 goto retry;
4231 }
2cd698be
YZ
4232 cap = new_cap;
4233 } else {
4234 if (new_cap) {
4235 ceph_put_cap(mdsc, new_cap);
4236 new_cap = NULL;
4237 }
d9df2783
YZ
4238 }
4239
2cd698be
YZ
4240 __ceph_caps_issued(ci, &issued);
4241 issued |= __ceph_caps_dirty(ci);
4242
135e671e 4243 ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
d9df2783
YZ
4244 realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
4245
2cd698be
YZ
4246 ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
4247 if (ocap && ocap->cap_id == p_cap_id) {
38d46409
XL
4248 doutc(cl, " remove export cap %p mds%d flags %d\n",
4249 ocap, peer, ph->flags);
4ee6a914 4250 if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
2cd698be
YZ
4251 (ocap->seq != le32_to_cpu(ph->seq) ||
4252 ocap->mseq != le32_to_cpu(ph->mseq))) {
38d46409
XL
4253 pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
4254 "%p %llx.%llx mds%d seq %d mseq %d"
4255 " importer mds%d has peer seq %d mseq %d\n",
4256 inode, ceph_vinop(inode), peer,
4257 ocap->seq, ocap->mseq, mds,
4258 le32_to_cpu(ph->seq),
d84b37f9 4259 le32_to_cpu(ph->mseq));
db354052 4260 }
197b7d79 4261 ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
a8599bd8
SW
4262 }
4263
2cd698be
YZ
4264 *old_issued = issued;
4265 *target_cap = cap;
a8599bd8
SW
4266}
4267
0d91f0ad
JL
4268#ifdef CONFIG_FS_ENCRYPTION
4269static int parse_fscrypt_fields(void **p, void *end,
4270 struct cap_extra_info *extra)
4271{
4272 u32 len;
4273
4274 ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
4275 if (extra->fscrypt_auth_len) {
4276 ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
4277 extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len,
4278 GFP_KERNEL);
4279 if (!extra->fscrypt_auth)
4280 return -ENOMEM;
4281 ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
4282 extra->fscrypt_auth_len, bad);
4283 }
4284
4285 ceph_decode_32_safe(p, end, len, bad);
4286 if (len >= sizeof(u64)) {
4287 ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
4288 len -= sizeof(u64);
4289 }
4290 ceph_decode_skip_n(p, end, len, bad);
4291 return 0;
4292bad:
4293 return -EIO;
4294}
4295#else
4296static int parse_fscrypt_fields(void **p, void *end,
4297 struct cap_extra_info *extra)
4298{
4299 u32 len;
4300
4301 /* Don't care about these fields unless we're encryption-capable */
4302 ceph_decode_32_safe(p, end, len, bad);
4303 if (len)
4304 ceph_decode_skip_n(p, end, len, bad);
4305 ceph_decode_32_safe(p, end, len, bad);
4306 if (len)
4307 ceph_decode_skip_n(p, end, len, bad);
4308 return 0;
4309bad:
4310 return -EIO;
4311}
4312#endif
4313
a8599bd8
SW
4314/*
4315 * Handle a caps message from the MDS.
4316 *
4317 * Identify the appropriate session, inode, and call the right handler
4318 * based on the cap op.
4319 */
4320void ceph_handle_caps(struct ceph_mds_session *session,
4321 struct ceph_msg *msg)
4322{
4323 struct ceph_mds_client *mdsc = session->s_mdsc;
38d46409 4324 struct ceph_client *cl = mdsc->fsc->client;
a8599bd8 4325 struct inode *inode;
be655596 4326 struct ceph_inode_info *ci;
a8599bd8
SW
4327 struct ceph_cap *cap;
4328 struct ceph_mds_caps *h;
4ee6a914 4329 struct ceph_mds_cap_peer *peer = NULL;
779fe0fb 4330 struct ceph_snap_realm *realm = NULL;
a1c6b835 4331 int op;
4985d6f9 4332 int msg_version = le16_to_cpu(msg->hdr.version);
3d7ded4d 4333 u32 seq, mseq;
a8599bd8 4334 struct ceph_vino vino;
70edb55b 4335 void *snaptrace;
ce1fbc8d 4336 size_t snaptrace_len;
fb01d1f8 4337 void *p, *end;
a1c6b835 4338 struct cap_extra_info extra_info = {};
7391fba2 4339 bool queue_trunc;
a68e564a 4340 bool close_sessions = false;
ce72d4e0 4341 bool do_cap_release = false;
a8599bd8 4342
38d46409 4343 doutc(cl, "from mds%d\n", session->s_mds);
a8599bd8 4344
e3dfcab2
XL
4345 if (!ceph_inc_mds_stopping_blocker(mdsc, session))
4346 return;
4347
a8599bd8 4348 /* decode */
4ee6a914 4349 end = msg->front.iov_base + msg->front.iov_len;
a8599bd8
SW
4350 if (msg->front.iov_len < sizeof(*h))
4351 goto bad;
4352 h = msg->front.iov_base;
4353 op = le32_to_cpu(h->op);
4354 vino.ino = le64_to_cpu(h->ino);
4355 vino.snap = CEPH_NOSNAP;
a8599bd8 4356 seq = le32_to_cpu(h->seq);
3d7ded4d 4357 mseq = le32_to_cpu(h->migrate_seq);
a8599bd8 4358
ce1fbc8d
SW
4359 snaptrace = h + 1;
4360 snaptrace_len = le32_to_cpu(h->snap_trace_len);
fb01d1f8 4361 p = snaptrace + snaptrace_len;
ce1fbc8d 4362
4985d6f9 4363 if (msg_version >= 2) {
fb01d1f8 4364 u32 flock_len;
ce1fbc8d 4365 ceph_decode_32_safe(&p, end, flock_len, bad);
4ee6a914
YZ
4366 if (p + flock_len > end)
4367 goto bad;
fb01d1f8 4368 p += flock_len;
ce1fbc8d
SW
4369 }
4370
4985d6f9 4371 if (msg_version >= 3) {
4ee6a914 4372 if (op == CEPH_CAP_OP_IMPORT) {
4ee6a914
YZ
4373 if (p + sizeof(*peer) > end)
4374 goto bad;
4375 peer = p;
fb01d1f8 4376 p += sizeof(*peer);
11df2dfb
YZ
4377 } else if (op == CEPH_CAP_OP_EXPORT) {
4378 /* recorded in unused fields */
4379 peer = (void *)&h->size;
4ee6a914
YZ
4380 }
4381 }
4382
4985d6f9 4383 if (msg_version >= 4) {
a1c6b835
YZ
4384 ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
4385 ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
4386 if (p + extra_info.inline_len > end)
fb01d1f8 4387 goto bad;
a1c6b835
YZ
4388 extra_info.inline_data = p;
4389 p += extra_info.inline_len;
fb01d1f8
YZ
4390 }
4391
4985d6f9 4392 if (msg_version >= 5) {
92475f05
JL
4393 struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
4394 u32 epoch_barrier;
4395
4396 ceph_decode_32_safe(&p, end, epoch_barrier, bad);
4397 ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
4398 }
4399
4985d6f9 4400 if (msg_version >= 8) {
779fe0fb 4401 u32 pool_ns_len;
92475f05 4402
5ea5c5e0 4403 /* version >= 6 */
06a1ad43 4404 ceph_decode_skip_64(&p, end, bad); // flush_tid
5ea5c5e0 4405 /* version >= 7 */
06a1ad43
JL
4406 ceph_decode_skip_32(&p, end, bad); // caller_uid
4407 ceph_decode_skip_32(&p, end, bad); // caller_gid
5ea5c5e0
YZ
4408 /* version >= 8 */
4409 ceph_decode_32_safe(&p, end, pool_ns_len, bad);
779fe0fb
YZ
4410 if (pool_ns_len > 0) {
4411 ceph_decode_need(&p, end, pool_ns_len, bad);
a1c6b835
YZ
4412 extra_info.pool_ns =
4413 ceph_find_or_create_string(p, pool_ns_len);
779fe0fb
YZ
4414 p += pool_ns_len;
4415 }
5ea5c5e0
YZ
4416 }
4417
ec62b894 4418 if (msg_version >= 9) {
4985d6f9 4419 struct ceph_timespec *btime;
4985d6f9 4420
4985d6f9
YZ
4421 if (p + sizeof(*btime) > end)
4422 goto bad;
4423 btime = p;
ec62b894 4424 ceph_decode_timespec64(&extra_info.btime, btime);
4985d6f9 4425 p += sizeof(*btime);
176c77c9 4426 ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
ec62b894
JL
4427 }
4428
4429 if (msg_version >= 11) {
4985d6f9 4430 /* version >= 10 */
06a1ad43 4431 ceph_decode_skip_32(&p, end, bad); // flags
4985d6f9
YZ
4432 /* version >= 11 */
4433 extra_info.dirstat_valid = true;
4434 ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
4435 ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
4436 }
4437
0d91f0ad
JL
4438 if (msg_version >= 12) {
4439 if (parse_fscrypt_fields(&p, end, &extra_info))
4440 goto bad;
4441 }
4442
6cd3bcad 4443 /* lookup ino */
a1c6b835 4444 inode = ceph_find_inode(mdsc->fsc->sb, vino);
38d46409
XL
4445 doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op),
4446 vino.ino, vino.snap, inode);
6cd3bcad 4447
a8599bd8 4448 mutex_lock(&session->s_mutex);
38d46409
XL
4449 doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds,
4450 session->s_seq, (unsigned)seq);
a8599bd8 4451
a8599bd8 4452 if (!inode) {
38d46409 4453 doutc(cl, " i don't have ino %llx\n", vino.ino);
3d7ded4d 4454
ce72d4e0
XL
4455 switch (op) {
4456 case CEPH_CAP_OP_IMPORT:
4457 case CEPH_CAP_OP_REVOKE:
4458 case CEPH_CAP_OP_GRANT:
4459 do_cap_release = true;
4460 break;
4461 default:
4462 break;
a096b09a 4463 }
fb33c114 4464 goto flush_cap_releases;
a8599bd8 4465 }
1ad3bb28 4466 ci = ceph_inode(inode);
a8599bd8
SW
4467
4468 /* these will work even if we don't have a cap yet */
4469 switch (op) {
4470 case CEPH_CAP_OP_FLUSHSNAP_ACK:
a1c6b835
YZ
4471 handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
4472 h, session);
a8599bd8
SW
4473 goto done;
4474
4475 case CEPH_CAP_OP_EXPORT:
11df2dfb
YZ
4476 handle_cap_export(inode, h, peer, session);
4477 goto done_unlocked;
a8599bd8
SW
4478
4479 case CEPH_CAP_OP_IMPORT:
982d6011
YZ
4480 realm = NULL;
4481 if (snaptrace_len) {
4482 down_write(&mdsc->snap_rwsem);
a68e564a
XL
4483 if (ceph_update_snap_trace(mdsc, snaptrace,
4484 snaptrace + snaptrace_len,
4485 false, &realm)) {
4486 up_write(&mdsc->snap_rwsem);
4487 close_sessions = true;
4488 goto done;
4489 }
982d6011
YZ
4490 downgrade_write(&mdsc->snap_rwsem);
4491 } else {
4492 down_read(&mdsc->snap_rwsem);
4493 }
78333233 4494 spin_lock(&ci->i_ceph_lock);
4ee6a914 4495 handle_cap_import(mdsc, inode, h, peer, session,
a1c6b835
YZ
4496 &cap, &extra_info.issued);
4497 handle_cap_grant(inode, session, cap,
4498 h, msg->middle, &extra_info);
982d6011
YZ
4499 if (realm)
4500 ceph_put_snap_realm(mdsc, realm);
2cd698be 4501 goto done_unlocked;
a8599bd8
SW
4502 }
4503
4504 /* the rest require a cap */
be655596 4505 spin_lock(&ci->i_ceph_lock);
a1c6b835 4506 cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
a8599bd8 4507 if (!cap) {
38d46409
XL
4508 doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n",
4509 inode, ceph_ino(inode), ceph_snap(inode),
4510 session->s_mds);
be655596 4511 spin_unlock(&ci->i_ceph_lock);
ce72d4e0
XL
4512 switch (op) {
4513 case CEPH_CAP_OP_REVOKE:
4514 case CEPH_CAP_OP_GRANT:
4515 do_cap_release = true;
4516 break;
4517 default:
4518 break;
4519 }
21b559de 4520 goto flush_cap_releases;
a8599bd8
SW
4521 }
4522
be655596 4523 /* note that each of these drops i_ceph_lock for us */
a8599bd8
SW
4524 switch (op) {
4525 case CEPH_CAP_OP_REVOKE:
4526 case CEPH_CAP_OP_GRANT:
a1c6b835
YZ
4527 __ceph_caps_issued(ci, &extra_info.issued);
4528 extra_info.issued |= __ceph_caps_dirty(ci);
4529 handle_cap_grant(inode, session, cap,
4530 h, msg->middle, &extra_info);
15637c8b 4531 goto done_unlocked;
a8599bd8
SW
4532
4533 case CEPH_CAP_OP_FLUSH_ACK:
a1c6b835
YZ
4534 handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
4535 h, session, cap);
a8599bd8
SW
4536 break;
4537
4538 case CEPH_CAP_OP_TRUNC:
0d91f0ad
JL
4539 queue_trunc = handle_cap_trunc(inode, h, session,
4540 &extra_info);
7391fba2
JL
4541 spin_unlock(&ci->i_ceph_lock);
4542 if (queue_trunc)
4543 ceph_queue_vmtruncate(inode);
a8599bd8
SW
4544 break;
4545
4546 default:
be655596 4547 spin_unlock(&ci->i_ceph_lock);
38d46409
XL
4548 pr_err_client(cl, "unknown cap op %d %s\n", op,
4549 ceph_cap_op_name(op));
a8599bd8
SW
4550 }
4551
e3ec8d68
YZ
4552done:
4553 mutex_unlock(&session->s_mutex);
4554done_unlocked:
23c2c76e 4555 iput(inode);
2ad32cf0 4556out:
e3dfcab2
XL
4557 ceph_dec_mds_stopping_blocker(mdsc);
4558
2ad32cf0 4559 ceph_put_string(extra_info.pool_ns);
a68e564a
XL
4560
4561 /* Defer closing the sessions after s_mutex lock being released */
4562 if (close_sessions)
4563 ceph_mdsc_close_sessions(mdsc);
4564
0d91f0ad 4565 kfree(extra_info.fscrypt_auth);
e3ec8d68 4566 return;
21b559de
GF
4567
4568flush_cap_releases:
4569 /*
745a8e3b 4570 * send any cap release message to try to move things
21b559de
GF
4571 * along for the mds (who clearly thinks we still have this
4572 * cap).
4573 */
ce72d4e0
XL
4574 if (do_cap_release) {
4575 cap = ceph_get_cap(mdsc, NULL);
4576 cap->cap_ino = vino.ino;
4577 cap->queue_release = 1;
4578 cap->cap_id = le64_to_cpu(h->cap_id);
4579 cap->mseq = mseq;
4580 cap->seq = seq;
4581 cap->issue_seq = seq;
4582 spin_lock(&session->s_cap_lock);
4583 __ceph_queue_cap_release(session, cap);
4584 spin_unlock(&session->s_cap_lock);
4585 }
e3ec8d68
YZ
4586 ceph_flush_cap_releases(mdsc, session);
4587 goto done;
a8599bd8
SW
4588
4589bad:
38d46409 4590 pr_err_client(cl, "corrupt message\n");
9ec7cab1 4591 ceph_msg_dump(msg);
2ad32cf0 4592 goto out;
a8599bd8
SW
4593}
4594
4595/*
4596 * Delayed work handler to process end of delayed cap release LRU list.
bf2ba432
LH
4597 *
4598 * If new caps are added to the list while processing it, these won't get
4599 * processed in this run. In this case, the ci->i_hold_caps_max will be
4600 * returned so that the work can be scheduled accordingly.
a8599bd8 4601 */
bf2ba432 4602unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
a8599bd8 4603{
38d46409 4604 struct ceph_client *cl = mdsc->fsc->client;
4b9f2042 4605 struct inode *inode;
a8599bd8 4606 struct ceph_inode_info *ci;
bf2ba432
LH
4607 struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4608 unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4609 unsigned long loop_start = jiffies;
4610 unsigned long delay = 0;
a8599bd8 4611
38d46409 4612 doutc(cl, "begin\n");
585d72f3
JL
4613 spin_lock(&mdsc->cap_delay_lock);
4614 while (!list_empty(&mdsc->cap_delay_list)) {
a8599bd8
SW
4615 ci = list_first_entry(&mdsc->cap_delay_list,
4616 struct ceph_inode_info,
4617 i_cap_delay_list);
bf2ba432 4618 if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
38d46409 4619 doutc(cl, "caps added recently. Exiting loop");
bf2ba432
LH
4620 delay = ci->i_hold_caps_max;
4621 break;
4622 }
a8599bd8
SW
4623 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
4624 time_before(jiffies, ci->i_hold_caps_max))
4625 break;
4626 list_del_init(&ci->i_cap_delay_list);
4b9f2042 4627
874c8ca1 4628 inode = igrab(&ci->netfs.inode);
4b9f2042 4629 if (inode) {
585d72f3 4630 spin_unlock(&mdsc->cap_delay_lock);
38d46409
XL
4631 doutc(cl, "on %p %llx.%llx\n", inode,
4632 ceph_vinop(inode));
e4b731cc 4633 ceph_check_caps(ci, 0);
23c2c76e 4634 iput(inode);
585d72f3 4635 spin_lock(&mdsc->cap_delay_lock);
4b9f2042 4636 }
a8599bd8
SW
4637 }
4638 spin_unlock(&mdsc->cap_delay_lock);
38d46409 4639 doutc(cl, "done\n");
bf2ba432
LH
4640
4641 return delay;
a8599bd8
SW
4642}
4643
afcdaea3
SW
4644/*
4645 * Flush all dirty caps to the mds
4646 */
1cf03a68 4647static void flush_dirty_session_caps(struct ceph_mds_session *s)
afcdaea3 4648{
1cf03a68 4649 struct ceph_mds_client *mdsc = s->s_mdsc;
38d46409 4650 struct ceph_client *cl = mdsc->fsc->client;
db354052
SW
4651 struct ceph_inode_info *ci;
4652 struct inode *inode;
afcdaea3 4653
38d46409 4654 doutc(cl, "begin\n");
afcdaea3 4655 spin_lock(&mdsc->cap_dirty_lock);
1cf03a68
JL
4656 while (!list_empty(&s->s_cap_dirty)) {
4657 ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
db354052 4658 i_dirty_item);
874c8ca1 4659 inode = &ci->netfs.inode;
70b666c3 4660 ihold(inode);
38d46409 4661 doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
afcdaea3 4662 spin_unlock(&mdsc->cap_dirty_lock);
8692969e 4663 ceph_wait_on_async_create(inode);
e4b731cc 4664 ceph_check_caps(ci, CHECK_CAPS_FLUSH);
70b666c3 4665 iput(inode);
afcdaea3
SW
4666 spin_lock(&mdsc->cap_dirty_lock);
4667 }
4668 spin_unlock(&mdsc->cap_dirty_lock);
38d46409 4669 doutc(cl, "done\n");
afcdaea3
SW
4670}
4671
1cf03a68
JL
4672void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4673{
59b312f3 4674 ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
1cf03a68
JL
4675}
4676
719a2514
YZ
4677void __ceph_touch_fmode(struct ceph_inode_info *ci,
4678 struct ceph_mds_client *mdsc, int fmode)
4679{
4680 unsigned long now = jiffies;
4681 if (fmode & CEPH_FILE_MODE_RD)
4682 ci->i_last_rd = now;
4683 if (fmode & CEPH_FILE_MODE_WR)
4684 ci->i_last_wr = now;
4685 /* queue periodic check */
4686 if (fmode &&
4687 __ceph_is_any_real_caps(ci) &&
4688 list_empty(&ci->i_cap_delay_list))
a0d93e32 4689 __cap_delay_requeue(mdsc, ci);
719a2514
YZ
4690}
4691
4692void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
4693{
874c8ca1 4694 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
719a2514 4695 int bits = (fmode << 1) | 1;
973e5245 4696 bool already_opened = false;
1dd8d470
XL
4697 int i;
4698
4699 if (count == 1)
4700 atomic64_inc(&mdsc->metric.opened_files);
4701
719a2514
YZ
4702 spin_lock(&ci->i_ceph_lock);
4703 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
1dd8d470 4704 /*
973e5245 4705 * If any of the mode ref is larger than 0,
1dd8d470
XL
4706 * that means it has been already opened by
4707 * others. Just skip checking the PIN ref.
4708 */
973e5245
HW
4709 if (i && ci->i_nr_by_mode[i])
4710 already_opened = true;
4711
4712 if (bits & (1 << i))
4713 ci->i_nr_by_mode[i] += count;
719a2514 4714 }
1dd8d470 4715
973e5245 4716 if (!already_opened)
1dd8d470 4717 percpu_counter_inc(&mdsc->metric.opened_inodes);
719a2514
YZ
4718 spin_unlock(&ci->i_ceph_lock);
4719}
4720
a8599bd8
SW
4721/*
4722 * Drop open file reference. If we were the last open file,
4723 * we may need to release capabilities to the MDS (or schedule
4724 * their delayed release).
4725 */
719a2514 4726void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
a8599bd8 4727{
874c8ca1 4728 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
774a6a11 4729 int bits = (fmode << 1) | 1;
1dd8d470
XL
4730 bool is_closed = true;
4731 int i;
4732
4733 if (count == 1)
4734 atomic64_dec(&mdsc->metric.opened_files);
4735
be655596 4736 spin_lock(&ci->i_ceph_lock);
774a6a11
YZ
4737 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4738 if (bits & (1 << i)) {
719a2514
YZ
4739 BUG_ON(ci->i_nr_by_mode[i] < count);
4740 ci->i_nr_by_mode[i] -= count;
774a6a11 4741 }
1dd8d470
XL
4742
4743 /*
4744 * If any of the mode ref is not 0 after
4745 * decreased, that means it is still opened
4746 * by others. Just skip checking the PIN ref.
4747 */
4748 if (i && ci->i_nr_by_mode[i])
4749 is_closed = false;
774a6a11 4750 }
1dd8d470
XL
4751
4752 if (is_closed)
4753 percpu_counter_dec(&mdsc->metric.opened_inodes);
be655596 4754 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
4755}
4756
6ef0bc6d 4757/*
a452bc06 4758 * For a soon-to-be unlinked file, drop the LINK caps. If it
6ef0bc6d
ZZ
4759 * looks like the link count will hit 0, drop any other caps (other
4760 * than PIN) we don't specifically want (due to the file still being
4761 * open).
4762 */
4763int ceph_drop_caps_for_unlink(struct inode *inode)
4764{
4765 struct ceph_inode_info *ci = ceph_inode(inode);
4766 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
4767
4768 spin_lock(&ci->i_ceph_lock);
4769 if (inode->i_nlink == 1) {
4770 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
4771
6ef0bc6d
ZZ
4772 if (__ceph_caps_dirty(ci)) {
4773 struct ceph_mds_client *mdsc =
5995d90d 4774 ceph_inode_to_fs_client(inode)->mdsc;
dbc347ef
XL
4775
4776 doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
4777 ceph_vinop(inode));
4778 spin_lock(&mdsc->cap_unlink_delay_lock);
4779 ci->i_ceph_flags |= CEPH_I_FLUSH;
4780 if (!list_empty(&ci->i_cap_delay_list))
4781 list_del_init(&ci->i_cap_delay_list);
4782 list_add_tail(&ci->i_cap_delay_list,
4783 &mdsc->cap_unlink_delay_list);
4784 spin_unlock(&mdsc->cap_unlink_delay_lock);
4785
4786 /*
4787 * Fire the work immediately, because the MDS maybe
4788 * waiting for caps release.
4789 */
4790 ceph_queue_cap_unlink_work(mdsc);
6ef0bc6d
ZZ
4791 }
4792 }
4793 spin_unlock(&ci->i_ceph_lock);
4794 return drop;
4795}
4796
a8599bd8
SW
4797/*
4798 * Helpers for embedding cap and dentry lease releases into mds
4799 * requests.
4800 *
4801 * @force is used by dentry_release (below) to force inclusion of a
4802 * record for the directory inode, even when there aren't any caps to
4803 * drop.
4804 */
4805int ceph_encode_inode_release(void **p, struct inode *inode,
4806 int mds, int drop, int unless, int force)
4807{
4808 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 4809 struct ceph_client *cl = ceph_inode_to_client(inode);
a8599bd8
SW
4810 struct ceph_cap *cap;
4811 struct ceph_mds_request_release *rel = *p;
ec97f88b 4812 int used, dirty;
a8599bd8 4813 int ret = 0;
a8599bd8 4814
be655596 4815 spin_lock(&ci->i_ceph_lock);
916623da 4816 used = __ceph_caps_used(ci);
ec97f88b 4817 dirty = __ceph_caps_dirty(ci);
916623da 4818
38d46409
XL
4819 doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n",
4820 inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty),
4821 ceph_cap_string(drop), ceph_cap_string(unless));
916623da 4822
ec97f88b
SW
4823 /* only drop unused, clean caps */
4824 drop &= ~(used | dirty);
916623da 4825
a8599bd8
SW
4826 cap = __get_cap_for_mds(ci, mds);
4827 if (cap && __cap_is_valid(cap)) {
222b7f90
YZ
4828 unless &= cap->issued;
4829 if (unless) {
4830 if (unless & CEPH_CAP_AUTH_EXCL)
4831 drop &= ~CEPH_CAP_AUTH_SHARED;
4832 if (unless & CEPH_CAP_LINK_EXCL)
4833 drop &= ~CEPH_CAP_LINK_SHARED;
4834 if (unless & CEPH_CAP_XATTR_EXCL)
4835 drop &= ~CEPH_CAP_XATTR_SHARED;
4836 if (unless & CEPH_CAP_FILE_EXCL)
4837 drop &= ~CEPH_CAP_FILE_SHARED;
4838 }
4839
4840 if (force || (cap->issued & drop)) {
4841 if (cap->issued & drop) {
bb137f84 4842 int wanted = __ceph_caps_wanted(ci);
38d46409
XL
4843 doutc(cl, "%p %llx.%llx cap %p %s -> %s, "
4844 "wanted %s -> %s\n", inode,
4845 ceph_vinop(inode), cap,
4846 ceph_cap_string(cap->issued),
4847 ceph_cap_string(cap->issued & ~drop),
4848 ceph_cap_string(cap->mds_wanted),
4849 ceph_cap_string(wanted));
bb137f84 4850
a8599bd8
SW
4851 cap->issued &= ~drop;
4852 cap->implemented &= ~drop;
bb137f84 4853 cap->mds_wanted = wanted;
6f05b30e
YZ
4854 if (cap == ci->i_auth_cap &&
4855 !(wanted & CEPH_CAP_ANY_FILE_WR))
4856 ci->i_requested_max_size = 0;
a8599bd8 4857 } else {
38d46409
XL
4858 doutc(cl, "%p %llx.%llx cap %p %s (force)\n",
4859 inode, ceph_vinop(inode), cap,
4860 ceph_cap_string(cap->issued));
a8599bd8
SW
4861 }
4862
4863 rel->ino = cpu_to_le64(ceph_ino(inode));
4864 rel->cap_id = cpu_to_le64(cap->cap_id);
4865 rel->seq = cpu_to_le32(cap->seq);
08a0f24e 4866 rel->issue_seq = cpu_to_le32(cap->issue_seq);
a8599bd8 4867 rel->mseq = cpu_to_le32(cap->mseq);
fd7b95cd 4868 rel->caps = cpu_to_le32(cap->implemented);
a8599bd8
SW
4869 rel->wanted = cpu_to_le32(cap->mds_wanted);
4870 rel->dname_len = 0;
4871 rel->dname_seq = 0;
4872 *p += sizeof(*rel);
4873 ret = 1;
4874 } else {
38d46409
XL
4875 doutc(cl, "%p %llx.%llx cap %p %s (noop)\n",
4876 inode, ceph_vinop(inode), cap,
4877 ceph_cap_string(cap->issued));
a8599bd8
SW
4878 }
4879 }
be655596 4880 spin_unlock(&ci->i_ceph_lock);
a8599bd8
SW
4881 return ret;
4882}
4883
3fd945a7
JL
4884/**
4885 * ceph_encode_dentry_release - encode a dentry release into an outgoing request
4886 * @p: outgoing request buffer
4887 * @dentry: dentry to release
4888 * @dir: dir to release it from
4889 * @mds: mds that we're speaking to
4890 * @drop: caps being dropped
4891 * @unless: unless we have these caps
4892 *
4893 * Encode a dentry release into an outgoing request buffer. Returns 1 if the
4894 * thing was released, or a negative error code otherwise.
4895 */
a8599bd8 4896int ceph_encode_dentry_release(void **p, struct dentry *dentry,
ca6c8ae0 4897 struct inode *dir,
a8599bd8
SW
4898 int mds, int drop, int unless)
4899{
a8599bd8
SW
4900 struct ceph_mds_request_release *rel = *p;
4901 struct ceph_dentry_info *di = ceph_dentry(dentry);
38d46409 4902 struct ceph_client *cl;
a8599bd8
SW
4903 int force = 0;
4904 int ret;
4905
b493ad71
XL
4906 /* This shouldn't happen */
4907 BUG_ON(!dir);
4908
a8599bd8
SW
4909 /*
4910 * force an record for the directory caps if we have a dentry lease.
be655596 4911 * this is racy (can't take i_ceph_lock and d_lock together), but it
a8599bd8
SW
4912 * doesn't have to be perfect; the mds will revoke anything we don't
4913 * release.
4914 */
4915 spin_lock(&dentry->d_lock);
4916 if (di->lease_session && di->lease_session->s_mds == mds)
4917 force = 1;
4918 spin_unlock(&dentry->d_lock);
4919
ca6c8ae0 4920 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
a8599bd8 4921
38d46409 4922 cl = ceph_inode_to_client(dir);
a8599bd8
SW
4923 spin_lock(&dentry->d_lock);
4924 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
38d46409
XL
4925 doutc(cl, "%p mds%d seq %d\n", dentry, mds,
4926 (int)di->lease_seq);
a8599bd8 4927 rel->dname_seq = cpu_to_le32(di->lease_seq);
1dadcce3 4928 __ceph_mdsc_drop_dentry_lease(dentry);
3fd945a7
JL
4929 spin_unlock(&dentry->d_lock);
4930 if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
4931 int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
4932
4933 if (ret2 < 0)
4934 return ret2;
4935
4936 rel->dname_len = cpu_to_le32(ret2);
4937 *p += ret2;
4938 } else {
4939 rel->dname_len = cpu_to_le32(dentry->d_name.len);
4940 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
4941 *p += dentry->d_name.len;
4942 }
4943 } else {
4944 spin_unlock(&dentry->d_lock);
a8599bd8 4945 }
a8599bd8
SW
4946 return ret;
4947}
36e6da98
JL
4948
4949static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
4950{
4951 struct ceph_inode_info *ci = ceph_inode(inode);
38d46409 4952 struct ceph_client *cl = mdsc->fsc->client;
36e6da98
JL
4953 struct ceph_cap_snap *capsnap;
4954 int capsnap_release = 0;
4955
4956 lockdep_assert_held(&ci->i_ceph_lock);
4957
38d46409
XL
4958 doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n",
4959 ci, inode, ceph_vinop(inode));
36e6da98
JL
4960
4961 while (!list_empty(&ci->i_cap_snaps)) {
4962 capsnap = list_first_entry(&ci->i_cap_snaps,
4963 struct ceph_cap_snap, ci_item);
4964 __ceph_remove_capsnap(inode, capsnap, NULL, NULL);
4965 ceph_put_snap_context(capsnap->context);
4966 ceph_put_cap_snap(capsnap);
4967 capsnap_release++;
4968 }
4969 wake_up_all(&ci->i_cap_wq);
4970 wake_up_all(&mdsc->cap_flushing_wq);
4971 return capsnap_release;
4972}
4973
4974int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
4975{
5995d90d 4976 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
36e6da98 4977 struct ceph_mds_client *mdsc = fsc->mdsc;
38d46409 4978 struct ceph_client *cl = fsc->client;
36e6da98
JL
4979 struct ceph_inode_info *ci = ceph_inode(inode);
4980 bool is_auth;
4981 bool dirty_dropped = false;
4982 int iputs = 0;
4983
4984 lockdep_assert_held(&ci->i_ceph_lock);
4985
38d46409
XL
4986 doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n",
4987 cap, ci, inode, ceph_vinop(inode));
36e6da98
JL
4988
4989 is_auth = (cap == ci->i_auth_cap);
4990 __ceph_remove_cap(cap, false);
4991 if (is_auth) {
4992 struct ceph_cap_flush *cf;
4993
5d6451b1 4994 if (ceph_inode_is_shutdown(inode)) {
36e6da98
JL
4995 if (inode->i_data.nrpages > 0)
4996 *invalidate = true;
4997 if (ci->i_wrbuffer_ref > 0)
4998 mapping_set_error(&inode->i_data, -EIO);
4999 }
5000
5001 spin_lock(&mdsc->cap_dirty_lock);
5002
5003 /* trash all of the cap flushes for this inode */
5004 while (!list_empty(&ci->i_cap_flush_list)) {
5005 cf = list_first_entry(&ci->i_cap_flush_list,
5006 struct ceph_cap_flush, i_list);
5007 list_del_init(&cf->g_list);
5008 list_del_init(&cf->i_list);
5009 if (!cf->is_capsnap)
5010 ceph_free_cap_flush(cf);
5011 }
5012
5013 if (!list_empty(&ci->i_dirty_item)) {
38d46409
XL
5014 pr_warn_ratelimited_client(cl,
5015 " dropping dirty %s state for %p %llx.%llx\n",
36e6da98 5016 ceph_cap_string(ci->i_dirty_caps),
38d46409 5017 inode, ceph_vinop(inode));
36e6da98
JL
5018 ci->i_dirty_caps = 0;
5019 list_del_init(&ci->i_dirty_item);
5020 dirty_dropped = true;
5021 }
5022 if (!list_empty(&ci->i_flushing_item)) {
38d46409
XL
5023 pr_warn_ratelimited_client(cl,
5024 " dropping dirty+flushing %s state for %p %llx.%llx\n",
36e6da98 5025 ceph_cap_string(ci->i_flushing_caps),
38d46409 5026 inode, ceph_vinop(inode));
36e6da98
JL
5027 ci->i_flushing_caps = 0;
5028 list_del_init(&ci->i_flushing_item);
5029 mdsc->num_cap_flushing--;
5030 dirty_dropped = true;
5031 }
5032 spin_unlock(&mdsc->cap_dirty_lock);
5033
5034 if (dirty_dropped) {
5035 mapping_set_error(inode->i_mapping, -EIO);
5036
5037 if (ci->i_wrbuffer_ref_head == 0 &&
5038 ci->i_wr_ref == 0 &&
5039 ci->i_dirty_caps == 0 &&
5040 ci->i_flushing_caps == 0) {
5041 ceph_put_snap_context(ci->i_head_snapc);
5042 ci->i_head_snapc = NULL;
5043 }
5044 }
5045
5046 if (atomic_read(&ci->i_filelock_ref) > 0) {
5047 /* make further file lock syscall return -EIO */
5048 ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
38d46409
XL
5049 pr_warn_ratelimited_client(cl,
5050 " dropping file locks for %p %llx.%llx\n",
5051 inode, ceph_vinop(inode));
36e6da98
JL
5052 }
5053
5054 if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
5055 cf = ci->i_prealloc_cap_flush;
5056 ci->i_prealloc_cap_flush = NULL;
5057 if (!cf->is_capsnap)
5058 ceph_free_cap_flush(cf);
5059 }
5060
5061 if (!list_empty(&ci->i_cap_snaps))
5062 iputs = remove_capsnaps(mdsc, inode);
5063 }
5064 if (dirty_dropped)
5065 ++iputs;
5066 return iputs;
5067}