Merge tag 'x86_cpu_for_v6.9_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / fs / ceph / locks.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
3
4 #include <linux/file.h>
5 #include <linux/namei.h>
6 #include <linux/random.h>
7
8 #include "super.h"
9 #include "mds_client.h"
10 #include <linux/filelock.h>
11 #include <linux/ceph/pagelist.h>
12
13 static u64 lock_secret;
14 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
15                                          struct ceph_mds_request *req);
16
17 static inline u64 secure_addr(void *addr)
18 {
19         u64 v = lock_secret ^ (u64)(unsigned long)addr;
20         /*
21          * Set the most significant bit, so that MDS knows the 'owner'
22          * is sufficient to identify the owner of lock. (old code uses
23          * both 'owner' and 'pid')
24          */
25         v |= (1ULL << 63);
26         return v;
27 }
28
29 void __init ceph_flock_init(void)
30 {
31         get_random_bytes(&lock_secret, sizeof(lock_secret));
32 }
33
34 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
35 {
36         struct inode *inode = file_inode(dst->c.flc_file);
37         atomic_inc(&ceph_inode(inode)->i_filelock_ref);
38         dst->fl_u.ceph.inode = igrab(inode);
39 }
40
41 /*
42  * Do not use the 'fl->fl_file' in release function, which
43  * is possibly already released by another thread.
44  */
45 static void ceph_fl_release_lock(struct file_lock *fl)
46 {
47         struct inode *inode = fl->fl_u.ceph.inode;
48         struct ceph_inode_info *ci;
49
50         /*
51          * If inode is NULL it should be a request file_lock,
52          * nothing we can do.
53          */
54         if (!inode)
55                 return;
56
57         ci = ceph_inode(inode);
58         if (atomic_dec_and_test(&ci->i_filelock_ref)) {
59                 /* clear error when all locks are released */
60                 spin_lock(&ci->i_ceph_lock);
61                 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
62                 spin_unlock(&ci->i_ceph_lock);
63         }
64         fl->fl_u.ceph.inode = NULL;
65         iput(inode);
66 }
67
68 static const struct file_lock_operations ceph_fl_lock_ops = {
69         .fl_copy_lock = ceph_fl_copy_lock,
70         .fl_release_private = ceph_fl_release_lock,
71 };
72
73 /*
74  * Implement fcntl and flock locking functions.
75  */
76 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
77                              int cmd, u8 wait, struct file_lock *fl)
78 {
79         struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
80         struct ceph_client *cl = mdsc->fsc->client;
81         struct ceph_mds_request *req;
82         int err;
83         u64 length = 0;
84         u64 owner;
85
86         if (operation == CEPH_MDS_OP_SETFILELOCK) {
87                 /*
88                  * increasing i_filelock_ref closes race window between
89                  * handling request reply and adding file_lock struct to
90                  * inode. Otherwise, auth caps may get trimmed in the
91                  * window. Caller function will decrease the counter.
92                  */
93                 fl->fl_ops = &ceph_fl_lock_ops;
94                 fl->fl_ops->fl_copy_lock(fl, NULL);
95         }
96
97         if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
98                 wait = 0;
99
100         req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
101         if (IS_ERR(req))
102                 return PTR_ERR(req);
103         req->r_inode = inode;
104         ihold(inode);
105         req->r_num_caps = 1;
106
107         /* mds requires start and length rather than start and end */
108         if (LLONG_MAX == fl->fl_end)
109                 length = 0;
110         else
111                 length = fl->fl_end - fl->fl_start + 1;
112
113         owner = secure_addr(fl->c.flc_owner);
114
115         doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
116                     "start: %llu, length: %llu, wait: %d, type: %d\n",
117                     (int)lock_type, (int)operation, owner,
118                     (u64) fl->c.flc_pid,
119                     fl->fl_start, length, wait, fl->c.flc_type);
120
121         req->r_args.filelock_change.rule = lock_type;
122         req->r_args.filelock_change.type = cmd;
123         req->r_args.filelock_change.owner = cpu_to_le64(owner);
124         req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
125         req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
126         req->r_args.filelock_change.length = cpu_to_le64(length);
127         req->r_args.filelock_change.wait = wait;
128
129         err = ceph_mdsc_submit_request(mdsc, inode, req);
130         if (!err)
131                 err = ceph_mdsc_wait_request(mdsc, req, wait ?
132                                         ceph_lock_wait_for_completion : NULL);
133         if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
134                 fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
135                 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
136                         fl->c.flc_type = F_RDLCK;
137                 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
138                         fl->c.flc_type = F_WRLCK;
139                 else
140                         fl->c.flc_type = F_UNLCK;
141
142                 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
143                 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
144                                                  le64_to_cpu(req->r_reply_info.filelock_reply->length);
145                 if (length >= 1)
146                         fl->fl_end = length -1;
147                 else
148                         fl->fl_end = 0;
149
150         }
151         ceph_mdsc_put_request(req);
152         doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
153               "length: %llu, wait: %d, type: %d, err code %d\n",
154               (int)lock_type, (int)operation, (u64) fl->c.flc_pid,
155               fl->fl_start, length, wait, fl->c.flc_type, err);
156         return err;
157 }
158
159 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
160                                          struct ceph_mds_request *req)
161 {
162         struct ceph_client *cl = mdsc->fsc->client;
163         struct ceph_mds_request *intr_req;
164         struct inode *inode = req->r_inode;
165         int err, lock_type;
166
167         BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
168         if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
169                 lock_type = CEPH_LOCK_FCNTL_INTR;
170         else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
171                 lock_type = CEPH_LOCK_FLOCK_INTR;
172         else
173                 BUG_ON(1);
174         BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
175
176         err = wait_for_completion_interruptible(&req->r_completion);
177         if (!err)
178                 return 0;
179
180         doutc(cl, "request %llu was interrupted\n", req->r_tid);
181
182         mutex_lock(&mdsc->mutex);
183         if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
184                 err = 0;
185         } else {
186                 /*
187                  * ensure we aren't running concurrently with
188                  * ceph_fill_trace or ceph_readdir_prepopulate, which
189                  * rely on locks (dir mutex) held by our caller.
190                  */
191                 mutex_lock(&req->r_fill_mutex);
192                 req->r_err = err;
193                 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
194                 mutex_unlock(&req->r_fill_mutex);
195
196                 if (!req->r_session) {
197                         // haven't sent the request
198                         err = 0;
199                 }
200         }
201         mutex_unlock(&mdsc->mutex);
202         if (!err)
203                 return 0;
204
205         intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
206                                             USE_AUTH_MDS);
207         if (IS_ERR(intr_req))
208                 return PTR_ERR(intr_req);
209
210         intr_req->r_inode = inode;
211         ihold(inode);
212         intr_req->r_num_caps = 1;
213
214         intr_req->r_args.filelock_change = req->r_args.filelock_change;
215         intr_req->r_args.filelock_change.rule = lock_type;
216         intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
217
218         err = ceph_mdsc_do_request(mdsc, inode, intr_req);
219         ceph_mdsc_put_request(intr_req);
220
221         if (err && err != -ERESTARTSYS)
222                 return err;
223
224         wait_for_completion_killable(&req->r_safe_completion);
225         return 0;
226 }
227
228 static int try_unlock_file(struct file *file, struct file_lock *fl)
229 {
230         int err;
231         unsigned int orig_flags = fl->c.flc_flags;
232         fl->c.flc_flags |= FL_EXISTS;
233         err = locks_lock_file_wait(file, fl);
234         fl->c.flc_flags = orig_flags;
235         if (err == -ENOENT) {
236                 if (!(orig_flags & FL_EXISTS))
237                         err = 0;
238                 return err;
239         }
240         return 1;
241 }
242
243 /*
244  * Attempt to set an fcntl lock.
245  * For now, this just goes away to the server. Later it may be more awesome.
246  */
247 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
248 {
249         struct inode *inode = file_inode(file);
250         struct ceph_inode_info *ci = ceph_inode(inode);
251         struct ceph_client *cl = ceph_inode_to_client(inode);
252         int err = 0;
253         u16 op = CEPH_MDS_OP_SETFILELOCK;
254         u8 wait = 0;
255         u8 lock_cmd;
256
257         if (!(fl->c.flc_flags & FL_POSIX))
258                 return -ENOLCK;
259
260         if (ceph_inode_is_shutdown(inode))
261                 return -ESTALE;
262
263         doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
264
265         /* set wait bit as appropriate, then make command as Ceph expects it*/
266         if (IS_GETLK(cmd))
267                 op = CEPH_MDS_OP_GETFILELOCK;
268         else if (IS_SETLKW(cmd))
269                 wait = 1;
270
271         spin_lock(&ci->i_ceph_lock);
272         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
273                 err = -EIO;
274         }
275         spin_unlock(&ci->i_ceph_lock);
276         if (err < 0) {
277                 if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
278                         posix_lock_file(file, fl, NULL);
279                 return err;
280         }
281
282         if (lock_is_read(fl))
283                 lock_cmd = CEPH_LOCK_SHARED;
284         else if (lock_is_write(fl))
285                 lock_cmd = CEPH_LOCK_EXCL;
286         else
287                 lock_cmd = CEPH_LOCK_UNLOCK;
288
289         if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
290                 err = try_unlock_file(file, fl);
291                 if (err <= 0)
292                         return err;
293         }
294
295         err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
296         if (!err) {
297                 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
298                         doutc(cl, "locking locally\n");
299                         err = posix_lock_file(file, fl, NULL);
300                         if (err) {
301                                 /* undo! This should only happen if
302                                  * the kernel detects local
303                                  * deadlock. */
304                                 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
305                                                   CEPH_LOCK_UNLOCK, 0, fl);
306                                 doutc(cl, "got %d on posix_lock_file, undid lock\n",
307                                       err);
308                         }
309                 }
310         }
311         return err;
312 }
313
314 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
315 {
316         struct inode *inode = file_inode(file);
317         struct ceph_inode_info *ci = ceph_inode(inode);
318         struct ceph_client *cl = ceph_inode_to_client(inode);
319         int err = 0;
320         u8 wait = 0;
321         u8 lock_cmd;
322
323         if (!(fl->c.flc_flags & FL_FLOCK))
324                 return -ENOLCK;
325
326         if (ceph_inode_is_shutdown(inode))
327                 return -ESTALE;
328
329         doutc(cl, "fl_file: %p\n", fl->c.flc_file);
330
331         spin_lock(&ci->i_ceph_lock);
332         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
333                 err = -EIO;
334         }
335         spin_unlock(&ci->i_ceph_lock);
336         if (err < 0) {
337                 if (lock_is_unlock(fl))
338                         locks_lock_file_wait(file, fl);
339                 return err;
340         }
341
342         if (IS_SETLKW(cmd))
343                 wait = 1;
344
345         if (lock_is_read(fl))
346                 lock_cmd = CEPH_LOCK_SHARED;
347         else if (lock_is_write(fl))
348                 lock_cmd = CEPH_LOCK_EXCL;
349         else
350                 lock_cmd = CEPH_LOCK_UNLOCK;
351
352         if (lock_is_unlock(fl)) {
353                 err = try_unlock_file(file, fl);
354                 if (err <= 0)
355                         return err;
356         }
357
358         err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
359                                 inode, lock_cmd, wait, fl);
360         if (!err && F_UNLCK != fl->c.flc_type) {
361                 err = locks_lock_file_wait(file, fl);
362                 if (err) {
363                         ceph_lock_message(CEPH_LOCK_FLOCK,
364                                           CEPH_MDS_OP_SETFILELOCK,
365                                           inode, CEPH_LOCK_UNLOCK, 0, fl);
366                         doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
367                               err);
368                 }
369         }
370         return err;
371 }
372
373 /*
374  * Fills in the passed counter variables, so you can prepare pagelist metadata
375  * before calling ceph_encode_locks.
376  */
377 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
378 {
379         struct ceph_client *cl = ceph_inode_to_client(inode);
380         struct file_lock *lock;
381         struct file_lock_context *ctx;
382
383         *fcntl_count = 0;
384         *flock_count = 0;
385
386         ctx = locks_inode_context(inode);
387         if (ctx) {
388                 spin_lock(&ctx->flc_lock);
389                 for_each_file_lock(lock, &ctx->flc_posix)
390                         ++(*fcntl_count);
391                 for_each_file_lock(lock, &ctx->flc_flock)
392                         ++(*flock_count);
393                 spin_unlock(&ctx->flc_lock);
394         }
395         doutc(cl, "counted %d flock locks and %d fcntl locks\n",
396               *flock_count, *fcntl_count);
397 }
398
399 /*
400  * Given a pointer to a lock, convert it to a ceph filelock
401  */
402 static int lock_to_ceph_filelock(struct inode *inode,
403                                  struct file_lock *lock,
404                                  struct ceph_filelock *cephlock)
405 {
406         struct ceph_client *cl = ceph_inode_to_client(inode);
407         int err = 0;
408
409         cephlock->start = cpu_to_le64(lock->fl_start);
410         cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
411         cephlock->client = cpu_to_le64(0);
412         cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
413         cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
414
415         switch (lock->c.flc_type) {
416         case F_RDLCK:
417                 cephlock->type = CEPH_LOCK_SHARED;
418                 break;
419         case F_WRLCK:
420                 cephlock->type = CEPH_LOCK_EXCL;
421                 break;
422         case F_UNLCK:
423                 cephlock->type = CEPH_LOCK_UNLOCK;
424                 break;
425         default:
426                 doutc(cl, "Have unknown lock type %d\n",
427                       lock->c.flc_type);
428                 err = -EINVAL;
429         }
430
431         return err;
432 }
433
434 /*
435  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
436  * array. Must be called with inode->i_lock already held.
437  * If we encounter more of a specific lock type than expected, return -ENOSPC.
438  */
439 int ceph_encode_locks_to_buffer(struct inode *inode,
440                                 struct ceph_filelock *flocks,
441                                 int num_fcntl_locks, int num_flock_locks)
442 {
443         struct file_lock *lock;
444         struct file_lock_context *ctx = locks_inode_context(inode);
445         struct ceph_client *cl = ceph_inode_to_client(inode);
446         int err = 0;
447         int seen_fcntl = 0;
448         int seen_flock = 0;
449         int l = 0;
450
451         doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
452               num_fcntl_locks);
453
454         if (!ctx)
455                 return 0;
456
457         spin_lock(&ctx->flc_lock);
458         for_each_file_lock(lock, &ctx->flc_posix) {
459                 ++seen_fcntl;
460                 if (seen_fcntl > num_fcntl_locks) {
461                         err = -ENOSPC;
462                         goto fail;
463                 }
464                 err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
465                 if (err)
466                         goto fail;
467                 ++l;
468         }
469         for_each_file_lock(lock, &ctx->flc_flock) {
470                 ++seen_flock;
471                 if (seen_flock > num_flock_locks) {
472                         err = -ENOSPC;
473                         goto fail;
474                 }
475                 err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
476                 if (err)
477                         goto fail;
478                 ++l;
479         }
480 fail:
481         spin_unlock(&ctx->flc_lock);
482         return err;
483 }
484
485 /*
486  * Copy the encoded flock and fcntl locks into the pagelist.
487  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
488  * sequential flock locks.
489  * Returns zero on success.
490  */
491 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
492                            struct ceph_pagelist *pagelist,
493                            int num_fcntl_locks, int num_flock_locks)
494 {
495         int err = 0;
496         __le32 nlocks;
497
498         nlocks = cpu_to_le32(num_fcntl_locks);
499         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
500         if (err)
501                 goto out_fail;
502
503         if (num_fcntl_locks > 0) {
504                 err = ceph_pagelist_append(pagelist, flocks,
505                                            num_fcntl_locks * sizeof(*flocks));
506                 if (err)
507                         goto out_fail;
508         }
509
510         nlocks = cpu_to_le32(num_flock_locks);
511         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
512         if (err)
513                 goto out_fail;
514
515         if (num_flock_locks > 0) {
516                 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
517                                            num_flock_locks * sizeof(*flocks));
518         }
519 out_fail:
520         return err;
521 }