ceph: use ceph_evict_inode to cleanup inode's resource
[linux-2.6-block.git] / fs / ceph / super.c
1 // SPDX-License-Identifier: GPL-2.0-only
2
3 #include <linux/ceph/ceph_debug.h>
4
5 #include <linux/backing-dev.h>
6 #include <linux/ctype.h>
7 #include <linux/fs.h>
8 #include <linux/inet.h>
9 #include <linux/in6.h>
10 #include <linux/module.h>
11 #include <linux/mount.h>
12 #include <linux/parser.h>
13 #include <linux/sched.h>
14 #include <linux/seq_file.h>
15 #include <linux/slab.h>
16 #include <linux/statfs.h>
17 #include <linux/string.h>
18
19 #include "super.h"
20 #include "mds_client.h"
21 #include "cache.h"
22
23 #include <linux/ceph/ceph_features.h>
24 #include <linux/ceph/decode.h>
25 #include <linux/ceph/mon_client.h>
26 #include <linux/ceph/auth.h>
27 #include <linux/ceph/debugfs.h>
28
29 /*
30  * Ceph superblock operations
31  *
32  * Handle the basics of mounting, unmounting.
33  */
34
35 /*
36  * super ops
37  */
38 static void ceph_put_super(struct super_block *s)
39 {
40         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
41
42         dout("put_super\n");
43         ceph_mdsc_close_sessions(fsc->mdsc);
44 }
45
46 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
47 {
48         struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
49         struct ceph_mon_client *monc = &fsc->client->monc;
50         struct ceph_statfs st;
51         u64 fsid;
52         int err;
53         u64 data_pool;
54
55         if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
56                 data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
57         } else {
58                 data_pool = CEPH_NOPOOL;
59         }
60
61         dout("statfs\n");
62         err = ceph_monc_do_statfs(monc, data_pool, &st);
63         if (err < 0)
64                 return err;
65
66         /* fill in kstatfs */
67         buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
68
69         /*
70          * express utilization in terms of large blocks to avoid
71          * overflow on 32-bit machines.
72          *
73          * NOTE: for the time being, we make bsize == frsize to humor
74          * not-yet-ancient versions of glibc that are broken.
75          * Someday, we will probably want to report a real block
76          * size...  whatever that may mean for a network file system!
77          */
78         buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
79         buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
80
81         /*
82          * By default use root quota for stats; fallback to overall filesystem
83          * usage if using 'noquotadf' mount option or if the root dir doesn't
84          * have max_bytes quota set.
85          */
86         if (ceph_test_mount_opt(fsc, NOQUOTADF) ||
87             !ceph_quota_update_statfs(fsc, buf)) {
88                 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
89                 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
90                 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
91         }
92
93         buf->f_files = le64_to_cpu(st.num_objects);
94         buf->f_ffree = -1;
95         buf->f_namelen = NAME_MAX;
96
97         /* Must convert the fsid, for consistent values across arches */
98         mutex_lock(&monc->mutex);
99         fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^
100                le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
101         mutex_unlock(&monc->mutex);
102
103         buf->f_fsid.val[0] = fsid & 0xffffffff;
104         buf->f_fsid.val[1] = fsid >> 32;
105
106         return 0;
107 }
108
109
110 static int ceph_sync_fs(struct super_block *sb, int wait)
111 {
112         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
113
114         if (!wait) {
115                 dout("sync_fs (non-blocking)\n");
116                 ceph_flush_dirty_caps(fsc->mdsc);
117                 dout("sync_fs (non-blocking) done\n");
118                 return 0;
119         }
120
121         dout("sync_fs (blocking)\n");
122         ceph_osdc_sync(&fsc->client->osdc);
123         ceph_mdsc_sync(fsc->mdsc);
124         dout("sync_fs (blocking) done\n");
125         return 0;
126 }
127
128 /*
129  * mount options
130  */
131 enum {
132         Opt_wsize,
133         Opt_rsize,
134         Opt_rasize,
135         Opt_caps_wanted_delay_min,
136         Opt_caps_wanted_delay_max,
137         Opt_caps_max,
138         Opt_readdir_max_entries,
139         Opt_readdir_max_bytes,
140         Opt_congestion_kb,
141         Opt_last_int,
142         /* int args above */
143         Opt_snapdirname,
144         Opt_mds_namespace,
145         Opt_fscache_uniq,
146         Opt_last_string,
147         /* string args above */
148         Opt_dirstat,
149         Opt_nodirstat,
150         Opt_rbytes,
151         Opt_norbytes,
152         Opt_asyncreaddir,
153         Opt_noasyncreaddir,
154         Opt_dcache,
155         Opt_nodcache,
156         Opt_ino32,
157         Opt_noino32,
158         Opt_fscache,
159         Opt_nofscache,
160         Opt_poolperm,
161         Opt_nopoolperm,
162         Opt_require_active_mds,
163         Opt_norequire_active_mds,
164 #ifdef CONFIG_CEPH_FS_POSIX_ACL
165         Opt_acl,
166 #endif
167         Opt_noacl,
168         Opt_quotadf,
169         Opt_noquotadf,
170         Opt_copyfrom,
171         Opt_nocopyfrom,
172 };
173
174 static match_table_t fsopt_tokens = {
175         {Opt_wsize, "wsize=%d"},
176         {Opt_rsize, "rsize=%d"},
177         {Opt_rasize, "rasize=%d"},
178         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
179         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
180         {Opt_caps_max, "caps_max=%d"},
181         {Opt_readdir_max_entries, "readdir_max_entries=%d"},
182         {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
183         {Opt_congestion_kb, "write_congestion_kb=%d"},
184         /* int args above */
185         {Opt_snapdirname, "snapdirname=%s"},
186         {Opt_mds_namespace, "mds_namespace=%s"},
187         {Opt_fscache_uniq, "fsc=%s"},
188         /* string args above */
189         {Opt_dirstat, "dirstat"},
190         {Opt_nodirstat, "nodirstat"},
191         {Opt_rbytes, "rbytes"},
192         {Opt_norbytes, "norbytes"},
193         {Opt_asyncreaddir, "asyncreaddir"},
194         {Opt_noasyncreaddir, "noasyncreaddir"},
195         {Opt_dcache, "dcache"},
196         {Opt_nodcache, "nodcache"},
197         {Opt_ino32, "ino32"},
198         {Opt_noino32, "noino32"},
199         {Opt_fscache, "fsc"},
200         {Opt_nofscache, "nofsc"},
201         {Opt_poolperm, "poolperm"},
202         {Opt_nopoolperm, "nopoolperm"},
203         {Opt_require_active_mds, "require_active_mds"},
204         {Opt_norequire_active_mds, "norequire_active_mds"},
205 #ifdef CONFIG_CEPH_FS_POSIX_ACL
206         {Opt_acl, "acl"},
207 #endif
208         {Opt_noacl, "noacl"},
209         {Opt_quotadf, "quotadf"},
210         {Opt_noquotadf, "noquotadf"},
211         {Opt_copyfrom, "copyfrom"},
212         {Opt_nocopyfrom, "nocopyfrom"},
213         {-1, NULL}
214 };
215
216 static int parse_fsopt_token(char *c, void *private)
217 {
218         struct ceph_mount_options *fsopt = private;
219         substring_t argstr[MAX_OPT_ARGS];
220         int token, intval, ret;
221
222         token = match_token((char *)c, fsopt_tokens, argstr);
223         if (token < 0)
224                 return -EINVAL;
225
226         if (token < Opt_last_int) {
227                 ret = match_int(&argstr[0], &intval);
228                 if (ret < 0) {
229                         pr_err("bad option arg (not int) at '%s'\n", c);
230                         return ret;
231                 }
232                 dout("got int token %d val %d\n", token, intval);
233         } else if (token > Opt_last_int && token < Opt_last_string) {
234                 dout("got string token %d val %s\n", token,
235                      argstr[0].from);
236         } else {
237                 dout("got token %d\n", token);
238         }
239
240         switch (token) {
241         case Opt_snapdirname:
242                 kfree(fsopt->snapdir_name);
243                 fsopt->snapdir_name = kstrndup(argstr[0].from,
244                                                argstr[0].to-argstr[0].from,
245                                                GFP_KERNEL);
246                 if (!fsopt->snapdir_name)
247                         return -ENOMEM;
248                 break;
249         case Opt_mds_namespace:
250                 kfree(fsopt->mds_namespace);
251                 fsopt->mds_namespace = kstrndup(argstr[0].from,
252                                                 argstr[0].to-argstr[0].from,
253                                                 GFP_KERNEL);
254                 if (!fsopt->mds_namespace)
255                         return -ENOMEM;
256                 break;
257         case Opt_fscache_uniq:
258                 kfree(fsopt->fscache_uniq);
259                 fsopt->fscache_uniq = kstrndup(argstr[0].from,
260                                                argstr[0].to-argstr[0].from,
261                                                GFP_KERNEL);
262                 if (!fsopt->fscache_uniq)
263                         return -ENOMEM;
264                 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
265                 break;
266                 /* misc */
267         case Opt_wsize:
268                 if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
269                         return -EINVAL;
270                 fsopt->wsize = ALIGN(intval, PAGE_SIZE);
271                 break;
272         case Opt_rsize:
273                 if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE)
274                         return -EINVAL;
275                 fsopt->rsize = ALIGN(intval, PAGE_SIZE);
276                 break;
277         case Opt_rasize:
278                 if (intval < 0)
279                         return -EINVAL;
280                 fsopt->rasize = ALIGN(intval, PAGE_SIZE);
281                 break;
282         case Opt_caps_wanted_delay_min:
283                 if (intval < 1)
284                         return -EINVAL;
285                 fsopt->caps_wanted_delay_min = intval;
286                 break;
287         case Opt_caps_wanted_delay_max:
288                 if (intval < 1)
289                         return -EINVAL;
290                 fsopt->caps_wanted_delay_max = intval;
291                 break;
292         case Opt_caps_max:
293                 if (intval < 0)
294                         return -EINVAL;
295                 fsopt->caps_max = intval;
296                 break;
297         case Opt_readdir_max_entries:
298                 if (intval < 1)
299                         return -EINVAL;
300                 fsopt->max_readdir = intval;
301                 break;
302         case Opt_readdir_max_bytes:
303                 if (intval < (int)PAGE_SIZE && intval != 0)
304                         return -EINVAL;
305                 fsopt->max_readdir_bytes = intval;
306                 break;
307         case Opt_congestion_kb:
308                 if (intval < 1024) /* at least 1M */
309                         return -EINVAL;
310                 fsopt->congestion_kb = intval;
311                 break;
312         case Opt_dirstat:
313                 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
314                 break;
315         case Opt_nodirstat:
316                 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
317                 break;
318         case Opt_rbytes:
319                 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
320                 break;
321         case Opt_norbytes:
322                 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
323                 break;
324         case Opt_asyncreaddir:
325                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
326                 break;
327         case Opt_noasyncreaddir:
328                 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
329                 break;
330         case Opt_dcache:
331                 fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
332                 break;
333         case Opt_nodcache:
334                 fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
335                 break;
336         case Opt_ino32:
337                 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
338                 break;
339         case Opt_noino32:
340                 fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
341                 break;
342         case Opt_fscache:
343                 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
344                 kfree(fsopt->fscache_uniq);
345                 fsopt->fscache_uniq = NULL;
346                 break;
347         case Opt_nofscache:
348                 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
349                 kfree(fsopt->fscache_uniq);
350                 fsopt->fscache_uniq = NULL;
351                 break;
352         case Opt_poolperm:
353                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
354                 break;
355         case Opt_nopoolperm:
356                 fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
357                 break;
358         case Opt_require_active_mds:
359                 fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT;
360                 break;
361         case Opt_norequire_active_mds:
362                 fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
363                 break;
364         case Opt_quotadf:
365                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF;
366                 break;
367         case Opt_noquotadf:
368                 fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
369                 break;
370         case Opt_copyfrom:
371                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
372                 break;
373         case Opt_nocopyfrom:
374                 fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
375                 break;
376 #ifdef CONFIG_CEPH_FS_POSIX_ACL
377         case Opt_acl:
378                 fsopt->sb_flags |= SB_POSIXACL;
379                 break;
380 #endif
381         case Opt_noacl:
382                 fsopt->sb_flags &= ~SB_POSIXACL;
383                 break;
384         default:
385                 BUG_ON(token);
386         }
387         return 0;
388 }
389
390 static void destroy_mount_options(struct ceph_mount_options *args)
391 {
392         dout("destroy_mount_options %p\n", args);
393         kfree(args->snapdir_name);
394         kfree(args->mds_namespace);
395         kfree(args->server_path);
396         kfree(args->fscache_uniq);
397         kfree(args);
398 }
399
400 static int strcmp_null(const char *s1, const char *s2)
401 {
402         if (!s1 && !s2)
403                 return 0;
404         if (s1 && !s2)
405                 return -1;
406         if (!s1 && s2)
407                 return 1;
408         return strcmp(s1, s2);
409 }
410
411 static int compare_mount_options(struct ceph_mount_options *new_fsopt,
412                                  struct ceph_options *new_opt,
413                                  struct ceph_fs_client *fsc)
414 {
415         struct ceph_mount_options *fsopt1 = new_fsopt;
416         struct ceph_mount_options *fsopt2 = fsc->mount_options;
417         int ofs = offsetof(struct ceph_mount_options, snapdir_name);
418         int ret;
419
420         ret = memcmp(fsopt1, fsopt2, ofs);
421         if (ret)
422                 return ret;
423
424         ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
425         if (ret)
426                 return ret;
427         ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
428         if (ret)
429                 return ret;
430         ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
431         if (ret)
432                 return ret;
433         ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
434         if (ret)
435                 return ret;
436
437         return ceph_compare_options(new_opt, fsc->client);
438 }
439
440 static int parse_mount_options(struct ceph_mount_options **pfsopt,
441                                struct ceph_options **popt,
442                                int flags, char *options,
443                                const char *dev_name)
444 {
445         struct ceph_mount_options *fsopt;
446         const char *dev_name_end;
447         int err;
448
449         if (!dev_name || !*dev_name)
450                 return -EINVAL;
451
452         fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
453         if (!fsopt)
454                 return -ENOMEM;
455
456         dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
457
458         fsopt->sb_flags = flags;
459         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
460
461         fsopt->wsize = CEPH_MAX_WRITE_SIZE;
462         fsopt->rsize = CEPH_MAX_READ_SIZE;
463         fsopt->rasize = CEPH_RASIZE_DEFAULT;
464         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
465         if (!fsopt->snapdir_name) {
466                 err = -ENOMEM;
467                 goto out;
468         }
469
470         fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
471         fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
472         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
473         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
474         fsopt->congestion_kb = default_congestion_kb();
475
476         /*
477          * Distinguish the server list from the path in "dev_name".
478          * Internally we do not include the leading '/' in the path.
479          *
480          * "dev_name" will look like:
481          *     <server_spec>[,<server_spec>...]:[<path>]
482          * where
483          *     <server_spec> is <ip>[:<port>]
484          *     <path> is optional, but if present must begin with '/'
485          */
486         dev_name_end = strchr(dev_name, '/');
487         if (dev_name_end) {
488                 if (strlen(dev_name_end) > 1) {
489                         fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
490                         if (!fsopt->server_path) {
491                                 err = -ENOMEM;
492                                 goto out;
493                         }
494                 }
495         } else {
496                 dev_name_end = dev_name + strlen(dev_name);
497         }
498         err = -EINVAL;
499         dev_name_end--;         /* back up to ':' separator */
500         if (dev_name_end < dev_name || *dev_name_end != ':') {
501                 pr_err("device name is missing path (no : separator in %s)\n",
502                                 dev_name);
503                 goto out;
504         }
505         dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
506         if (fsopt->server_path)
507                 dout("server path '%s'\n", fsopt->server_path);
508
509         *popt = ceph_parse_options(options, dev_name, dev_name_end,
510                                  parse_fsopt_token, (void *)fsopt);
511         if (IS_ERR(*popt)) {
512                 err = PTR_ERR(*popt);
513                 goto out;
514         }
515
516         /* success */
517         *pfsopt = fsopt;
518         return 0;
519
520 out:
521         destroy_mount_options(fsopt);
522         return err;
523 }
524
525 /**
526  * ceph_show_options - Show mount options in /proc/mounts
527  * @m: seq_file to write to
528  * @root: root of that (sub)tree
529  */
530 static int ceph_show_options(struct seq_file *m, struct dentry *root)
531 {
532         struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
533         struct ceph_mount_options *fsopt = fsc->mount_options;
534         size_t pos;
535         int ret;
536
537         /* a comma between MNT/MS and client options */
538         seq_putc(m, ',');
539         pos = m->count;
540
541         ret = ceph_print_client_options(m, fsc->client, false);
542         if (ret)
543                 return ret;
544
545         /* retract our comma if no client options */
546         if (m->count == pos)
547                 m->count--;
548
549         if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
550                 seq_puts(m, ",dirstat");
551         if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
552                 seq_puts(m, ",rbytes");
553         if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
554                 seq_puts(m, ",noasyncreaddir");
555         if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
556                 seq_puts(m, ",nodcache");
557         if (fsopt->flags & CEPH_MOUNT_OPT_INO32)
558                 seq_puts(m, ",ino32");
559         if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
560                 seq_show_option(m, "fsc", fsopt->fscache_uniq);
561         }
562         if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
563                 seq_puts(m, ",nopoolperm");
564         if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF)
565                 seq_puts(m, ",noquotadf");
566
567 #ifdef CONFIG_CEPH_FS_POSIX_ACL
568         if (fsopt->sb_flags & SB_POSIXACL)
569                 seq_puts(m, ",acl");
570         else
571                 seq_puts(m, ",noacl");
572 #endif
573
574         if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
575                 seq_puts(m, ",copyfrom");
576
577         if (fsopt->mds_namespace)
578                 seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
579         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
580                 seq_printf(m, ",wsize=%d", fsopt->wsize);
581         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
582                 seq_printf(m, ",rsize=%d", fsopt->rsize);
583         if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
584                 seq_printf(m, ",rasize=%d", fsopt->rasize);
585         if (fsopt->congestion_kb != default_congestion_kb())
586                 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
587         if (fsopt->caps_max)
588                 seq_printf(m, ",caps_max=%d", fsopt->caps_max);
589         if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
590                 seq_printf(m, ",caps_wanted_delay_min=%d",
591                          fsopt->caps_wanted_delay_min);
592         if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
593                 seq_printf(m, ",caps_wanted_delay_max=%d",
594                            fsopt->caps_wanted_delay_max);
595         if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
596                 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
597         if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
598                 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
599         if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
600                 seq_show_option(m, "snapdirname", fsopt->snapdir_name);
601
602         return 0;
603 }
604
605 /*
606  * handle any mon messages the standard library doesn't understand.
607  * return error if we don't either.
608  */
609 static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
610 {
611         struct ceph_fs_client *fsc = client->private;
612         int type = le16_to_cpu(msg->hdr.type);
613
614         switch (type) {
615         case CEPH_MSG_MDS_MAP:
616                 ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
617                 return 0;
618         case CEPH_MSG_FS_MAP_USER:
619                 ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
620                 return 0;
621         default:
622                 return -1;
623         }
624 }
625
626 /*
627  * create a new fs client
628  *
629  * Success or not, this function consumes @fsopt and @opt.
630  */
631 static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
632                                         struct ceph_options *opt)
633 {
634         struct ceph_fs_client *fsc;
635         int page_count;
636         size_t size;
637         int err;
638
639         fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
640         if (!fsc) {
641                 err = -ENOMEM;
642                 goto fail;
643         }
644
645         fsc->client = ceph_create_client(opt, fsc);
646         if (IS_ERR(fsc->client)) {
647                 err = PTR_ERR(fsc->client);
648                 goto fail;
649         }
650         opt = NULL; /* fsc->client now owns this */
651
652         fsc->client->extra_mon_dispatch = extra_mon_dispatch;
653         ceph_set_opt(fsc->client, ABORT_ON_FULL);
654
655         if (!fsopt->mds_namespace) {
656                 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
657                                    0, true);
658         } else {
659                 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
660                                    0, false);
661         }
662
663         fsc->mount_options = fsopt;
664
665         fsc->sb = NULL;
666         fsc->mount_state = CEPH_MOUNT_MOUNTING;
667
668         atomic_long_set(&fsc->writeback_count, 0);
669
670         err = -ENOMEM;
671         /*
672          * The number of concurrent works can be high but they don't need
673          * to be processed in parallel, limit concurrency.
674          */
675         fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
676         if (!fsc->inode_wq)
677                 goto fail_client;
678         fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
679         if (!fsc->cap_wq)
680                 goto fail_inode_wq;
681
682         /* set up mempools */
683         err = -ENOMEM;
684         page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
685         size = sizeof (struct page *) * (page_count ? page_count : 1);
686         fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
687         if (!fsc->wb_pagevec_pool)
688                 goto fail_cap_wq;
689
690         return fsc;
691
692 fail_cap_wq:
693         destroy_workqueue(fsc->cap_wq);
694 fail_inode_wq:
695         destroy_workqueue(fsc->inode_wq);
696 fail_client:
697         ceph_destroy_client(fsc->client);
698 fail:
699         kfree(fsc);
700         if (opt)
701                 ceph_destroy_options(opt);
702         destroy_mount_options(fsopt);
703         return ERR_PTR(err);
704 }
705
706 static void flush_fs_workqueues(struct ceph_fs_client *fsc)
707 {
708         flush_workqueue(fsc->inode_wq);
709         flush_workqueue(fsc->cap_wq);
710 }
711
712 static void destroy_fs_client(struct ceph_fs_client *fsc)
713 {
714         dout("destroy_fs_client %p\n", fsc);
715
716         destroy_workqueue(fsc->inode_wq);
717         destroy_workqueue(fsc->cap_wq);
718
719         mempool_destroy(fsc->wb_pagevec_pool);
720
721         destroy_mount_options(fsc->mount_options);
722
723         ceph_destroy_client(fsc->client);
724
725         kfree(fsc);
726         dout("destroy_fs_client %p done\n", fsc);
727 }
728
729 /*
730  * caches
731  */
732 struct kmem_cache *ceph_inode_cachep;
733 struct kmem_cache *ceph_cap_cachep;
734 struct kmem_cache *ceph_cap_flush_cachep;
735 struct kmem_cache *ceph_dentry_cachep;
736 struct kmem_cache *ceph_file_cachep;
737 struct kmem_cache *ceph_dir_file_cachep;
738
739 static void ceph_inode_init_once(void *foo)
740 {
741         struct ceph_inode_info *ci = foo;
742         inode_init_once(&ci->vfs_inode);
743 }
744
745 static int __init init_caches(void)
746 {
747         int error = -ENOMEM;
748
749         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
750                                       sizeof(struct ceph_inode_info),
751                                       __alignof__(struct ceph_inode_info),
752                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
753                                       SLAB_ACCOUNT, ceph_inode_init_once);
754         if (!ceph_inode_cachep)
755                 return -ENOMEM;
756
757         ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
758         if (!ceph_cap_cachep)
759                 goto bad_cap;
760         ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
761                                            SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
762         if (!ceph_cap_flush_cachep)
763                 goto bad_cap_flush;
764
765         ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
766                                         SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
767         if (!ceph_dentry_cachep)
768                 goto bad_dentry;
769
770         ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
771         if (!ceph_file_cachep)
772                 goto bad_file;
773
774         ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD);
775         if (!ceph_dir_file_cachep)
776                 goto bad_dir_file;
777
778         error = ceph_fscache_register();
779         if (error)
780                 goto bad_fscache;
781
782         return 0;
783
784 bad_fscache:
785         kmem_cache_destroy(ceph_dir_file_cachep);
786 bad_dir_file:
787         kmem_cache_destroy(ceph_file_cachep);
788 bad_file:
789         kmem_cache_destroy(ceph_dentry_cachep);
790 bad_dentry:
791         kmem_cache_destroy(ceph_cap_flush_cachep);
792 bad_cap_flush:
793         kmem_cache_destroy(ceph_cap_cachep);
794 bad_cap:
795         kmem_cache_destroy(ceph_inode_cachep);
796         return error;
797 }
798
799 static void destroy_caches(void)
800 {
801         /*
802          * Make sure all delayed rcu free inodes are flushed before we
803          * destroy cache.
804          */
805         rcu_barrier();
806
807         kmem_cache_destroy(ceph_inode_cachep);
808         kmem_cache_destroy(ceph_cap_cachep);
809         kmem_cache_destroy(ceph_cap_flush_cachep);
810         kmem_cache_destroy(ceph_dentry_cachep);
811         kmem_cache_destroy(ceph_file_cachep);
812         kmem_cache_destroy(ceph_dir_file_cachep);
813
814         ceph_fscache_unregister();
815 }
816
817
818 /*
819  * ceph_umount_begin - initiate forced umount.  Tear down down the
820  * mount, skipping steps that may hang while waiting for server(s).
821  */
822 static void ceph_umount_begin(struct super_block *sb)
823 {
824         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
825
826         dout("ceph_umount_begin - starting forced umount\n");
827         if (!fsc)
828                 return;
829         fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
830         ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
831         ceph_mdsc_force_umount(fsc->mdsc);
832         return;
833 }
834
835 static int ceph_remount(struct super_block *sb, int *flags, char *data)
836 {
837         sync_filesystem(sb);
838         return 0;
839 }
840
841 static const struct super_operations ceph_super_ops = {
842         .alloc_inode    = ceph_alloc_inode,
843         .free_inode     = ceph_free_inode,
844         .write_inode    = ceph_write_inode,
845         .drop_inode     = ceph_drop_inode,
846         .evict_inode    = ceph_evict_inode,
847         .sync_fs        = ceph_sync_fs,
848         .put_super      = ceph_put_super,
849         .remount_fs     = ceph_remount,
850         .show_options   = ceph_show_options,
851         .statfs         = ceph_statfs,
852         .umount_begin   = ceph_umount_begin,
853 };
854
855 /*
856  * Bootstrap mount by opening the root directory.  Note the mount
857  * @started time from caller, and time out if this takes too long.
858  */
859 static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
860                                        const char *path,
861                                        unsigned long started)
862 {
863         struct ceph_mds_client *mdsc = fsc->mdsc;
864         struct ceph_mds_request *req = NULL;
865         int err;
866         struct dentry *root;
867
868         /* open dir */
869         dout("open_root_inode opening '%s'\n", path);
870         req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
871         if (IS_ERR(req))
872                 return ERR_CAST(req);
873         req->r_path1 = kstrdup(path, GFP_NOFS);
874         if (!req->r_path1) {
875                 root = ERR_PTR(-ENOMEM);
876                 goto out;
877         }
878
879         req->r_ino1.ino = CEPH_INO_ROOT;
880         req->r_ino1.snap = CEPH_NOSNAP;
881         req->r_started = started;
882         req->r_timeout = fsc->client->options->mount_timeout;
883         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
884         req->r_num_caps = 2;
885         err = ceph_mdsc_do_request(mdsc, NULL, req);
886         if (err == 0) {
887                 struct inode *inode = req->r_target_inode;
888                 req->r_target_inode = NULL;
889                 dout("open_root_inode success\n");
890                 root = d_make_root(inode);
891                 if (!root) {
892                         root = ERR_PTR(-ENOMEM);
893                         goto out;
894                 }
895                 dout("open_root_inode success, root dentry is %p\n", root);
896         } else {
897                 root = ERR_PTR(err);
898         }
899 out:
900         ceph_mdsc_put_request(req);
901         return root;
902 }
903
904
905
906
907 /*
908  * mount: join the ceph cluster, and open root directory.
909  */
910 static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
911 {
912         int err;
913         unsigned long started = jiffies;  /* note the start time */
914         struct dentry *root;
915
916         dout("mount start %p\n", fsc);
917         mutex_lock(&fsc->client->mount_mutex);
918
919         if (!fsc->sb->s_root) {
920                 const char *path;
921                 err = __ceph_open_session(fsc->client, started);
922                 if (err < 0)
923                         goto out;
924
925                 /* setup fscache */
926                 if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
927                         err = ceph_fscache_register_fs(fsc);
928                         if (err < 0)
929                                 goto out;
930                 }
931
932                 if (!fsc->mount_options->server_path) {
933                         path = "";
934                         dout("mount opening path \\t\n");
935                 } else {
936                         path = fsc->mount_options->server_path + 1;
937                         dout("mount opening path %s\n", path);
938                 }
939
940                 err = ceph_fs_debugfs_init(fsc);
941                 if (err < 0)
942                         goto out;
943
944                 root = open_root_dentry(fsc, path, started);
945                 if (IS_ERR(root)) {
946                         err = PTR_ERR(root);
947                         goto out;
948                 }
949                 fsc->sb->s_root = dget(root);
950         } else {
951                 root = dget(fsc->sb->s_root);
952         }
953
954         fsc->mount_state = CEPH_MOUNT_MOUNTED;
955         dout("mount success\n");
956         mutex_unlock(&fsc->client->mount_mutex);
957         return root;
958
959 out:
960         mutex_unlock(&fsc->client->mount_mutex);
961         return ERR_PTR(err);
962 }
963
964 static int ceph_set_super(struct super_block *s, void *data)
965 {
966         struct ceph_fs_client *fsc = data;
967         int ret;
968
969         dout("set_super %p data %p\n", s, data);
970
971         s->s_flags = fsc->mount_options->sb_flags;
972         s->s_maxbytes = MAX_LFS_FILESIZE;
973
974         s->s_xattr = ceph_xattr_handlers;
975         s->s_fs_info = fsc;
976         fsc->sb = s;
977         fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
978
979         s->s_op = &ceph_super_ops;
980         s->s_d_op = &ceph_dentry_ops;
981         s->s_export_op = &ceph_export_ops;
982
983         s->s_time_gran = 1;
984
985         ret = set_anon_super(s, NULL);  /* what is that second arg for? */
986         if (ret != 0)
987                 goto fail;
988
989         return ret;
990
991 fail:
992         s->s_fs_info = NULL;
993         fsc->sb = NULL;
994         return ret;
995 }
996
997 /*
998  * share superblock if same fs AND options
999  */
1000 static int ceph_compare_super(struct super_block *sb, void *data)
1001 {
1002         struct ceph_fs_client *new = data;
1003         struct ceph_mount_options *fsopt = new->mount_options;
1004         struct ceph_options *opt = new->client->options;
1005         struct ceph_fs_client *other = ceph_sb_to_client(sb);
1006
1007         dout("ceph_compare_super %p\n", sb);
1008
1009         if (compare_mount_options(fsopt, opt, other)) {
1010                 dout("monitor(s)/mount options don't match\n");
1011                 return 0;
1012         }
1013         if ((opt->flags & CEPH_OPT_FSID) &&
1014             ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
1015                 dout("fsid doesn't match\n");
1016                 return 0;
1017         }
1018         if (fsopt->sb_flags != other->mount_options->sb_flags) {
1019                 dout("flags differ\n");
1020                 return 0;
1021         }
1022         return 1;
1023 }
1024
1025 /*
1026  * construct our own bdi so we can control readahead, etc.
1027  */
1028 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
1029
1030 static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
1031 {
1032         int err;
1033
1034         err = super_setup_bdi_name(sb, "ceph-%ld",
1035                                    atomic_long_inc_return(&bdi_seq));
1036         if (err)
1037                 return err;
1038
1039         /* set ra_pages based on rasize mount option? */
1040         sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT;
1041
1042         /* set io_pages based on max osd read size */
1043         sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT;
1044
1045         return 0;
1046 }
1047
1048 static struct dentry *ceph_mount(struct file_system_type *fs_type,
1049                        int flags, const char *dev_name, void *data)
1050 {
1051         struct super_block *sb;
1052         struct ceph_fs_client *fsc;
1053         struct dentry *res;
1054         int err;
1055         int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
1056         struct ceph_mount_options *fsopt = NULL;
1057         struct ceph_options *opt = NULL;
1058
1059         dout("ceph_mount\n");
1060
1061 #ifdef CONFIG_CEPH_FS_POSIX_ACL
1062         flags |= SB_POSIXACL;
1063 #endif
1064         err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
1065         if (err < 0) {
1066                 res = ERR_PTR(err);
1067                 goto out_final;
1068         }
1069
1070         /* create client (which we may/may not use) */
1071         fsc = create_fs_client(fsopt, opt);
1072         if (IS_ERR(fsc)) {
1073                 res = ERR_CAST(fsc);
1074                 goto out_final;
1075         }
1076
1077         err = ceph_mdsc_init(fsc);
1078         if (err < 0) {
1079                 res = ERR_PTR(err);
1080                 goto out;
1081         }
1082
1083         if (ceph_test_opt(fsc->client, NOSHARE))
1084                 compare_super = NULL;
1085         sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
1086         if (IS_ERR(sb)) {
1087                 res = ERR_CAST(sb);
1088                 goto out;
1089         }
1090
1091         if (ceph_sb_to_client(sb) != fsc) {
1092                 ceph_mdsc_destroy(fsc);
1093                 destroy_fs_client(fsc);
1094                 fsc = ceph_sb_to_client(sb);
1095                 dout("get_sb got existing client %p\n", fsc);
1096         } else {
1097                 dout("get_sb using new client %p\n", fsc);
1098                 err = ceph_setup_bdi(sb, fsc);
1099                 if (err < 0) {
1100                         res = ERR_PTR(err);
1101                         goto out_splat;
1102                 }
1103         }
1104
1105         res = ceph_real_mount(fsc);
1106         if (IS_ERR(res))
1107                 goto out_splat;
1108         dout("root %p inode %p ino %llx.%llx\n", res,
1109              d_inode(res), ceph_vinop(d_inode(res)));
1110         return res;
1111
1112 out_splat:
1113         ceph_mdsc_close_sessions(fsc->mdsc);
1114         deactivate_locked_super(sb);
1115         goto out_final;
1116
1117 out:
1118         ceph_mdsc_destroy(fsc);
1119         destroy_fs_client(fsc);
1120 out_final:
1121         dout("ceph_mount fail %ld\n", PTR_ERR(res));
1122         return res;
1123 }
1124
1125 static void ceph_kill_sb(struct super_block *s)
1126 {
1127         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1128         dev_t dev = s->s_dev;
1129
1130         dout("kill_sb %p\n", s);
1131
1132         ceph_mdsc_pre_umount(fsc->mdsc);
1133         flush_fs_workqueues(fsc);
1134
1135         generic_shutdown_super(s);
1136
1137         fsc->client->extra_mon_dispatch = NULL;
1138         ceph_fs_debugfs_cleanup(fsc);
1139
1140         ceph_fscache_unregister_fs(fsc);
1141
1142         ceph_mdsc_destroy(fsc);
1143
1144         destroy_fs_client(fsc);
1145         free_anon_bdev(dev);
1146 }
1147
1148 static struct file_system_type ceph_fs_type = {
1149         .owner          = THIS_MODULE,
1150         .name           = "ceph",
1151         .mount          = ceph_mount,
1152         .kill_sb        = ceph_kill_sb,
1153         .fs_flags       = FS_RENAME_DOES_D_MOVE,
1154 };
1155 MODULE_ALIAS_FS("ceph");
1156
1157 static int __init init_ceph(void)
1158 {
1159         int ret = init_caches();
1160         if (ret)
1161                 goto out;
1162
1163         ceph_flock_init();
1164         ret = register_filesystem(&ceph_fs_type);
1165         if (ret)
1166                 goto out_caches;
1167
1168         pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1169
1170         return 0;
1171
1172 out_caches:
1173         destroy_caches();
1174 out:
1175         return ret;
1176 }
1177
1178 static void __exit exit_ceph(void)
1179 {
1180         dout("exit_ceph\n");
1181         unregister_filesystem(&ceph_fs_type);
1182         destroy_caches();
1183 }
1184
1185 module_init(init_ceph);
1186 module_exit(exit_ceph);
1187
1188 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1189 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1190 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1191 MODULE_DESCRIPTION("Ceph filesystem for Linux");
1192 MODULE_LICENSE("GPL");