bcachefs: Reduce/kill BKEY_PADDED use
[linux-block.git] / fs / bcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "extents.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fs-io.h"
15 #include "fs-ioctl.h"
16 #include "fsck.h"
17 #include "inode.h"
18 #include "io.h"
19 #include "journal.h"
20 #include "keylist.h"
21 #include "quota.h"
22 #include "super.h"
23 #include "xattr.h"
24
25 #include <linux/aio.h>
26 #include <linux/backing-dev.h>
27 #include <linux/exportfs.h>
28 #include <linux/fiemap.h>
29 #include <linux/module.h>
30 #include <linux/pagemap.h>
31 #include <linux/posix_acl.h>
32 #include <linux/random.h>
33 #include <linux/seq_file.h>
34 #include <linux/statfs.h>
35 #include <linux/xattr.h>
36
37 static struct kmem_cache *bch2_inode_cache;
38
39 static void bch2_vfs_inode_init(struct bch_fs *,
40                                 struct bch_inode_info *,
41                                 struct bch_inode_unpacked *);
42
43 static void journal_seq_copy(struct bch_fs *c,
44                              struct bch_inode_info *dst,
45                              u64 journal_seq)
46 {
47         /*
48          * atomic64_cmpxchg has a fallback for archs that don't support it,
49          * cmpxchg does not:
50          */
51         atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
52         u64 old, v = READ_ONCE(dst->ei_journal_seq);
53
54         do {
55                 old = v;
56
57                 if (old >= journal_seq)
58                         break;
59         } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
60
61         bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
62 }
63
64 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
65 {
66         BUG_ON(atomic_long_read(&lock->v) == 0);
67
68         if (atomic_long_sub_return_release(i, &lock->v) == 0)
69                 wake_up_all(&lock->wait);
70 }
71
72 static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
73 {
74         long v = atomic_long_read(&lock->v), old;
75
76         do {
77                 old = v;
78
79                 if (i > 0 ? v < 0 : v > 0)
80                         return false;
81         } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
82                                         old, old + i)) != old);
83         return true;
84 }
85
86 static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
87 {
88         wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
89 }
90
91 void bch2_pagecache_add_put(struct pagecache_lock *lock)
92 {
93         __pagecache_lock_put(lock, 1);
94 }
95
96 bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
97 {
98         return __pagecache_lock_tryget(lock, 1);
99 }
100
101 void bch2_pagecache_add_get(struct pagecache_lock *lock)
102 {
103         __pagecache_lock_get(lock, 1);
104 }
105
106 void bch2_pagecache_block_put(struct pagecache_lock *lock)
107 {
108         __pagecache_lock_put(lock, -1);
109 }
110
111 void bch2_pagecache_block_get(struct pagecache_lock *lock)
112 {
113         __pagecache_lock_get(lock, -1);
114 }
115
116 void bch2_inode_update_after_write(struct bch_fs *c,
117                                    struct bch_inode_info *inode,
118                                    struct bch_inode_unpacked *bi,
119                                    unsigned fields)
120 {
121         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
122         i_uid_write(&inode->v, bi->bi_uid);
123         i_gid_write(&inode->v, bi->bi_gid);
124         inode->v.i_mode = bi->bi_mode;
125
126         if (fields & ATTR_ATIME)
127                 inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
128         if (fields & ATTR_MTIME)
129                 inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
130         if (fields & ATTR_CTIME)
131                 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
132
133         inode->ei_inode         = *bi;
134
135         bch2_inode_flags_to_vfs(inode);
136 }
137
138 int __must_check bch2_write_inode(struct bch_fs *c,
139                                   struct bch_inode_info *inode,
140                                   inode_set_fn set,
141                                   void *p, unsigned fields)
142 {
143         struct btree_trans trans;
144         struct btree_iter *iter;
145         struct bch_inode_unpacked inode_u;
146         int ret;
147
148         bch2_trans_init(&trans, c, 0, 0);
149 retry:
150         bch2_trans_begin(&trans);
151
152         iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
153                                BTREE_ITER_INTENT);
154         ret   = PTR_ERR_OR_ZERO(iter) ?:
155                 (set ? set(inode, &inode_u, p) : 0) ?:
156                 bch2_inode_write(&trans, iter, &inode_u) ?:
157                 bch2_trans_commit(&trans, NULL,
158                                   &inode->ei_journal_seq,
159                                   BTREE_INSERT_NOUNLOCK|
160                                   BTREE_INSERT_NOFAIL);
161
162         /*
163          * the btree node lock protects inode->ei_inode, not ei_update_lock;
164          * this is important for inode updates via bchfs_write_index_update
165          */
166         if (!ret)
167                 bch2_inode_update_after_write(c, inode, &inode_u, fields);
168
169         bch2_trans_iter_put(&trans, iter);
170
171         if (ret == -EINTR)
172                 goto retry;
173
174         bch2_trans_exit(&trans);
175         return ret < 0 ? ret : 0;
176 }
177
178 int bch2_fs_quota_transfer(struct bch_fs *c,
179                            struct bch_inode_info *inode,
180                            struct bch_qid new_qid,
181                            unsigned qtypes,
182                            enum quota_acct_mode mode)
183 {
184         unsigned i;
185         int ret;
186
187         qtypes &= enabled_qtypes(c);
188
189         for (i = 0; i < QTYP_NR; i++)
190                 if (new_qid.q[i] == inode->ei_qid.q[i])
191                         qtypes &= ~(1U << i);
192
193         if (!qtypes)
194                 return 0;
195
196         mutex_lock(&inode->ei_quota_lock);
197
198         ret = bch2_quota_transfer(c, qtypes, new_qid,
199                                   inode->ei_qid,
200                                   inode->v.i_blocks +
201                                   inode->ei_quota_reserved,
202                                   mode);
203         if (!ret)
204                 for (i = 0; i < QTYP_NR; i++)
205                         if (qtypes & (1 << i))
206                                 inode->ei_qid.q[i] = new_qid.q[i];
207
208         mutex_unlock(&inode->ei_quota_lock);
209
210         return ret;
211 }
212
213 struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
214 {
215         struct bch_inode_unpacked inode_u;
216         struct bch_inode_info *inode;
217         int ret;
218
219         inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
220         if (unlikely(!inode))
221                 return ERR_PTR(-ENOMEM);
222         if (!(inode->v.i_state & I_NEW))
223                 return &inode->v;
224
225         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
226         if (ret) {
227                 iget_failed(&inode->v);
228                 return ERR_PTR(ret);
229         }
230
231         bch2_vfs_inode_init(c, inode, &inode_u);
232
233         inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
234
235         unlock_new_inode(&inode->v);
236
237         return &inode->v;
238 }
239
240 static int inum_test(struct inode *inode, void *p)
241 {
242         unsigned long *ino = p;
243
244         return *ino == inode->i_ino;
245 }
246
247 static struct bch_inode_info *
248 __bch2_create(struct mnt_idmap *idmap,
249               struct bch_inode_info *dir, struct dentry *dentry,
250               umode_t mode, dev_t rdev, bool tmpfile)
251 {
252         struct bch_fs *c = dir->v.i_sb->s_fs_info;
253         struct btree_trans trans;
254         struct bch_inode_unpacked dir_u;
255         struct bch_inode_info *inode, *old;
256         struct bch_inode_unpacked inode_u;
257         struct posix_acl *default_acl = NULL, *acl = NULL;
258         u64 journal_seq = 0;
259         int ret;
260
261         /*
262          * preallocate acls + vfs inode before btree transaction, so that
263          * nothing can fail after the transaction succeeds:
264          */
265 #ifdef CONFIG_BCACHEFS_POSIX_ACL
266         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
267         if (ret)
268                 return ERR_PTR(ret);
269 #endif
270         inode = to_bch_ei(new_inode(c->vfs_sb));
271         if (unlikely(!inode)) {
272                 inode = ERR_PTR(-ENOMEM);
273                 goto err;
274         }
275
276         bch2_inode_init_early(c, &inode_u);
277
278         if (!tmpfile)
279                 mutex_lock(&dir->ei_update_lock);
280
281         bch2_trans_init(&trans, c, 8,
282                         2048 + (!tmpfile ? dentry->d_name.len : 0));
283 retry:
284         bch2_trans_begin(&trans);
285
286         ret   = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
287                                   !tmpfile ? &dentry->d_name : NULL,
288                                   from_kuid(i_user_ns(&dir->v), current_fsuid()),
289                                   from_kgid(i_user_ns(&dir->v), current_fsgid()),
290                                   mode, rdev,
291                                   default_acl, acl) ?:
292                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
293                                 KEY_TYPE_QUOTA_PREALLOC);
294         if (unlikely(ret))
295                 goto err_before_quota;
296
297         ret   = bch2_trans_commit(&trans, NULL, &journal_seq,
298                                   BTREE_INSERT_NOUNLOCK);
299         if (unlikely(ret)) {
300                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
301                                 KEY_TYPE_QUOTA_WARN);
302 err_before_quota:
303                 if (ret == -EINTR)
304                         goto retry;
305                 goto err_trans;
306         }
307
308         if (!tmpfile) {
309                 bch2_inode_update_after_write(c, dir, &dir_u,
310                                               ATTR_MTIME|ATTR_CTIME);
311                 journal_seq_copy(c, dir, journal_seq);
312                 mutex_unlock(&dir->ei_update_lock);
313         }
314
315         bch2_vfs_inode_init(c, inode, &inode_u);
316         journal_seq_copy(c, inode, journal_seq);
317
318         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
319         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
320
321         /*
322          * we must insert the new inode into the inode cache before calling
323          * bch2_trans_exit() and dropping locks, else we could race with another
324          * thread pulling the inode in and modifying it:
325          */
326
327         inode->v.i_state |= I_CREATING;
328         old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
329                                       inum_test, NULL, &inode->v.i_ino));
330         BUG_ON(!old);
331
332         if (unlikely(old != inode)) {
333                 /*
334                  * We raced, another process pulled the new inode into cache
335                  * before us:
336                  */
337                 journal_seq_copy(c, old, journal_seq);
338                 make_bad_inode(&inode->v);
339                 iput(&inode->v);
340
341                 inode = old;
342         } else {
343                 /*
344                  * we really don't want insert_inode_locked2() to be setting
345                  * I_NEW...
346                  */
347                 unlock_new_inode(&inode->v);
348         }
349
350         bch2_trans_exit(&trans);
351 err:
352         posix_acl_release(default_acl);
353         posix_acl_release(acl);
354         return inode;
355 err_trans:
356         if (!tmpfile)
357                 mutex_unlock(&dir->ei_update_lock);
358
359         bch2_trans_exit(&trans);
360         make_bad_inode(&inode->v);
361         iput(&inode->v);
362         inode = ERR_PTR(ret);
363         goto err;
364 }
365
366 /* methods */
367
368 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
369                                   unsigned int flags)
370 {
371         struct bch_fs *c = vdir->i_sb->s_fs_info;
372         struct bch_inode_info *dir = to_bch_ei(vdir);
373         struct inode *vinode = NULL;
374         u64 inum;
375
376         inum = bch2_dirent_lookup(c, dir->v.i_ino,
377                                   &dir->ei_str_hash,
378                                   &dentry->d_name);
379
380         if (inum)
381                 vinode = bch2_vfs_inode_get(c, inum);
382
383         return d_splice_alias(vinode, dentry);
384 }
385
386 static int bch2_mknod(struct mnt_idmap *idmap,
387                       struct inode *vdir, struct dentry *dentry,
388                       umode_t mode, dev_t rdev)
389 {
390         struct bch_inode_info *inode =
391                 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false);
392
393         if (IS_ERR(inode))
394                 return PTR_ERR(inode);
395
396         d_instantiate(dentry, &inode->v);
397         return 0;
398 }
399
400 static int bch2_create(struct mnt_idmap *idmap,
401                        struct inode *vdir, struct dentry *dentry,
402                        umode_t mode, bool excl)
403 {
404         return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
405 }
406
407 static int __bch2_link(struct bch_fs *c,
408                        struct bch_inode_info *inode,
409                        struct bch_inode_info *dir,
410                        struct dentry *dentry)
411 {
412         struct btree_trans trans;
413         struct bch_inode_unpacked dir_u, inode_u;
414         int ret;
415
416         mutex_lock(&inode->ei_update_lock);
417         bch2_trans_init(&trans, c, 4, 1024);
418
419         do {
420                 bch2_trans_begin(&trans);
421                 ret   = bch2_link_trans(&trans,
422                                         dir->v.i_ino,
423                                         inode->v.i_ino, &dir_u, &inode_u,
424                                         &dentry->d_name) ?:
425                         bch2_trans_commit(&trans, NULL,
426                                         &inode->ei_journal_seq,
427                                         BTREE_INSERT_NOUNLOCK);
428         } while (ret == -EINTR);
429
430         if (likely(!ret)) {
431                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
432
433                 journal_seq_copy(c, inode, dir->ei_journal_seq);
434                 bch2_inode_update_after_write(c, dir, &dir_u,
435                                               ATTR_MTIME|ATTR_CTIME);
436                 bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
437         }
438
439         bch2_trans_exit(&trans);
440         mutex_unlock(&inode->ei_update_lock);
441         return ret;
442 }
443
444 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
445                      struct dentry *dentry)
446 {
447         struct bch_fs *c = vdir->i_sb->s_fs_info;
448         struct bch_inode_info *dir = to_bch_ei(vdir);
449         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
450         int ret;
451
452         lockdep_assert_held(&inode->v.i_rwsem);
453
454         ret = __bch2_link(c, inode, dir, dentry);
455         if (unlikely(ret))
456                 return ret;
457
458         ihold(&inode->v);
459         d_instantiate(dentry, &inode->v);
460         return 0;
461 }
462
463 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
464 {
465         struct bch_fs *c = vdir->i_sb->s_fs_info;
466         struct bch_inode_info *dir = to_bch_ei(vdir);
467         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
468         struct bch_inode_unpacked dir_u, inode_u;
469         struct btree_trans trans;
470         int ret;
471
472         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
473         bch2_trans_init(&trans, c, 4, 1024);
474
475         do {
476                 bch2_trans_begin(&trans);
477
478                 ret   = bch2_unlink_trans(&trans,
479                                           dir->v.i_ino, &dir_u,
480                                           &inode_u, &dentry->d_name) ?:
481                         bch2_trans_commit(&trans, NULL,
482                                           &dir->ei_journal_seq,
483                                           BTREE_INSERT_NOUNLOCK|
484                                           BTREE_INSERT_NOFAIL);
485         } while (ret == -EINTR);
486
487         if (likely(!ret)) {
488                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
489
490                 journal_seq_copy(c, inode, dir->ei_journal_seq);
491                 bch2_inode_update_after_write(c, dir, &dir_u,
492                                               ATTR_MTIME|ATTR_CTIME);
493                 bch2_inode_update_after_write(c, inode, &inode_u,
494                                               ATTR_MTIME);
495         }
496
497         bch2_trans_exit(&trans);
498         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
499
500         return ret;
501 }
502
503 static int bch2_symlink(struct mnt_idmap *idmap,
504                         struct inode *vdir, struct dentry *dentry,
505                         const char *symname)
506 {
507         struct bch_fs *c = vdir->i_sb->s_fs_info;
508         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
509         int ret;
510
511         inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
512         if (unlikely(IS_ERR(inode)))
513                 return PTR_ERR(inode);
514
515         inode_lock(&inode->v);
516         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
517         inode_unlock(&inode->v);
518
519         if (unlikely(ret))
520                 goto err;
521
522         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
523         if (unlikely(ret))
524                 goto err;
525
526         journal_seq_copy(c, dir, inode->ei_journal_seq);
527
528         ret = __bch2_link(c, inode, dir, dentry);
529         if (unlikely(ret))
530                 goto err;
531
532         d_instantiate(dentry, &inode->v);
533         return 0;
534 err:
535         iput(&inode->v);
536         return ret;
537 }
538
539 static int bch2_mkdir(struct mnt_idmap *idmap,
540                       struct inode *vdir, struct dentry *dentry, umode_t mode)
541 {
542         return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
543 }
544
545 static int bch2_rename2(struct mnt_idmap *idmap,
546                         struct inode *src_vdir, struct dentry *src_dentry,
547                         struct inode *dst_vdir, struct dentry *dst_dentry,
548                         unsigned flags)
549 {
550         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
551         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
552         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
553         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
554         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
555         struct bch_inode_unpacked dst_dir_u, src_dir_u;
556         struct bch_inode_unpacked src_inode_u, dst_inode_u;
557         struct btree_trans trans;
558         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
559                 ? BCH_RENAME_EXCHANGE
560                 : dst_dentry->d_inode
561                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
562         u64 journal_seq = 0;
563         int ret;
564
565         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
566                 return -EINVAL;
567
568         if (mode == BCH_RENAME_OVERWRITE) {
569                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
570                                                    0, LLONG_MAX);
571                 if (ret)
572                         return ret;
573         }
574
575         bch2_trans_init(&trans, c, 8, 2048);
576
577         bch2_lock_inodes(INODE_UPDATE_LOCK,
578                          src_dir,
579                          dst_dir,
580                          src_inode,
581                          dst_inode);
582
583         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
584                 ret = bch2_fs_quota_transfer(c, src_inode,
585                                              dst_dir->ei_qid,
586                                              1 << QTYP_PRJ,
587                                              KEY_TYPE_QUOTA_PREALLOC);
588                 if (ret)
589                         goto err;
590         }
591
592         if (mode == BCH_RENAME_EXCHANGE &&
593             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
594                 ret = bch2_fs_quota_transfer(c, dst_inode,
595                                              src_dir->ei_qid,
596                                              1 << QTYP_PRJ,
597                                              KEY_TYPE_QUOTA_PREALLOC);
598                 if (ret)
599                         goto err;
600         }
601
602 retry:
603         bch2_trans_begin(&trans);
604         ret   = bch2_rename_trans(&trans,
605                                   src_dir->v.i_ino, &src_dir_u,
606                                   dst_dir->v.i_ino, &dst_dir_u,
607                                   &src_inode_u,
608                                   &dst_inode_u,
609                                   &src_dentry->d_name,
610                                   &dst_dentry->d_name,
611                                   mode) ?:
612                 bch2_trans_commit(&trans, NULL,
613                                   &journal_seq,
614                                   BTREE_INSERT_NOUNLOCK);
615         if (ret == -EINTR)
616                 goto retry;
617         if (unlikely(ret))
618                 goto err;
619
620         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
621         BUG_ON(dst_inode &&
622                dst_inode->v.i_ino != dst_inode_u.bi_inum);
623
624         bch2_inode_update_after_write(c, src_dir, &src_dir_u,
625                                       ATTR_MTIME|ATTR_CTIME);
626         journal_seq_copy(c, src_dir, journal_seq);
627
628         if (src_dir != dst_dir) {
629                 bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
630                                               ATTR_MTIME|ATTR_CTIME);
631                 journal_seq_copy(c, dst_dir, journal_seq);
632         }
633
634         bch2_inode_update_after_write(c, src_inode, &src_inode_u,
635                                       ATTR_CTIME);
636         journal_seq_copy(c, src_inode, journal_seq);
637
638         if (dst_inode) {
639                 bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
640                                               ATTR_CTIME);
641                 journal_seq_copy(c, dst_inode, journal_seq);
642         }
643 err:
644         bch2_trans_exit(&trans);
645
646         bch2_fs_quota_transfer(c, src_inode,
647                                bch_qid(&src_inode->ei_inode),
648                                1 << QTYP_PRJ,
649                                KEY_TYPE_QUOTA_NOCHECK);
650         if (dst_inode)
651                 bch2_fs_quota_transfer(c, dst_inode,
652                                        bch_qid(&dst_inode->ei_inode),
653                                        1 << QTYP_PRJ,
654                                        KEY_TYPE_QUOTA_NOCHECK);
655
656         bch2_unlock_inodes(INODE_UPDATE_LOCK,
657                            src_dir,
658                            dst_dir,
659                            src_inode,
660                            dst_inode);
661
662         return ret;
663 }
664
665 static void bch2_setattr_copy(struct mnt_idmap *idmap,
666                               struct bch_inode_info *inode,
667                               struct bch_inode_unpacked *bi,
668                               struct iattr *attr)
669 {
670         struct bch_fs *c = inode->v.i_sb->s_fs_info;
671         unsigned int ia_valid = attr->ia_valid;
672
673         if (ia_valid & ATTR_UID)
674                 bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
675         if (ia_valid & ATTR_GID)
676                 bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
677
678         if (ia_valid & ATTR_ATIME)
679                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
680         if (ia_valid & ATTR_MTIME)
681                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
682         if (ia_valid & ATTR_CTIME)
683                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
684
685         if (ia_valid & ATTR_MODE) {
686                 umode_t mode = attr->ia_mode;
687                 kgid_t gid = ia_valid & ATTR_GID
688                         ? attr->ia_gid
689                         : inode->v.i_gid;
690
691                 if (!in_group_p(gid) &&
692                     !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
693                         mode &= ~S_ISGID;
694                 bi->bi_mode = mode;
695         }
696 }
697
698 static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
699                                 struct bch_inode_info *inode,
700                                 struct iattr *attr)
701 {
702         struct bch_fs *c = inode->v.i_sb->s_fs_info;
703         struct bch_qid qid;
704         struct btree_trans trans;
705         struct btree_iter *inode_iter;
706         struct bch_inode_unpacked inode_u;
707         struct posix_acl *acl = NULL;
708         int ret;
709
710         mutex_lock(&inode->ei_update_lock);
711
712         qid = inode->ei_qid;
713
714         if (attr->ia_valid & ATTR_UID)
715                 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
716
717         if (attr->ia_valid & ATTR_GID)
718                 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
719
720         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
721                                      KEY_TYPE_QUOTA_PREALLOC);
722         if (ret)
723                 goto err;
724
725         bch2_trans_init(&trans, c, 0, 0);
726 retry:
727         bch2_trans_begin(&trans);
728         kfree(acl);
729         acl = NULL;
730
731         inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
732                                      BTREE_ITER_INTENT);
733         ret = PTR_ERR_OR_ZERO(inode_iter);
734         if (ret)
735                 goto btree_err;
736
737         bch2_setattr_copy(idmap, inode, &inode_u, attr);
738
739         if (attr->ia_valid & ATTR_MODE) {
740                 ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
741                 if (ret)
742                         goto btree_err;
743         }
744
745         ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
746                 bch2_trans_commit(&trans, NULL,
747                                   &inode->ei_journal_seq,
748                                   BTREE_INSERT_NOUNLOCK|
749                                   BTREE_INSERT_NOFAIL);
750 btree_err:
751         if (ret == -EINTR)
752                 goto retry;
753         if (unlikely(ret))
754                 goto err_trans;
755
756         bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
757
758         if (acl)
759                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
760 err_trans:
761         bch2_trans_exit(&trans);
762 err:
763         mutex_unlock(&inode->ei_update_lock);
764
765         return ret;
766 }
767
768 static int bch2_getattr(struct mnt_idmap *idmap,
769                         const struct path *path, struct kstat *stat,
770                         u32 request_mask, unsigned query_flags)
771 {
772         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
773         struct bch_fs *c = inode->v.i_sb->s_fs_info;
774
775         stat->dev       = inode->v.i_sb->s_dev;
776         stat->ino       = inode->v.i_ino;
777         stat->mode      = inode->v.i_mode;
778         stat->nlink     = inode->v.i_nlink;
779         stat->uid       = inode->v.i_uid;
780         stat->gid       = inode->v.i_gid;
781         stat->rdev      = inode->v.i_rdev;
782         stat->size      = i_size_read(&inode->v);
783         stat->atime     = inode->v.i_atime;
784         stat->mtime     = inode->v.i_mtime;
785         stat->ctime     = inode_get_ctime(&inode->v);
786         stat->blksize   = block_bytes(c);
787         stat->blocks    = inode->v.i_blocks;
788
789         if (request_mask & STATX_BTIME) {
790                 stat->result_mask |= STATX_BTIME;
791                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
792         }
793
794         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
795                 stat->attributes |= STATX_ATTR_IMMUTABLE;
796         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
797
798         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
799                 stat->attributes |= STATX_ATTR_APPEND;
800         stat->attributes_mask    |= STATX_ATTR_APPEND;
801
802         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
803                 stat->attributes |= STATX_ATTR_NODUMP;
804         stat->attributes_mask    |= STATX_ATTR_NODUMP;
805
806         return 0;
807 }
808
809 static int bch2_setattr(struct mnt_idmap *idmap,
810                         struct dentry *dentry, struct iattr *iattr)
811 {
812         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
813         int ret;
814
815         lockdep_assert_held(&inode->v.i_rwsem);
816
817         ret = setattr_prepare(idmap, dentry, iattr);
818         if (ret)
819                 return ret;
820
821         return iattr->ia_valid & ATTR_SIZE
822                 ? bch2_truncate(inode, iattr)
823                 : bch2_setattr_nonsize(idmap, inode, iattr);
824 }
825
826 static int bch2_tmpfile(struct mnt_idmap *idmap,
827                         struct inode *vdir, struct file *file, umode_t mode)
828 {
829         struct bch_inode_info *inode =
830                 __bch2_create(idmap, to_bch_ei(vdir),
831                               file->f_path.dentry, mode, 0, true);
832
833         if (IS_ERR(inode))
834                 return PTR_ERR(inode);
835
836         d_mark_tmpfile(file, &inode->v);
837         d_instantiate(file->f_path.dentry, &inode->v);
838         return finish_open_simple(file, 0);
839 }
840
841 static int bch2_fill_extent(struct bch_fs *c,
842                             struct fiemap_extent_info *info,
843                             struct bkey_s_c k, unsigned flags)
844 {
845         if (bkey_extent_is_direct_data(k.k)) {
846                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
847                 const union bch_extent_entry *entry;
848                 struct extent_ptr_decoded p;
849                 int ret;
850
851                 if (k.k->type == KEY_TYPE_reflink_v)
852                         flags |= FIEMAP_EXTENT_SHARED;
853
854                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
855                         int flags2 = 0;
856                         u64 offset = p.ptr.offset;
857
858                         if (p.crc.compression_type)
859                                 flags2 |= FIEMAP_EXTENT_ENCODED;
860                         else
861                                 offset += p.crc.offset;
862
863                         if ((offset & (c->opts.block_size - 1)) ||
864                             (k.k->size & (c->opts.block_size - 1)))
865                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
866
867                         ret = fiemap_fill_next_extent(info,
868                                                 bkey_start_offset(k.k) << 9,
869                                                 offset << 9,
870                                                 k.k->size << 9, flags|flags2);
871                         if (ret)
872                                 return ret;
873                 }
874
875                 return 0;
876         } else if (bkey_extent_is_inline_data(k.k)) {
877                 return fiemap_fill_next_extent(info,
878                                                bkey_start_offset(k.k) << 9,
879                                                0, k.k->size << 9,
880                                                flags|
881                                                FIEMAP_EXTENT_DATA_INLINE);
882         } else if (k.k->type == KEY_TYPE_reservation) {
883                 return fiemap_fill_next_extent(info,
884                                                bkey_start_offset(k.k) << 9,
885                                                0, k.k->size << 9,
886                                                flags|
887                                                FIEMAP_EXTENT_DELALLOC|
888                                                FIEMAP_EXTENT_UNWRITTEN);
889         } else {
890                 BUG();
891         }
892 }
893
894 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
895                        u64 start, u64 len)
896 {
897         struct bch_fs *c = vinode->i_sb->s_fs_info;
898         struct bch_inode_info *ei = to_bch_ei(vinode);
899         struct btree_trans trans;
900         struct btree_iter *iter;
901         struct bkey_s_c k;
902         struct bkey_buf cur, prev;
903         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
904         unsigned offset_into_extent, sectors;
905         bool have_extent = false;
906         int ret = 0;
907
908         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
909         if (ret)
910                 return ret;
911
912         if (start + len < start)
913                 return -EINVAL;
914
915         bch2_bkey_buf_init(&cur);
916         bch2_bkey_buf_init(&prev);
917         bch2_trans_init(&trans, c, 0, 0);
918
919         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
920                                    POS(ei->v.i_ino, start >> 9), 0);
921 retry:
922         while ((k = bch2_btree_iter_peek(iter)).k &&
923                !(ret = bkey_err(k)) &&
924                bkey_cmp(iter->pos, end) < 0) {
925                 if (!bkey_extent_is_data(k.k) &&
926                     k.k->type != KEY_TYPE_reservation) {
927                         bch2_btree_iter_next(iter);
928                         continue;
929                 }
930
931                 offset_into_extent      = iter->pos.offset -
932                         bkey_start_offset(k.k);
933                 sectors                 = k.k->size - offset_into_extent;
934
935                 bch2_bkey_buf_reassemble(&cur, c, k);
936
937                 ret = bch2_read_indirect_extent(&trans,
938                                         &offset_into_extent, &cur);
939                 if (ret)
940                         break;
941
942                 k = bkey_i_to_s_c(cur.k);
943                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
944
945                 sectors = min(sectors, k.k->size - offset_into_extent);
946
947                 bch2_cut_front(POS(k.k->p.inode,
948                                    bkey_start_offset(k.k) +
949                                    offset_into_extent),
950                                cur.k);
951                 bch2_key_resize(&cur.k->k, sectors);
952                 cur.k->k.p = iter->pos;
953                 cur.k->k.p.offset += cur.k->k.size;
954
955                 if (have_extent) {
956                         ret = bch2_fill_extent(c, info,
957                                         bkey_i_to_s_c(prev.k), 0);
958                         if (ret)
959                                 break;
960                 }
961
962                 bkey_copy(prev.k, cur.k);
963                 have_extent = true;
964
965                 bch2_btree_iter_set_pos(iter,
966                         POS(iter->pos.inode, iter->pos.offset + sectors));
967         }
968
969         if (ret == -EINTR)
970                 goto retry;
971
972         if (!ret && have_extent)
973                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
974                                        FIEMAP_EXTENT_LAST);
975
976         ret = bch2_trans_exit(&trans) ?: ret;
977         bch2_bkey_buf_exit(&cur, c);
978         bch2_bkey_buf_exit(&prev, c);
979         return ret < 0 ? ret : 0;
980 }
981
982 static const struct vm_operations_struct bch_vm_ops = {
983         .fault          = bch2_page_fault,
984         .map_pages      = filemap_map_pages,
985         .page_mkwrite   = bch2_page_mkwrite,
986 };
987
988 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
989 {
990         file_accessed(file);
991
992         vma->vm_ops = &bch_vm_ops;
993         return 0;
994 }
995
996 /* Directories: */
997
998 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
999 {
1000         return generic_file_llseek_size(file, offset, whence,
1001                                         S64_MAX, S64_MAX);
1002 }
1003
1004 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1005 {
1006         struct bch_inode_info *inode = file_bch_inode(file);
1007         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1008
1009         if (!dir_emit_dots(file, ctx))
1010                 return 0;
1011
1012         return bch2_readdir(c, inode->v.i_ino, ctx);
1013 }
1014
1015 static const struct file_operations bch_file_operations = {
1016         .llseek         = bch2_llseek,
1017         .read_iter      = bch2_read_iter,
1018         .write_iter     = bch2_write_iter,
1019         .mmap           = bch2_mmap,
1020         .open           = generic_file_open,
1021         .fsync          = bch2_fsync,
1022         .splice_read    = filemap_splice_read,
1023         .splice_write   = iter_file_splice_write,
1024         .fallocate      = bch2_fallocate_dispatch,
1025         .unlocked_ioctl = bch2_fs_file_ioctl,
1026 #ifdef CONFIG_COMPAT
1027         .compat_ioctl   = bch2_compat_fs_ioctl,
1028 #endif
1029         .remap_file_range = bch2_remap_file_range,
1030 };
1031
1032 static const struct inode_operations bch_file_inode_operations = {
1033         .getattr        = bch2_getattr,
1034         .setattr        = bch2_setattr,
1035         .fiemap         = bch2_fiemap,
1036         .listxattr      = bch2_xattr_list,
1037 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1038         .get_acl        = bch2_get_acl,
1039         .set_acl        = bch2_set_acl,
1040 #endif
1041 };
1042
1043 static const struct inode_operations bch_dir_inode_operations = {
1044         .lookup         = bch2_lookup,
1045         .create         = bch2_create,
1046         .link           = bch2_link,
1047         .unlink         = bch2_unlink,
1048         .symlink        = bch2_symlink,
1049         .mkdir          = bch2_mkdir,
1050         .rmdir          = bch2_unlink,
1051         .mknod          = bch2_mknod,
1052         .rename         = bch2_rename2,
1053         .getattr        = bch2_getattr,
1054         .setattr        = bch2_setattr,
1055         .tmpfile        = bch2_tmpfile,
1056         .listxattr      = bch2_xattr_list,
1057 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1058         .get_acl        = bch2_get_acl,
1059         .set_acl        = bch2_set_acl,
1060 #endif
1061 };
1062
1063 static const struct file_operations bch_dir_file_operations = {
1064         .llseek         = bch2_dir_llseek,
1065         .read           = generic_read_dir,
1066         .iterate_shared = bch2_vfs_readdir,
1067         .fsync          = bch2_fsync,
1068         .unlocked_ioctl = bch2_fs_file_ioctl,
1069 #ifdef CONFIG_COMPAT
1070         .compat_ioctl   = bch2_compat_fs_ioctl,
1071 #endif
1072 };
1073
1074 static const struct inode_operations bch_symlink_inode_operations = {
1075         .get_link       = page_get_link,
1076         .getattr        = bch2_getattr,
1077         .setattr        = bch2_setattr,
1078         .listxattr      = bch2_xattr_list,
1079 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1080         .get_acl        = bch2_get_acl,
1081         .set_acl        = bch2_set_acl,
1082 #endif
1083 };
1084
1085 static const struct inode_operations bch_special_inode_operations = {
1086         .getattr        = bch2_getattr,
1087         .setattr        = bch2_setattr,
1088         .listxattr      = bch2_xattr_list,
1089 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1090         .get_acl        = bch2_get_acl,
1091         .set_acl        = bch2_set_acl,
1092 #endif
1093 };
1094
1095 static const struct address_space_operations bch_address_space_operations = {
1096         .writepage      = bch2_writepage,
1097         .read_folio     = bch2_read_folio,
1098         .writepages     = bch2_writepages,
1099         .readahead      = bch2_readahead,
1100         .dirty_folio    = filemap_dirty_folio,
1101         .write_begin    = bch2_write_begin,
1102         .write_end      = bch2_write_end,
1103         .invalidate_folio = bch2_invalidate_folio,
1104         .release_folio  = bch2_release_folio,
1105         .direct_IO      = noop_direct_IO,
1106 #ifdef CONFIG_MIGRATION
1107         .migrate_folio  = filemap_migrate_folio,
1108 #endif
1109         .error_remove_page = generic_error_remove_page,
1110 };
1111
1112 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1113                 u64 ino, u32 generation)
1114 {
1115         struct bch_fs *c = sb->s_fs_info;
1116         struct inode *vinode;
1117
1118         if (ino < BCACHEFS_ROOT_INO)
1119                 return ERR_PTR(-ESTALE);
1120
1121         vinode = bch2_vfs_inode_get(c, ino);
1122         if (IS_ERR(vinode))
1123                 return ERR_CAST(vinode);
1124         if (generation && vinode->i_generation != generation) {
1125                 /* we didn't find the right inode.. */
1126                 iput(vinode);
1127                 return ERR_PTR(-ESTALE);
1128         }
1129         return vinode;
1130 }
1131
1132 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
1133                 int fh_len, int fh_type)
1134 {
1135         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1136                                     bch2_nfs_get_inode);
1137 }
1138
1139 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
1140                 int fh_len, int fh_type)
1141 {
1142         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1143                                     bch2_nfs_get_inode);
1144 }
1145
1146 static const struct export_operations bch_export_ops = {
1147         .fh_to_dentry   = bch2_fh_to_dentry,
1148         .fh_to_parent   = bch2_fh_to_parent,
1149         //.get_parent   = bch2_get_parent,
1150 };
1151
1152 static void bch2_vfs_inode_init(struct bch_fs *c,
1153                                 struct bch_inode_info *inode,
1154                                 struct bch_inode_unpacked *bi)
1155 {
1156         bch2_inode_update_after_write(c, inode, bi, ~0);
1157
1158         inode->v.i_blocks       = bi->bi_sectors;
1159         inode->v.i_ino          = bi->bi_inum;
1160         inode->v.i_rdev         = bi->bi_dev;
1161         inode->v.i_generation   = bi->bi_generation;
1162         inode->v.i_size         = bi->bi_size;
1163
1164         inode->ei_flags         = 0;
1165         inode->ei_journal_seq   = 0;
1166         inode->ei_quota_reserved = 0;
1167         inode->ei_str_hash      = bch2_hash_info_init(c, bi);
1168         inode->ei_qid           = bch_qid(bi);
1169
1170         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1171
1172         switch (inode->v.i_mode & S_IFMT) {
1173         case S_IFREG:
1174                 inode->v.i_op   = &bch_file_inode_operations;
1175                 inode->v.i_fop  = &bch_file_operations;
1176                 break;
1177         case S_IFDIR:
1178                 inode->v.i_op   = &bch_dir_inode_operations;
1179                 inode->v.i_fop  = &bch_dir_file_operations;
1180                 break;
1181         case S_IFLNK:
1182                 inode_nohighmem(&inode->v);
1183                 inode->v.i_op   = &bch_symlink_inode_operations;
1184                 break;
1185         default:
1186                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1187                 inode->v.i_op   = &bch_special_inode_operations;
1188                 break;
1189         }
1190 }
1191
1192 static struct inode *bch2_alloc_inode(struct super_block *sb)
1193 {
1194         struct bch_inode_info *inode;
1195
1196         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1197         if (!inode)
1198                 return NULL;
1199
1200         inode_init_once(&inode->v);
1201         mutex_init(&inode->ei_update_lock);
1202         pagecache_lock_init(&inode->ei_pagecache_lock);
1203         mutex_init(&inode->ei_quota_lock);
1204         inode->ei_journal_seq = 0;
1205
1206         return &inode->v;
1207 }
1208
1209 static void bch2_i_callback(struct rcu_head *head)
1210 {
1211         struct inode *vinode = container_of(head, struct inode, i_rcu);
1212         struct bch_inode_info *inode = to_bch_ei(vinode);
1213
1214         kmem_cache_free(bch2_inode_cache, inode);
1215 }
1216
1217 static void bch2_destroy_inode(struct inode *vinode)
1218 {
1219         call_rcu(&vinode->i_rcu, bch2_i_callback);
1220 }
1221
1222 static int inode_update_times_fn(struct bch_inode_info *inode,
1223                                  struct bch_inode_unpacked *bi,
1224                                  void *p)
1225 {
1226         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1227
1228         bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
1229         bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
1230         bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1231
1232         return 0;
1233 }
1234
1235 static int bch2_vfs_write_inode(struct inode *vinode,
1236                                 struct writeback_control *wbc)
1237 {
1238         struct bch_fs *c = vinode->i_sb->s_fs_info;
1239         struct bch_inode_info *inode = to_bch_ei(vinode);
1240         int ret;
1241
1242         mutex_lock(&inode->ei_update_lock);
1243         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1244                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1245         mutex_unlock(&inode->ei_update_lock);
1246
1247         return ret;
1248 }
1249
1250 static void bch2_evict_inode(struct inode *vinode)
1251 {
1252         struct bch_fs *c = vinode->i_sb->s_fs_info;
1253         struct bch_inode_info *inode = to_bch_ei(vinode);
1254
1255         truncate_inode_pages_final(&inode->v.i_data);
1256
1257         clear_inode(&inode->v);
1258
1259         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1260
1261         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1262                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1263                                 KEY_TYPE_QUOTA_WARN);
1264                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1265                                 KEY_TYPE_QUOTA_WARN);
1266                 bch2_inode_rm(c, inode->v.i_ino, true);
1267         }
1268 }
1269
1270 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1271 {
1272         struct super_block *sb = dentry->d_sb;
1273         struct bch_fs *c = sb->s_fs_info;
1274         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1275         unsigned shift = sb->s_blocksize_bits - 9;
1276         /*
1277          * this assumes inodes take up 64 bytes, which is a decent average
1278          * number:
1279          */
1280         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1281         u64 fsid;
1282
1283         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1284         buf->f_bsize    = sb->s_blocksize;
1285         buf->f_blocks   = usage.capacity >> shift;
1286         buf->f_bfree    = (usage.capacity - usage.used) >> shift;
1287         buf->f_bavail   = buf->f_bfree;
1288
1289         buf->f_files    = usage.nr_inodes + avail_inodes;
1290         buf->f_ffree    = avail_inodes;
1291
1292         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1293                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1294         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1295         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1296         buf->f_namelen  = BCH_NAME_MAX;
1297
1298         return 0;
1299 }
1300
1301 static int bch2_sync_fs(struct super_block *sb, int wait)
1302 {
1303         struct bch_fs *c = sb->s_fs_info;
1304
1305         if (c->opts.journal_flush_disabled)
1306                 return 0;
1307
1308         if (!wait) {
1309                 bch2_journal_flush_async(&c->journal, NULL);
1310                 return 0;
1311         }
1312
1313         return bch2_journal_flush(&c->journal);
1314 }
1315
1316 static struct bch_fs *bch2_path_to_fs(const char *path)
1317 {
1318         struct bch_fs *c;
1319         dev_t dev;
1320         int ret;
1321
1322         ret = lookup_bdev(path, &dev);
1323         if (ret)
1324                 return ERR_PTR(ret);
1325
1326         c = bch2_dev_to_fs(dev);
1327         if (c)
1328                 closure_put(&c->cl);
1329         return c ?: ERR_PTR(-ENOENT);
1330 }
1331
1332 static char **split_devs(const char *_dev_name, unsigned *nr)
1333 {
1334         char *dev_name = NULL, **devs = NULL, *s;
1335         size_t i, nr_devs = 0;
1336
1337         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1338         if (!dev_name)
1339                 return NULL;
1340
1341         for (s = dev_name; s; s = strchr(s + 1, ':'))
1342                 nr_devs++;
1343
1344         devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
1345         if (!devs) {
1346                 kfree(dev_name);
1347                 return NULL;
1348         }
1349
1350         for (i = 0, s = dev_name;
1351              s;
1352              (s = strchr(s, ':')) && (*s++ = '\0'))
1353                 devs[i++] = s;
1354
1355         *nr = nr_devs;
1356         return devs;
1357 }
1358
1359 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1360 {
1361         struct bch_fs *c = sb->s_fs_info;
1362         struct bch_opts opts = bch2_opts_empty();
1363         int ret;
1364
1365         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1366
1367         ret = bch2_parse_mount_opts(c, &opts, data);
1368         if (ret)
1369                 return ret;
1370
1371         if (opts.read_only != c->opts.read_only) {
1372                 down_write(&c->state_lock);
1373
1374                 if (opts.read_only) {
1375                         bch2_fs_read_only(c);
1376
1377                         sb->s_flags |= SB_RDONLY;
1378                 } else {
1379                         ret = bch2_fs_read_write(c);
1380                         if (ret) {
1381                                 bch_err(c, "error going rw: %i", ret);
1382                                 up_write(&c->state_lock);
1383                                 return -EINVAL;
1384                         }
1385
1386                         sb->s_flags &= ~SB_RDONLY;
1387                 }
1388
1389                 c->opts.read_only = opts.read_only;
1390
1391                 up_write(&c->state_lock);
1392         }
1393
1394         if (opts.errors >= 0)
1395                 c->opts.errors = opts.errors;
1396
1397         return ret;
1398 }
1399
1400 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1401 {
1402         struct bch_fs *c = root->d_sb->s_fs_info;
1403         struct bch_dev *ca;
1404         unsigned i;
1405         bool first = true;
1406
1407         for_each_online_member(ca, c, i) {
1408                 if (!first)
1409                         seq_putc(seq, ':');
1410                 first = false;
1411                 seq_puts(seq, "/dev/");
1412                 seq_puts(seq, ca->name);
1413         }
1414
1415         return 0;
1416 }
1417
1418 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1419 {
1420         struct bch_fs *c = root->d_sb->s_fs_info;
1421         enum bch_opt_id i;
1422         char buf[512];
1423
1424         for (i = 0; i < bch2_opts_nr; i++) {
1425                 const struct bch_option *opt = &bch2_opt_table[i];
1426                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1427
1428                 if (!(opt->mode & OPT_MOUNT))
1429                         continue;
1430
1431                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1432                         continue;
1433
1434                 bch2_opt_to_text(&PBUF(buf), c, opt, v,
1435                                  OPT_SHOW_MOUNT_STYLE);
1436                 seq_putc(seq, ',');
1437                 seq_puts(seq, buf);
1438         }
1439
1440         return 0;
1441 }
1442
1443 static void bch2_put_super(struct super_block *sb)
1444 {
1445         struct bch_fs *c = sb->s_fs_info;
1446
1447         __bch2_fs_stop(c);
1448 }
1449
1450 static const struct super_operations bch_super_operations = {
1451         .alloc_inode    = bch2_alloc_inode,
1452         .destroy_inode  = bch2_destroy_inode,
1453         .write_inode    = bch2_vfs_write_inode,
1454         .evict_inode    = bch2_evict_inode,
1455         .sync_fs        = bch2_sync_fs,
1456         .statfs         = bch2_statfs,
1457         .show_devname   = bch2_show_devname,
1458         .show_options   = bch2_show_options,
1459         .remount_fs     = bch2_remount,
1460         .put_super      = bch2_put_super,
1461 #if 0
1462         .freeze_fs      = bch2_freeze,
1463         .unfreeze_fs    = bch2_unfreeze,
1464 #endif
1465 };
1466
1467 static int bch2_set_super(struct super_block *s, void *data)
1468 {
1469         s->s_fs_info = data;
1470         return 0;
1471 }
1472
1473 static int bch2_noset_super(struct super_block *s, void *data)
1474 {
1475         return -EBUSY;
1476 }
1477
1478 static int bch2_test_super(struct super_block *s, void *data)
1479 {
1480         struct bch_fs *c = s->s_fs_info;
1481         struct bch_fs **devs = data;
1482         unsigned i;
1483
1484         if (!c)
1485                 return false;
1486
1487         for (i = 0; devs[i]; i++)
1488                 if (c != devs[i])
1489                         return false;
1490         return true;
1491 }
1492
1493 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1494                                  int flags, const char *dev_name, void *data)
1495 {
1496         struct bch_fs *c;
1497         struct bch_dev *ca;
1498         struct super_block *sb;
1499         struct inode *vinode;
1500         struct bch_opts opts = bch2_opts_empty();
1501         char **devs;
1502         struct bch_fs **devs_to_fs = NULL;
1503         unsigned i, nr_devs;
1504         int ret;
1505
1506         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1507
1508         ret = bch2_parse_mount_opts(NULL, &opts, data);
1509         if (ret)
1510                 return ERR_PTR(ret);
1511
1512         devs = split_devs(dev_name, &nr_devs);
1513         if (!devs)
1514                 return ERR_PTR(-ENOMEM);
1515
1516         devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
1517         if (!devs_to_fs) {
1518                 sb = ERR_PTR(-ENOMEM);
1519                 goto got_sb;
1520         }
1521
1522         for (i = 0; i < nr_devs; i++)
1523                 devs_to_fs[i] = bch2_path_to_fs(devs[i]);
1524
1525         sb = sget(fs_type, bch2_test_super, bch2_noset_super,
1526                   flags|SB_NOSEC, devs_to_fs);
1527         if (!IS_ERR(sb))
1528                 goto got_sb;
1529
1530         c = bch2_fs_open(devs, nr_devs, opts);
1531         if (IS_ERR(c)) {
1532                 sb = ERR_CAST(c);
1533                 goto got_sb;
1534         }
1535
1536         /* Some options can't be parsed until after the fs is started: */
1537         ret = bch2_parse_mount_opts(c, &opts, data);
1538         if (ret) {
1539                 bch2_fs_stop(c);
1540                 sb = ERR_PTR(ret);
1541                 goto got_sb;
1542         }
1543
1544         bch2_opts_apply(&c->opts, opts);
1545
1546         sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1547         if (IS_ERR(sb))
1548                 bch2_fs_stop(c);
1549 got_sb:
1550         kfree(devs_to_fs);
1551         kfree(devs[0]);
1552         kfree(devs);
1553
1554         if (IS_ERR(sb))
1555                 return ERR_CAST(sb);
1556
1557         c = sb->s_fs_info;
1558
1559         if (sb->s_root) {
1560                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1561                         ret = -EBUSY;
1562                         goto err_put_super;
1563                 }
1564                 goto out;
1565         }
1566
1567         sb->s_blocksize         = block_bytes(c);
1568         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1569         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1570         sb->s_op                = &bch_super_operations;
1571         sb->s_export_op         = &bch_export_ops;
1572 #ifdef CONFIG_BCACHEFS_QUOTA
1573         sb->s_qcop              = &bch2_quotactl_operations;
1574         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1575 #endif
1576         sb->s_xattr             = bch2_xattr_handlers;
1577         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1578         sb->s_time_gran         = c->sb.time_precision;
1579         c->vfs_sb               = sb;
1580         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1581
1582         ret = super_setup_bdi(sb);
1583         if (ret)
1584                 goto err_put_super;
1585
1586         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1587
1588         for_each_online_member(ca, c, i) {
1589                 struct block_device *bdev = ca->disk_sb.bdev;
1590
1591                 /* XXX: create an anonymous device for multi device filesystems */
1592                 sb->s_bdev      = bdev;
1593                 sb->s_dev       = bdev->bd_dev;
1594                 percpu_ref_put(&ca->io_ref);
1595                 break;
1596         }
1597
1598 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1599         if (c->opts.acl)
1600                 sb->s_flags     |= SB_POSIXACL;
1601 #endif
1602
1603         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
1604         if (IS_ERR(vinode)) {
1605                 bch_err(c, "error mounting: error getting root inode %i",
1606                         (int) PTR_ERR(vinode));
1607                 ret = PTR_ERR(vinode);
1608                 goto err_put_super;
1609         }
1610
1611         sb->s_root = d_make_root(vinode);
1612         if (!sb->s_root) {
1613                 bch_err(c, "error mounting: error allocating root dentry");
1614                 ret = -ENOMEM;
1615                 goto err_put_super;
1616         }
1617
1618         sb->s_flags |= SB_ACTIVE;
1619 out:
1620         return dget(sb->s_root);
1621
1622 err_put_super:
1623         deactivate_locked_super(sb);
1624         return ERR_PTR(ret);
1625 }
1626
1627 static void bch2_kill_sb(struct super_block *sb)
1628 {
1629         struct bch_fs *c = sb->s_fs_info;
1630
1631         generic_shutdown_super(sb);
1632         bch2_fs_free(c);
1633 }
1634
1635 static struct file_system_type bcache_fs_type = {
1636         .owner          = THIS_MODULE,
1637         .name           = "bcachefs",
1638         .mount          = bch2_mount,
1639         .kill_sb        = bch2_kill_sb,
1640         .fs_flags       = FS_REQUIRES_DEV,
1641 };
1642
1643 MODULE_ALIAS_FS("bcachefs");
1644
1645 void bch2_vfs_exit(void)
1646 {
1647         unregister_filesystem(&bcache_fs_type);
1648         if (bch2_inode_cache)
1649                 kmem_cache_destroy(bch2_inode_cache);
1650 }
1651
1652 int __init bch2_vfs_init(void)
1653 {
1654         int ret = -ENOMEM;
1655
1656         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1657         if (!bch2_inode_cache)
1658                 goto err;
1659
1660         ret = register_filesystem(&bcache_fs_type);
1661         if (ret)
1662                 goto err;
1663
1664         return 0;
1665 err:
1666         bch2_vfs_exit();
1667         return ret;
1668 }
1669
1670 #endif /* NO_BCACHEFS_FS */