bcachefs: Initial commit
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 17 Mar 2017 06:18:50 +0000 (22:18 -0800)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:07 +0000 (17:08 -0400)
Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write
filesystem with every feature you could possibly want.

Website: https://bcachefs.org

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
122 files changed:
fs/Kconfig
fs/Makefile
fs/bcachefs/Kconfig [new file with mode: 0644]
fs/bcachefs/Makefile [new file with mode: 0644]
fs/bcachefs/acl.c [new file with mode: 0644]
fs/bcachefs/acl.h [new file with mode: 0644]
fs/bcachefs/alloc.c [new file with mode: 0644]
fs/bcachefs/alloc.h [new file with mode: 0644]
fs/bcachefs/alloc_types.h [new file with mode: 0644]
fs/bcachefs/bcachefs.h [new file with mode: 0644]
fs/bcachefs/bcachefs_format.h [new file with mode: 0644]
fs/bcachefs/bcachefs_ioctl.h [new file with mode: 0644]
fs/bcachefs/bkey.c [new file with mode: 0644]
fs/bcachefs/bkey.h [new file with mode: 0644]
fs/bcachefs/bkey_methods.c [new file with mode: 0644]
fs/bcachefs/bkey_methods.h [new file with mode: 0644]
fs/bcachefs/bset.c [new file with mode: 0644]
fs/bcachefs/bset.h [new file with mode: 0644]
fs/bcachefs/btree_cache.c [new file with mode: 0644]
fs/bcachefs/btree_cache.h [new file with mode: 0644]
fs/bcachefs/btree_gc.c [new file with mode: 0644]
fs/bcachefs/btree_gc.h [new file with mode: 0644]
fs/bcachefs/btree_io.c [new file with mode: 0644]
fs/bcachefs/btree_io.h [new file with mode: 0644]
fs/bcachefs/btree_iter.c [new file with mode: 0644]
fs/bcachefs/btree_iter.h [new file with mode: 0644]
fs/bcachefs/btree_locking.h [new file with mode: 0644]
fs/bcachefs/btree_types.h [new file with mode: 0644]
fs/bcachefs/btree_update.h [new file with mode: 0644]
fs/bcachefs/btree_update_interior.c [new file with mode: 0644]
fs/bcachefs/btree_update_interior.h [new file with mode: 0644]
fs/bcachefs/btree_update_leaf.c [new file with mode: 0644]
fs/bcachefs/buckets.c [new file with mode: 0644]
fs/bcachefs/buckets.h [new file with mode: 0644]
fs/bcachefs/buckets_types.h [new file with mode: 0644]
fs/bcachefs/chardev.c [new file with mode: 0644]
fs/bcachefs/chardev.h [new file with mode: 0644]
fs/bcachefs/checksum.c [new file with mode: 0644]
fs/bcachefs/checksum.h [new file with mode: 0644]
fs/bcachefs/clock.c [new file with mode: 0644]
fs/bcachefs/clock.h [new file with mode: 0644]
fs/bcachefs/clock_types.h [new file with mode: 0644]
fs/bcachefs/compress.c [new file with mode: 0644]
fs/bcachefs/compress.h [new file with mode: 0644]
fs/bcachefs/debug.c [new file with mode: 0644]
fs/bcachefs/debug.h [new file with mode: 0644]
fs/bcachefs/dirent.c [new file with mode: 0644]
fs/bcachefs/dirent.h [new file with mode: 0644]
fs/bcachefs/disk_groups.c [new file with mode: 0644]
fs/bcachefs/disk_groups.h [new file with mode: 0644]
fs/bcachefs/error.c [new file with mode: 0644]
fs/bcachefs/error.h [new file with mode: 0644]
fs/bcachefs/extents.c [new file with mode: 0644]
fs/bcachefs/extents.h [new file with mode: 0644]
fs/bcachefs/extents_types.h [new file with mode: 0644]
fs/bcachefs/eytzinger.h [new file with mode: 0644]
fs/bcachefs/fifo.h [new file with mode: 0644]
fs/bcachefs/fs-io.c [new file with mode: 0644]
fs/bcachefs/fs-io.h [new file with mode: 0644]
fs/bcachefs/fs-ioctl.c [new file with mode: 0644]
fs/bcachefs/fs-ioctl.h [new file with mode: 0644]
fs/bcachefs/fs.c [new file with mode: 0644]
fs/bcachefs/fs.h [new file with mode: 0644]
fs/bcachefs/fsck.c [new file with mode: 0644]
fs/bcachefs/fsck.h [new file with mode: 0644]
fs/bcachefs/inode.c [new file with mode: 0644]
fs/bcachefs/inode.h [new file with mode: 0644]
fs/bcachefs/io.c [new file with mode: 0644]
fs/bcachefs/io.h [new file with mode: 0644]
fs/bcachefs/io_types.h [new file with mode: 0644]
fs/bcachefs/journal.c [new file with mode: 0644]
fs/bcachefs/journal.h [new file with mode: 0644]
fs/bcachefs/journal_io.c [new file with mode: 0644]
fs/bcachefs/journal_io.h [new file with mode: 0644]
fs/bcachefs/journal_reclaim.c [new file with mode: 0644]
fs/bcachefs/journal_reclaim.h [new file with mode: 0644]
fs/bcachefs/journal_seq_blacklist.c [new file with mode: 0644]
fs/bcachefs/journal_seq_blacklist.h [new file with mode: 0644]
fs/bcachefs/journal_types.h [new file with mode: 0644]
fs/bcachefs/keylist.c [new file with mode: 0644]
fs/bcachefs/keylist.h [new file with mode: 0644]
fs/bcachefs/keylist_types.h [new file with mode: 0644]
fs/bcachefs/migrate.c [new file with mode: 0644]
fs/bcachefs/migrate.h [new file with mode: 0644]
fs/bcachefs/move.c [new file with mode: 0644]
fs/bcachefs/move.h [new file with mode: 0644]
fs/bcachefs/move_types.h [new file with mode: 0644]
fs/bcachefs/movinggc.c [new file with mode: 0644]
fs/bcachefs/movinggc.h [new file with mode: 0644]
fs/bcachefs/opts.c [new file with mode: 0644]
fs/bcachefs/opts.h [new file with mode: 0644]
fs/bcachefs/quota.c [new file with mode: 0644]
fs/bcachefs/quota.h [new file with mode: 0644]
fs/bcachefs/quota_types.h [new file with mode: 0644]
fs/bcachefs/rebalance.c [new file with mode: 0644]
fs/bcachefs/rebalance.h [new file with mode: 0644]
fs/bcachefs/rebalance_types.h [new file with mode: 0644]
fs/bcachefs/recovery.c [new file with mode: 0644]
fs/bcachefs/recovery.h [new file with mode: 0644]
fs/bcachefs/replicas.c [new file with mode: 0644]
fs/bcachefs/replicas.h [new file with mode: 0644]
fs/bcachefs/siphash.c [new file with mode: 0644]
fs/bcachefs/siphash.h [new file with mode: 0644]
fs/bcachefs/six.c [new file with mode: 0644]
fs/bcachefs/six.h [new file with mode: 0644]
fs/bcachefs/str_hash.h [new file with mode: 0644]
fs/bcachefs/super-io.c [new file with mode: 0644]
fs/bcachefs/super-io.h [new file with mode: 0644]
fs/bcachefs/super.c [new file with mode: 0644]
fs/bcachefs/super.h [new file with mode: 0644]
fs/bcachefs/super_types.h [new file with mode: 0644]
fs/bcachefs/sysfs.c [new file with mode: 0644]
fs/bcachefs/sysfs.h [new file with mode: 0644]
fs/bcachefs/tests.c [new file with mode: 0644]
fs/bcachefs/tests.h [new file with mode: 0644]
fs/bcachefs/trace.c [new file with mode: 0644]
fs/bcachefs/trace.h [new file with mode: 0644]
fs/bcachefs/util.c [new file with mode: 0644]
fs/bcachefs/util.h [new file with mode: 0644]
fs/bcachefs/vstructs.h [new file with mode: 0644]
fs/bcachefs/xattr.c [new file with mode: 0644]
fs/bcachefs/xattr.h [new file with mode: 0644]

index aa7e03cc1941cb3e6145d95886b99b83b677a69b..0d6cb927872af1afd919134c8df29405f125a810 100644 (file)
@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
index f9541f40be4e08fbdee72f39e8c0c7ef856fb3f6..75522f88e763670ca38010401975209c114fc67e 100644 (file)
@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)              += ocfs2/
 obj-$(CONFIG_BTRFS_FS)         += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)          += f2fs/
+obj-$(CONFIG_BCACHEFS_FS)      += bcachefs/
 obj-$(CONFIG_CEPH_FS)          += ceph/
 obj-$(CONFIG_PSTORE)           += pstore/
 obj-$(CONFIG_EFIVAR_FS)                += efivarfs/
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
new file mode 100644 (file)
index 0000000..c13f2cf
--- /dev/null
@@ -0,0 +1,52 @@
+
+config BCACHEFS_FS
+       tristate "bcachefs filesystem support"
+       depends on BLOCK
+       select EXPORTFS
+       select CLOSURES
+       select LIBCRC32C
+       select FS_POSIX_ACL
+       select LZ4_COMPRESS
+       select LZ4_DECOMPRESS
+       select ZLIB_DEFLATE
+       select ZLIB_INFLATE
+       select ZSTD_COMPRESS
+       select ZSTD_DECOMPRESS
+       select CRYPTO_SHA256
+       select CRYPTO_CHACHA20
+       select CRYPTO_POLY1305
+       select KEYS
+       help
+       The bcachefs filesystem - a modern, copy on write filesystem, with
+       support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+       bool "bcachefs quota support"
+       depends on BCACHEFS_FS
+       select QUOTACTL
+
+config BCACHEFS_POSIX_ACL
+       bool "bcachefs POSIX ACL support"
+       depends on BCACHEFS_FS
+       select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG
+       bool "bcachefs debugging"
+       depends on BCACHEFS_FS
+       help
+       Enables many extra debugging checks and assertions.
+
+       The resulting code will be significantly slower than normal; you
+       probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+       bool "bcachefs unit and performance tests"
+       depends on BCACHEFS_FS
+       help
+       Include some unit and performance tests for the core btree code
+
+config BCACHEFS_NO_LATENCY_ACCT
+       bool "disable latency accounting and time stats"
+       depends on BCACHEFS_FS
+       help
+       This disables device latency tracking and time stats, only for performance testing
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
new file mode 100644 (file)
index 0000000..13cd6d2
--- /dev/null
@@ -0,0 +1,53 @@
+
+obj-$(CONFIG_BCACHEFS_FS)      += bcachefs.o
+
+bcachefs-y             :=      \
+       acl.o                   \
+       alloc.o                 \
+       bkey.o                  \
+       bkey_methods.o          \
+       bset.o                  \
+       btree_cache.o           \
+       btree_gc.o              \
+       btree_io.o              \
+       btree_iter.o            \
+       btree_update_interior.o \
+       btree_update_leaf.o     \
+       buckets.o               \
+       chardev.o               \
+       checksum.o              \
+       clock.o                 \
+       compress.o              \
+       debug.o                 \
+       dirent.o                \
+       disk_groups.o           \
+       error.o                 \
+       extents.o               \
+       fs.o                    \
+       fs-ioctl.o              \
+       fs-io.o                 \
+       fsck.o                  \
+       inode.o                 \
+       io.o                    \
+       journal.o               \
+       journal_io.o            \
+       journal_reclaim.o       \
+       journal_seq_blacklist.o \
+       keylist.o               \
+       migrate.o               \
+       move.o                  \
+       movinggc.o              \
+       opts.o                  \
+       quota.o                 \
+       rebalance.o             \
+       recovery.o              \
+       replicas.o              \
+       siphash.o               \
+       six.o                   \
+       super.o                 \
+       super-io.o              \
+       sysfs.o                 \
+       tests.o                 \
+       trace.o                 \
+       util.o                  \
+       xattr.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
new file mode 100644 (file)
index 0000000..eaf5c8e
--- /dev/null
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "bcachefs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "acl.h"
+#include "fs.h"
+#include "xattr.h"
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+       return sizeof(bch_acl_header) +
+               sizeof(bch_acl_entry_short) * nr_short +
+               sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+       switch (type) {
+       case ACL_TYPE_ACCESS:
+               return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+       case ACL_TYPE_DEFAULT:
+               return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+       default:
+               BUG();
+       }
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+{
+       const void *p, *end = value + size;
+       struct posix_acl *acl;
+       struct posix_acl_entry *out;
+       unsigned count = 0;
+
+       if (!value)
+               return NULL;
+       if (size < sizeof(bch_acl_header))
+               goto invalid;
+       if (((bch_acl_header *)value)->a_version !=
+           cpu_to_le32(BCH_ACL_VERSION))
+               goto invalid;
+
+       p = value + sizeof(bch_acl_header);
+       while (p < end) {
+               const bch_acl_entry *entry = p;
+
+               if (p + sizeof(bch_acl_entry_short) > end)
+                       goto invalid;
+
+               switch (le16_to_cpu(entry->e_tag)) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       p += sizeof(bch_acl_entry_short);
+                       break;
+               case ACL_USER:
+               case ACL_GROUP:
+                       p += sizeof(bch_acl_entry);
+                       break;
+               default:
+                       goto invalid;
+               }
+
+               count++;
+       }
+
+       if (p > end)
+               goto invalid;
+
+       if (!count)
+               return NULL;
+
+       acl = posix_acl_alloc(count, GFP_KERNEL);
+       if (!acl)
+               return ERR_PTR(-ENOMEM);
+
+       out = acl->a_entries;
+
+       p = value + sizeof(bch_acl_header);
+       while (p < end) {
+               const bch_acl_entry *in = p;
+
+               out->e_tag  = le16_to_cpu(in->e_tag);
+               out->e_perm = le16_to_cpu(in->e_perm);
+
+               switch (out->e_tag) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       p += sizeof(bch_acl_entry_short);
+                       break;
+               case ACL_USER:
+                       out->e_uid = make_kuid(&init_user_ns,
+                                              le32_to_cpu(in->e_id));
+                       p += sizeof(bch_acl_entry);
+                       break;
+               case ACL_GROUP:
+                       out->e_gid = make_kgid(&init_user_ns,
+                                              le32_to_cpu(in->e_id));
+                       p += sizeof(bch_acl_entry);
+                       break;
+               }
+
+               out++;
+       }
+
+       BUG_ON(out != acl->a_entries + acl->a_count);
+
+       return acl;
+invalid:
+       pr_err("invalid acl entry");
+       return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e)                 \
+       for (acl_e = acl->a_entries;                    \
+            acl_e < acl->a_entries + acl->a_count;     \
+            acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+                 const struct posix_acl *acl,
+                 int type)
+{
+       struct bkey_i_xattr *xattr;
+       bch_acl_header *acl_header;
+       const struct posix_acl_entry *acl_e;
+       void *outptr;
+       unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+       acl_for_each_entry(acl, acl_e) {
+               switch (acl_e->e_tag) {
+               case ACL_USER:
+               case ACL_GROUP:
+                       nr_long++;
+                       break;
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       nr_short++;
+                       break;
+               default:
+                       return ERR_PTR(-EINVAL);
+               }
+       }
+
+       acl_len = bch2_acl_size(nr_short, nr_long);
+       u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+       if (u64s > U8_MAX)
+               return ERR_PTR(-E2BIG);
+
+       xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+       if (IS_ERR(xattr))
+               return xattr;
+
+       bkey_xattr_init(&xattr->k_i);
+       xattr->k.u64s           = u64s;
+       xattr->v.x_type         = acl_to_xattr_type(type);
+       xattr->v.x_name_len     = 0,
+       xattr->v.x_val_len      = cpu_to_le16(acl_len);
+
+       acl_header = xattr_val(&xattr->v);
+       acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+       outptr = (void *) acl_header + sizeof(*acl_header);
+
+       acl_for_each_entry(acl, acl_e) {
+               bch_acl_entry *entry = outptr;
+
+               entry->e_tag = cpu_to_le16(acl_e->e_tag);
+               entry->e_perm = cpu_to_le16(acl_e->e_perm);
+               switch (acl_e->e_tag) {
+               case ACL_USER:
+                       entry->e_id = cpu_to_le32(
+                               from_kuid(&init_user_ns, acl_e->e_uid));
+                       outptr += sizeof(bch_acl_entry);
+                       break;
+               case ACL_GROUP:
+                       entry->e_id = cpu_to_le32(
+                               from_kgid(&init_user_ns, acl_e->e_gid));
+                       outptr += sizeof(bch_acl_entry);
+                       break;
+
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       outptr += sizeof(bch_acl_entry_short);
+                       break;
+               }
+       }
+
+       BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+       return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+                              struct dentry *dentry, int type)
+{
+       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c_xattr xattr;
+       struct posix_acl *acl = NULL;
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+                       &inode->ei_str_hash, inode->v.i_ino,
+                       &X_SEARCH(acl_to_xattr_type(type), "", 0),
+                       0);
+       if (IS_ERR(iter)) {
+               if (PTR_ERR(iter) == -EINTR)
+                       goto retry;
+
+               if (PTR_ERR(iter) != -ENOENT)
+                       acl = ERR_CAST(iter);
+               goto out;
+       }
+
+       xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+       acl = bch2_acl_from_disk(xattr_val(xattr.v),
+                       le16_to_cpu(xattr.v->x_val_len));
+
+       if (!IS_ERR(acl))
+               set_cached_acl(&inode->v, type, acl);
+out:
+       bch2_trans_exit(&trans);
+       return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans,
+                      struct bch_inode_unpacked *inode_u,
+                      const struct bch_hash_info *hash_info,
+                      struct posix_acl *acl, int type)
+{
+       int ret;
+
+       if (type == ACL_TYPE_DEFAULT &&
+           !S_ISDIR(inode_u->bi_mode))
+               return acl ? -EACCES : 0;
+
+       if (acl) {
+               struct bkey_i_xattr *xattr =
+                       bch2_acl_to_xattr(trans, acl, type);
+               if (IS_ERR(xattr))
+                       return PTR_ERR(xattr);
+
+               ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+                                     inode_u->bi_inum, &xattr->k_i, 0);
+       } else {
+               struct xattr_search_key search =
+                       X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+                                      inode_u->bi_inum, &search);
+       }
+
+       return ret == -ENOENT ? 0 : ret;
+}
+
+static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
+                                      struct bch_inode_unpacked *bi,
+                                      void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct timespec64 now = current_time(&inode->v);
+       umode_t mode = (unsigned long) p;
+
+       bi->bi_ctime    = timespec_to_bch2_time(c, now);
+       bi->bi_mode     = mode;
+       return 0;
+}
+
+int bch2_set_acl(struct mnt_idmap *idmap,
+                struct dentry *dentry,
+                struct posix_acl *acl, int type)
+{
+       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       umode_t mode = inode->v.i_mode;
+       int ret;
+
+       if (type == ACL_TYPE_ACCESS && acl) {
+               ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+               if (ret)
+                       return ret;
+       }
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = bch2_set_acl_trans(&trans,
+                                  &inode->ei_inode,
+                                  &inode->ei_str_hash,
+                                  acl, type) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_set_acl_fn,
+                                      (void *)(unsigned long) mode) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err;
+
+       bch2_inode_update_after_write(c, inode, &inode_u,
+                                     ATTR_CTIME|ATTR_MODE);
+
+       set_cached_acl(&inode->v, type, acl);
+err:
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans,
+                  struct bch_inode_info *inode,
+                  umode_t mode,
+                  struct posix_acl **new_acl)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c_xattr xattr;
+       struct bkey_i_xattr *new;
+       struct posix_acl *acl;
+       int ret = 0;
+
+       iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+                       &inode->ei_str_hash, inode->v.i_ino,
+                       &X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+                       BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+
+       xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+       acl = bch2_acl_from_disk(xattr_val(xattr.v),
+                       le16_to_cpu(xattr.v->x_val_len));
+       if (IS_ERR_OR_NULL(acl))
+               return PTR_ERR(acl);
+
+       ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+       if (ret)
+               goto err;
+
+       new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+       if (IS_ERR(new)) {
+               ret = PTR_ERR(new);
+               goto err;
+       }
+
+       bch2_trans_update(trans, iter, &new->k_i, 0);
+       *new_acl = acl;
+       acl = NULL;
+err:
+       kfree(acl);
+       return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
new file mode 100644 (file)
index 0000000..73739e3
--- /dev/null
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#define BCH_ACL_VERSION        0x0001
+
+typedef struct {
+       __le16          e_tag;
+       __le16          e_perm;
+       __le32          e_id;
+} bch_acl_entry;
+
+typedef struct {
+       __le16          e_tag;
+       __le16          e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+       __le32          a_version;
+} bch_acl_header;
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+
+int bch2_set_acl_trans(struct btree_trans *,
+                      struct bch_inode_unpacked *,
+                      const struct bch_hash_info *,
+                      struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+                  umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+                                    struct bch_inode_unpacked *inode_u,
+                                    const struct bch_hash_info *hash_info,
+                                    struct posix_acl *acl, int type)
+{
+       return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+                                struct bch_inode_info *inode,
+                                umode_t mode,
+                                struct posix_acl **new_acl)
+{
+       return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
new file mode 100644 (file)
index 0000000..e6e506e
--- /dev/null
@@ -0,0 +1,2205 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * It's important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ *
+ * invalidate_buckets() drives all the processes described above. It's called
+ * from bch2_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
+
+/* Ratelimiting/PD controllers */
+
+static void pd_controllers_update(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(to_delayed_work(work),
+                                          struct bch_fs,
+                                          pd_controllers_update);
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+
+               u64 free = bucket_to_sector(ca,
+                               __dev_buckets_free(ca, stats)) << 9;
+               /*
+                * Bytes of internal fragmentation, which can be
+                * reclaimed by copy GC
+                */
+               s64 fragmented = (bucket_to_sector(ca,
+                                       stats.buckets[BCH_DATA_USER] +
+                                       stats.buckets[BCH_DATA_CACHED]) -
+                                 (stats.sectors[BCH_DATA_USER] +
+                                  stats.sectors[BCH_DATA_CACHED])) << 9;
+
+               fragmented = max(0LL, fragmented);
+
+               bch2_pd_controller_update(&ca->copygc_pd,
+                                        free, fragmented, -1);
+       }
+
+       schedule_delayed_work(&c->pd_controllers_update,
+                             c->pd_controllers_update_seconds * HZ);
+}
+
+/* Persistent alloc info: */
+
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+       unsigned bytes = offsetof(struct bch_alloc, data);
+
+       if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+               bytes += 2;
+       if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+               bytes += 2;
+
+       return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
+
+       switch (k.k->type) {
+       case BCH_ALLOC: {
+               struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+               if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+                       return "incorrect value size";
+               break;
+       }
+       default:
+               return "invalid type";
+       }
+
+       return NULL;
+}
+
+void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
+{
+       buf[0] = '\0';
+
+       switch (k.k->type) {
+       case BCH_ALLOC:
+               break;
+       }
+}
+
+static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
+{
+       unsigned v;
+
+       switch (bytes) {
+       case 1:
+               v = **p;
+               break;
+       case 2:
+               v = le16_to_cpup((void *) *p);
+               break;
+       case 4:
+               v = le32_to_cpup((void *) *p);
+               break;
+       default:
+               BUG();
+       }
+
+       *p += bytes;
+       return v;
+}
+
+static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+{
+       switch (bytes) {
+       case 1:
+               **p = v;
+               break;
+       case 2:
+               *((__le16 *) *p) = cpu_to_le16(v);
+               break;
+       case 4:
+               *((__le32 *) *p) = cpu_to_le32(v);
+               break;
+       default:
+               BUG();
+       }
+
+       *p += bytes;
+}
+
+static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bch_dev *ca;
+       struct bkey_s_c_alloc a;
+       struct bucket_mark new;
+       struct bucket *g;
+       const u8 *d;
+
+       if (k.k->type != BCH_ALLOC)
+               return;
+
+       a = bkey_s_c_to_alloc(k);
+       ca = bch_dev_bkey_exists(c, a.k->p.inode);
+
+       if (a.k->p.offset >= ca->mi.nbuckets)
+               return;
+
+       percpu_down_read(&c->usage_lock);
+
+       g = bucket(ca, a.k->p.offset);
+       bucket_cmpxchg(g, new, ({
+               new.gen = a.v->gen;
+               new.gen_valid = 1;
+       }));
+
+       d = a.v->data;
+       if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+               g->io_time[READ] = get_alloc_field(&d, 2);
+       if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+               g->io_time[WRITE] = get_alloc_field(&d, 2);
+
+       percpu_up_read(&c->usage_lock);
+}
+
+int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+       struct journal_replay *r;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_dev *ca;
+       unsigned i;
+       int ret;
+
+       for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+               bch2_alloc_read_key(c, k);
+               bch2_btree_iter_cond_resched(&iter);
+       }
+
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
+
+       list_for_each_entry(r, journal_replay_list, list) {
+               struct bkey_i *k, *n;
+               struct jset_entry *entry;
+
+               for_each_jset_key(k, n, entry, &r->j)
+                       if (entry->btree_id == BTREE_ID_ALLOC)
+                               bch2_alloc_read_key(c, bkey_i_to_s_c(k));
+       }
+
+       mutex_lock(&c->bucket_clock[READ].lock);
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               bch2_recalc_oldest_io(c, ca, READ);
+               up_read(&ca->bucket_lock);
+       }
+       mutex_unlock(&c->bucket_clock[READ].lock);
+
+       mutex_lock(&c->bucket_clock[WRITE].lock);
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               bch2_recalc_oldest_io(c, ca, WRITE);
+               up_read(&ca->bucket_lock);
+       }
+       mutex_unlock(&c->bucket_clock[WRITE].lock);
+
+       return 0;
+}
+
+static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+                                 size_t b, struct btree_iter *iter,
+                                 u64 *journal_seq, bool nowait)
+{
+       struct bucket_mark m;
+       __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
+       struct bucket *g;
+       struct bkey_i_alloc *a;
+       u8 *d;
+       int ret;
+       unsigned flags = BTREE_INSERT_ATOMIC|
+               BTREE_INSERT_NOFAIL|
+               BTREE_INSERT_USE_RESERVE|
+               BTREE_INSERT_USE_ALLOC_RESERVE;
+
+       if (nowait)
+               flags |= BTREE_INSERT_NOWAIT;
+
+       bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+
+       do {
+               ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
+               if (ret)
+                       break;
+
+               percpu_down_read(&c->usage_lock);
+               g = bucket(ca, b);
+
+               /* read mark under btree node lock: */
+               m = READ_ONCE(g->mark);
+               a = bkey_alloc_init(&alloc_key.k);
+               a->k.p          = iter->pos;
+               a->v.fields     = 0;
+               a->v.gen        = m.gen;
+               set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+               d = a->v.data;
+               if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+                       put_alloc_field(&d, 2, g->io_time[READ]);
+               if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+                       put_alloc_field(&d, 2, g->io_time[WRITE]);
+               percpu_up_read(&c->usage_lock);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
+                                          BTREE_INSERT_ENTRY(iter, &a->k_i));
+               bch2_btree_iter_cond_resched(iter);
+       } while (ret == -EINTR);
+
+       return ret;
+}
+
+int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+{
+       struct bch_dev *ca;
+       struct btree_iter iter;
+       int ret;
+
+       if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+               return 0;
+
+       ca = bch_dev_bkey_exists(c, pos.inode);
+
+       if (pos.offset >= ca->mi.nbuckets)
+               return 0;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
+                                    NULL, false);
+       bch2_btree_iter_unlock(&iter);
+       return ret;
+}
+
+int bch2_alloc_write(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int ret = 0;
+
+       for_each_rw_member(ca, c, i) {
+               struct btree_iter iter;
+               unsigned long bucket;
+
+               bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+               down_read(&ca->bucket_lock);
+               for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
+                       ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
+                                                    NULL, false);
+                       if (ret)
+                               break;
+
+                       clear_bit(bucket, ca->buckets_dirty);
+               }
+               up_read(&ca->bucket_lock);
+               bch2_btree_iter_unlock(&iter);
+
+               if (ret) {
+                       percpu_ref_put(&ca->io_ref);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+/* Bucket IO clocks: */
+
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
+{
+       struct bucket_clock *clock = &c->bucket_clock[rw];
+       struct bucket_array *buckets = bucket_array(ca);
+       struct bucket *g;
+       u16 max_last_io = 0;
+       unsigned i;
+
+       lockdep_assert_held(&c->bucket_clock[rw].lock);
+
+       /* Recalculate max_last_io for this device: */
+       for_each_bucket(g, buckets)
+               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
+
+       ca->max_last_bucket_io[rw] = max_last_io;
+
+       /* Recalculate global max_last_io: */
+       max_last_io = 0;
+
+       for_each_member_device(ca, c, i)
+               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
+
+       clock->max_last_io = max_last_io;
+}
+
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
+{
+       struct bucket_clock *clock = &c->bucket_clock[rw];
+       struct bucket_array *buckets;
+       struct bch_dev *ca;
+       struct bucket *g;
+       unsigned i;
+
+       trace_rescale_prios(c);
+
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               buckets = bucket_array(ca);
+
+               for_each_bucket(g, buckets)
+                       g->io_time[rw] = clock->hand -
+                       bucket_last_io(c, g, rw) / 2;
+
+               bch2_recalc_oldest_io(c, ca, rw);
+
+               up_read(&ca->bucket_lock);
+       }
+}
+
+static void bch2_inc_clock_hand(struct io_timer *timer)
+{
+       struct bucket_clock *clock = container_of(timer,
+                                               struct bucket_clock, rescale);
+       struct bch_fs *c = container_of(clock,
+                                       struct bch_fs, bucket_clock[clock->rw]);
+       struct bch_dev *ca;
+       u64 capacity;
+       unsigned i;
+
+       mutex_lock(&clock->lock);
+
+       /* if clock cannot be advanced more, rescale prio */
+       if (clock->max_last_io >= U16_MAX - 2)
+               bch2_rescale_bucket_io_times(c, clock->rw);
+
+       BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+       for_each_member_device(ca, c, i)
+               ca->max_last_bucket_io[clock->rw]++;
+       clock->max_last_io++;
+       clock->hand++;
+
+       mutex_unlock(&clock->lock);
+
+       capacity = READ_ONCE(c->capacity);
+
+       if (!capacity)
+               return;
+
+       /*
+        * we only increment when 0.1% of the filesystem capacity has been read
+        * or written too, this determines if it's time
+        *
+        * XXX: we shouldn't really be going off of the capacity of devices in
+        * RW mode (that will be 0 when we're RO, yet we can still service
+        * reads)
+        */
+       timer->expire += capacity >> 10;
+
+       bch2_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
+{
+       struct bucket_clock *clock = &c->bucket_clock[rw];
+
+       clock->hand             = 1;
+       clock->rw               = rw;
+       clock->rescale.fn       = bch2_inc_clock_hand;
+       clock->rescale.expire   = c->capacity >> 10;
+       mutex_init(&clock->lock);
+}
+
+/* Background allocator thread: */
+
+/*
+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
+ * (marking them as invalidated on disk), then optionally issues discard
+ * commands to the newly free buckets, then puts them on the various freelists.
+ */
+
+static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
+                                  size_t bucket)
+{
+       if (expensive_debug_checks(c) &&
+           test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+               size_t iter;
+               long i;
+               unsigned j;
+
+               for (j = 0; j < RESERVE_NR; j++)
+                       fifo_for_each_entry(i, &ca->free[j], iter)
+                               BUG_ON(i == bucket);
+               fifo_for_each_entry(i, &ca->free_inc, iter)
+                       BUG_ON(i == bucket);
+       }
+}
+
+#define BUCKET_GC_GEN_MAX      96U
+
+/**
+ * wait_buckets_available - wait on reclaimable buckets
+ *
+ * If there aren't enough available buckets to fill up free_inc, wait until
+ * there are.
+ */
+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+       unsigned long gc_count = c->gc_count;
+       int ret = 0;
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread_should_stop()) {
+                       ret = 1;
+                       break;
+               }
+
+               if (gc_count != c->gc_count)
+                       ca->inc_gen_really_needs_gc = 0;
+
+               if ((ssize_t) (dev_buckets_available(c, ca) -
+                              ca->inc_gen_really_needs_gc) >=
+                   (ssize_t) fifo_free(&ca->free_inc))
+                       break;
+
+               up_read(&c->gc_lock);
+               schedule();
+               try_to_freeze();
+               down_read(&c->gc_lock);
+       }
+
+       __set_current_state(TASK_RUNNING);
+       return ret;
+}
+
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
+                                      size_t bucket,
+                                      struct bucket_mark mark)
+{
+       u8 gc_gen;
+
+       if (!is_available_bucket(mark))
+               return false;
+
+       gc_gen = bucket_gc_gen(ca, bucket);
+
+       if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
+               ca->inc_gen_needs_gc++;
+
+       if (gc_gen >= BUCKET_GC_GEN_MAX)
+               ca->inc_gen_really_needs_gc++;
+
+       return gc_gen < BUCKET_GC_GEN_MAX;
+}
+
+static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                      size_t bucket)
+{
+       struct bucket_mark m;
+
+       percpu_down_read(&c->usage_lock);
+       spin_lock(&c->freelist_lock);
+
+       if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
+               spin_unlock(&c->freelist_lock);
+               percpu_up_read(&c->usage_lock);
+               return;
+       }
+
+       verify_not_on_freelist(c, ca, bucket);
+       BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
+       spin_unlock(&c->freelist_lock);
+       percpu_up_read(&c->usage_lock);
+
+       /* gc lock held: */
+       bucket_io_clock_reset(c, ca, bucket, READ);
+       bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+       if (m.cached_sectors) {
+               ca->allocator_invalidating_data = true;
+       } else if (m.journal_seq_valid) {
+               u64 journal_seq = atomic64_read(&c->journal.seq);
+               u64 bucket_seq  = journal_seq;
+
+               bucket_seq &= ~((u64) U16_MAX);
+               bucket_seq |= m.journal_seq;
+
+               if (bucket_seq > journal_seq)
+                       bucket_seq -= 1 << 16;
+
+               ca->allocator_journal_seq_flush =
+                       max(ca->allocator_journal_seq_flush, bucket_seq);
+       }
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
+ * first.
+ *
+ *
+ * - We take into account the read prio of the bucket, which gives us an
+ *   indication of how hot the data is -- we scale the prio so that the prio
+ *   farthest from the clock is worth 1/8th of the closest.
+ *
+ * - The number of sectors of cached data in the bucket, which gives us an
+ *   indication of the cost in cache misses this eviction will cause.
+ *
+ * - If hotness * sectors used compares equal, we pick the bucket with the
+ *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
+ *   number repeatedly forces us to run mark and sweep gc to avoid generation
+ *   number wraparound.
+ */
+
+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, struct bucket_mark m)
+{
+       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+       unsigned max_last_io = ca->max_last_bucket_io[READ];
+
+       /*
+        * Time since last read, scaled to [0, 8) where larger value indicates
+        * more recently read data:
+        */
+       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+
+       /* How much we want to keep the data in this bucket: */
+       unsigned long data_wantness =
+               (hotness + 1) * bucket_sectors_used(m);
+
+       unsigned long needs_journal_commit =
+               bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+
+       return  (data_wantness << 9) |
+               (needs_journal_commit << 8) |
+               bucket_gc_gen(ca, b);
+}
+
+static inline int bucket_alloc_cmp(alloc_heap *h,
+                                  struct alloc_heap_entry l,
+                                  struct alloc_heap_entry r)
+{
+       return (l.key > r.key) - (l.key < r.key) ?:
+               (l.nr < r.nr)  - (l.nr  > r.nr) ?:
+               (l.bucket > r.bucket) - (l.bucket < r.bucket);
+}
+
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bucket_array *buckets;
+       struct alloc_heap_entry e = { 0 };
+       size_t b;
+
+       ca->alloc_heap.used = 0;
+
+       mutex_lock(&c->bucket_clock[READ].lock);
+       down_read(&ca->bucket_lock);
+
+       buckets = bucket_array(ca);
+
+       bch2_recalc_oldest_io(c, ca, READ);
+
+       /*
+        * Find buckets with lowest read priority, by building a maxheap sorted
+        * by read priority and repeatedly replacing the maximum element until
+        * all buckets have been visited.
+        */
+       for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
+               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+               unsigned long key = bucket_sort_key(c, ca, b, m);
+
+               if (!bch2_can_invalidate_bucket(ca, b, m))
+                       continue;
+
+               if (e.nr && e.bucket + e.nr == b && e.key == key) {
+                       e.nr++;
+               } else {
+                       if (e.nr)
+                               heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+                       e = (struct alloc_heap_entry) {
+                               .bucket = b,
+                               .nr     = 1,
+                               .key    = key,
+                       };
+               }
+
+               cond_resched();
+       }
+
+       if (e.nr)
+               heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+       up_read(&ca->bucket_lock);
+       mutex_unlock(&c->bucket_clock[READ].lock);
+
+       heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+
+       while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
+               for (b = e.bucket;
+                    b < e.bucket + e.nr;
+                    b++) {
+                       if (fifo_full(&ca->free_inc))
+                               return;
+
+                       bch2_invalidate_one_bucket(c, ca, b);
+               }
+       }
+}
+
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bucket_array *buckets = bucket_array(ca);
+       struct bucket_mark m;
+       size_t b, checked;
+
+       for (checked = 0;
+            checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
+            checked++) {
+               if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+                   ca->fifo_last_bucket >= ca->mi.nbuckets)
+                       ca->fifo_last_bucket = ca->mi.first_bucket;
+
+               b = ca->fifo_last_bucket++;
+
+               m = READ_ONCE(buckets->b[b].mark);
+
+               if (bch2_can_invalidate_bucket(ca, b, m))
+                       bch2_invalidate_one_bucket(c, ca, b);
+
+               cond_resched();
+       }
+}
+
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bucket_array *buckets = bucket_array(ca);
+       struct bucket_mark m;
+       size_t checked;
+
+       for (checked = 0;
+            checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
+            checked++) {
+               size_t b = bch2_rand_range(ca->mi.nbuckets -
+                                          ca->mi.first_bucket) +
+                       ca->mi.first_bucket;
+
+               m = READ_ONCE(buckets->b[b].mark);
+
+               if (bch2_can_invalidate_bucket(ca, b, m))
+                       bch2_invalidate_one_bucket(c, ca, b);
+
+               cond_resched();
+       }
+}
+
+static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+       ca->inc_gen_needs_gc                    = 0;
+       ca->inc_gen_really_needs_gc             = 0;
+
+       switch (ca->mi.replacement) {
+       case CACHE_REPLACEMENT_LRU:
+               find_reclaimable_buckets_lru(c, ca);
+               break;
+       case CACHE_REPLACEMENT_FIFO:
+               find_reclaimable_buckets_fifo(c, ca);
+               break;
+       case CACHE_REPLACEMENT_RANDOM:
+               find_reclaimable_buckets_random(c, ca);
+               break;
+       }
+}
+
+static int size_t_cmp(const void *_l, const void *_r)
+{
+       const size_t *l = _l, *r = _r;
+
+       return (*l > *r) - (*l < *r);
+}
+
+static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+{
+       BUG_ON(ca->free_inc.front);
+
+       spin_lock(&c->freelist_lock);
+       sort(ca->free_inc.data,
+            ca->free_inc.back,
+            sizeof(ca->free_inc.data[0]),
+            size_t_cmp, NULL);
+       spin_unlock(&c->freelist_lock);
+}
+
+static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
+                                   u64 *journal_seq, size_t nr,
+                                   bool nowait)
+{
+       struct btree_iter iter;
+       int ret = 0;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       /* Only use nowait if we've already invalidated at least one bucket: */
+       while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
+               size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
+
+               ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
+                                            nowait && ca->nr_invalidated);
+               if (ret)
+                       break;
+
+               ca->nr_invalidated++;
+       }
+
+       bch2_btree_iter_unlock(&iter);
+
+       /* If we used NOWAIT, don't return the error: */
+       return ca->nr_invalidated ? 0 : ret;
+}
+
+static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+       unsigned i;
+
+       /*
+        * Don't remove from free_inc until after it's added to
+        * freelist, so gc can find it:
+        */
+       spin_lock(&c->freelist_lock);
+       for (i = 0; i < RESERVE_NR; i++)
+               if (fifo_push(&ca->free[i], bucket)) {
+                       fifo_pop(&ca->free_inc, bucket);
+                       --ca->nr_invalidated;
+                       closure_wake_up(&c->freelist_wait);
+                       spin_unlock(&c->freelist_lock);
+                       return true;
+               }
+       spin_unlock(&c->freelist_lock);
+
+       return false;
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+       int ret = 0;
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               if (__push_invalidated_bucket(c, ca, bucket))
+                       break;
+
+               if ((current->flags & PF_KTHREAD) &&
+                   kthread_should_stop()) {
+                       ret = 1;
+                       break;
+               }
+
+               schedule();
+               try_to_freeze();
+       }
+
+       __set_current_state(TASK_RUNNING);
+       return ret;
+}
+
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+       while (ca->nr_invalidated) {
+               size_t bucket = fifo_peek(&ca->free_inc);
+
+               BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
+
+               if (ca->mi.discard &&
+                   bdev_max_discard_sectors(ca->disk_sb.bdev))
+                       blkdev_issue_discard(ca->disk_sb.bdev,
+                                            bucket_to_sector(ca, bucket),
+                                            ca->mi.bucket_size, GFP_NOIO);
+
+               if (push_invalidated_bucket(c, ca, bucket))
+                       return 1;
+       }
+
+       return 0;
+}
+
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
+static int bch2_allocator_thread(void *arg)
+{
+       struct bch_dev *ca = arg;
+       struct bch_fs *c = ca->fs;
+       u64 journal_seq;
+       int ret;
+
+       set_freezable();
+
+       while (1) {
+               while (1) {
+                       cond_resched();
+
+                       pr_debug("discarding %zu invalidated buckets",
+                                ca->nr_invalidated);
+
+                       ret = discard_invalidated_buckets(c, ca);
+                       if (ret)
+                               goto stop;
+
+                       if (fifo_empty(&ca->free_inc))
+                               break;
+
+                       pr_debug("invalidating %zu buckets",
+                                fifo_used(&ca->free_inc));
+
+                       journal_seq = 0;
+                       ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+                                                      SIZE_MAX, true);
+                       if (ret) {
+                               bch_err(ca, "error invalidating buckets: %i", ret);
+                               goto stop;
+                       }
+
+                       if (!ca->nr_invalidated) {
+                               bch_err(ca, "allocator thread unable to make forward progress!");
+                               goto stop;
+                       }
+
+                       if (ca->allocator_invalidating_data)
+                               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+                       else if (ca->allocator_journal_seq_flush)
+                               ret = bch2_journal_flush_seq(&c->journal,
+                                                      ca->allocator_journal_seq_flush);
+
+                       /*
+                        * journal error - buckets haven't actually been
+                        * invalidated, can't discard them:
+                        */
+                       if (ret) {
+                               bch_err(ca, "journal error: %i", ret);
+                               goto stop;
+                       }
+               }
+
+               pr_debug("free_inc now empty");
+
+               /* Reset front/back so we can easily sort fifo entries later: */
+               ca->free_inc.front = ca->free_inc.back  = 0;
+               ca->allocator_journal_seq_flush         = 0;
+               ca->allocator_invalidating_data         = false;
+
+               down_read(&c->gc_lock);
+               while (1) {
+                       size_t prev = fifo_used(&ca->free_inc);
+
+                       if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+                               up_read(&c->gc_lock);
+                               bch_err(ca, "gc failure");
+                               goto stop;
+                       }
+
+                       /*
+                        * Find some buckets that we can invalidate, either
+                        * they're completely unused, or only contain clean data
+                        * that's been written back to the backing device or
+                        * another cache tier
+                        */
+
+                       pr_debug("scanning for reclaimable buckets");
+
+                       find_reclaimable_buckets(c, ca);
+
+                       pr_debug("found %zu buckets (free_inc %zu/%zu)",
+                                fifo_used(&ca->free_inc) - prev,
+                                fifo_used(&ca->free_inc), ca->free_inc.size);
+
+                       trace_alloc_batch(ca, fifo_used(&ca->free_inc),
+                                         ca->free_inc.size);
+
+                       if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
+                            (!fifo_full(&ca->free_inc) &&
+                             ca->inc_gen_really_needs_gc >=
+                             fifo_free(&ca->free_inc))) &&
+                           c->gc_thread) {
+                               atomic_inc(&c->kick_gc);
+                               wake_up_process(c->gc_thread);
+                       }
+
+                       if (fifo_full(&ca->free_inc))
+                               break;
+
+                       if (!fifo_empty(&ca->free_inc) &&
+                           !fifo_full(&ca->free[RESERVE_MOVINGGC]))
+                               break;
+
+                       /*
+                        * copygc may be waiting until either its reserve fills
+                        * up, or we can't make forward progress:
+                        */
+                       ca->allocator_blocked = true;
+                       closure_wake_up(&c->freelist_wait);
+
+                       ret = wait_buckets_available(c, ca);
+                       if (ret) {
+                               up_read(&c->gc_lock);
+                               goto stop;
+                       }
+               }
+
+               ca->allocator_blocked = false;
+               up_read(&c->gc_lock);
+
+               pr_debug("free_inc now %zu/%zu",
+                        fifo_used(&ca->free_inc),
+                        ca->free_inc.size);
+
+               sort_free_inc(c, ca);
+
+               /*
+                * free_inc is now full of newly-invalidated buckets: next,
+                * write out the new bucket gens:
+                */
+       }
+
+stop:
+       pr_debug("alloc thread stopping (ret %i)", ret);
+       return 0;
+}
+
+/* Allocation */
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+       percpu_down_read(&c->usage_lock);
+       spin_lock(&ob->lock);
+
+       bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
+                              false, gc_pos_alloc(c, ob), 0);
+       ob->valid = false;
+
+       spin_unlock(&ob->lock);
+       percpu_up_read(&c->usage_lock);
+
+       spin_lock(&c->freelist_lock);
+       ob->freelist = c->open_buckets_freelist;
+       c->open_buckets_freelist = ob - c->open_buckets;
+       c->open_buckets_nr_free++;
+       spin_unlock(&c->freelist_lock);
+
+       closure_wake_up(&c->open_buckets_wait);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+       struct open_bucket *ob;
+
+       BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+       ob = c->open_buckets + c->open_buckets_freelist;
+       c->open_buckets_freelist = ob->freelist;
+       atomic_set(&ob->pin, 1);
+
+       c->open_buckets_nr_free--;
+       return ob;
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+       struct bucket_array *buckets;
+       ssize_t b;
+
+       rcu_read_lock();
+       buckets = bucket_array(ca);
+
+       for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
+               if (is_available_bucket(buckets->b[b].mark))
+                       goto success;
+       b = -1;
+success:
+       rcu_read_unlock();
+       return b;
+}
+
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+{
+       switch (reserve) {
+       case RESERVE_ALLOC:
+               return 0;
+       case RESERVE_BTREE:
+               return BTREE_NODE_RESERVE / 2;
+       default:
+               return BTREE_NODE_RESERVE;
+       }
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+                     enum alloc_reserve reserve,
+                     bool may_alloc_partial,
+                     struct closure *cl)
+{
+       struct bucket_array *buckets;
+       struct open_bucket *ob;
+       long bucket;
+
+       spin_lock(&c->freelist_lock);
+
+       if (may_alloc_partial &&
+           ca->open_buckets_partial_nr) {
+               int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+               c->open_buckets[ret].on_partial_list = false;
+               spin_unlock(&c->freelist_lock);
+               return ret;
+       }
+
+       if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+               if (cl)
+                       closure_wait(&c->open_buckets_wait, cl);
+               spin_unlock(&c->freelist_lock);
+               trace_open_bucket_alloc_fail(ca, reserve);
+               return OPEN_BUCKETS_EMPTY;
+       }
+
+       if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+               goto out;
+
+       switch (reserve) {
+       case RESERVE_ALLOC:
+               if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+                       goto out;
+               break;
+       case RESERVE_BTREE:
+               if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+                   ca->free[RESERVE_BTREE].size &&
+                   fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+                       goto out;
+               break;
+       case RESERVE_MOVINGGC:
+               if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+                       goto out;
+               break;
+       default:
+               break;
+       }
+
+       if (cl)
+               closure_wait(&c->freelist_wait, cl);
+
+       spin_unlock(&c->freelist_lock);
+
+       trace_bucket_alloc_fail(ca, reserve);
+       return FREELIST_EMPTY;
+out:
+       verify_not_on_freelist(c, ca, bucket);
+
+       ob = bch2_open_bucket_alloc(c);
+
+       spin_lock(&ob->lock);
+       buckets = bucket_array(ca);
+
+       ob->valid       = true;
+       ob->sectors_free = ca->mi.bucket_size;
+       ob->ptr         = (struct bch_extent_ptr) {
+               .gen    = buckets->b[bucket].mark.gen,
+               .offset = bucket_to_sector(ca, bucket),
+               .dev    = ca->dev_idx,
+       };
+
+       bucket_io_clock_reset(c, ca, bucket, READ);
+       bucket_io_clock_reset(c, ca, bucket, WRITE);
+       spin_unlock(&ob->lock);
+
+       spin_unlock(&c->freelist_lock);
+
+       bch2_wake_allocator(ca);
+
+       trace_bucket_alloc(ca, reserve);
+       return ob - c->open_buckets;
+}
+
+static int __dev_alloc_cmp(struct write_point *wp,
+                          unsigned l, unsigned r)
+{
+       return ((wp->next_alloc[l] > wp->next_alloc[r]) -
+               (wp->next_alloc[l] < wp->next_alloc[r]));
+}
+
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+                                        struct write_point *wp,
+                                        struct bch_devs_mask *devs)
+{
+       struct dev_alloc_list ret = { .nr = 0 };
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device_rcu(ca, c, i, devs)
+               ret.devs[ret.nr++] = i;
+
+       bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+       return ret;
+}
+
+void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
+                    struct write_point *wp)
+{
+       u64 *v = wp->next_alloc + ca->dev_idx;
+       u64 free_space = dev_buckets_free(c, ca);
+       u64 free_space_inv = free_space
+               ? div64_u64(1ULL << 48, free_space)
+               : 1ULL << 48;
+       u64 scale = *v / 4;
+
+       if (*v + free_space_inv >= *v)
+               *v += free_space_inv;
+       else
+               *v = U64_MAX;
+
+       for (v = wp->next_alloc;
+            v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+               *v = *v < scale ? 0 : *v - scale;
+}
+
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
+                                       struct write_point *wp,
+                                       unsigned nr_replicas,
+                                       enum alloc_reserve reserve,
+                                       struct bch_devs_mask *devs,
+                                       struct closure *cl)
+{
+       enum bucket_alloc_ret ret = NO_DEVICES;
+       struct dev_alloc_list devs_sorted;
+       struct bch_dev *ca;
+       unsigned i, nr_ptrs_effective = 0;
+       bool have_cache_dev = false;
+
+       BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
+
+       for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
+               ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
+
+               nr_ptrs_effective += ca->mi.durability;
+               have_cache_dev |= !ca->mi.durability;
+       }
+
+       if (nr_ptrs_effective >= nr_replicas)
+               return ALLOC_SUCCESS;
+
+       devs_sorted = bch2_wp_alloc_list(c, wp, devs);
+
+       for (i = 0; i < devs_sorted.nr; i++) {
+               int ob;
+
+               ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+               if (!ca)
+                       continue;
+
+               if (!ca->mi.durability &&
+                   (have_cache_dev ||
+                    wp->type != BCH_DATA_USER))
+                       continue;
+
+               ob = bch2_bucket_alloc(c, ca, reserve,
+                                      wp->type == BCH_DATA_USER, cl);
+               if (ob < 0) {
+                       ret = ob;
+                       if (ret == OPEN_BUCKETS_EMPTY)
+                               break;
+                       continue;
+               }
+
+               BUG_ON(ob <= 0 || ob > U8_MAX);
+               BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
+
+               wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
+
+               bch2_wp_rescale(c, ca, wp);
+
+               nr_ptrs_effective += ca->mi.durability;
+               have_cache_dev |= !ca->mi.durability;
+
+               __clear_bit(ca->dev_idx, devs->d);
+
+               if (nr_ptrs_effective >= nr_replicas) {
+                       ret = ALLOC_SUCCESS;
+                       break;
+               }
+       }
+
+       EBUG_ON(reserve == RESERVE_MOVINGGC &&
+               ret != ALLOC_SUCCESS &&
+               ret != OPEN_BUCKETS_EMPTY);
+
+       switch (ret) {
+       case ALLOC_SUCCESS:
+               return 0;
+       case NO_DEVICES:
+               return -EROFS;
+       case FREELIST_EMPTY:
+       case OPEN_BUCKETS_EMPTY:
+               return cl ? -EAGAIN : -ENOSPC;
+       default:
+               BUG();
+       }
+}
+
+/* Sector allocator */
+
+static void writepoint_drop_ptr(struct bch_fs *c,
+                               struct write_point *wp,
+                               unsigned i)
+{
+       struct open_bucket *ob = wp->ptrs[i];
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+       BUG_ON(ca->open_buckets_partial_nr >=
+              ARRAY_SIZE(ca->open_buckets_partial));
+
+       if (wp->type == BCH_DATA_USER) {
+               spin_lock(&c->freelist_lock);
+               ob->on_partial_list = true;
+               ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+                       ob - c->open_buckets;
+               spin_unlock(&c->freelist_lock);
+
+               closure_wake_up(&c->open_buckets_wait);
+               closure_wake_up(&c->freelist_wait);
+       } else {
+               bch2_open_bucket_put(c, ob);
+       }
+
+       array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+
+       if (i < wp->first_ptr)
+               wp->first_ptr--;
+}
+
+static void writepoint_drop_ptrs(struct bch_fs *c,
+                                struct write_point *wp,
+                                u16 target, bool in_target)
+{
+       int i;
+
+       for (i = wp->first_ptr - 1; i >= 0; --i)
+               if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+                                      target) == in_target)
+                       writepoint_drop_ptr(c, wp, i);
+}
+
+static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct open_bucket *ob;
+       unsigned i;
+
+       writepoint_for_each_ptr_all(wp, ob, i) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+               BUG_ON(ptr_stale(ca, &ob->ptr));
+       }
+#endif
+}
+
+static int open_bucket_add_buckets(struct bch_fs *c,
+                                  u16 target,
+                                  struct write_point *wp,
+                                  struct bch_devs_list *devs_have,
+                                  unsigned nr_replicas,
+                                  enum alloc_reserve reserve,
+                                  struct closure *cl)
+{
+       struct bch_devs_mask devs = c->rw_devs[wp->type];
+       const struct bch_devs_mask *t;
+       struct open_bucket *ob;
+       unsigned i;
+       int ret;
+
+       percpu_down_read(&c->usage_lock);
+       rcu_read_lock();
+
+       /* Don't allocate from devices we already have pointers to: */
+       for (i = 0; i < devs_have->nr; i++)
+               __clear_bit(devs_have->devs[i], devs.d);
+
+       writepoint_for_each_ptr_all(wp, ob, i)
+               __clear_bit(ob->ptr.dev, devs.d);
+
+       t = bch2_target_to_mask(c, target);
+       if (t)
+               bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+
+       ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
+
+       rcu_read_unlock();
+       percpu_up_read(&c->usage_lock);
+
+       return ret;
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+                                            unsigned long write_point)
+{
+       struct write_point *wp;
+
+       hlist_for_each_entry_rcu(wp, head, node)
+               if (wp->write_point == write_point)
+                       return wp;
+
+       return NULL;
+}
+
+static struct hlist_head *writepoint_hash(struct bch_fs *c,
+                                         unsigned long write_point)
+{
+       unsigned hash =
+               hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+       return &c->write_points_hash[hash];
+}
+
+static struct write_point *writepoint_find(struct bch_fs *c,
+                                          unsigned long write_point)
+{
+       struct write_point *wp, *oldest;
+       struct hlist_head *head;
+
+       if (!(write_point & 1UL)) {
+               wp = (struct write_point *) write_point;
+               mutex_lock(&wp->lock);
+               return wp;
+       }
+
+       head = writepoint_hash(c, write_point);
+restart_find:
+       wp = __writepoint_find(head, write_point);
+       if (wp) {
+lock_wp:
+               mutex_lock(&wp->lock);
+               if (wp->write_point == write_point)
+                       goto out;
+               mutex_unlock(&wp->lock);
+               goto restart_find;
+       }
+
+       oldest = NULL;
+       for (wp = c->write_points;
+            wp < c->write_points + ARRAY_SIZE(c->write_points);
+            wp++)
+               if (!oldest || time_before64(wp->last_used, oldest->last_used))
+                       oldest = wp;
+
+       mutex_lock(&oldest->lock);
+       mutex_lock(&c->write_points_hash_lock);
+       wp = __writepoint_find(head, write_point);
+       if (wp && wp != oldest) {
+               mutex_unlock(&c->write_points_hash_lock);
+               mutex_unlock(&oldest->lock);
+               goto lock_wp;
+       }
+
+       wp = oldest;
+       hlist_del_rcu(&wp->node);
+       wp->write_point = write_point;
+       hlist_add_head_rcu(&wp->node, head);
+       mutex_unlock(&c->write_points_hash_lock);
+out:
+       wp->last_used = sched_clock();
+       return wp;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+                               unsigned target,
+                               struct write_point_specifier write_point,
+                               struct bch_devs_list *devs_have,
+                               unsigned nr_replicas,
+                               unsigned nr_replicas_required,
+                               enum alloc_reserve reserve,
+                               unsigned flags,
+                               struct closure *cl)
+{
+       struct write_point *wp;
+       struct open_bucket *ob;
+       struct bch_dev *ca;
+       unsigned nr_ptrs_have, nr_ptrs_effective;
+       int ret, i, cache_idx = -1;
+
+       BUG_ON(!nr_replicas || !nr_replicas_required);
+
+       wp = writepoint_find(c, write_point.v);
+
+       wp->first_ptr = 0;
+
+       /* does writepoint have ptrs we can't use? */
+       writepoint_for_each_ptr(wp, ob, i)
+               if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
+                       swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+                       wp->first_ptr++;
+               }
+
+       nr_ptrs_have = wp->first_ptr;
+
+       /* does writepoint have ptrs we don't want to use? */
+       if (target)
+               writepoint_for_each_ptr(wp, ob, i)
+                       if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
+                               swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+                               wp->first_ptr++;
+                       }
+
+       if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
+               ret = open_bucket_add_buckets(c, target, wp, devs_have,
+                                             nr_replicas, reserve, cl);
+       } else {
+               ret = open_bucket_add_buckets(c, target, wp, devs_have,
+                                             nr_replicas, reserve, NULL);
+               if (!ret)
+                       goto alloc_done;
+
+               wp->first_ptr = nr_ptrs_have;
+
+               ret = open_bucket_add_buckets(c, 0, wp, devs_have,
+                                             nr_replicas, reserve, cl);
+       }
+
+       if (ret && ret != -EROFS)
+               goto err;
+alloc_done:
+       /* check for more than one cache: */
+       for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
+               ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
+
+               if (ca->mi.durability)
+                       continue;
+
+               /*
+                * if we ended up with more than one cache device, prefer the
+                * one in the target we want:
+                */
+               if (cache_idx >= 0) {
+                       if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+                                               target)) {
+                               writepoint_drop_ptr(c, wp, i);
+                       } else {
+                               writepoint_drop_ptr(c, wp, cache_idx);
+                               cache_idx = i;
+                       }
+               } else {
+                       cache_idx = i;
+               }
+       }
+
+       /* we might have more effective replicas than required: */
+       nr_ptrs_effective = 0;
+       writepoint_for_each_ptr(wp, ob, i) {
+               ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+               nr_ptrs_effective += ca->mi.durability;
+       }
+
+       if (ret == -EROFS &&
+           nr_ptrs_effective >= nr_replicas_required)
+               ret = 0;
+
+       if (ret)
+               goto err;
+
+       if (nr_ptrs_effective > nr_replicas) {
+               writepoint_for_each_ptr(wp, ob, i) {
+                       ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+                       if (ca->mi.durability &&
+                           ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
+                           !bch2_dev_in_target(c, ob->ptr.dev, target)) {
+                               swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+                               wp->first_ptr++;
+                               nr_ptrs_effective -= ca->mi.durability;
+                       }
+               }
+       }
+
+       if (nr_ptrs_effective > nr_replicas) {
+               writepoint_for_each_ptr(wp, ob, i) {
+                       ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+                       if (ca->mi.durability &&
+                           ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
+                               swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+                               wp->first_ptr++;
+                               nr_ptrs_effective -= ca->mi.durability;
+                       }
+               }
+       }
+
+       /* Remove pointers we don't want to use: */
+       if (target)
+               writepoint_drop_ptrs(c, wp, target, false);
+
+       BUG_ON(wp->first_ptr >= wp->nr_ptrs);
+       BUG_ON(nr_ptrs_effective < nr_replicas_required);
+
+       wp->sectors_free = UINT_MAX;
+
+       writepoint_for_each_ptr(wp, ob, i)
+               wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+       BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+       verify_not_stale(c, wp);
+
+       return wp;
+err:
+       mutex_unlock(&wp->lock);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+                                   struct bkey_i_extent *e, unsigned sectors)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       BUG_ON(sectors > wp->sectors_free);
+       wp->sectors_free -= sectors;
+
+       writepoint_for_each_ptr(wp, ob, i) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+               struct bch_extent_ptr tmp = ob->ptr;
+
+               EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
+
+               tmp.cached = bkey_extent_is_cached(&e->k) ||
+                       (!ca->mi.durability && wp->type == BCH_DATA_USER);
+
+               tmp.offset += ca->mi.bucket_size - ob->sectors_free;
+               extent_ptr_append(e, tmp);
+
+               BUG_ON(sectors > ob->sectors_free);
+               ob->sectors_free -= sectors;
+       }
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+       int i;
+
+       for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+               struct open_bucket *ob = wp->ptrs[i];
+
+               if (!ob->sectors_free) {
+                       array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+                       bch2_open_bucket_put(c, ob);
+               }
+       }
+
+       mutex_unlock(&wp->lock);
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       u64 total_capacity, capacity = 0, reserved_sectors = 0;
+       unsigned long ra_pages = 0;
+       unsigned i, j;
+
+       lockdep_assert_held(&c->state_lock);
+
+       for_each_online_member(ca, c, i) {
+               struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+               ra_pages += bdi->ra_pages;
+       }
+
+       bch2_set_ra_pages(c, ra_pages);
+
+       for_each_rw_member(ca, c, i) {
+               size_t reserve = 0;
+
+               /*
+                * We need to reserve buckets (from the number
+                * of currently available buckets) against
+                * foreground writes so that mainly copygc can
+                * make forward progress.
+                *
+                * We need enough to refill the various reserves
+                * from scratch - copygc will use its entire
+                * reserve all at once, then run against when
+                * its reserve is refilled (from the formerly
+                * available buckets).
+                *
+                * This reserve is just used when considering if
+                * allocations for foreground writes must wait -
+                * not -ENOSPC calculations.
+                */
+               for (j = 0; j < RESERVE_NONE; j++)
+                       reserve += ca->free[j].size;
+
+               reserve += ca->free_inc.size;
+
+               reserve += ARRAY_SIZE(c->write_points);
+
+               reserve += 1;   /* btree write point */
+
+               reserved_sectors += bucket_to_sector(ca, reserve);
+
+               capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+                                            ca->mi.first_bucket);
+       }
+
+       total_capacity = capacity;
+
+       capacity *= (100 - c->opts.gc_reserve_percent);
+       capacity = div64_u64(capacity, 100);
+
+       BUG_ON(reserved_sectors > total_capacity);
+
+       capacity = min(capacity, total_capacity - reserved_sectors);
+
+       c->capacity = capacity;
+
+       if (c->capacity) {
+               bch2_io_timer_add(&c->io_clock[READ],
+                                &c->bucket_clock[READ].rescale);
+               bch2_io_timer_add(&c->io_clock[WRITE],
+                                &c->bucket_clock[WRITE].rescale);
+       } else {
+               bch2_io_timer_del(&c->io_clock[READ],
+                                &c->bucket_clock[READ].rescale);
+               bch2_io_timer_del(&c->io_clock[WRITE],
+                                &c->bucket_clock[WRITE].rescale);
+       }
+
+       /* Wake up case someone was waiting for buckets */
+       closure_wake_up(&c->freelist_wait);
+}
+
+static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
+                                 struct write_point *wp)
+{
+       struct bch_devs_mask not_self;
+
+       bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
+
+       mutex_lock(&wp->lock);
+       wp->first_ptr = wp->nr_ptrs;
+       writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
+       mutex_unlock(&wp->lock);
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct open_bucket *ob;
+       bool ret = false;
+
+       for (ob = c->open_buckets;
+            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+            ob++) {
+               spin_lock(&ob->lock);
+               if (ob->valid && !ob->on_partial_list &&
+                   ob->ptr.dev == ca->dev_idx)
+                       ret = true;
+               spin_unlock(&ob->lock);
+       }
+
+       return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+       unsigned i;
+
+       BUG_ON(ca->alloc_thread);
+
+       /* First, remove device from allocation groups: */
+
+       for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+               clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+       /*
+        * Capacity is calculated based off of devices in allocation groups:
+        */
+       bch2_recalc_capacity(c);
+
+       /* Next, close write points that point to this device... */
+       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+               bch2_stop_write_point(c, ca, &c->write_points[i]);
+
+       bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+       bch2_stop_write_point(c, ca, &c->rebalance_write_point);
+       bch2_stop_write_point(c, ca, &c->btree_write_point);
+
+       mutex_lock(&c->btree_reserve_cache_lock);
+       while (c->btree_reserve_cache_nr) {
+               struct btree_alloc *a =
+                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+               bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
+       }
+       mutex_unlock(&c->btree_reserve_cache_lock);
+
+       /*
+        * Wake up threads that were blocked on allocation, so they can notice
+        * the device can no longer be removed and the capacity has changed:
+        */
+       closure_wake_up(&c->freelist_wait);
+
+       /*
+        * journal_res_get() can block waiting for free space in the journal -
+        * it needs to notice there may not be devices to allocate from anymore:
+        */
+       wake_up(&c->journal.wait);
+
+       /* Now wait for any in flight writes: */
+
+       closure_wait_event(&c->open_buckets_wait,
+                          !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+               if (ca->mi.data_allowed & (1 << i))
+                       set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+       struct task_struct *p;
+
+       p = rcu_dereference_protected(ca->alloc_thread, 1);
+       ca->alloc_thread = NULL;
+
+       /*
+        * We need an rcu barrier between setting ca->alloc_thread = NULL and
+        * the thread shutting down to avoid bch2_wake_allocator() racing:
+        *
+        * XXX: it would be better to have the rcu barrier be asynchronous
+        * instead of blocking us here
+        */
+       synchronize_rcu();
+
+       if (p) {
+               kthread_stop(p);
+               put_task_struct(p);
+       }
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+       struct task_struct *p;
+
+       /*
+        * allocator thread already started?
+        */
+       if (ca->alloc_thread)
+               return 0;
+
+       p = kthread_create(bch2_allocator_thread, ca,
+                          "bch_alloc[%s]", ca->name);
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       get_task_struct(p);
+       rcu_assign_pointer(ca->alloc_thread, p);
+       wake_up_process(p);
+       return 0;
+}
+
+static void allocator_start_issue_discards(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned dev_iter;
+       size_t i, bu;
+
+       for_each_rw_member(ca, c, dev_iter) {
+               unsigned done = 0;
+
+               fifo_for_each_entry(bu, &ca->free_inc, i) {
+                       if (done == ca->nr_invalidated)
+                               break;
+
+                       blkdev_issue_discard(ca->disk_sb.bdev,
+                                            bucket_to_sector(ca, bu),
+                                            ca->mi.bucket_size, GFP_NOIO);
+                       done++;
+               }
+       }
+}
+
+static int __bch2_fs_allocator_start(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       size_t bu, i;
+       unsigned dev_iter;
+       u64 journal_seq = 0;
+       bool invalidating_data = false;
+       int ret = 0;
+
+       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+               return -1;
+
+       /* Scan for buckets that are already invalidated: */
+       for_each_rw_member(ca, c, dev_iter) {
+               struct btree_iter iter;
+               struct bucket_mark m;
+               struct bkey_s_c k;
+
+               for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
+                       if (k.k->type != BCH_ALLOC)
+                               continue;
+
+                       bu = k.k->p.offset;
+                       m = READ_ONCE(bucket(ca, bu)->mark);
+
+                       if (!is_available_bucket(m) || m.cached_sectors)
+                               continue;
+
+                       percpu_down_read(&c->usage_lock);
+                       bch2_mark_alloc_bucket(c, ca, bu, true,
+                                       gc_pos_alloc(c, NULL),
+                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
+                       percpu_up_read(&c->usage_lock);
+
+                       fifo_push(&ca->free_inc, bu);
+                       ca->nr_invalidated++;
+
+                       if (fifo_full(&ca->free_inc))
+                               break;
+               }
+               bch2_btree_iter_unlock(&iter);
+       }
+
+       /* did we find enough buckets? */
+       for_each_rw_member(ca, c, dev_iter)
+               if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+                       percpu_ref_put(&ca->io_ref);
+                       goto not_enough;
+               }
+
+       return 0;
+not_enough:
+       pr_debug("did not find enough empty buckets; issuing discards");
+
+       /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+       for_each_rw_member(ca, c, dev_iter)
+               discard_invalidated_buckets(c, ca);
+
+       pr_debug("scanning for reclaimable buckets");
+
+       for_each_rw_member(ca, c, dev_iter) {
+               BUG_ON(!fifo_empty(&ca->free_inc));
+               ca->free_inc.front = ca->free_inc.back  = 0;
+
+               find_reclaimable_buckets(c, ca);
+               sort_free_inc(c, ca);
+
+               invalidating_data |= ca->allocator_invalidating_data;
+
+               fifo_for_each_entry(bu, &ca->free_inc, i)
+                       if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
+                               break;
+       }
+
+       pr_debug("done scanning for reclaimable buckets");
+
+       /*
+        * We're moving buckets to freelists _before_ they've been marked as
+        * invalidated on disk - we have to so that we can allocate new btree
+        * nodes to mark them as invalidated on disk.
+        *
+        * However, we can't _write_ to any of these buckets yet - they might
+        * have cached data in them, which is live until they're marked as
+        * invalidated on disk:
+        */
+       if (invalidating_data) {
+               pr_debug("invalidating existing data");
+               set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+       } else {
+               pr_debug("issuing discards");
+               allocator_start_issue_discards(c);
+       }
+
+       /*
+        * XXX: it's possible for this to deadlock waiting on journal reclaim,
+        * since we're holding btree writes. What then?
+        */
+
+       for_each_rw_member(ca, c, dev_iter) {
+               ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+                                              ca->free[RESERVE_BTREE].size,
+                                              false);
+               if (ret) {
+                       percpu_ref_put(&ca->io_ref);
+                       return ret;
+               }
+       }
+
+       if (invalidating_data) {
+               pr_debug("flushing journal");
+
+               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+               if (ret)
+                       return ret;
+
+               pr_debug("issuing discards");
+               allocator_start_issue_discards(c);
+       }
+
+       for_each_rw_member(ca, c, dev_iter)
+               while (ca->nr_invalidated) {
+                       BUG_ON(!fifo_pop(&ca->free_inc, bu));
+                       ca->nr_invalidated--;
+               }
+
+       set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+
+       /* now flush dirty btree nodes: */
+       if (invalidating_data) {
+               struct bucket_table *tbl;
+               struct rhash_head *pos;
+               struct btree *b;
+               bool flush_updates;
+               size_t nr_pending_updates;
+
+               clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+               pr_debug("flushing dirty btree nodes");
+               cond_resched();
+
+               flush_updates = false;
+               nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+
+               rcu_read_lock();
+               for_each_cached_btree(b, c, tbl, i, pos)
+                       if (btree_node_dirty(b) && (!b->written || b->level)) {
+                               if (btree_node_may_write(b)) {
+                                       rcu_read_unlock();
+                                       btree_node_lock_type(c, b, SIX_LOCK_read);
+                                       bch2_btree_node_write(c, b, SIX_LOCK_read);
+                                       six_unlock_read(&b->lock);
+                                       goto again;
+                               } else {
+                                       flush_updates = true;
+                               }
+                       }
+               rcu_read_unlock();
+
+               /*
+                * This is ugly, but it's needed to flush btree node writes
+                * without spinning...
+                */
+               if (flush_updates) {
+                       closure_wait_event(&c->btree_interior_update_wait,
+                               bch2_btree_interior_updates_nr_pending(c) <
+                               nr_pending_updates);
+                       goto again;
+               }
+       }
+
+       return 0;
+}
+
+int bch2_fs_allocator_start(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int ret;
+
+       down_read(&c->gc_lock);
+       ret = __bch2_fs_allocator_start(c);
+       up_read(&c->gc_lock);
+
+       if (ret)
+               return ret;
+
+       for_each_rw_member(ca, c, i) {
+               ret = bch2_dev_allocator_start(ca);
+               if (ret) {
+                       percpu_ref_put(&ca->io_ref);
+                       return ret;
+               }
+       }
+
+       return bch2_alloc_write(c);
+}
+
+void bch2_fs_allocator_init(struct bch_fs *c)
+{
+       struct open_bucket *ob;
+       struct write_point *wp;
+
+       mutex_init(&c->write_points_hash_lock);
+       spin_lock_init(&c->freelist_lock);
+       bch2_bucket_clock_init(c, READ);
+       bch2_bucket_clock_init(c, WRITE);
+
+       /* open bucket 0 is a sentinal NULL: */
+       spin_lock_init(&c->open_buckets[0].lock);
+
+       for (ob = c->open_buckets + 1;
+            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+               spin_lock_init(&ob->lock);
+               c->open_buckets_nr_free++;
+
+               ob->freelist = c->open_buckets_freelist;
+               c->open_buckets_freelist = ob - c->open_buckets;
+       }
+
+       writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+       writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+
+       for (wp = c->write_points;
+            wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
+               writepoint_init(wp, BCH_DATA_USER);
+
+               wp->last_used   = sched_clock();
+               wp->write_point = (unsigned long) wp;
+               hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+       }
+
+       c->pd_controllers_update_seconds = 5;
+       INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
+}
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
new file mode 100644 (file)
index 0000000..00d01f4
--- /dev/null
@@ -0,0 +1,141 @@
+#ifndef _BCACHEFS_ALLOC_H
+#define _BCACHEFS_ALLOC_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_alloc_invalid,           \
+       .val_to_text    = bch2_alloc_to_text,           \
+}
+
+struct dev_alloc_list {
+       unsigned        nr;
+       u8              devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
+                                        struct write_point *,
+                                        struct bch_devs_mask *);
+void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
+                    struct write_point *);
+
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+
+enum bucket_alloc_ret {
+       ALLOC_SUCCESS           = 0,
+       OPEN_BUCKETS_EMPTY      = -1,
+       FREELIST_EMPTY          = -2,   /* Allocator thread not keeping up */
+       NO_DEVICES              = -3,   /* -EROFS */
+};
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+                     struct closure *);
+
+#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)                        \
+       for ((_i) = (_start);                                           \
+            (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);  \
+            (_i)++)
+
+#define writepoint_for_each_ptr_all(_wp, _ob, _i)                      \
+       __writepoint_for_each_ptr(_wp, _ob, _i, 0)
+
+#define writepoint_for_each_ptr(_wp, _ob, _i)                          \
+       __writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+       if (atomic_dec_and_test(&ob->pin))
+               __bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+       unsigned i;
+
+       for (i = 0; i < *nr; i++)
+               bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+       *nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+                                       struct write_point *wp,
+                                       u8 *nr, u8 *refs)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       writepoint_for_each_ptr(wp, ob, i) {
+               atomic_inc(&ob->pin);
+               refs[(*nr)++] = ob - c->open_buckets;
+       }
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+                                            unsigned,
+                                            struct write_point_specifier,
+                                            struct bch_devs_list *,
+                                            unsigned, unsigned,
+                                            enum alloc_reserve,
+                                            unsigned,
+                                            struct closure *);
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+                                   struct bkey_i_extent *, unsigned);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+static inline void bch2_wake_allocator(struct bch_dev *ca)
+{
+       struct task_struct *p;
+
+       rcu_read_lock();
+       p = rcu_dereference(ca->alloc_thread);
+       if (p)
+               wake_up_process(p);
+       rcu_read_unlock();
+}
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+       return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+       return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_allocator_stop(struct bch_dev *);
+int bch2_dev_allocator_start(struct bch_dev *);
+
+static inline void writepoint_init(struct write_point *wp,
+                                  enum bch_data_type type)
+{
+       mutex_init(&wp->lock);
+       wp->type = type;
+}
+
+int bch2_alloc_write(struct bch_fs *);
+int bch2_fs_allocator_start(struct bch_fs *);
+void bch2_fs_allocator_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
new file mode 100644 (file)
index 0000000..035c500
--- /dev/null
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+/* There's two of these clocks, one for reads and one for writes: */
+struct bucket_clock {
+       /*
+        * "now" in (read/write) IO time - incremented whenever we do X amount
+        * of reads or writes.
+        *
+        * Goes with the bucket read/write prios: when we read or write to a
+        * bucket we reset the bucket's prio to the current hand; thus hand -
+        * prio = time since bucket was last read/written.
+        *
+        * The units are some amount (bytes/sectors) of data read/written, and
+        * the units can change on the fly if we need to rescale to fit
+        * everything in a u16 - your only guarantee is that the units are
+        * consistent.
+        */
+       u16                     hand;
+       u16                     max_last_io;
+
+       int                     rw;
+
+       struct io_timer         rescale;
+       struct mutex            lock;
+};
+
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+       RESERVE_ALLOC           = -1,
+       RESERVE_BTREE           = 0,
+       RESERVE_MOVINGGC        = 1,
+       RESERVE_NONE            = 2,
+       RESERVE_NR              = 3,
+};
+
+typedef FIFO(long)     alloc_fifo;
+
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT     256
+#define WRITE_POINT_COUNT      32
+
+struct open_bucket {
+       spinlock_t              lock;
+       atomic_t                pin;
+       u8                      freelist;
+       bool                    valid;
+       bool                    on_partial_list;
+       unsigned                sectors_free;
+       struct bch_extent_ptr   ptr;
+};
+
+struct write_point {
+       struct hlist_node       node;
+       struct mutex            lock;
+       u64                     last_used;
+       unsigned long           write_point;
+       enum bch_data_type      type;
+
+       u8                      nr_ptrs;
+       u8                      first_ptr;
+
+       /* calculated based on how many pointers we're actually going to use: */
+       unsigned                sectors_free;
+
+       struct open_bucket      *ptrs[BCH_REPLICAS_MAX * 2];
+       u64                     next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+struct write_point_specifier {
+       unsigned long           v;
+};
+
+struct alloc_heap_entry {
+       size_t                  bucket;
+       size_t                  nr;
+       unsigned long           key;
+};
+
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
new file mode 100644 (file)
index 0000000..b5e119d
--- /dev/null
@@ -0,0 +1,785 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+
+#include <linux/backing-dev-defs.h>
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "fifo.h"
+#include "opts.h"
+#include "util.h"
+
+#define dynamic_fault(...)             0
+#define race_fault(...)                        0
+
+#define bch2_fs_init_fault(name)                                               \
+       dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name)                                     \
+        dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name)                                    \
+        dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define bch2_fmt(_c, fmt)      "bcachefs (%s): " fmt "\n", ((_c)->name)
+#else
+#define bch2_fmt(_c, fmt)      fmt "\n"
+#endif
+
+#define bch_info(c, fmt, ...) \
+       printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+       printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+       printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err(c, fmt, ...) \
+       printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_verbose(c, fmt, ...)                                       \
+do {                                                                   \
+       if ((c)->opts.verbose_recovery)                                 \
+               bch_info(c, fmt, ##__VA_ARGS__);                        \
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...)                                        \
+do {                                                                   \
+       if (opt_get(opts, verbose_init))                                \
+               pr_info(fmt, ##__VA_ARGS__);                            \
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()                                      \
+       BCH_DEBUG_PARAM(key_merging_disabled,                           \
+               "Disables merging of extents")                          \
+       BCH_DEBUG_PARAM(btree_gc_always_rewrite,                        \
+               "Causes mark and sweep to compact and rewrite every "   \
+               "btree node it traverses")                              \
+       BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,                      \
+               "Disables rewriting of btree nodes during mark and sweep")\
+       BCH_DEBUG_PARAM(btree_shrinker_disabled,                        \
+               "Disables the shrinker callback for the btree node cache")
+
+/* Parameters that should only be compiled in in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()                                       \
+       BCH_DEBUG_PARAM(expensive_debug_checks,                         \
+               "Enables various runtime debugging checks that "        \
+               "significantly affect performance")                     \
+       BCH_DEBUG_PARAM(debug_check_bkeys,                              \
+               "Run bkey_debugcheck (primarily checking GC/allocation "\
+               "information) when iterating over keys")                \
+       BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
+               "Reread btree nodes at various points to verify the "   \
+               "mergesort in the read path against modifications "     \
+               "done in memory")                                       \
+       BCH_DEBUG_PARAM(journal_seq_verify,                             \
+               "Store the journal sequence number in the version "     \
+               "number of every btree key, and verify that btree "     \
+               "update ordering is preserved during recovery")         \
+       BCH_DEBUG_PARAM(inject_invalid_keys,                            \
+               "Store the journal sequence number in the version "     \
+               "number of every btree key, and verify that btree "     \
+               "update ordering is preserved during recovery")         \
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_TIME_STATS()                       \
+       x(btree_node_mem_alloc)                 \
+       x(btree_gc)                             \
+       x(btree_split)                          \
+       x(btree_sort)                           \
+       x(btree_read)                           \
+       x(btree_lock_contended_read)            \
+       x(btree_lock_contended_intent)          \
+       x(btree_lock_contended_write)           \
+       x(data_write)                           \
+       x(data_read)                            \
+       x(data_promote)                         \
+       x(journal_write)                        \
+       x(journal_delay)                        \
+       x(journal_blocked)                      \
+       x(journal_flush_seq)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+       BCH_TIME_STATS()
+#undef x
+       BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "clock_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES         4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX      (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE     (BTREE_RESERVE_MAX * 4)
+
+struct btree;
+
+enum gc_phase {
+       GC_PHASE_START,
+       GC_PHASE_SB,
+
+#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
+       DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+
+       GC_PHASE_PENDING_DELETE,
+       GC_PHASE_ALLOC,
+       GC_PHASE_DONE
+};
+
+struct gc_pos {
+       enum gc_phase           phase;
+       struct bpos             pos;
+       unsigned                level;
+};
+
+struct io_count {
+       u64                     sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+       struct kobject          kobj;
+       struct percpu_ref       ref;
+       struct completion       ref_completion;
+       struct percpu_ref       io_ref;
+       struct completion       io_ref_completion;
+
+       struct bch_fs           *fs;
+
+       u8                      dev_idx;
+       /*
+        * Cached version of this device's member info from superblock
+        * Committed by bch2_write_super() -> bch_fs_mi_update()
+        */
+       struct bch_member_cpu   mi;
+       __uuid_t                uuid;
+       char                    name[BDEVNAME_SIZE];
+
+       struct bch_sb_handle    disk_sb;
+       int                     sb_write_error;
+
+       struct bch_devs_mask    self;
+
+       /* biosets used in cloned bios for writing multiple replicas */
+       struct bio_set          replica_set;
+
+       /*
+        * Buckets:
+        * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+        * gc_lock, for device resize - holding any is sufficient for access:
+        * Or rcu_read_lock(), but only for ptr_stale():
+        */
+       struct bucket_array __rcu *buckets;
+       unsigned long           *buckets_dirty;
+       /* most out of date gen in the btree */
+       u8                      *oldest_gens;
+       struct rw_semaphore     bucket_lock;
+
+       struct bch_dev_usage __percpu *usage_percpu;
+       struct bch_dev_usage    usage_cached;
+
+       /* Allocator: */
+       struct task_struct __rcu *alloc_thread;
+
+       /*
+        * free: Buckets that are ready to be used
+        *
+        * free_inc: Incoming buckets - these are buckets that currently have
+        * cached data in them, and we can't reuse them until after we write
+        * their new gen to disk. After prio_write() finishes writing the new
+        * gens/prios, they'll be moved to the free list (and possibly discarded
+        * in the process)
+        */
+       alloc_fifo              free[RESERVE_NR];
+       alloc_fifo              free_inc;
+       spinlock_t              freelist_lock;
+       size_t                  nr_invalidated;
+
+       u8                      open_buckets_partial[OPEN_BUCKETS_COUNT];
+       unsigned                open_buckets_partial_nr;
+
+       size_t                  fifo_last_bucket;
+
+       /* last calculated minimum prio */
+       u16                     max_last_bucket_io[2];
+
+       atomic_long_t           saturated_count;
+       size_t                  inc_gen_needs_gc;
+       size_t                  inc_gen_really_needs_gc;
+       u64                     allocator_journal_seq_flush;
+       bool                    allocator_invalidating_data;
+       bool                    allocator_blocked;
+
+       alloc_heap              alloc_heap;
+
+       /* Copying GC: */
+       struct task_struct      *copygc_thread;
+       copygc_heap             copygc_heap;
+       struct bch_pd_controller copygc_pd;
+       struct write_point      copygc_write_point;
+
+       atomic64_t              rebalance_work;
+
+       struct journal_device   journal;
+
+       struct work_struct      io_error_work;
+
+       /* The rest of this all shows up in sysfs */
+       atomic64_t              cur_latency[2];
+       struct bch2_time_stats  io_latency[2];
+
+#define CONGESTED_MAX          1024
+       atomic_t                congested;
+       u64                     congested_last;
+
+       struct io_count __percpu *io_done;
+};
+
+/*
+ * Flag bits for what phase of startup/shutdown the cache set is at, how we're
+ * shutting down, etc.:
+ *
+ * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ */
+enum {
+       /* startup: */
+       BCH_FS_ALLOC_READ_DONE,
+       BCH_FS_ALLOCATOR_STARTED,
+       BCH_FS_INITIAL_GC_DONE,
+       BCH_FS_FSCK_DONE,
+       BCH_FS_STARTED,
+
+       /* shutdown: */
+       BCH_FS_EMERGENCY_RO,
+       BCH_FS_WRITE_DISABLE_COMPLETE,
+
+       /* errors: */
+       BCH_FS_ERROR,
+       BCH_FS_GC_FAILURE,
+
+       /* misc: */
+       BCH_FS_BDEV_MOUNTED,
+       BCH_FS_FSCK_FIXED_ERRORS,
+       BCH_FS_FIXED_GENS,
+       BCH_FS_REBUILD_REPLICAS,
+       BCH_FS_HOLD_BTREE_WRITES,
+};
+
+struct btree_debug {
+       unsigned                id;
+       struct dentry           *btree;
+       struct dentry           *btree_format;
+       struct dentry           *failed;
+};
+
+enum bch_fs_state {
+       BCH_FS_STARTING         = 0,
+       BCH_FS_STOPPING,
+       BCH_FS_RO,
+       BCH_FS_RW,
+};
+
+struct bch_fs {
+       struct closure          cl;
+
+       struct list_head        list;
+       struct kobject          kobj;
+       struct kobject          internal;
+       struct kobject          opts_dir;
+       struct kobject          time_stats;
+       unsigned long           flags;
+
+       int                     minor;
+       struct device           *chardev;
+       struct super_block      *vfs_sb;
+       char                    name[40];
+
+       /* ro/rw, add/remove devices: */
+       struct mutex            state_lock;
+       enum bch_fs_state       state;
+
+       /* Counts outstanding writes, for clean transition to read-only */
+       struct percpu_ref       writes;
+       struct work_struct      read_only_work;
+
+       struct bch_dev __rcu    *devs[BCH_SB_MEMBERS_MAX];
+
+       struct bch_replicas_cpu __rcu *replicas;
+       struct bch_replicas_cpu __rcu *replicas_gc;
+       struct mutex            replicas_gc_lock;
+
+       struct bch_disk_groups_cpu __rcu *disk_groups;
+
+       struct bch_opts         opts;
+
+       /* Updated by bch2_sb_update():*/
+       struct {
+               __uuid_t        uuid;
+               __uuid_t        user_uuid;
+
+               u16             encoded_extent_max;
+
+               u8              nr_devices;
+               u8              clean;
+
+               u8              encryption_type;
+
+               u64             time_base_lo;
+               u32             time_base_hi;
+               u32             time_precision;
+               u64             features;
+       }                       sb;
+
+       struct bch_sb_handle    disk_sb;
+
+       unsigned short          block_bits;     /* ilog2(block_size) */
+
+       u16                     btree_foreground_merge_threshold;
+
+       struct closure          sb_write;
+       struct mutex            sb_lock;
+
+       /* BTREE CACHE */
+       struct bio_set          btree_bio;
+
+       struct btree_root       btree_roots[BTREE_ID_NR];
+       bool                    btree_roots_dirty;
+       struct mutex            btree_root_lock;
+
+       struct btree_cache      btree_cache;
+
+       mempool_t               btree_reserve_pool;
+
+       /*
+        * Cache of allocated btree nodes - if we allocate a btree node and
+        * don't use it, if we free it that space can't be reused until going
+        * _all_ the way through the allocator (which exposes us to a livelock
+        * when allocating btree reserves fail halfway through) - instead, we
+        * can stick them here:
+        */
+       struct btree_alloc      btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+       unsigned                btree_reserve_cache_nr;
+       struct mutex            btree_reserve_cache_lock;
+
+       mempool_t               btree_interior_update_pool;
+       struct list_head        btree_interior_update_list;
+       struct mutex            btree_interior_update_lock;
+       struct closure_waitlist btree_interior_update_wait;
+
+       struct workqueue_struct *wq;
+       /* copygc needs its own workqueue for index updates.. */
+       struct workqueue_struct *copygc_wq;
+
+       /* ALLOCATION */
+       struct delayed_work     pd_controllers_update;
+       unsigned                pd_controllers_update_seconds;
+
+       struct bch_devs_mask    rw_devs[BCH_DATA_NR];
+
+       u64                     capacity; /* sectors */
+
+       /*
+        * When capacity _decreases_ (due to a disk being removed), we
+        * increment capacity_gen - this invalidates outstanding reservations
+        * and forces them to be revalidated
+        */
+       u32                     capacity_gen;
+
+       atomic64_t              sectors_available;
+
+       struct bch_fs_usage __percpu *usage_percpu;
+       struct bch_fs_usage     usage_cached;
+       struct percpu_rw_semaphore usage_lock;
+
+       struct closure_waitlist freelist_wait;
+
+       /*
+        * When we invalidate buckets, we use both the priority and the amount
+        * of good data to determine which buckets to reuse first - to weight
+        * those together consistently we keep track of the smallest nonzero
+        * priority of any bucket.
+        */
+       struct bucket_clock     bucket_clock[2];
+
+       struct io_clock         io_clock[2];
+
+       /* ALLOCATOR */
+       spinlock_t              freelist_lock;
+       u8                      open_buckets_freelist;
+       u8                      open_buckets_nr_free;
+       struct closure_waitlist open_buckets_wait;
+       struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
+
+       struct write_point      btree_write_point;
+       struct write_point      rebalance_write_point;
+
+       struct write_point      write_points[WRITE_POINT_COUNT];
+       struct hlist_head       write_points_hash[WRITE_POINT_COUNT];
+       struct mutex            write_points_hash_lock;
+
+       /* GARBAGE COLLECTION */
+       struct task_struct      *gc_thread;
+       atomic_t                kick_gc;
+       unsigned long           gc_count;
+
+       /*
+        * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+        * has been marked by GC.
+        *
+        * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+        *
+        * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
+        * currently running, and gc marks are currently valid
+        *
+        * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+        * can read without a lock.
+        */
+       seqcount_t              gc_pos_lock;
+       struct gc_pos           gc_pos;
+
+       /*
+        * The allocation code needs gc_mark in struct bucket to be correct, but
+        * it's not while a gc is in progress.
+        */
+       struct rw_semaphore     gc_lock;
+
+       /* IO PATH */
+       struct bio_set          bio_read;
+       struct bio_set          bio_read_split;
+       struct bio_set          bio_write;
+       struct mutex            bio_bounce_pages_lock;
+       mempool_t               bio_bounce_pages;
+       struct rhashtable       promote_table;
+
+       mempool_t               compression_bounce[2];
+       mempool_t               compress_workspace[BCH_COMPRESSION_NR];
+       mempool_t               decompress_workspace;
+       ZSTD_parameters         zstd_params;
+
+       struct crypto_shash     *sha256;
+       struct crypto_sync_skcipher *chacha20;
+       struct crypto_shash     *poly1305;
+
+       atomic64_t              key_version;
+
+       /* REBALANCE */
+       struct bch_fs_rebalance rebalance;
+
+       /* VFS IO PATH - fs-io.c */
+       struct bio_set          writepage_bioset;
+       struct bio_set          dio_write_bioset;
+       struct bio_set          dio_read_bioset;
+
+       struct bio_list         btree_write_error_list;
+       struct work_struct      btree_write_error_work;
+       spinlock_t              btree_write_error_lock;
+
+       /* ERRORS */
+       struct list_head        fsck_errors;
+       struct mutex            fsck_error_lock;
+       bool                    fsck_alloc_err;
+
+       /* FILESYSTEM */
+       atomic_long_t           nr_inodes;
+
+       /* QUOTAS */
+       struct bch_memquota_type quotas[QTYP_NR];
+
+       /* DEBUG JUNK */
+       struct dentry           *debug;
+       struct btree_debug      btree_debug[BTREE_ID_NR];
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct btree            *verify_data;
+       struct btree_node       *verify_ondisk;
+       struct mutex            verify_lock;
+#endif
+
+       u64                     unused_inode_hint;
+
+       /*
+        * A btree node on disk could have too many bsets for an iterator to fit
+        * on the stack - have to dynamically allocate them
+        */
+       mempool_t               fill_iter;
+
+       mempool_t               btree_bounce_pool;
+
+       struct journal          journal;
+
+       unsigned                bucket_journal_seq;
+
+       /* The rest of this all shows up in sysfs */
+       atomic_long_t           read_realloc_races;
+       atomic_long_t           extent_migrate_done;
+       atomic_long_t           extent_migrate_raced;
+
+       unsigned                btree_gc_periodic:1;
+       unsigned                copy_gc_enabled:1;
+       bool                    promote_whole_extents;
+
+#define BCH_DEBUG_PARAM(name, description) bool name;
+       BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+       struct bch2_time_stats  times[BCH_TIME_STAT_NR];
+};
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+       if (c->vfs_sb)
+               c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline bool bch2_fs_running(struct bch_fs *c)
+{
+       return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+       return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+       return c->opts.block_size << 9;
+}
+
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+       return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+       s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+       if (c->sb.time_precision == 1)
+               return ns;
+
+       return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+       struct timespec64 now;
+
+       ktime_get_real_ts64(&now);
+       return timespec_to_bch2_time(c, now);
+}
+
+#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
new file mode 100644 (file)
index 0000000..eb14dba
--- /dev/null
@@ -0,0 +1,1498 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
+
+/*
+ * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
+ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+#include <linux/uuid.h>
+
+#ifdef __KERNEL__
+typedef uuid_t __uuid_t;
+#endif
+
+#define LE_BITMASK(_bits, name, type, field, offset, end)              \
+static const unsigned  name##_OFFSET = offset;                         \
+static const unsigned  name##_BITS = (end - offset);                   \
+static const __u##_bits        name##_MAX = (1ULL << (end - offset)) - 1;      \
+                                                                       \
+static inline __u64 name(const type *k)                                        \
+{                                                                      \
+       return (__le##_bits##_to_cpu(k->field) >> offset) &             \
+               ~(~0ULL << (end - offset));                             \
+}                                                                      \
+                                                                       \
+static inline void SET_##name(type *k, __u64 v)                                \
+{                                                                      \
+       __u##_bits new = __le##_bits##_to_cpu(k->field);                \
+                                                                       \
+       new &= ~(~(~0ULL << (end - offset)) << offset);                 \
+       new |= (v & ~(~0ULL << (end - offset))) << offset;              \
+       k->field = __cpu_to_le##_bits(new);                             \
+}
+
+#define LE16_BITMASK(n, t, f, o, e)    LE_BITMASK(16, n, t, f, o, e)
+#define LE32_BITMASK(n, t, f, o, e)    LE_BITMASK(32, n, t, f, o, e)
+#define LE64_BITMASK(n, t, f, o, e)    LE_BITMASK(64, n, t, f, o, e)
+
+struct bkey_format {
+       __u8            key_u64s;
+       __u8            nr_fields;
+       /* One unused slot for now: */
+       __u8            bits_per_field[6];
+       __le64          field_offset[6];
+};
+
+/* Btree keys - all units are in sectors */
+
+struct bpos {
+       /*
+        * Word order matches machine byte order - btree code treats a bpos as a
+        * single large integer, for search/comparison purposes
+        *
+        * Note that wherever a bpos is embedded in another on disk data
+        * structure, it has to be byte swabbed when reading in metadata that
+        * wasn't written in native endian order:
+        */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+       __u32           snapshot;
+       __u64           offset;
+       __u64           inode;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+       __u64           inode;
+       __u64           offset;         /* Points to end of extent - sectors */
+       __u32           snapshot;
+#else
+#error edit for your odd byteorder.
+#endif
+} __attribute__((packed, aligned(4)));
+
+#define KEY_INODE_MAX                  ((__u64)~0ULL)
+#define KEY_OFFSET_MAX                 ((__u64)~0ULL)
+#define KEY_SNAPSHOT_MAX               ((__u32)~0U)
+#define KEY_SIZE_MAX                   ((__u32)~0U)
+
+static inline struct bpos POS(__u64 inode, __u64 offset)
+{
+       struct bpos ret;
+
+       ret.inode       = inode;
+       ret.offset      = offset;
+       ret.snapshot    = 0;
+
+       return ret;
+}
+
+#define POS_MIN                                POS(0, 0)
+#define POS_MAX                                POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+
+/* Empty placeholder struct, for container_of() */
+struct bch_val {
+       __u64           __nothing[0];
+};
+
+struct bversion {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+       __u64           lo;
+       __u32           hi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+       __u32           hi;
+       __u64           lo;
+#endif
+} __attribute__((packed, aligned(4)));
+
+struct bkey {
+       /* Size of combined key and value, in u64s */
+       __u8            u64s;
+
+       /* Format of key (0 for format local to btree node) */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8            format:7,
+                       needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u8            needs_whiteout:1,
+                       format:7;
+#else
+#error edit for your odd byteorder.
+#endif
+
+       /* Type of the value */
+       __u8            type;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+       __u8            pad[1];
+
+       struct bversion version;
+       __u32           size;           /* extent size, in sectors */
+       struct bpos     p;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+       struct bpos     p;
+       __u32           size;           /* extent size, in sectors */
+       struct bversion version;
+
+       __u8            pad[1];
+#endif
+} __attribute__((packed, aligned(8)));
+
+struct bkey_packed {
+       __u64           _data[0];
+
+       /* Size of combined key and value, in u64s */
+       __u8            u64s;
+
+       /* Format of key (0 for format local to btree node) */
+
+       /*
+        * XXX: next incompat on disk format change, switch format and
+        * needs_whiteout - bkey_packed() will be cheaper if format is the high
+        * bits of the bitfield
+        */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8            format:7,
+                       needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u8            needs_whiteout:1,
+                       format:7;
+#endif
+
+       /* Type of the value */
+       __u8            type;
+       __u8            key_start[0];
+
+       /*
+        * We copy bkeys with struct assignment in various places, and while
+        * that shouldn't be done with packed bkeys we can't disallow it in C,
+        * and it's legal to cast a bkey to a bkey_packed  - so padding it out
+        * to the same size as struct bkey should hopefully be safest.
+        */
+       __u8            pad[sizeof(struct bkey) - 3];
+} __attribute__((packed, aligned(8)));
+
+#define BKEY_U64s                      (sizeof(struct bkey) / sizeof(__u64))
+#define KEY_PACKED_BITS_START          24
+
+#define KEY_FORMAT_LOCAL_BTREE         0
+#define KEY_FORMAT_CURRENT             1
+
+enum bch_bkey_fields {
+       BKEY_FIELD_INODE,
+       BKEY_FIELD_OFFSET,
+       BKEY_FIELD_SNAPSHOT,
+       BKEY_FIELD_SIZE,
+       BKEY_FIELD_VERSION_HI,
+       BKEY_FIELD_VERSION_LO,
+       BKEY_NR_FIELDS,
+};
+
+#define bkey_format_field(name, field)                                 \
+       [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
+
+#define BKEY_FORMAT_CURRENT                                            \
+((struct bkey_format) {                                                        \
+       .key_u64s       = BKEY_U64s,                                    \
+       .nr_fields      = BKEY_NR_FIELDS,                               \
+       .bits_per_field = {                                             \
+               bkey_format_field(INODE,        p.inode),               \
+               bkey_format_field(OFFSET,       p.offset),              \
+               bkey_format_field(SNAPSHOT,     p.snapshot),            \
+               bkey_format_field(SIZE,         size),                  \
+               bkey_format_field(VERSION_HI,   version.hi),            \
+               bkey_format_field(VERSION_LO,   version.lo),            \
+       },                                                              \
+})
+
+/* bkey with inline value */
+struct bkey_i {
+       __u64                   _data[0];
+
+       union {
+       struct {
+               /* Size of combined key and value, in u64s */
+               __u8            u64s;
+       };
+       struct {
+               struct bkey     k;
+               struct bch_val  v;
+       };
+       };
+};
+
+#define KEY(_inode, _offset, _size)                                    \
+((struct bkey) {                                                       \
+       .u64s           = BKEY_U64s,                                    \
+       .format         = KEY_FORMAT_CURRENT,                           \
+       .p              = POS(_inode, _offset),                         \
+       .size           = _size,                                        \
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+       *k = KEY(0, 0, 0);
+}
+
+#define bkey_bytes(_k)         ((_k)->u64s * sizeof(__u64))
+
+#define __BKEY_PADDED(key, pad)                                        \
+       struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
+#define BKEY_VAL_TYPE(name, nr)                                                \
+struct bkey_i_##name {                                                 \
+       union {                                                         \
+               struct bkey             k;                              \
+               struct bkey_i           k_i;                            \
+       };                                                              \
+       struct bch_##name               v;                              \
+}
+
+/*
+ * - DELETED keys are used internally to mark keys that should be ignored but
+ *   override keys in composition order.  Their version number is ignored.
+ *
+ * - DISCARDED keys indicate that the data is all 0s because it has been
+ *   discarded. DISCARDs may have a version; if the version is nonzero the key
+ *   will be persistent, otherwise the key will be dropped whenever the btree
+ *   node is rewritten (like DELETED keys).
+ *
+ * - ERROR: any read of the data returns a read error, as the data was lost due
+ *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
+ *   by new writes or cluster-wide GC. Node repair can also overwrite them with
+ *   the same or a more recent version number, but not with an older version
+ *   number.
+*/
+#define KEY_TYPE_DELETED               0
+#define KEY_TYPE_DISCARD               1
+#define KEY_TYPE_ERROR                 2
+#define KEY_TYPE_COOKIE                        3
+#define KEY_TYPE_PERSISTENT_DISCARD    4
+#define KEY_TYPE_GENERIC_NR            128
+
+struct bch_cookie {
+       struct bch_val          v;
+       __le64                  cookie;
+};
+BKEY_VAL_TYPE(cookie,          KEY_TYPE_COOKIE);
+
+/* Extents */
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32    - 0b1
+ * bch_extent_ptr      - 0b10
+ * bch_extent_crc64    - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+       __le64                  lo;
+       __le64                  hi;
+} __attribute__((packed, aligned(8)));
+
+enum bch_csum_type {
+       BCH_CSUM_NONE                   = 0,
+       BCH_CSUM_CRC32C_NONZERO         = 1,
+       BCH_CSUM_CRC64_NONZERO          = 2,
+       BCH_CSUM_CHACHA20_POLY1305_80   = 3,
+       BCH_CSUM_CHACHA20_POLY1305_128  = 4,
+       BCH_CSUM_CRC32C                 = 5,
+       BCH_CSUM_CRC64                  = 6,
+       BCH_CSUM_NR                     = 7,
+};
+
+static const unsigned bch_crc_bytes[] = {
+       [BCH_CSUM_NONE]                         = 0,
+       [BCH_CSUM_CRC32C_NONZERO]               = 4,
+       [BCH_CSUM_CRC32C]                       = 4,
+       [BCH_CSUM_CRC64_NONZERO]                = 8,
+       [BCH_CSUM_CRC64]                        = 8,
+       [BCH_CSUM_CHACHA20_POLY1305_80]         = 10,
+       [BCH_CSUM_CHACHA20_POLY1305_128]        = 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+       switch (type) {
+       case BCH_CSUM_CHACHA20_POLY1305_80:
+       case BCH_CSUM_CHACHA20_POLY1305_128:
+               return true;
+       default:
+               return false;
+       }
+}
+
+enum bch_compression_type {
+       BCH_COMPRESSION_NONE            = 0,
+       BCH_COMPRESSION_LZ4_OLD         = 1,
+       BCH_COMPRESSION_GZIP            = 2,
+       BCH_COMPRESSION_LZ4             = 3,
+       BCH_COMPRESSION_ZSTD            = 4,
+       BCH_COMPRESSION_NR              = 5,
+};
+
+enum bch_extent_entry_type {
+       BCH_EXTENT_ENTRY_ptr            = 0,
+       BCH_EXTENT_ENTRY_crc32          = 1,
+       BCH_EXTENT_ENTRY_crc64          = 2,
+       BCH_EXTENT_ENTRY_crc128         = 3,
+};
+
+#define BCH_EXTENT_ENTRY_MAX           4
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u32                   type:2,
+                               _compressed_size:7,
+                               _uncompressed_size:7,
+                               offset:7,
+                               _unused:1,
+                               csum_type:4,
+                               compression_type:4;
+       __u32                   csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u32                   csum;
+       __u32                   compression_type:4,
+                               csum_type:4,
+                               _unused:1,
+                               offset:7,
+                               _uncompressed_size:7,
+                               _compressed_size:7,
+                               type:2;
+#endif
+} __attribute__((packed, aligned(8)));
+
+#define CRC32_SIZE_MAX         (1U << 7)
+#define CRC32_NONCE_MAX                0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:3,
+                               _compressed_size:9,
+                               _uncompressed_size:9,
+                               offset:9,
+                               nonce:10,
+                               csum_type:4,
+                               compression_type:4,
+                               csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   csum_hi:16,
+                               compression_type:4,
+                               csum_type:4,
+                               nonce:10,
+                               offset:9,
+                               _uncompressed_size:9,
+                               _compressed_size:9,
+                               type:3;
+#endif
+       __u64                   csum_lo;
+} __attribute__((packed, aligned(8)));
+
+#define CRC64_SIZE_MAX         (1U << 9)
+#define CRC64_NONCE_MAX                ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:4,
+                               _compressed_size:13,
+                               _uncompressed_size:13,
+                               offset:13,
+                               nonce:13,
+                               csum_type:4,
+                               compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   compression_type:4,
+                               csum_type:4,
+                               nonce:13,
+                               offset:13,
+                               _uncompressed_size:13,
+                               _compressed_size:13,
+                               type:4;
+#endif
+       struct bch_csum         csum;
+} __attribute__((packed, aligned(8)));
+
+#define CRC128_SIZE_MAX                (1U << 13)
+#define CRC128_NONCE_MAX       ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:1,
+                               cached:1,
+                               erasure_coded:1,
+                               reservation:1,
+                               offset:44, /* 8 petabytes */
+                               dev:8,
+                               gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   gen:8,
+                               dev:8,
+                               offset:44,
+                               reservation:1,
+                               erasure_coded:1,
+                               cached:1,
+                               type:1;
+#endif
+} __attribute__((packed, aligned(8)));
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:5,
+                               unused:23,
+                               replicas:4,
+                               generation:32;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   generation:32,
+                               replicas:4,
+                               unused:23,
+                               type:5;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+       unsigned long                   type;
+#elif __BITS_PER_LONG == 32
+       struct {
+               unsigned long           pad;
+               unsigned long           type;
+       };
+#else
+#error edit for your odd byteorder.
+#endif
+       struct bch_extent_crc32         crc32;
+       struct bch_extent_crc64         crc64;
+       struct bch_extent_crc128        crc128;
+       struct bch_extent_ptr           ptr;
+};
+
+enum {
+       BCH_EXTENT              = 128,
+
+       /*
+        * This is kind of a hack, we're overloading the type for a boolean that
+        * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
+        * have the same value type:
+        */
+       BCH_EXTENT_CACHED       = 129,
+
+       /*
+        * Persistent reservation:
+        */
+       BCH_RESERVATION         = 130,
+};
+
+struct bch_extent {
+       struct bch_val          v;
+
+       __u64                   _data[0];
+       union bch_extent_entry  start[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(extent,          BCH_EXTENT);
+
+struct bch_reservation {
+       struct bch_val          v;
+
+       __le32                  generation;
+       __u8                    nr_replicas;
+       __u8                    pad[3];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(reservation,     BCH_RESERVATION);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+       ((sizeof(struct bch_extent_crc128) +                    \
+         sizeof(struct bch_extent_ptr)) / sizeof(u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX                               \
+       (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+#define BKEY_PADDED(key)       __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX                            \
+       ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+#define BKEY_BTREE_PTR_U64s_MAX                                        \
+       (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+/* Inodes */
+
+#define BLOCKDEV_INODE_MAX     4096
+
+#define BCACHEFS_ROOT_INO      4096
+
+enum bch_inode_types {
+       BCH_INODE_FS            = 128,
+       BCH_INODE_BLOCKDEV      = 129,
+       BCH_INODE_GENERATION    = 130,
+};
+
+struct bch_inode {
+       struct bch_val          v;
+
+       __le64                  bi_hash_seed;
+       __le32                  bi_flags;
+       __le16                  bi_mode;
+       __u8                    fields[0];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode,           BCH_INODE_FS);
+
+struct bch_inode_generation {
+       struct bch_val          v;
+
+       __le32                  bi_generation;
+       __le32                  pad;
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode_generation,        BCH_INODE_GENERATION);
+
+#define BCH_INODE_FIELDS()                                     \
+       BCH_INODE_FIELD(bi_atime,                       64)     \
+       BCH_INODE_FIELD(bi_ctime,                       64)     \
+       BCH_INODE_FIELD(bi_mtime,                       64)     \
+       BCH_INODE_FIELD(bi_otime,                       64)     \
+       BCH_INODE_FIELD(bi_size,                        64)     \
+       BCH_INODE_FIELD(bi_sectors,                     64)     \
+       BCH_INODE_FIELD(bi_uid,                         32)     \
+       BCH_INODE_FIELD(bi_gid,                         32)     \
+       BCH_INODE_FIELD(bi_nlink,                       32)     \
+       BCH_INODE_FIELD(bi_generation,                  32)     \
+       BCH_INODE_FIELD(bi_dev,                         32)     \
+       BCH_INODE_FIELD(bi_data_checksum,               8)      \
+       BCH_INODE_FIELD(bi_compression,                 8)      \
+       BCH_INODE_FIELD(bi_project,                     32)     \
+       BCH_INODE_FIELD(bi_background_compression,      8)      \
+       BCH_INODE_FIELD(bi_data_replicas,               8)      \
+       BCH_INODE_FIELD(bi_promote_target,              16)     \
+       BCH_INODE_FIELD(bi_foreground_target,           16)     \
+       BCH_INODE_FIELD(bi_background_target,           16)
+
+#define BCH_INODE_FIELDS_INHERIT()                             \
+       BCH_INODE_FIELD(bi_data_checksum)                       \
+       BCH_INODE_FIELD(bi_compression)                         \
+       BCH_INODE_FIELD(bi_project)                             \
+       BCH_INODE_FIELD(bi_background_compression)              \
+       BCH_INODE_FIELD(bi_data_replicas)                       \
+       BCH_INODE_FIELD(bi_promote_target)                      \
+       BCH_INODE_FIELD(bi_foreground_target)                   \
+       BCH_INODE_FIELD(bi_background_target)
+
+enum {
+       /*
+        * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
+        * flags)
+        */
+       __BCH_INODE_SYNC        = 0,
+       __BCH_INODE_IMMUTABLE   = 1,
+       __BCH_INODE_APPEND      = 2,
+       __BCH_INODE_NODUMP      = 3,
+       __BCH_INODE_NOATIME     = 4,
+
+       __BCH_INODE_I_SIZE_DIRTY= 5,
+       __BCH_INODE_I_SECTORS_DIRTY= 6,
+       __BCH_INODE_UNLINKED    = 7,
+
+       /* bits 20+ reserved for packed fields below: */
+};
+
+#define BCH_INODE_SYNC         (1 << __BCH_INODE_SYNC)
+#define BCH_INODE_IMMUTABLE    (1 << __BCH_INODE_IMMUTABLE)
+#define BCH_INODE_APPEND       (1 << __BCH_INODE_APPEND)
+#define BCH_INODE_NODUMP       (1 << __BCH_INODE_NODUMP)
+#define BCH_INODE_NOATIME      (1 << __BCH_INODE_NOATIME)
+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
+#define BCH_INODE_UNLINKED     (1 << __BCH_INODE_UNLINKED)
+
+LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 32);
+
+struct bch_inode_blockdev {
+       struct bch_val          v;
+
+       __le64                  i_size;
+       __le64                  i_flags;
+
+       /* Seconds: */
+       __le64                  i_ctime;
+       __le64                  i_mtime;
+
+       __uuid_t                i_uuid;
+       __u8                    i_label[32];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode_blockdev,  BCH_INODE_BLOCKDEV);
+
+/* Thin provisioned volume, or cache for another block device? */
+LE64_BITMASK(CACHED_DEV,       struct bch_inode_blockdev, i_flags, 0,  1)
+
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+enum {
+       BCH_DIRENT              = 128,
+       BCH_DIRENT_WHITEOUT     = 129,
+};
+
+struct bch_dirent {
+       struct bch_val          v;
+
+       /* Target inode number: */
+       __le64                  d_inum;
+
+       /*
+        * Copy of mode bits 12-15 from the target inode - so userspace can get
+        * the filetype without having to do a stat()
+        */
+       __u8                    d_type;
+
+       __u8                    d_name[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(dirent,          BCH_DIRENT);
+
+#define BCH_NAME_MAX   (U8_MAX * sizeof(u64) -                         \
+                        sizeof(struct bkey) -                          \
+                        offsetof(struct bch_dirent, d_name))
+
+
+/* Xattrs */
+
+enum {
+       BCH_XATTR               = 128,
+       BCH_XATTR_WHITEOUT      = 129,
+};
+
+#define BCH_XATTR_INDEX_USER                   0
+#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS       1
+#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT      2
+#define BCH_XATTR_INDEX_TRUSTED                        3
+#define BCH_XATTR_INDEX_SECURITY               4
+
+struct bch_xattr {
+       struct bch_val          v;
+       __u8                    x_type;
+       __u8                    x_name_len;
+       __le16                  x_val_len;
+       __u8                    x_name[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(xattr,           BCH_XATTR);
+
+/* Bucket/allocation information: */
+
+enum {
+       BCH_ALLOC               = 128,
+};
+
+enum {
+       BCH_ALLOC_FIELD_READ_TIME       = 0,
+       BCH_ALLOC_FIELD_WRITE_TIME      = 1,
+};
+
+struct bch_alloc {
+       struct bch_val          v;
+       __u8                    fields;
+       __u8                    gen;
+       __u8                    data[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(alloc,   BCH_ALLOC);
+
+/* Quotas: */
+
+enum {
+       BCH_QUOTA               = 128,
+};
+
+enum quota_types {
+       QTYP_USR                = 0,
+       QTYP_GRP                = 1,
+       QTYP_PRJ                = 2,
+       QTYP_NR                 = 3,
+};
+
+enum quota_counters {
+       Q_SPC                   = 0,
+       Q_INO                   = 1,
+       Q_COUNTERS              = 2,
+};
+
+struct bch_quota_counter {
+       __le64                  hardlimit;
+       __le64                  softlimit;
+};
+
+struct bch_quota {
+       struct bch_val          v;
+       struct bch_quota_counter c[Q_COUNTERS];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(quota,   BCH_QUOTA);
+
+/* Optional/variable size superblock sections: */
+
+struct bch_sb_field {
+       __u64                   _data[0];
+       __le32                  u64s;
+       __le32                  type;
+};
+
+#define BCH_SB_FIELDS()                \
+       x(journal,      0)      \
+       x(members,      1)      \
+       x(crypt,        2)      \
+       x(replicas,     3)      \
+       x(quota,        4)      \
+       x(disk_groups,  5)      \
+       x(clean,        6)
+
+enum bch_sb_field_type {
+#define x(f, nr)       BCH_SB_FIELD_##f = nr,
+       BCH_SB_FIELDS()
+#undef x
+       BCH_SB_FIELD_NR
+};
+
+/* BCH_SB_FIELD_journal: */
+
+struct bch_sb_field_journal {
+       struct bch_sb_field     field;
+       __le64                  buckets[0];
+};
+
+/* BCH_SB_FIELD_members: */
+
+struct bch_member {
+       __uuid_t                uuid;
+       __le64                  nbuckets;       /* device size */
+       __le16                  first_bucket;   /* index of first bucket used */
+       __le16                  bucket_size;    /* sectors */
+       __le32                  pad;
+       __le64                  last_mount;     /* time_t */
+
+       __le64                  flags[2];
+};
+
+LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags[0],  0,  4)
+/* 4-10 unused, was TIER, HAS_(META)DATA */
+LE64_BITMASK(BCH_MEMBER_REPLACEMENT,   struct bch_member, flags[0], 10, 14)
+LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags[0], 28, 30)
+
+#define BCH_TIER_MAX                   4U
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+enum bch_member_state {
+       BCH_MEMBER_STATE_RW             = 0,
+       BCH_MEMBER_STATE_RO             = 1,
+       BCH_MEMBER_STATE_FAILED         = 2,
+       BCH_MEMBER_STATE_SPARE          = 3,
+       BCH_MEMBER_STATE_NR             = 4,
+};
+
+enum cache_replacement {
+       CACHE_REPLACEMENT_LRU           = 0,
+       CACHE_REPLACEMENT_FIFO          = 1,
+       CACHE_REPLACEMENT_RANDOM        = 2,
+       CACHE_REPLACEMENT_NR            = 3,
+};
+
+struct bch_sb_field_members {
+       struct bch_sb_field     field;
+       struct bch_member       members[0];
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+struct nonce {
+       __le32                  d[4];
+};
+
+struct bch_key {
+       __le64                  key[4];
+};
+
+#define BCH_KEY_MAGIC                                  \
+       (((u64) 'b' <<  0)|((u64) 'c' <<  8)|           \
+        ((u64) 'h' << 16)|((u64) '*' << 24)|           \
+        ((u64) '*' << 32)|((u64) 'k' << 40)|           \
+        ((u64) 'e' << 48)|((u64) 'y' << 56))
+
+struct bch_encrypted_key {
+       __le64                  magic;
+       struct bch_key          key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+       struct bch_sb_field     field;
+
+       __le64                  flags;
+       __le64                  kdf_flags;
+       struct bch_encrypted_key key;
+};
+
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE,       struct bch_sb_field_crypt, flags, 0, 4);
+
+enum bch_kdf_types {
+       BCH_KDF_SCRYPT          = 0,
+       BCH_KDF_NR              = 1,
+};
+
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags,  0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
+
+/* BCH_SB_FIELD_replicas: */
+
+enum bch_data_type {
+       BCH_DATA_NONE           = 0,
+       BCH_DATA_SB             = 1,
+       BCH_DATA_JOURNAL        = 2,
+       BCH_DATA_BTREE          = 3,
+       BCH_DATA_USER           = 4,
+       BCH_DATA_CACHED         = 5,
+       BCH_DATA_NR             = 6,
+};
+
+struct bch_replicas_entry {
+       u8                      data_type;
+       u8                      nr;
+       u8                      devs[];
+};
+
+struct bch_sb_field_replicas {
+       struct bch_sb_field     field;
+       struct bch_replicas_entry entries[];
+};
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+       __le32                          timelimit;
+       __le32                          warnlimit;
+};
+
+struct bch_sb_quota_type {
+       __le64                          flags;
+       struct bch_sb_quota_counter     c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+       struct bch_sb_field             field;
+       struct bch_sb_quota_type        q[QTYP_NR];
+} __attribute__((packed, aligned(8)));
+
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE              32
+
+struct bch_disk_group {
+       __u8                    label[BCH_SB_LABEL_SIZE];
+       __le64                  flags[2];
+};
+
+LE64_BITMASK(BCH_GROUP_DELETED,                struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,   struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,         struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+       struct bch_sb_field     field;
+       struct bch_disk_group   entries[0];
+};
+
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+       __le16                  u64s;
+       __u8                    btree_id;
+       __u8                    level;
+       __u8                    type; /* designates what this jset holds */
+       __u8                    pad[3];
+
+       union {
+               struct bkey_i   start[0];
+               __u64           _data[0];
+       };
+};
+
+struct bch_sb_field_clean {
+       struct bch_sb_field     field;
+
+       __le32                  flags;
+       __le16                  read_clock;
+       __le16                  write_clock;
+       __le64                  journal_seq;
+
+       union {
+               struct jset_entry start[0];
+               __u64           _data[0];
+       };
+};
+
+/* Superblock: */
+
+/*
+ * Version 8:  BCH_SB_ENCODED_EXTENT_MAX_BITS
+ *             BCH_MEMBER_DATA_ALLOWED
+ * Version 9:  incompatible extent nonce change
+ */
+
+#define BCH_SB_VERSION_MIN             7
+#define BCH_SB_VERSION_EXTENT_MAX      8
+#define BCH_SB_VERSION_EXTENT_NONCE_V1 9
+#define BCH_SB_VERSION_MAX             9
+
+#define BCH_SB_SECTOR                  8
+#define BCH_SB_MEMBERS_MAX             64 /* XXX kill */
+
+struct bch_sb_layout {
+       __uuid_t                magic;  /* bcachefs superblock UUID */
+       __u8                    layout_type;
+       __u8                    sb_max_size_bits; /* base 2 of 512 byte sectors */
+       __u8                    nr_superblocks;
+       __u8                    pad[5];
+       __le64                  sb_offset[61];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_SB_LAYOUT_SECTOR   7
+
+/*
+ * @offset     - sector where this sb was written
+ * @version    - on disk format version
+ * @magic      - identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @seq                - incremented each time superblock is written
+ * @uuid       - used for generating various magic numbers and identifying
+ *                member devices, never changes
+ * @user_uuid  - user visible UUID, may be changed
+ * @label      - filesystem label
+ * @seq                - identifies most recent superblock, incremented each time
+ *               superblock is written
+ * @features   - enabled incompatible features
+ */
+struct bch_sb {
+       struct bch_csum         csum;
+       __le16                  version;
+       __le16                  version_min;
+       __le16                  pad[2];
+       __uuid_t                magic;
+       __uuid_t                uuid;
+       __uuid_t                user_uuid;
+       __u8                    label[BCH_SB_LABEL_SIZE];
+       __le64                  offset;
+       __le64                  seq;
+
+       __le16                  block_size;
+       __u8                    dev_idx;
+       __u8                    nr_devices;
+       __le32                  u64s;
+
+       __le64                  time_base_lo;
+       __le32                  time_base_hi;
+       __le32                  time_precision;
+
+       __le64                  flags[8];
+       __le64                  features[2];
+       __le64                  compat[2];
+
+       struct bch_sb_layout    layout;
+
+       union {
+               struct bch_sb_field start[0];
+               __le64          _data[0];
+       };
+} __attribute__((packed, aligned(8)));
+
+/*
+ * Flags:
+ * BCH_SB_INITALIZED   - set on first mount
+ * BCH_SB_CLEAN                - did we shut down cleanly? Just a hint, doesn't affect
+ *                       behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT  - limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ *                        DATA/META_CSUM_TYPE. Also indicates encryption
+ *                        algorithm in use, if/when we get more than one
+ */
+
+LE16_BITMASK(BCH_SB_BLOCK_SIZE,                struct bch_sb, block_size, 0, 16);
+
+LE64_BITMASK(BCH_SB_INITIALIZED,       struct bch_sb, flags[0],  0,  1);
+LE64_BITMASK(BCH_SB_CLEAN,             struct bch_sb, flags[0],  1,  2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE,         struct bch_sb, flags[0],  2,  8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION,      struct bch_sb, flags[0],  8, 12);
+
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,   struct bch_sb, flags[0], 12, 28);
+
+LE64_BITMASK(BCH_SB_GC_RESERVE,                struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE,      struct bch_sb, flags[0], 33, 40);
+
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE,    struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,    struct bch_sb, flags[0], 44, 48);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,        struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,        struct bch_sb, flags[0], 52, 56);
+
+LE64_BITMASK(BCH_SB_POSIX_ACL,         struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA,          struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA,          struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA,          struct bch_sb, flags[0], 59, 60);
+
+/* 60-64 unused */
+
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE,     struct bch_sb, flags[1],  0,  4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,  struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_INODE_32BIT,       struct bch_sb, flags[1],  8,  9);
+
+LE64_BITMASK(BCH_SB_128_BIT_MACS,      struct bch_sb, flags[1],  9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,   struct bch_sb, flags[1], 10, 14);
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+                                       struct bch_sb, flags[1], 14, 20);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET,    struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
+
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+                                       struct bch_sb, flags[2],  0,  4);
+
+/* Features: */
+enum bch_sb_features {
+       BCH_FEATURE_LZ4                 = 0,
+       BCH_FEATURE_GZIP                = 1,
+       BCH_FEATURE_ZSTD                = 2,
+       BCH_FEATURE_ATOMIC_NLINK        = 3,
+};
+
+/* options: */
+
+#define BCH_REPLICAS_MAX               4U
+
+enum bch_error_actions {
+       BCH_ON_ERROR_CONTINUE           = 0,
+       BCH_ON_ERROR_RO                 = 1,
+       BCH_ON_ERROR_PANIC              = 2,
+       BCH_NR_ERROR_ACTIONS            = 3,
+};
+
+enum bch_csum_opts {
+       BCH_CSUM_OPT_NONE               = 0,
+       BCH_CSUM_OPT_CRC32C             = 1,
+       BCH_CSUM_OPT_CRC64              = 2,
+       BCH_CSUM_OPT_NR                 = 3,
+};
+
+enum bch_str_hash_opts {
+       BCH_STR_HASH_CRC32C             = 0,
+       BCH_STR_HASH_CRC64              = 1,
+       BCH_STR_HASH_SIPHASH            = 2,
+       BCH_STR_HASH_NR                 = 3,
+};
+
+#define BCH_COMPRESSION_TYPES()                \
+       x(NONE)                         \
+       x(LZ4)                          \
+       x(GZIP)                         \
+       x(ZSTD)
+
+enum bch_compression_opts {
+#define x(t) BCH_COMPRESSION_OPT_##t,
+       BCH_COMPRESSION_TYPES()
+#undef x
+       BCH_COMPRESSION_OPT_NR
+};
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define BCACHE_MAGIC                                                   \
+       UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,                           \
+                 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC                                                    \
+       UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,                           \
+                 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
+#define BCACHEFS_STATFS_MAGIC          0xca451a4e
+
+#define JSET_MAGIC             __cpu_to_le64(0x245235c1a3625032ULL)
+#define BSET_MAGIC             __cpu_to_le64(0x90135c78b99e07f5ULL)
+
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
+{
+       __le64 ret;
+       memcpy(&ret, &sb->uuid, sizeof(ret));
+       return ret;
+}
+
+static inline __u64 __jset_magic(struct bch_sb *sb)
+{
+       return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
+}
+
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+       return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
+}
+
+/* Journal */
+
+#define BCACHE_JSET_VERSION_UUIDv1     1
+#define BCACHE_JSET_VERSION_UUID       1       /* Always latest UUID format */
+#define BCACHE_JSET_VERSION_JKEYS      2
+#define BCACHE_JSET_VERSION            2
+
+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()                 \
+       x(btree_keys,           0)              \
+       x(btree_root,           1)              \
+       x(prio_ptrs,            2)              \
+       x(blacklist,            3)              \
+       x(blacklist_v2,         4)
+
+enum {
+#define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
+       BCH_JSET_ENTRY_TYPES()
+#undef x
+       BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
+struct jset_entry_blacklist {
+       struct jset_entry       entry;
+       __le64                  seq;
+};
+
+struct jset_entry_blacklist_v2 {
+       struct jset_entry       entry;
+       __le64                  start;
+       __le64                  end;
+};
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+       struct bch_csum         csum;
+
+       __le64                  magic;
+       __le64                  seq;
+       __le32                  version;
+       __le32                  flags;
+
+       __le32                  u64s; /* size of d[] in u64s */
+
+       __u8                    encrypted_start[0];
+
+       __le16                  read_clock;
+       __le16                  write_clock;
+
+       /* Sequence number of oldest dirty journal entry */
+       __le64                  last_seq;
+
+
+       union {
+               struct jset_entry start[0];
+               __u64           _data[0];
+       };
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
+LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
+
+#define BCH_JOURNAL_BUCKETS_MIN                20
+
+/* Btree: */
+
+#define DEFINE_BCH_BTREE_IDS()                                 \
+       DEF_BTREE_ID(EXTENTS,   0, "extents")                   \
+       DEF_BTREE_ID(INODES,    1, "inodes")                    \
+       DEF_BTREE_ID(DIRENTS,   2, "dirents")                   \
+       DEF_BTREE_ID(XATTRS,    3, "xattrs")                    \
+       DEF_BTREE_ID(ALLOC,     4, "alloc")                     \
+       DEF_BTREE_ID(QUOTAS,    5, "quotas")
+
+#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
+
+enum btree_id {
+       DEFINE_BCH_BTREE_IDS()
+       BTREE_ID_NR
+};
+
+#undef DEF_BTREE_ID
+
+#define BTREE_MAX_DEPTH                4U
+
+/* Btree nodes */
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_CSUM               1
+#define BCACHE_BSET_KEY_v1             2
+#define BCACHE_BSET_JOURNAL_SEQ                3
+#define BCACHE_BSET_VERSION            3
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+       __le64                  seq;
+
+       /*
+        * Highest journal entry this bset contains keys for.
+        * If on recovery we don't see that journal entry, this bset is ignored:
+        * this allows us to preserve the order of all index updates after a
+        * crash, since the journal records a total order of all index updates
+        * and anything that didn't make it to the journal doesn't get used.
+        */
+       __le64                  journal_seq;
+
+       __le32                  flags;
+       __le16                  version;
+       __le16                  u64s; /* count of d[] in u64s */
+
+       union {
+               struct bkey_packed start[0];
+               __u64           _data[0];
+       };
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(BSET_CSUM_TYPE,   struct bset, flags, 0, 4);
+
+LE32_BITMASK(BSET_BIG_ENDIAN,  struct bset, flags, 4, 5);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+                               struct bset, flags, 5, 6);
+
+struct btree_node {
+       struct bch_csum         csum;
+       __le64                  magic;
+
+       /* this flags field is encrypted, unlike bset->flags: */
+       __le64                  flags;
+
+       /* Closed interval: */
+       struct bpos             min_key;
+       struct bpos             max_key;
+       struct bch_extent_ptr   ptr;
+       struct bkey_format      format;
+
+       union {
+       struct bset             keys;
+       struct {
+               __u8            pad[22];
+               __le16          u64s;
+               __u64           _data[0];
+
+       };
+       };
+} __attribute__((packed, aligned(8)));
+
+LE64_BITMASK(BTREE_NODE_ID,    struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags,  4,  8);
+/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ,   struct btree_node, flags, 32, 64);
+
+struct btree_node_entry {
+       struct bch_csum         csum;
+
+       union {
+       struct bset             keys;
+       struct {
+               __u8            pad[22];
+               __le16          u64s;
+               __u64           _data[0];
+
+       };
+       };
+} __attribute__((packed, aligned(8)));
+
+#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
new file mode 100644 (file)
index 0000000..c65104e
--- /dev/null
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST         (1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST     (1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED     (1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
+
+#define BCH_FORCE_IF_DEGRADED                  \
+       (BCH_FORCE_IF_DATA_DEGRADED|            \
+        BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX                   (1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV                   (1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE     _IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL  _IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+       __u32                   flags;
+       __u32                   nr_devs;
+       __u64                   pad;
+       __u64                   devs[];
+};
+
+struct bch_ioctl_incremental {
+       __u32                   flags;
+       __u64                   pad;
+       __u64                   dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID   _IOR(0xbc,      1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START                _IOW(0xbc,      2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP         _IO(0xbc,       3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD     _IOW(0xbc,      4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE  _IOW(0xbc,      5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE  _IOW(0xbc,      6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc,      7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,    8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA         _IOW(0xbc,      10, struct bch_ioctl_data)
+#define BCH_IOCTL_USAGE                _IOWR(0xbc,     11, struct bch_ioctl_usage)
+#define BCH_IOCTL_READ_SUPER   _IOW(0xbc,      12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc,      13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE  _IOW(0xbc,      13,  struct bch_ioctl_disk_resize)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+       __uuid_t                uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+       __u32                   flags;
+       __u32                   pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+       __u32                   flags;
+       __u32                   pad;
+       __u64                   dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state          - one of the bch_member_state states (rw, ro, failed,
+ *                       spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+       __u32                   flags;
+       __u8                    new_state;
+       __u8                    pad[3];
+       __u64                   dev;
+};
+
+enum bch_data_ops {
+       BCH_DATA_OP_SCRUB       = 0,
+       BCH_DATA_OP_REREPLICATE = 1,
+       BCH_DATA_OP_MIGRATE     = 2,
+       BCH_DATA_OP_NR          = 3,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+       __u32                   op;
+       __u32                   flags;
+
+       struct bpos             start;
+       struct bpos             end;
+
+       union {
+       struct {
+               __u32           dev;
+               __u32           pad;
+       }                       migrate;
+       struct {
+               __u64           pad[8];
+       };
+       };
+} __attribute__((packed, aligned(8)));
+
+enum bch_data_event {
+       BCH_DATA_EVENT_PROGRESS = 0,
+       /* XXX: add an event for reporting errors */
+       BCH_DATA_EVENT_NR       = 1,
+};
+
+struct bch_ioctl_data_progress {
+       __u8                    data_type;
+       __u8                    btree_id;
+       __u8                    pad[2];
+       struct bpos             pos;
+
+       __u64                   sectors_done;
+       __u64                   sectors_total;
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_data_event {
+       __u8                    type;
+       __u8                    pad[7];
+       union {
+       struct bch_ioctl_data_progress p;
+       __u64                   pad2[15];
+       };
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_dev_usage {
+       __u8                    state;
+       __u8                    alive;
+       __u8                    pad[6];
+       __u32                   dev;
+
+       __u32                   bucket_size;
+       __u64                   nr_buckets;
+
+       __u64                   buckets[BCH_DATA_NR];
+       __u64                   sectors[BCH_DATA_NR];
+};
+
+struct bch_ioctl_fs_usage {
+       __u64                   capacity;
+       __u64                   used;
+       __u64                   online_reserved;
+       __u64                   persistent_reserved[BCH_REPLICAS_MAX];
+       __u64                   sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+};
+
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices - number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
+struct bch_ioctl_usage {
+       __u16                   nr_devices;
+       __u16                   pad[3];
+
+       struct bch_ioctl_fs_usage fs;
+       struct bch_ioctl_dev_usage devs[0];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb         - buffer to read into
+ * @size       - size of userspace allocated buffer
+ * @dev                - device to read superblock for, if BCH_READ_DEV flag is
+ *               specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+       __u32                   flags;
+       __u32                   pad;
+       __u64                   dev;
+       __u64                   size;
+       __u64                   sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+       __u64                   dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev                - member to resize
+ * @nbuckets   - new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+       __u32                   flags;
+       __u32                   pad;
+       __u64                   dev;
+       __u64                   nbuckets;
+};
+
+#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
new file mode 100644 (file)
index 0000000..c0e86ad
--- /dev/null
@@ -0,0 +1,1164 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "bset.h"
+#include "util.h"
+
+#undef EBUG_ON
+
+#ifdef DEBUG_BKEYS
+#define EBUG_ON(cond)          BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+                             const struct bkey_packed *);
+
+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+{
+       unsigned bit = high_bit_offset, done = 0;
+
+       while (1) {
+               while (bit < 64) {
+                       if (done && !(done % 8))
+                               *out++ = ' ';
+                       *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
+                       bit++;
+                       done++;
+                       if (done == nr_bits) {
+                               *out++ = '\0';
+                               return;
+                       }
+               }
+
+               p = next_word(p);
+               bit = 0;
+       }
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+                                const struct bkey *unpacked,
+                                const struct bkey_format *format)
+{
+       struct bkey tmp;
+
+       BUG_ON(bkeyp_val_u64s(format, packed) !=
+              bkey_val_u64s(unpacked));
+
+       BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+       tmp = __bch2_bkey_unpack_key(format, packed);
+
+       if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+               char buf1[160], buf2[160];
+               char buf3[160], buf4[160];
+
+               bch2_bkey_to_text(buf1, sizeof(buf1), unpacked);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &tmp);
+               bch2_to_binary(buf3, (void *) unpacked, 80);
+               bch2_to_binary(buf4, high_word(format, packed), 80);
+
+               panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+                     format->key_u64s,
+                     format->bits_per_field[0],
+                     format->bits_per_field[1],
+                     format->bits_per_field[2],
+                     format->bits_per_field[3],
+                     format->bits_per_field[4],
+                     buf1, buf2, buf3, buf4);
+       }
+}
+
+#else
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+                                       const struct bkey *unpacked,
+                                       const struct bkey_format *format) {}
+#endif
+
+struct pack_state {
+       const struct bkey_format *format;
+       unsigned                bits;   /* bits remaining in current word */
+       u64                     w;      /* current word */
+       u64                     *p;     /* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+                                        struct bkey_packed *k)
+{
+       u64 *p = high_word(format, k);
+
+       return (struct pack_state) {
+               .format = format,
+               .bits   = 64 - high_bit_offset,
+               .w      = 0,
+               .p      = p,
+       };
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+                             struct bkey_packed *k)
+{
+       EBUG_ON(state->p <  k->_data);
+       EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+
+       *state->p = state->w;
+}
+
+struct unpack_state {
+       const struct bkey_format *format;
+       unsigned                bits;   /* bits remaining in current word */
+       u64                     w;      /* current word */
+       const u64               *p;     /* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+                                            const struct bkey_packed *k)
+{
+       const u64 *p = high_word(format, k);
+
+       return (struct unpack_state) {
+               .format = format,
+               .bits   = 64 - high_bit_offset,
+               .w      = *p << high_bit_offset,
+               .p      = p,
+       };
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+       unsigned bits = state->format->bits_per_field[field];
+       u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+       if (bits >= state->bits) {
+               v = state->w >> (64 - bits);
+               bits -= state->bits;
+
+               state->p = next_word(state->p);
+               state->w = *state->p;
+               state->bits = 64;
+       }
+
+       /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+       v |= (state->w >> 1) >> (63 - bits);
+       state->w <<= bits;
+       state->bits -= bits;
+
+       return v + offset;
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+       unsigned bits = state->format->bits_per_field[field];
+       u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+       if (v < offset)
+               return false;
+
+       v -= offset;
+
+       if (fls64(v) > bits)
+               return false;
+
+       if (bits > state->bits) {
+               bits -= state->bits;
+               /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+               state->w |= (v >> 1) >> (bits - 1);
+
+               *state->p = state->w;
+               state->p = next_word(state->p);
+               state->w = 0;
+               state->bits = 64;
+       }
+
+       state->bits -= bits;
+       state->w |= v << state->bits;
+
+       return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
+                                  struct bkey_packed *out,
+                                  const struct bkey_format *in_f,
+                                  const struct bkey_packed *in)
+{
+       struct pack_state out_s = pack_state_init(out_f, out);
+       struct unpack_state in_s = unpack_state_init(in_f, in);
+       u64 *w = out->_data;
+       unsigned i;
+
+       *w = 0;
+
+       for (i = 0; i < BKEY_NR_FIELDS; i++)
+               if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+                       return false;
+
+       /* Can't happen because the val would be too big to unpack: */
+       EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+       pack_state_finish(&out_s, out);
+       out->u64s       = out_f->key_u64s + in->u64s - in_f->key_u64s;
+       out->needs_whiteout = in->needs_whiteout;
+       out->type       = in->type;
+
+       return true;
+}
+
+bool bch2_bkey_transform(const struct bkey_format *out_f,
+                       struct bkey_packed *out,
+                       const struct bkey_format *in_f,
+                       const struct bkey_packed *in)
+{
+       if (!bch2_bkey_transform_key(out_f, out, in_f, in))
+               return false;
+
+       memcpy_u64s((u64 *) out + out_f->key_u64s,
+                   (u64 *) in + in_f->key_u64s,
+                   (in->u64s - in_f->key_u64s));
+       return true;
+}
+
+#define bkey_fields()                                                  \
+       x(BKEY_FIELD_INODE,             p.inode)                        \
+       x(BKEY_FIELD_OFFSET,            p.offset)                       \
+       x(BKEY_FIELD_SNAPSHOT,          p.snapshot)                     \
+       x(BKEY_FIELD_SIZE,              size)                           \
+       x(BKEY_FIELD_VERSION_HI,        version.hi)                     \
+       x(BKEY_FIELD_VERSION_LO,        version.lo)
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
+                             const struct bkey_packed *in)
+{
+       struct unpack_state state = unpack_state_init(format, in);
+       struct bkey out;
+
+       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+       EBUG_ON(in->u64s < format->key_u64s);
+       EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+       EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+       out.u64s        = BKEY_U64s + in->u64s - format->key_u64s;
+       out.format      = KEY_FORMAT_CURRENT;
+       out.needs_whiteout = in->needs_whiteout;
+       out.type        = in->type;
+       out.pad[0]      = 0;
+
+#define x(id, field)   out.field = get_inc_field(&state, id);
+       bkey_fields()
+#undef x
+
+       return out;
+}
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+                                    const struct bkey_packed *in)
+{
+       struct unpack_state state = unpack_state_init(format, in);
+       struct bpos out;
+
+       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+       EBUG_ON(in->u64s < format->key_u64s);
+       EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+       out.inode       = get_inc_field(&state, BKEY_FIELD_INODE);
+       out.offset      = get_inc_field(&state, BKEY_FIELD_OFFSET);
+       out.snapshot    = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+       return out;
+}
+#endif
+
+/**
+ * bch2_bkey_pack_key -- pack just the key, not the value
+ */
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+                  const struct bkey_format *format)
+{
+       struct pack_state state = pack_state_init(format, out);
+       u64 *w = out->_data;
+
+       EBUG_ON((void *) in == (void *) out);
+       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+       EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+       *w = 0;
+
+#define x(id, field)   if (!set_inc_field(&state, id, in->field)) return false;
+       bkey_fields()
+#undef x
+
+       /*
+        * Extents - we have to guarantee that if an extent is packed, a trimmed
+        * version will also pack:
+        */
+       if (bkey_start_offset(in) <
+           le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
+               return false;
+
+       pack_state_finish(&state, out);
+       out->u64s       = format->key_u64s + in->u64s - BKEY_U64s;
+       out->format     = KEY_FORMAT_LOCAL_BTREE;
+       out->needs_whiteout = in->needs_whiteout;
+       out->type       = in->type;
+
+       bch2_bkey_pack_verify(out, in, format);
+       return true;
+}
+
+/**
+ * bch2_bkey_unpack -- unpack the key and the value
+ */
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
+                const struct bkey_packed *src)
+{
+       dst->k = bkey_unpack_key(b, src);
+
+       memcpy_u64s(&dst->v,
+                   bkeyp_val(&b->format, src),
+                   bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bch2_bkey_pack -- pack the key and the value
+ */
+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
+              const struct bkey_format *format)
+{
+       struct bkey_packed tmp;
+
+       if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+               return false;
+
+       memmove_u64s((u64 *) out + format->key_u64s,
+                    &in->v,
+                    bkey_val_u64s(&in->k));
+       memcpy_u64s(out, &tmp, format->key_u64s);
+
+       return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+       unsigned bits = state->format->bits_per_field[field];
+       u64 offset = le64_to_cpu(state->format->field_offset[field]);
+       bool ret = true;
+
+       EBUG_ON(v < offset);
+       v -= offset;
+
+       if (fls64(v) > bits) {
+               v = ~(~0ULL << bits);
+               ret = false;
+       }
+
+       if (bits > state->bits) {
+               bits -= state->bits;
+               state->w |= (v >> 1) >> (bits - 1);
+
+               *state->p = state->w;
+               state->p = next_word(state->p);
+               state->w = 0;
+               state->bits = 64;
+       }
+
+       state->bits -= bits;
+       state->w |= v << state->bits;
+
+       return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+                                 const struct btree *b,
+                                 struct bkey_packed k)
+{
+       const struct bkey_format *f = &b->format;
+       unsigned nr_key_bits = b->nr_key_bits;
+       unsigned first_bit, offset;
+       u64 *p;
+
+       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+       if (!nr_key_bits)
+               return false;
+
+       *out = k;
+
+       first_bit = high_bit_offset + nr_key_bits - 1;
+       p = nth_word(high_word(f, out), first_bit >> 6);
+       offset = 63 - (first_bit & 63);
+
+       while (nr_key_bits) {
+               unsigned bits = min(64 - offset, nr_key_bits);
+               u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+               if ((*p & mask) != mask) {
+                       *p += 1ULL << offset;
+                       EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+                       return true;
+               }
+
+               *p &= ~mask;
+               p = prev_word(p);
+               nr_key_bits -= bits;
+               offset = 0;
+       }
+
+       return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
+                                          struct bpos in,
+                                          const struct btree *b)
+{
+       const struct bkey_format *f = &b->format;
+       struct pack_state state = pack_state_init(f, out);
+       u64 *w = out->_data;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bpos orig = in;
+#endif
+       bool exact = true;
+
+       *w = 0;
+
+       if (unlikely(in.snapshot <
+                    le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+               if (!in.offset-- &&
+                   !in.inode--)
+                       return BKEY_PACK_POS_FAIL;
+               in.snapshot     = KEY_SNAPSHOT_MAX;
+               exact = false;
+       }
+
+       if (unlikely(in.offset <
+                    le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+               if (!in.inode--)
+                       return BKEY_PACK_POS_FAIL;
+               in.offset       = KEY_OFFSET_MAX;
+               in.snapshot     = KEY_SNAPSHOT_MAX;
+               exact = false;
+       }
+
+       if (unlikely(in.inode <
+                    le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+               return BKEY_PACK_POS_FAIL;
+
+       if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+               in.offset       = KEY_OFFSET_MAX;
+               in.snapshot     = KEY_SNAPSHOT_MAX;
+               exact = false;
+       }
+
+       if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+               in.snapshot     = KEY_SNAPSHOT_MAX;
+               exact = false;
+       }
+
+       if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+               exact = false;
+
+       pack_state_finish(&state, out);
+       out->u64s       = f->key_u64s;
+       out->format     = KEY_FORMAT_LOCAL_BTREE;
+       out->type       = KEY_TYPE_DELETED;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       if (exact) {
+               BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+       } else {
+               struct bkey_packed successor;
+
+               BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+               BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+                      bkey_cmp_left_packed(b, &successor, &orig) < 0);
+       }
+#endif
+
+       return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch2_bkey_format_init(struct bkey_format_state *s)
+{
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+               s->field_min[i] = U64_MAX;
+
+       for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+               s->field_max[i] = 0;
+
+       /* Make sure we can store a size of 0: */
+       s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+static void __bkey_format_add(struct bkey_format_state *s,
+                             unsigned field, u64 v)
+{
+       s->field_min[field] = min(s->field_min[field], v);
+       s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+       bkey_fields()
+#undef x
+       __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+       unsigned field = 0;
+
+       __bkey_format_add(s, field++, p.inode);
+       __bkey_format_add(s, field++, p.offset);
+       __bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+                            unsigned bits, u64 offset)
+{
+       offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+
+       f->bits_per_field[i]    = bits;
+       f->field_offset[i]      = cpu_to_le64(offset);
+}
+
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
+{
+       unsigned i, bits = KEY_PACKED_BITS_START;
+       struct bkey_format ret = {
+               .nr_fields = BKEY_NR_FIELDS,
+       };
+
+       for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+               s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+               set_format_field(&ret, i,
+                                fls64(s->field_max[i] - s->field_min[i]),
+                                s->field_min[i]);
+
+               bits += ret.bits_per_field[i];
+       }
+
+       /* allow for extent merging: */
+       if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+               ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
+               bits += 4;
+       }
+
+       ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+       /* if we have enough spare bits, round fields up to nearest byte */
+       bits = ret.key_u64s * 64 - bits;
+
+       for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+               unsigned r = round_up(ret.bits_per_field[i], 8) -
+                       ret.bits_per_field[i];
+
+               if (r <= bits) {
+                       set_format_field(&ret, i,
+                                        ret.bits_per_field[i] + r,
+                                        le64_to_cpu(ret.field_offset[i]));
+                       bits -= r;
+               }
+       }
+
+       EBUG_ON(bch2_bkey_format_validate(&ret));
+       return ret;
+}
+
+const char *bch2_bkey_format_validate(struct bkey_format *f)
+{
+       unsigned i, bits = KEY_PACKED_BITS_START;
+
+       if (f->nr_fields != BKEY_NR_FIELDS)
+               return "incorrect number of fields";
+
+       for (i = 0; i < f->nr_fields; i++) {
+               u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+               if (f->bits_per_field[i] > 64)
+                       return "field too large";
+
+               if (field_offset &&
+                   (f->bits_per_field[i] == 64 ||
+                   (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
+                    field_offset)))
+                       return "offset + bits overflow";
+
+               bits += f->bits_per_field[i];
+       }
+
+       if (f->key_u64s != DIV_ROUND_UP(bits, 64))
+               return "incorrect key_u64s";
+
+       return NULL;
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
+                                         const struct bkey_packed *l_k,
+                                         const struct bkey_packed *r_k)
+{
+       const u64 *l = high_word(&b->format, l_k);
+       const u64 *r = high_word(&b->format, r_k);
+       unsigned nr_key_bits = b->nr_key_bits;
+       unsigned word_bits = 64 - high_bit_offset;
+       u64 l_v, r_v;
+
+       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+       /* for big endian, skip past header */
+       l_v = *l & (~0ULL >> high_bit_offset);
+       r_v = *r & (~0ULL >> high_bit_offset);
+
+       while (nr_key_bits) {
+               if (nr_key_bits < word_bits) {
+                       l_v >>= word_bits - nr_key_bits;
+                       r_v >>= word_bits - nr_key_bits;
+                       nr_key_bits = 0;
+               } else {
+                       nr_key_bits -= word_bits;
+               }
+
+               if (l_v != r_v)
+                       return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+               l = next_word(l);
+               r = next_word(r);
+
+               l_v = *l;
+               r_v = *r;
+               word_bits = 64;
+       }
+
+       return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
+{
+       const u64 *p = high_word(&b->format, k);
+       unsigned nr_key_bits = b->nr_key_bits;
+       unsigned ret = 0, offset;
+
+       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+       offset = nr_key_bits;
+       while (offset > 64) {
+               p = next_word(p);
+               offset -= 64;
+       }
+
+       offset = 64 - offset;
+
+       while (nr_key_bits) {
+               unsigned bits = nr_key_bits + offset < 64
+                       ? nr_key_bits
+                       : 64 - offset;
+
+               u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+               if (*p & mask)
+                       return ret + __ffs64(*p & mask) - offset;
+
+               p = prev_word(p);
+               nr_key_bits -= bits;
+               ret += bits;
+               offset = 0;
+       }
+
+       return 0;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+                                 unsigned nr_key_bits)
+{
+       long d0, d1, d2, d3;
+       int cmp;
+
+       /* we shouldn't need asm for this, but gcc is being retarded: */
+
+       asm(".intel_syntax noprefix;"
+           "xor eax, eax;"
+           "xor edx, edx;"
+           "1:;"
+           "mov r8, [rdi];"
+           "mov r9, [rsi];"
+           "sub ecx, 64;"
+           "jl 2f;"
+
+           "cmp r8, r9;"
+           "jnz 3f;"
+
+           "lea rdi, [rdi - 8];"
+           "lea rsi, [rsi - 8];"
+           "jmp 1b;"
+
+           "2:;"
+           "not ecx;"
+           "shr r8, 1;"
+           "shr r9, 1;"
+           "shr r8, cl;"
+           "shr r9, cl;"
+           "cmp r8, r9;"
+
+           "3:\n"
+           "seta al;"
+           "setb dl;"
+           "sub eax, edx;"
+           ".att_syntax prefix;"
+           : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+           : "0" (l), "1" (r), "3" (nr_key_bits)
+           : "r8", "r9", "cc", "memory");
+
+       return cmp;
+}
+
+#define I(_x)                  (*(out)++ = (_x))
+#define I1(i0)                                         I(i0)
+#define I2(i0, i1)             (I1(i0),                I(i1))
+#define I3(i0, i1, i2)         (I2(i0, i1),            I(i2))
+#define I4(i0, i1, i2, i3)     (I3(i0, i1, i2),        I(i3))
+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3),    I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+                             enum bch_bkey_fields field,
+                             unsigned dst_offset, unsigned dst_size,
+                             bool *eax_zeroed)
+{
+       unsigned bits = format->bits_per_field[field];
+       u64 offset = le64_to_cpu(format->field_offset[field]);
+       unsigned i, byte, bit_offset, align, shl, shr;
+
+       if (!bits && !offset) {
+               if (!*eax_zeroed) {
+                       /* xor eax, eax */
+                       I2(0x31, 0xc0);
+               }
+
+               *eax_zeroed = true;
+               goto set_field;
+       }
+
+       if (!bits) {
+               /* just return offset: */
+
+               switch (dst_size) {
+               case 8:
+                       if (offset > S32_MAX) {
+                               /* mov [rdi + dst_offset], offset */
+                               I3(0xc7, 0x47, dst_offset);
+                               memcpy(out, &offset, 4);
+                               out += 4;
+
+                               I3(0xc7, 0x47, dst_offset + 4);
+                               memcpy(out, (void *) &offset + 4, 4);
+                               out += 4;
+                       } else {
+                               /* mov [rdi + dst_offset], offset */
+                               /* sign extended */
+                               I4(0x48, 0xc7, 0x47, dst_offset);
+                               memcpy(out, &offset, 4);
+                               out += 4;
+                       }
+                       break;
+               case 4:
+                       /* mov [rdi + dst_offset], offset */
+                       I3(0xc7, 0x47, dst_offset);
+                       memcpy(out, &offset, 4);
+                       out += 4;
+                       break;
+               default:
+                       BUG();
+               }
+
+               return out;
+       }
+
+       bit_offset = format->key_u64s * 64;
+       for (i = 0; i <= field; i++)
+               bit_offset -= format->bits_per_field[i];
+
+       byte = bit_offset / 8;
+       bit_offset -= byte * 8;
+
+       *eax_zeroed = false;
+
+       if (bit_offset == 0 && bits == 8) {
+               /* movzx eax, BYTE PTR [rsi + imm8] */
+               I4(0x0f, 0xb6, 0x46, byte);
+       } else if (bit_offset == 0 && bits == 16) {
+               /* movzx eax, WORD PTR [rsi + imm8] */
+               I4(0x0f, 0xb7, 0x46, byte);
+       } else if (bit_offset + bits <= 32) {
+               align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+               byte -= align;
+               bit_offset += align * 8;
+
+               BUG_ON(bit_offset + bits > 32);
+
+               /* mov eax, [rsi + imm8] */
+               I3(0x8b, 0x46, byte);
+
+               if (bit_offset) {
+                       /* shr eax, imm8 */
+                       I3(0xc1, 0xe8, bit_offset);
+               }
+
+               if (bit_offset + bits < 32) {
+                       unsigned mask = ~0U >> (32 - bits);
+
+                       /* and eax, imm32 */
+                       I1(0x25);
+                       memcpy(out, &mask, 4);
+                       out += 4;
+               }
+       } else if (bit_offset + bits <= 64) {
+               align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+               byte -= align;
+               bit_offset += align * 8;
+
+               BUG_ON(bit_offset + bits > 64);
+
+               /* mov rax, [rsi + imm8] */
+               I4(0x48, 0x8b, 0x46, byte);
+
+               shl = 64 - bit_offset - bits;
+               shr = bit_offset + shl;
+
+               if (shl) {
+                       /* shl rax, imm8 */
+                       I4(0x48, 0xc1, 0xe0, shl);
+               }
+
+               if (shr) {
+                       /* shr rax, imm8 */
+                       I4(0x48, 0xc1, 0xe8, shr);
+               }
+       } else {
+               align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+               byte -= align;
+               bit_offset += align * 8;
+
+               BUG_ON(bit_offset + bits > 96);
+
+               /* mov rax, [rsi + byte] */
+               I4(0x48, 0x8b, 0x46, byte);
+
+               /* mov edx, [rsi + byte + 8] */
+               I3(0x8b, 0x56, byte + 8);
+
+               /* bits from next word: */
+               shr = bit_offset + bits - 64;
+               BUG_ON(shr > bit_offset);
+
+               /* shr rax, bit_offset */
+               I4(0x48, 0xc1, 0xe8, shr);
+
+               /* shl rdx, imm8 */
+               I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+               /* or rax, rdx */
+               I3(0x48, 0x09, 0xd0);
+
+               shr = bit_offset - shr;
+
+               if (shr) {
+                       /* shr rax, imm8 */
+                       I4(0x48, 0xc1, 0xe8, shr);
+               }
+       }
+
+       /* rax += offset: */
+       if (offset > S32_MAX) {
+               /* mov rdx, imm64 */
+               I2(0x48, 0xba);
+               memcpy(out, &offset, 8);
+               out += 8;
+               /* add %rdx, %rax */
+               I3(0x48, 0x01, 0xd0);
+       } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+               /* add rax, imm32 */
+               I2(0x48, 0x05);
+               memcpy(out, &offset, 4);
+               out += 4;
+       } else if (offset) {
+               /* add eax, imm32 */
+               I1(0x05);
+               memcpy(out, &offset, 4);
+               out += 4;
+       }
+set_field:
+       switch (dst_size) {
+       case 8:
+               /* mov [rdi + dst_offset], rax */
+               I4(0x48, 0x89, 0x47, dst_offset);
+               break;
+       case 4:
+               /* mov [rdi + dst_offset], eax */
+               I3(0x89, 0x47, dst_offset);
+               break;
+       default:
+               BUG();
+       }
+
+       return out;
+}
+
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+       bool eax_zeroed = false;
+       u8 *out = _out;
+
+       /*
+        * rdi: dst - unpacked key
+        * rsi: src - packed key
+        */
+
+       /* k->u64s, k->format, k->type */
+
+       /* mov eax, [rsi] */
+       I2(0x8b, 0x06);
+
+       /* add eax, BKEY_U64s - format->key_u64s */
+       I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+       /* and eax, imm32: mask out k->pad: */
+       I5(0x25, 0xff, 0xff, 0xff, 0);
+
+       /* mov [rdi], eax */
+       I2(0x89, 0x07);
+
+#define x(id, field)                                                   \
+       out = compile_bkey_field(format, out, id,                       \
+                                offsetof(struct bkey, field),          \
+                                sizeof(((struct bkey *) NULL)->field), \
+                                &eax_zeroed);
+       bkey_fields()
+#undef x
+
+       /* retq */
+       I1(0xc3);
+
+       return (void *) out - _out;
+}
+
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+                                 unsigned nr_key_bits)
+{
+       u64 l_v, r_v;
+
+       if (!nr_key_bits)
+               return 0;
+
+       /* for big endian, skip past header */
+       nr_key_bits += high_bit_offset;
+       l_v = *l & (~0ULL >> high_bit_offset);
+       r_v = *r & (~0ULL >> high_bit_offset);
+
+       while (1) {
+               if (nr_key_bits < 64) {
+                       l_v >>= 64 - nr_key_bits;
+                       r_v >>= 64 - nr_key_bits;
+                       nr_key_bits = 0;
+               } else {
+                       nr_key_bits -= 64;
+               }
+
+               if (l_v != r_v)
+                       return l_v < r_v ? -1 : 1;
+
+               if (!nr_key_bits)
+                       return 0;
+
+               l = next_word(l);
+               r = next_word(r);
+
+               l_v = *l;
+               r_v = *r;
+       }
+}
+#endif
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+                                         const struct bkey_packed *r,
+                                         const struct btree *b)
+{
+       const struct bkey_format *f = &b->format;
+       int ret;
+
+       EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+       ret = __bkey_cmp_bits(high_word(f, l),
+                             high_word(f, r),
+                             b->nr_key_bits);
+
+       EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+                               bkey_unpack_pos(b, r)));
+       return ret;
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
+                                              const struct bkey_packed *l,
+                                              const struct bpos *r)
+{
+       return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
+                          const struct bkey_packed *r,
+                          const struct btree *b)
+{
+       int packed = bkey_lr_packed(l, r);
+
+       if (likely(packed == BKEY_PACKED_BOTH))
+               return __bch2_bkey_cmp_packed_format_checked(l, r, b);
+
+       switch (packed) {
+       case BKEY_PACKED_NONE:
+               return bkey_cmp(((struct bkey *) l)->p,
+                               ((struct bkey *) r)->p);
+       case BKEY_PACKED_LEFT:
+               return __bch2_bkey_cmp_left_packed_format_checked(b,
+                                 (struct bkey_packed *) l,
+                                 &((struct bkey *) r)->p);
+       case BKEY_PACKED_RIGHT:
+               return -__bch2_bkey_cmp_left_packed_format_checked(b,
+                                 (struct bkey_packed *) r,
+                                 &((struct bkey *) l)->p);
+       default:
+               unreachable();
+       }
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
+                               const struct bkey_packed *l,
+                               const struct bpos *r)
+{
+       const struct bkey *l_unpacked;
+
+       return unlikely(l_unpacked = packed_to_bkey_c(l))
+               ? bkey_cmp(l_unpacked->p, *r)
+               : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch2_bpos_swab(struct bpos *p)
+{
+       u8 *l = (u8 *) p;
+       u8 *h = ((u8 *) &p[1]) - 1;
+
+       while (l < h) {
+               swap(*l, *h);
+               l++;
+               --h;
+       }
+}
+
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+       const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
+       u8 *l = k->key_start;
+       u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+       while (l < h) {
+               swap(*l, *h);
+               l++;
+               --h;
+       }
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void)
+{
+       struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+       struct bkey_packed p;
+
+       struct bkey_format test_format = {
+               .key_u64s       = 2,
+               .nr_fields      = BKEY_NR_FIELDS,
+               .bits_per_field = {
+                       13,
+                       64,
+               },
+       };
+
+       struct unpack_state in_s =
+               unpack_state_init(&bch2_bkey_format_current, (void *) &t);
+       struct pack_state out_s = pack_state_init(&test_format, &p);
+       unsigned i;
+
+       for (i = 0; i < out_s.format->nr_fields; i++) {
+               u64 a, v = get_inc_field(&in_s, i);
+
+               switch (i) {
+#define x(id, field)   case id: a = t.field; break;
+       bkey_fields()
+#undef x
+               default:
+                       BUG();
+               }
+
+               if (a != v)
+                       panic("got %llu actual %llu i %u\n", v, a, i);
+
+               if (!set_inc_field(&out_s, i, v))
+                       panic("failed at %u\n", i);
+       }
+
+       BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
new file mode 100644 (file)
index 0000000..9a0286d
--- /dev/null
@@ -0,0 +1,627 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "util.h"
+#include "vstructs.h"
+
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK  1
+#endif
+#endif
+
+void bch2_to_binary(char *, const u64 *, unsigned);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+       const struct bkey       *k;
+       const struct bch_val    *v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+       union {
+       struct {
+               struct bkey     *k;
+               struct bch_val  *v;
+       };
+       struct bkey_s_c         s_c;
+       };
+};
+
+#define bkey_next(_k)          vstruct_next(_k)
+
+static inline unsigned bkey_val_u64s(const struct bkey *k)
+{
+       return k->u64s - BKEY_U64s;
+}
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+       return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+       k->u64s = BKEY_U64s + val_u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+       k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+#define bkey_deleted(_k)       ((_k)->type == KEY_TYPE_DELETED)
+
+#define bkey_whiteout(_k)                              \
+       ((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+
+#define bkey_packed_typecheck(_k)                                      \
+({                                                                     \
+       BUILD_BUG_ON(!type_is(_k, struct bkey *) &&                     \
+                    !type_is(_k, struct bkey_packed *));               \
+       type_is(_k, struct bkey_packed *);                              \
+})
+
+enum bkey_lr_packed {
+       BKEY_PACKED_BOTH,
+       BKEY_PACKED_RIGHT,
+       BKEY_PACKED_LEFT,
+       BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed_typecheck(_l, _r)                               \
+       (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
+
+#define bkey_lr_packed(_l, _r)                                         \
+       ((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)                                  \
+do {                                                           \
+       BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&         \
+                    !type_is(_dst, struct bkey_packed *));     \
+       BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&         \
+                    !type_is(_src, struct bkey_packed *));     \
+       EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&              \
+               (u64 *) (_dst) < (u64 *) (_src) +               \
+               ((struct bkey *) (_src))->u64s);                \
+                                                               \
+       __memmove_u64s_down((_dst), (_src),                     \
+                           ((struct bkey *) (_src))->u64s);    \
+} while (0)
+
+struct btree;
+
+struct bkey_format_state {
+       u64 field_min[BKEY_NR_FIELDS];
+       u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+const char *bch2_bkey_format_validate(struct bkey_format *);
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+                                         const struct bkey_packed *,
+                                         const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+                                    const struct bkey_packed *,
+                                    const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+                                         const struct bkey_packed *,
+                                         const struct bpos *);
+
+__pure
+int __bch2_bkey_cmp_packed(const struct bkey_packed *,
+                          const struct bkey_packed *,
+                          const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+                               const struct bkey_packed *,
+                               const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+                        const struct bkey_packed *l, const struct bpos *r)
+{
+       return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+                                            const struct bkey_packed *l,
+                                            struct bpos r)
+{
+       return bkey_cmp_left_packed(b, l, &r);
+}
+
+/*
+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
+ * skip dispatching on k->format:
+ */
+#define bkey_cmp_packed(_b, _l, _r)                                    \
+({                                                                     \
+       int _cmp;                                                       \
+                                                                       \
+       switch (bkey_lr_packed_typecheck(_l, _r)) {                     \
+       case BKEY_PACKED_NONE:                                          \
+               _cmp = bkey_cmp(((struct bkey *) (_l))->p,              \
+                               ((struct bkey *) (_r))->p);             \
+               break;                                                  \
+       case BKEY_PACKED_LEFT:                                          \
+               _cmp = bkey_cmp_left_packed((_b),                       \
+                                 (struct bkey_packed *) (_l),          \
+                                 &((struct bkey *) (_r))->p);          \
+               break;                                                  \
+       case BKEY_PACKED_RIGHT:                                         \
+               _cmp = -bkey_cmp_left_packed((_b),                      \
+                                 (struct bkey_packed *) (_r),          \
+                                 &((struct bkey *) (_l))->p);          \
+               break;                                                  \
+       case BKEY_PACKED_BOTH:                                          \
+               _cmp = __bch2_bkey_cmp_packed((void *) (_l),            \
+                                        (void *) (_r), (_b));          \
+               break;                                                  \
+       }                                                               \
+       _cmp;                                                           \
+})
+
+#if 1
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+       if (l.inode != r.inode)
+               return l.inode < r.inode ? -1 : 1;
+       if (l.offset != r.offset)
+               return l.offset < r.offset ? -1 : 1;
+       if (l.snapshot != r.snapshot)
+               return l.snapshot < r.snapshot ? -1 : 1;
+       return 0;
+}
+#else
+int bkey_cmp(struct bpos l, struct bpos r);
+#endif
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+       return bkey_cmp(l, r) < 0 ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+       return  (l.hi > r.hi) - (l.hi < r.hi) ?:
+               (l.lo > r.lo) - (l.lo < r.lo);
+}
+
+#define ZERO_VERSION   ((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION    ((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+       return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)                                                        \
+       ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);                  \
+        (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)                ((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+       return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+       return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+       return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+       return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+       return format->bits_per_field[BKEY_FIELD_INODE] +
+               format->bits_per_field[BKEY_FIELD_OFFSET] +
+               format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bkey_successor(struct bpos p)
+{
+       struct bpos ret = p;
+
+       if (!++ret.offset)
+               BUG_ON(!++ret.inode);
+
+       return ret;
+}
+
+static inline struct bpos bkey_predecessor(struct bpos p)
+{
+       struct bpos ret = p;
+
+       if (!ret.offset--)
+               BUG_ON(!ret.inode--);
+
+       return ret;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+       return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+       return (struct bpos) {
+               .inode          = k->p.inode,
+               .offset         = bkey_start_offset(k),
+               .snapshot       = k->p.snapshot,
+       };
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+                                     const struct bkey_packed *k)
+{
+       unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+       EBUG_ON(k->u64s < ret);
+       return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+                                      const struct bkey_packed *k)
+{
+       return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+                                     const struct bkey_packed *k)
+{
+       return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+                                    const struct bkey_packed *k)
+{
+       return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+                                     struct bkey_packed *k, unsigned val_u64s)
+{
+       k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)                                         \
+        ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+                        struct bkey_packed *,
+                        const struct bkey_format *,
+                        const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+                                  const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+                             const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+                  const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+       BKEY_PACK_POS_EXACT,
+       BKEY_PACK_POS_SMALLER,
+       BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+                                          const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+                                const struct btree *b)
+{
+       return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+                const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+              const struct bkey_format *);
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+                                enum bch_bkey_fields nr)
+{
+       return f->bits_per_field[nr] < 64
+               ? (le64_to_cpu(f->field_offset[nr]) +
+                  ~(~0ULL << f->bits_per_field[nr]))
+               : U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+                                         void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+                                  struct bkey_s_c src)
+{
+       BUG_ON(bkey_packed(src.k));
+       dst->k = *src.k;
+       memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null            ((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null          ((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)                ((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)      ((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+       return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+       return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+       return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+       return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define __BKEY_VAL_ACCESSORS(name, nr, _assert)                                \
+struct bkey_s_c_##name {                                               \
+       union {                                                         \
+       struct {                                                        \
+               const struct bkey       *k;                             \
+               const struct bch_##name *v;                             \
+       };                                                              \
+       struct bkey_s_c                 s_c;                            \
+       };                                                              \
+};                                                                     \
+                                                                       \
+struct bkey_s_##name {                                                 \
+       union {                                                         \
+       struct {                                                        \
+               struct bkey             *k;                             \
+               struct bch_##name       *v;                             \
+       };                                                              \
+       struct bkey_s_c_##name          c;                              \
+       struct bkey_s                   s;                              \
+       struct bkey_s_c                 s_c;                            \
+       };                                                              \
+};                                                                     \
+                                                                       \
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
+{                                                                      \
+       _assert(k->k.type, nr);                                         \
+       return container_of(&k->k, struct bkey_i_##name, k);            \
+}                                                                      \
+                                                                       \
+static inline const struct bkey_i_##name *                             \
+bkey_i_to_##name##_c(const struct bkey_i *k)                           \
+{                                                                      \
+       _assert(k->k.type, nr);                                         \
+       return container_of(&k->k, struct bkey_i_##name, k);            \
+}                                                                      \
+                                                                       \
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)   \
+{                                                                      \
+       _assert(k.k->type, nr);                                         \
+       return (struct bkey_s_##name) {                                 \
+               .k = k.k,                                               \
+               .v = container_of(k.v, struct bch_##name, v),           \
+       };                                                              \
+}                                                                      \
+                                                                       \
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{                                                                      \
+       _assert(k.k->type, nr);                                         \
+       return (struct bkey_s_c_##name) {                               \
+               .k = k.k,                                               \
+               .v = container_of(k.v, struct bch_##name, v),           \
+       };                                                              \
+}                                                                      \
+                                                                       \
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{                                                                      \
+       return (struct bkey_s_##name) {                                 \
+               .k = &k->k,                                             \
+               .v = &k->v,                                             \
+       };                                                              \
+}                                                                      \
+                                                                       \
+static inline struct bkey_s_c_##name                                   \
+name##_i_to_s_c(const struct bkey_i_##name *k)                         \
+{                                                                      \
+       return (struct bkey_s_c_##name) {                               \
+               .k = &k->k,                                             \
+               .v = &k->v,                                             \
+       };                                                              \
+}                                                                      \
+                                                                       \
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)        \
+{                                                                      \
+       _assert(k->k.type, nr);                                         \
+       return (struct bkey_s_##name) {                                 \
+               .k = &k->k,                                             \
+               .v = container_of(&k->v, struct bch_##name, v),         \
+       };                                                              \
+}                                                                      \
+                                                                       \
+static inline struct bkey_s_c_##name                                   \
+bkey_i_to_s_c_##name(const struct bkey_i *k)                           \
+{                                                                      \
+       _assert(k->k.type, nr);                                         \
+       return (struct bkey_s_c_##name) {                               \
+               .k = &k->k,                                             \
+               .v = container_of(&k->v, struct bch_##name, v),         \
+       };                                                              \
+}                                                                      \
+                                                                       \
+static inline struct bch_##name *                                      \
+bkey_p_##name##_val(const struct bkey_format *f,                       \
+                   struct bkey_packed *k)                              \
+{                                                                      \
+       return container_of(bkeyp_val(f, k), struct bch_##name, v);     \
+}                                                                      \
+                                                                       \
+static inline const struct bch_##name *                                        \
+bkey_p_c_##name##_val(const struct bkey_format *f,                     \
+                     const struct bkey_packed *k)                      \
+{                                                                      \
+       return container_of(bkeyp_val(f, k), struct bch_##name, v);     \
+}                                                                      \
+                                                                       \
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{                                                                      \
+       struct bkey_i_##name *k =                                       \
+               container_of(&_k->k, struct bkey_i_##name, k);          \
+                                                                       \
+       bkey_init(&k->k);                                               \
+       memset(&k->v, 0, sizeof(k->v));                                 \
+       k->k.type = nr;                                                 \
+       set_bkey_val_bytes(&k->k, sizeof(k->v));                        \
+                                                                       \
+       return k;                                                       \
+}
+
+#define __BKEY_VAL_ASSERT(_type, _nr)  EBUG_ON(_type != _nr)
+
+#define BKEY_VAL_ACCESSORS(name, _nr)                                  \
+       static inline void __bch_##name##_assert(u8 type, u8 nr)        \
+       {                                                               \
+               EBUG_ON(type != _nr);                                   \
+       }                                                               \
+                                                                       \
+       __BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
+
+BKEY_VAL_ACCESSORS(cookie,             KEY_TYPE_COOKIE);
+
+static inline void __bch2_extent_assert(u8 type, u8 nr)
+{
+       EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
+}
+
+__BKEY_VAL_ACCESSORS(extent,           BCH_EXTENT, __bch2_extent_assert);
+BKEY_VAL_ACCESSORS(reservation,                BCH_RESERVATION);
+
+BKEY_VAL_ACCESSORS(inode,              BCH_INODE_FS);
+BKEY_VAL_ACCESSORS(inode_blockdev,     BCH_INODE_BLOCKDEV);
+BKEY_VAL_ACCESSORS(inode_generation,   BCH_INODE_GENERATION);
+
+BKEY_VAL_ACCESSORS(dirent,             BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr,              BCH_XATTR);
+
+BKEY_VAL_ACCESSORS(alloc,              BCH_ALLOC);
+
+BKEY_VAL_ACCESSORS(quota,              BCH_QUOTA);
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+       return f->key_u64s - 1;
+}
+
+#define high_bit_offset                0
+#define nth_word(p, n)         ((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+       return 0;
+}
+
+#define high_bit_offset                KEY_PACKED_BITS_START
+#define nth_word(p, n)         ((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k)                ((k)->_data + high_word_offset(f))
+#define next_word(p)           nth_word(p, 1)
+#define prev_word(p)           nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
new file mode 100644 (file)
index 0000000..017425a
--- /dev/null
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "alloc.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "quota.h"
+#include "xattr.h"
+
+const struct bkey_ops bch2_bkey_ops[] = {
+       [BKEY_TYPE_EXTENTS]     = bch2_bkey_extent_ops,
+       [BKEY_TYPE_INODES]      = bch2_bkey_inode_ops,
+       [BKEY_TYPE_DIRENTS]     = bch2_bkey_dirent_ops,
+       [BKEY_TYPE_XATTRS]      = bch2_bkey_xattr_ops,
+       [BKEY_TYPE_ALLOC]       = bch2_bkey_alloc_ops,
+       [BKEY_TYPE_QUOTAS]      = bch2_bkey_quota_ops,
+       [BKEY_TYPE_BTREE]       = bch2_bkey_btree_ops,
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+                                 struct bkey_s_c k)
+{
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+       switch (k.k->type) {
+       case KEY_TYPE_DELETED:
+       case KEY_TYPE_DISCARD:
+               return NULL;
+
+       case KEY_TYPE_ERROR:
+               return bkey_val_bytes(k.k) != 0
+                       ? "value size should be zero"
+                       : NULL;
+
+       case KEY_TYPE_COOKIE:
+               return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
+                       ? "incorrect value size"
+                       : NULL;
+
+       default:
+               if (k.k->type < KEY_TYPE_GENERIC_NR)
+                       return "invalid type";
+
+               return ops->key_invalid(c, k);
+       }
+}
+
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+                             struct bkey_s_c k)
+{
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+       if (k.k->u64s < BKEY_U64s)
+               return "u64s too small";
+
+       if (!ops->is_extents) {
+               if (k.k->size)
+                       return "nonzero size field";
+       } else {
+               if ((k.k->size == 0) != bkey_deleted(k.k))
+                       return "bad size field";
+       }
+
+       if (ops->is_extents &&
+           !k.k->size &&
+           !bkey_deleted(k.k))
+               return "zero size field";
+
+       if (k.k->p.snapshot)
+               return "nonzero snapshot";
+
+       if (type != BKEY_TYPE_BTREE &&
+           !bkey_cmp(k.k->p, POS_MAX))
+               return "POS_MAX key";
+
+       return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+                             struct bkey_s_c k)
+{
+       return __bch2_bkey_invalid(c, type, k) ?:
+               bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+{
+       if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+               return "key before start of btree node";
+
+       if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+               return "key past end of btree node";
+
+       return NULL;
+}
+
+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+       enum bkey_type type = btree_node_type(b);
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
+       const char *invalid;
+
+       BUG_ON(!k.k->u64s);
+
+       invalid = bch2_bkey_invalid(c, type, k) ?:
+               bch2_bkey_in_btree_node(b, k);
+       if (invalid) {
+               char buf[160];
+
+               bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+               bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+               return;
+       }
+
+       if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+           ops->key_debugcheck)
+               ops->key_debugcheck(c, b, k);
+}
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+{
+       char *out = buf, *end = buf + size;
+
+       p("u64s %u type %u ", k->u64s, k->type);
+
+       if (bkey_cmp(k->p, POS_MAX))
+               p("%llu:%llu", k->p.inode, k->p.offset);
+       else
+               p("POS_MAX");
+
+       p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+       return out - buf;
+}
+
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+                    char *buf, size_t size, struct bkey_s_c k)
+{
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
+       char *out = buf, *end = buf + size;
+
+       switch (k.k->type) {
+       case KEY_TYPE_DELETED:
+               p(" deleted");
+               break;
+       case KEY_TYPE_DISCARD:
+               p(" discard");
+               break;
+       case KEY_TYPE_ERROR:
+               p(" error");
+               break;
+       case KEY_TYPE_COOKIE:
+               p(" cookie");
+               break;
+       default:
+               if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+                       ops->val_to_text(c, buf, size, k);
+               break;
+       }
+
+       return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+                         char *buf, size_t size, struct bkey_s_c k)
+{
+       char *out = buf, *end = buf + size;
+
+       out += bch2_bkey_to_text(out, end - out, k.k);
+       out += scnprintf(out, end - out, ": ");
+       out += bch2_val_to_text(c, type, out, end - out, k);
+
+       return out - buf;
+}
+
+void bch2_bkey_swab(enum bkey_type type,
+                  const struct bkey_format *f,
+                  struct bkey_packed *k)
+{
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+       bch2_bkey_swab_key(f, k);
+
+       if (ops->swab)
+               ops->swab(f, k);
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
new file mode 100644 (file)
index 0000000..04c80f3
--- /dev/null
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
+
+enum bkey_type {
+       DEFINE_BCH_BTREE_IDS()
+       BKEY_TYPE_BTREE,
+};
+
+#undef DEF_BTREE_ID
+
+/* Type of a key in btree @id at level @level: */
+static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
+{
+       return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
+}
+
+static inline bool btree_type_has_ptrs(enum bkey_type type)
+{
+       switch (type) {
+       case BKEY_TYPE_BTREE:
+       case BKEY_TYPE_EXTENTS:
+               return true;
+       default:
+               return false;
+       }
+}
+
+struct bch_fs;
+struct btree;
+struct bkey;
+
+enum merge_result {
+       BCH_MERGE_NOMERGE,
+
+       /*
+        * The keys were mergeable, but would have overflowed size - so instead
+        * l was changed to the maximum size, and both keys were modified:
+        */
+       BCH_MERGE_PARTIAL,
+       BCH_MERGE_MERGE,
+};
+
+typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
+                             struct bkey_s);
+typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
+                                         struct btree *,
+                                         struct bkey_i *, struct bkey_i *);
+
+struct bkey_ops {
+       /* Returns reason for being invalid if invalid, else NULL: */
+       const char *    (*key_invalid)(const struct bch_fs *,
+                                      struct bkey_s_c);
+       void            (*key_debugcheck)(struct bch_fs *, struct btree *,
+                                         struct bkey_s_c);
+       void            (*val_to_text)(struct bch_fs *, char *,
+                                      size_t, struct bkey_s_c);
+       void            (*swab)(const struct bkey_format *, struct bkey_packed *);
+       key_filter_fn   key_normalize;
+       key_merge_fn    key_merge;
+       bool            is_extents;
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+                                 struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+
+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+                    char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+                         char *, size_t, struct bkey_s_c);
+
+void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
+                   struct bkey_packed *);
+
+extern const struct bkey_ops bch2_bkey_ops[];
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
new file mode 100644 (file)
index 0000000..faf58b4
--- /dev/null
@@ -0,0 +1,1849 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "bset.h"
+#include "eytzinger.h"
+#include "trace.h"
+#include "util.h"
+
+#include <asm/unaligned.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+       struct bset_tree *t;
+
+       for_each_bset(b, t)
+               if (k >= btree_bkey_first(b, t) &&
+                   k < btree_bkey_last(b, t))
+                       return t;
+
+       BUG();
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
+{
+       struct bkey_packed *_k, *_n;
+       struct bkey k, n;
+       char buf[120];
+
+       if (!i->u64s)
+               return;
+
+       for (_k = i->start, k = bkey_unpack_key(b, _k);
+            _k < vstruct_last(i);
+            _k = _n, k = n) {
+               _n = bkey_next(_k);
+
+               bch2_bkey_to_text(buf, sizeof(buf), &k);
+               printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
+                      _k->_data - i->_data, i->u64s, buf);
+
+               if (_n == vstruct_last(i))
+                       continue;
+
+               n = bkey_unpack_key(b, _n);
+
+               if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+                       printk(KERN_ERR "Key skipped backwards\n");
+                       continue;
+               }
+
+               /*
+                * Weird check for duplicate non extent keys: extents are
+                * deleted iff they have 0 size, so if it has zero size and it's
+                * not deleted these aren't extents:
+                */
+               if (((!k.size && !bkey_deleted(&k)) ||
+                    (!n.size && !bkey_deleted(&n))) &&
+                   !bkey_deleted(&k) &&
+                   !bkey_cmp(n.p, k.p))
+                       printk(KERN_ERR "Duplicate keys\n");
+       }
+}
+
+void bch2_dump_btree_node(struct btree *b)
+{
+       struct bset_tree *t;
+
+       console_lock();
+       for_each_bset(b, t)
+               bch2_dump_bset(b, bset(b, t), t - b->set);
+       console_unlock();
+}
+
+void bch2_dump_btree_node_iter(struct btree *b,
+                             struct btree_node_iter *iter)
+{
+       struct btree_node_iter_set *set;
+
+       printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+
+       btree_node_iter_for_each(iter, set) {
+               struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+               struct bset_tree *t = bch2_bkey_to_bset(b, k);
+               struct bkey uk = bkey_unpack_key(b, k);
+               char buf[100];
+
+               bch2_bkey_to_text(buf, sizeof(buf), &uk);
+               printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
+                      k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+       }
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static bool keys_out_of_order(struct btree *b,
+                             const struct bkey_packed *prev,
+                             const struct bkey_packed *next,
+                             bool is_extents)
+{
+       struct bkey nextu = bkey_unpack_key(b, next);
+
+       return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
+               ((is_extents
+                 ? !bkey_deleted(next)
+                 : !bkey_deleted(prev)) &&
+                !bkey_cmp_packed(b, prev, next));
+}
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+       struct bset_tree *t;
+       struct bkey_packed *k;
+       struct btree_nr_keys nr = { 0 };
+
+       for_each_bset(b, t)
+               for (k = btree_bkey_first(b, t);
+                    k != btree_bkey_last(b, t);
+                    k = bkey_next(k))
+                       if (!bkey_whiteout(k))
+                               btree_keys_account_key_add(&nr, t - b->set, k);
+
+       BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+                                          struct btree *b,
+                                          struct bkey_packed *k)
+{
+       const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
+
+       bkey_unpack_key(b, k);
+
+       if (n &&
+           keys_out_of_order(b, k, n, iter->is_extents)) {
+               struct bkey ku = bkey_unpack_key(b, k);
+               struct bkey nu = bkey_unpack_key(b, n);
+               char buf1[80], buf2[80];
+
+               bch2_dump_btree_node(b);
+               bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
+               panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+       }
+}
+
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+                               struct btree *b)
+{
+       struct btree_node_iter_set *set, *prev = NULL;
+       struct bset_tree *t;
+       struct bkey_packed *k, *first;
+
+       if (bch2_btree_node_iter_end(iter))
+               return;
+
+       btree_node_iter_for_each(iter, set) {
+               k = __btree_node_offset_to_key(b, set->k);
+               t = bch2_bkey_to_bset(b, k);
+
+               BUG_ON(__btree_node_offset_to_key(b, set->end) !=
+                      btree_bkey_last(b, t));
+
+               BUG_ON(prev &&
+                      btree_node_iter_cmp(iter, b, *prev, *set) > 0);
+
+               prev = set;
+       }
+
+       first = __btree_node_offset_to_key(b, iter->data[0].k);
+
+       for_each_bset(b, t)
+               if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
+                   btree_bkey_last(b, t) &&
+                   (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
+                       BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
+                                                    k, first) > 0);
+}
+
+void bch2_verify_key_order(struct btree *b,
+                         struct btree_node_iter *iter,
+                         struct bkey_packed *where)
+{
+       struct bset_tree *t = bch2_bkey_to_bset(b, where);
+       struct bkey_packed *k, *prev;
+       struct bkey uk, uw = bkey_unpack_key(b, where);
+
+       k = bch2_bkey_prev_all(b, t, where);
+       if (k &&
+           keys_out_of_order(b, k, where, iter->is_extents)) {
+               char buf1[100], buf2[100];
+
+               bch2_dump_btree_node(b);
+               uk = bkey_unpack_key(b, k);
+               bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
+               panic("out of order with prev:\n%s\n%s\n",
+                     buf1, buf2);
+       }
+
+       k = bkey_next(where);
+       BUG_ON(k != btree_bkey_last(b, t) &&
+              keys_out_of_order(b, where, k, iter->is_extents));
+
+       for_each_bset(b, t) {
+               if (where >= btree_bkey_first(b, t) ||
+                   where < btree_bkey_last(b, t))
+                       continue;
+
+               k = bch2_btree_node_iter_bset_pos(iter, b, t);
+
+               if (k == btree_bkey_last(b, t))
+                       k = bch2_bkey_prev_all(b, t, k);
+
+               while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
+                      (prev = bch2_bkey_prev_all(b, t, k)))
+                       k = prev;
+
+               for (;
+                    k != btree_bkey_last(b, t);
+                    k = bkey_next(k)) {
+                       uk = bkey_unpack_key(b, k);
+
+                       if (iter->is_extents) {
+                               BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
+                                        bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
+                       } else {
+                               BUG_ON(!bkey_cmp(uw.p, uk.p) &&
+                                      !bkey_deleted(&uk));
+                       }
+
+                       if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
+                               break;
+               }
+       }
+}
+
+#else
+
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+                                                  struct btree *b,
+                                                  struct bkey_packed *k) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0)
+#define BFLOAT_FAILED_PREV     (U8_MAX - 1)
+#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2)
+#define BFLOAT_FAILED          (U8_MAX - 2)
+
+#define KEY_WORDS              BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+
+struct bkey_float {
+       u8              exponent;
+       u8              key_offset;
+       union {
+               u32     mantissa32;
+       struct {
+               u16     mantissa16;
+               u16     _pad;
+       };
+       };
+} __packed;
+
+#define BFLOAT_32BIT_NR                32U
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+       int d = (idx - BFLOAT_32BIT_NR) << 1;
+
+       d &= ~(d >> 31);
+
+       return idx * 6 - d;
+}
+
+struct ro_aux_tree {
+       struct bkey_float       _d[0];
+};
+
+struct rw_aux_tree {
+       u16             offset;
+       struct bpos     k;
+};
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE         128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree *b)
+{
+       return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+       return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+       return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+       return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+       BUG_ON(t->aux_data_offset == U16_MAX);
+
+       switch (bset_aux_tree_type(t)) {
+       case BSET_NO_AUX_TREE:
+               return t->aux_data_offset;
+       case BSET_RO_AUX_TREE:
+               return t->aux_data_offset +
+                       DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
+                                    sizeof(u8) * t->size, 8);
+       case BSET_RW_AUX_TREE:
+               return t->aux_data_offset +
+                       DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+       default:
+               BUG();
+       }
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+                                       const struct bset_tree *t)
+{
+       return t == b->set
+               ? DIV_ROUND_UP(b->unpack_fn_len, 8)
+               : bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+                            const struct bset_tree *t)
+{
+       return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+                                           const struct bset_tree *t)
+{
+       EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+       return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+                           const struct bset_tree *t)
+{
+       EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+       return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
+                                        unsigned idx)
+{
+       return (void *) b + bkey_float_byte_offset(idx);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+                                    const struct bset_tree *t,
+                                    unsigned idx)
+{
+       return bkey_float_get(ro_aux_tree_base(b, t), idx);
+}
+
+static void bset_aux_tree_verify(struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bset_tree *t;
+
+       for_each_bset(b, t) {
+               if (t->aux_data_offset == U16_MAX)
+                       continue;
+
+               BUG_ON(t != b->set &&
+                      t[-1].aux_data_offset == U16_MAX);
+
+               BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+               BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+               BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+       }
+#endif
+}
+
+/* Memory allocation */
+
+void bch2_btree_keys_free(struct btree *b)
+{
+       kvfree(b->aux_data);
+       b->aux_data = NULL;
+}
+
+int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
+{
+       b->page_order   = page_order;
+       b->aux_data     = kvmalloc(btree_aux_data_bytes(b), gfp);
+       if (!b->aux_data)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+{
+       unsigned i;
+
+       b->nsets                = 0;
+       memset(&b->nr, 0, sizeof(b->nr));
+#ifdef CONFIG_BCACHEFS_DEBUG
+       b->expensive_debug_checks = expensive_debug_checks;
+#endif
+       for (i = 0; i < MAX_BSETS; i++)
+               b->set[i].data_offset = U16_MAX;
+
+       bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+                                  const struct bset_tree *t,
+                                  unsigned cacheline)
+{
+       return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+                                  L1_CACHE_BYTES) +
+               cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+                                            const struct bset_tree *t,
+                                            unsigned cacheline,
+                                            unsigned offset)
+{
+       return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+                                 const struct bset_tree *t,
+                                 const struct bkey_packed *k)
+{
+       return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+                                         const struct bset_tree *t,
+                                         unsigned cacheline,
+                                         const struct bkey_packed *k)
+{
+       return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+                                        const struct bset_tree *t,
+                                        unsigned cacheline,
+                                        const struct bkey_packed *k)
+{
+       size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+       EBUG_ON(m > U8_MAX);
+       return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+                                              const struct bset_tree *t,
+                                              unsigned j)
+{
+       return cacheline_to_bkey(b, t,
+                       __eytzinger1_to_inorder(j, t->size, t->extra),
+                       bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+                                            const struct bset_tree *t,
+                                            unsigned j)
+{
+       unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+       return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+                                      const struct bset_tree *t)
+{
+       EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+       return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+                                         struct bset_tree *t,
+                                         unsigned j)
+{
+       return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+                           unsigned j, struct bkey_packed *k)
+{
+       EBUG_ON(k >= btree_bkey_last(b, t));
+
+       rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+               .offset = __btree_node_key_to_offset(b, k),
+               .k      = bkey_unpack_pos(b, k),
+       };
+}
+
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
+                                       struct bset_tree *t)
+{
+       struct bkey_packed *k = btree_bkey_first(b, t);
+       unsigned j = 0;
+
+       if (!btree_keys_expensive_checks(b))
+               return;
+
+       BUG_ON(bset_has_ro_aux_tree(t));
+
+       if (!bset_has_rw_aux_tree(t))
+               return;
+
+       BUG_ON(t->size < 1);
+       BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+       goto start;
+       while (1) {
+               if (rw_aux_to_bkey(b, t, j) == k) {
+                       BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+                                       bkey_unpack_pos(b, k)));
+start:
+                       if (++j == t->size)
+                               break;
+
+                       BUG_ON(rw_aux_tree(b, t)[j].offset <=
+                              rw_aux_tree(b, t)[j - 1].offset);
+               }
+
+               k = bkey_next(k);
+               BUG_ON(k >= btree_bkey_last(b, t));
+       }
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+                                   struct bset_tree *t,
+                                   unsigned offset)
+{
+       unsigned l = 0, r = t->size;
+
+       EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+       while (l < r) {
+               unsigned m = (l + r) >> 1;
+
+               if (rw_aux_tree(b, t)[m].offset < offset)
+                       l = m + 1;
+               else
+                       r = m;
+       }
+
+       EBUG_ON(l < t->size &&
+               rw_aux_tree(b, t)[l].offset < offset);
+       EBUG_ON(l &&
+               rw_aux_tree(b, t)[l - 1].offset >= offset);
+
+       EBUG_ON(l > r);
+       EBUG_ON(l > t->size);
+
+       return l;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey_float *f,
+                                      unsigned idx)
+{
+       return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
+}
+
+static inline void bfloat_mantissa_set(struct bkey_float *f,
+                                      unsigned idx, unsigned mantissa)
+{
+       if (idx < BFLOAT_32BIT_NR)
+               f->mantissa32 = mantissa;
+       else
+               f->mantissa16 = mantissa;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+                                    const struct bkey_float *f,
+                                    unsigned idx)
+{
+       u64 v;
+
+       EBUG_ON(!bkey_packed(k));
+
+       v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+       /*
+        * In little endian, we're shifting off low bits (and then the bits we
+        * want are at the low end), in big endian we're shifting off high bits
+        * (and then the bits we want are at the high end, so we shift them
+        * back down):
+        */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+       v >>= f->exponent & 7;
+#else
+       v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+#endif
+       return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+}
+
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+                       unsigned j,
+                       struct bkey_packed *min_key,
+                       struct bkey_packed *max_key)
+{
+       struct bkey_float *f = bkey_float(b, t, j);
+       struct bkey_packed *m = tree_to_bkey(b, t, j);
+       struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
+       struct bkey_packed *l, *r;
+       unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
+       unsigned mantissa;
+       int shift, exponent, high_bit;
+
+       EBUG_ON(bkey_next(p) != m);
+
+       if (is_power_of_2(j)) {
+               l = min_key;
+
+               if (!l->u64s) {
+                       if (!bkey_pack_pos(l, b->data->min_key, b)) {
+                               struct bkey_i tmp;
+
+                               bkey_init(&tmp.k);
+                               tmp.k.p = b->data->min_key;
+                               bkey_copy(l, &tmp);
+                       }
+               }
+       } else {
+               l = tree_to_prev_bkey(b, t, j >> ffs(j));
+
+               EBUG_ON(m < l);
+       }
+
+       if (is_power_of_2(j + 1)) {
+               r = max_key;
+
+               if (!r->u64s) {
+                       if (!bkey_pack_pos(r, t->max_key, b)) {
+                               struct bkey_i tmp;
+
+                               bkey_init(&tmp.k);
+                               tmp.k.p = t->max_key;
+                               bkey_copy(r, &tmp);
+                       }
+               }
+       } else {
+               r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+               EBUG_ON(m > r);
+       }
+
+       /*
+        * for failed bfloats, the lookup code falls back to comparing against
+        * the original key.
+        */
+
+       if (!bkey_packed(l) || !bkey_packed(r) ||
+           !bkey_packed(p) || !bkey_packed(m) ||
+           !b->nr_key_bits) {
+               f->exponent = BFLOAT_FAILED_UNPACKED;
+               return;
+       }
+
+       /*
+        * The greatest differing bit of l and r is the first bit we must
+        * include in the bfloat mantissa we're creating in order to do
+        * comparisons - that bit always becomes the high bit of
+        * bfloat->mantissa, and thus the exponent we're calculating here is
+        * the position of what will become the low bit in bfloat->mantissa:
+        *
+        * Note that this may be negative - we may be running off the low end
+        * of the key: we handle this later:
+        */
+       high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+                      min_t(unsigned, bits, b->nr_key_bits) - 1);
+       exponent = high_bit - (bits - 1);
+
+       /*
+        * Then we calculate the actual shift value, from the start of the key
+        * (k->_data), to get the key bits starting at exponent:
+        */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+       shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+       EBUG_ON(shift + bits > b->format.key_u64s * 64);
+#else
+       shift = high_bit_offset +
+               b->nr_key_bits -
+               exponent -
+               bits;
+
+       EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+       EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+       f->exponent = shift;
+       mantissa = bkey_mantissa(m, f, j);
+
+       /*
+        * If we've got garbage bits, set them to all 1s - it's legal for the
+        * bfloat to compare larger than the original key, but not smaller:
+        */
+       if (exponent < 0)
+               mantissa |= ~(~0U << -exponent);
+
+       bfloat_mantissa_set(f, j, mantissa);
+
+       /*
+        * The bfloat must be able to tell its key apart from the previous key -
+        * if its key and the previous key don't differ in the required bits,
+        * flag as failed - unless the keys are actually equal, in which case
+        * we aren't required to return a specific one:
+        */
+       if (exponent > 0 &&
+           bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
+           bkey_cmp_packed(b, p, m)) {
+               f->exponent = BFLOAT_FAILED_PREV;
+               return;
+       }
+
+       /*
+        * f->mantissa must compare >= the original key - for transitivity with
+        * the comparison in bset_search_tree. If we're dropping set bits,
+        * increment it:
+        */
+       if (exponent > (int) bch2_bkey_ffs(b, m)) {
+               if (j < BFLOAT_32BIT_NR
+                   ? f->mantissa32 == U32_MAX
+                   : f->mantissa16 == U16_MAX)
+                       f->exponent = BFLOAT_FAILED_OVERFLOW;
+
+               if (j < BFLOAT_32BIT_NR)
+                       f->mantissa32++;
+               else
+                       f->mantissa16++;
+       }
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+       bset_aux_tree_verify(b);
+
+       return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+       unsigned bytes = __bset_tree_capacity(b, t);
+
+       if (bytes < 7 * BFLOAT_32BIT_NR)
+               return bytes / 7;
+
+       bytes -= 7 * BFLOAT_32BIT_NR;
+
+       return BFLOAT_32BIT_NR + bytes / 5;
+}
+
+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+       return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+       struct bkey_packed *k;
+
+       t->size = 1;
+       t->extra = BSET_RW_AUX_TREE_VAL;
+       rw_aux_tree(b, t)[0].offset =
+               __btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+       for (k = btree_bkey_first(b, t);
+            k != btree_bkey_last(b, t);
+            k = bkey_next(k)) {
+               if (t->size == bset_rw_tree_capacity(b, t))
+                       break;
+
+               if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+                   L1_CACHE_BYTES)
+                       rw_aux_tree_set(b, t, t->size++, k);
+       }
+}
+
+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+       struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+       struct bkey_packed min_key, max_key;
+       unsigned j, cacheline = 1;
+
+       /* signal to make_bfloat() that they're uninitialized: */
+       min_key.u64s = max_key.u64s = 0;
+
+       t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+                     bset_ro_tree_capacity(b, t));
+retry:
+       if (t->size < 2) {
+               t->size = 0;
+               t->extra = BSET_NO_AUX_TREE_VAL;
+               return;
+       }
+
+       t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+       /* First we figure out where the first key in each cacheline is */
+       eytzinger1_for_each(j, t->size) {
+               while (bkey_to_cacheline(b, t, k) < cacheline)
+                       prev = k, k = bkey_next(k);
+
+               if (k >= btree_bkey_last(b, t)) {
+                       /* XXX: this path sucks */
+                       t->size--;
+                       goto retry;
+               }
+
+               ro_aux_tree_prev(b, t)[j] = prev->u64s;
+               bkey_float(b, t, j)->key_offset =
+                       bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+               EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+               EBUG_ON(tree_to_bkey(b, t, j) != k);
+       }
+
+       while (bkey_next(k) != btree_bkey_last(b, t))
+               k = bkey_next(k);
+
+       t->max_key = bkey_unpack_pos(b, k);
+
+       /* Then we build the tree */
+       eytzinger1_for_each(j, t->size)
+               make_bfloat(b, t, j, &min_key, &max_key);
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+       struct bset_tree *i;
+
+       for (i = b->set; i != t; i++)
+               BUG_ON(bset_has_rw_aux_tree(i));
+
+       bch2_bset_set_no_aux_tree(b, t);
+
+       /* round up to next cacheline: */
+       t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+                                     SMP_CACHE_BYTES / sizeof(u64));
+
+       bset_aux_tree_verify(b);
+}
+
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+                            bool writeable)
+{
+       if (writeable
+           ? bset_has_rw_aux_tree(t)
+           : bset_has_ro_aux_tree(t))
+               return;
+
+       bset_alloc_tree(b, t);
+
+       if (!__bset_tree_capacity(b, t))
+               return;
+
+       if (writeable)
+               __build_rw_aux_tree(b, t);
+       else
+               __build_ro_aux_tree(b, t);
+
+       bset_aux_tree_verify(b);
+}
+
+void bch2_bset_init_first(struct btree *b, struct bset *i)
+{
+       struct bset_tree *t;
+
+       BUG_ON(b->nsets);
+
+       memset(i, 0, sizeof(*i));
+       get_random_bytes(&i->seq, sizeof(i->seq));
+       SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+       t = &b->set[b->nsets++];
+       set_btree_bset(b, t, i);
+}
+
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+                        struct btree_node_entry *bne)
+{
+       struct bset *i = &bne->keys;
+       struct bset_tree *t;
+
+       BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+       BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
+       BUG_ON(b->nsets >= MAX_BSETS);
+
+       memset(i, 0, sizeof(*i));
+       i->seq = btree_bset_first(b)->seq;
+       SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+       t = &b->set[b->nsets++];
+       set_btree_bset(b, t, i);
+}
+
+/*
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
+ * immediate predecessor:
+ */
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+                                      struct bkey_packed *k)
+{
+       struct bkey_packed *p;
+       unsigned offset;
+       int j;
+
+       EBUG_ON(k < btree_bkey_first(b, t) ||
+               k > btree_bkey_last(b, t));
+
+       if (k == btree_bkey_first(b, t))
+               return NULL;
+
+       switch (bset_aux_tree_type(t)) {
+       case BSET_NO_AUX_TREE:
+               p = btree_bkey_first(b, t);
+               break;
+       case BSET_RO_AUX_TREE:
+               j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+               do {
+                       p = j ? tree_to_bkey(b, t,
+                                       __inorder_to_eytzinger1(j--,
+                                                       t->size, t->extra))
+                             : btree_bkey_first(b, t);
+               } while (p >= k);
+               break;
+       case BSET_RW_AUX_TREE:
+               offset = __btree_node_key_to_offset(b, k);
+               j = rw_aux_tree_bsearch(b, t, offset);
+               p = j ? rw_aux_to_bkey(b, t, j - 1)
+                     : btree_bkey_first(b, t);
+               break;
+       }
+
+       return p;
+}
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
+                                         struct bset_tree *t,
+                                         struct bkey_packed *k,
+                                         unsigned min_key_type)
+{
+       struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
+
+       while ((p = __bkey_prev(b, t, k)) && !ret) {
+               for (i = p; i != k; i = bkey_next(i))
+                       if (i->type >= min_key_type)
+                               ret = i;
+
+               k = p;
+       }
+
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               BUG_ON(ret >= orig_k);
+
+               for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+                    i != orig_k;
+                    i = bkey_next(i))
+                       BUG_ON(i->type >= min_key_type);
+       }
+
+       return ret;
+}
+
+/* Insert */
+
+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
+                                           struct bset_tree *t,
+                                           struct bkey_packed *k)
+{
+       unsigned offset = __btree_node_key_to_offset(b, k);
+       unsigned j = rw_aux_tree_bsearch(b, t, offset);
+
+       if (j < t->size &&
+           rw_aux_tree(b, t)[j].offset == offset)
+               rw_aux_tree_set(b, t, j, k);
+
+       bch2_bset_verify_rw_aux_tree(b, t);
+}
+
+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
+                                           struct bset_tree *t,
+                                           struct bkey_packed *k)
+{
+       struct bkey_packed min_key, max_key;
+       unsigned inorder, j;
+
+       EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+       /* signal to make_bfloat() that they're uninitialized: */
+       min_key.u64s = max_key.u64s = 0;
+
+       if (bkey_next(k) == btree_bkey_last(b, t)) {
+               t->max_key = bkey_unpack_pos(b, k);
+
+               for (j = 1; j < t->size; j = j * 2 + 1)
+                       make_bfloat(b, t, j, &min_key, &max_key);
+       }
+
+       inorder = bkey_to_cacheline(b, t, k);
+
+       if (inorder &&
+           inorder < t->size) {
+               j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+
+               if (k == tree_to_bkey(b, t, j)) {
+                       /* Fix the node this key corresponds to */
+                       make_bfloat(b, t, j, &min_key, &max_key);
+
+                       /* Children for which this key is the right boundary */
+                       for (j = eytzinger1_left_child(j);
+                            j < t->size;
+                            j = eytzinger1_right_child(j))
+                               make_bfloat(b, t, j, &min_key, &max_key);
+               }
+       }
+
+       if (inorder + 1 < t->size) {
+               j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
+
+               if (k == tree_to_prev_bkey(b, t, j)) {
+                       make_bfloat(b, t, j, &min_key, &max_key);
+
+                       /* Children for which this key is the left boundary */
+                       for (j = eytzinger1_right_child(j);
+                            j < t->size;
+                            j = eytzinger1_left_child(j))
+                               make_bfloat(b, t, j, &min_key, &max_key);
+               }
+       }
+}
+
+/**
+ * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
+ * modified, fix any auxiliary search tree by remaking all the nodes in the
+ * auxiliary search tree that @k corresponds to
+ */
+void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
+                                  struct bkey_packed *k)
+{
+       switch (bset_aux_tree_type(t)) {
+       case BSET_NO_AUX_TREE:
+               break;
+       case BSET_RO_AUX_TREE:
+               ro_aux_tree_fix_invalidated_key(b, t, k);
+               break;
+       case BSET_RW_AUX_TREE:
+               rw_aux_tree_fix_invalidated_key(b, t, k);
+               break;
+       }
+}
+
+static void bch2_bset_fix_lookup_table(struct btree *b,
+                                      struct bset_tree *t,
+                                      struct bkey_packed *_where,
+                                      unsigned clobber_u64s,
+                                      unsigned new_u64s)
+{
+       int shift = new_u64s - clobber_u64s;
+       unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+       EBUG_ON(bset_has_ro_aux_tree(t));
+
+       if (!bset_has_rw_aux_tree(t))
+               return;
+
+       l = rw_aux_tree_bsearch(b, t, where);
+
+       /* l is first >= than @where */
+
+       EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
+       EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
+
+       if (!l) /* never delete first entry */
+               l++;
+       else if (l < t->size &&
+                where < t->end_offset &&
+                rw_aux_tree(b, t)[l].offset == where)
+               rw_aux_tree_set(b, t, l++, _where);
+
+       /* l now > where */
+
+       for (j = l;
+            j < t->size &&
+            rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+            j++)
+               ;
+
+       if (j < t->size &&
+           rw_aux_tree(b, t)[j].offset + shift ==
+           rw_aux_tree(b, t)[l - 1].offset)
+               j++;
+
+       memmove(&rw_aux_tree(b, t)[l],
+               &rw_aux_tree(b, t)[j],
+               (void *) &rw_aux_tree(b, t)[t->size] -
+               (void *) &rw_aux_tree(b, t)[j]);
+       t->size -= j - l;
+
+       for (j = l; j < t->size; j++)
+              rw_aux_tree(b, t)[j].offset += shift;
+
+       EBUG_ON(l < t->size &&
+               rw_aux_tree(b, t)[l].offset ==
+               rw_aux_tree(b, t)[l - 1].offset);
+
+       if (t->size < bset_rw_tree_capacity(b, t) &&
+           (l < t->size
+            ? rw_aux_tree(b, t)[l].offset
+            : t->end_offset) -
+           rw_aux_tree(b, t)[l - 1].offset >
+           L1_CACHE_BYTES / sizeof(u64)) {
+               struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+               struct bkey_packed *end = l < t->size
+                       ? rw_aux_to_bkey(b, t, l)
+                       : btree_bkey_last(b, t);
+               struct bkey_packed *k = start;
+
+               while (1) {
+                       k = bkey_next(k);
+                       if (k == end)
+                               break;
+
+                       if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+                               memmove(&rw_aux_tree(b, t)[l + 1],
+                                       &rw_aux_tree(b, t)[l],
+                                       (void *) &rw_aux_tree(b, t)[t->size] -
+                                       (void *) &rw_aux_tree(b, t)[l]);
+                               t->size++;
+                               rw_aux_tree_set(b, t, l, k);
+                               break;
+                       }
+               }
+       }
+
+       bch2_bset_verify_rw_aux_tree(b, t);
+       bset_aux_tree_verify(b);
+}
+
+void bch2_bset_insert(struct btree *b,
+                     struct btree_node_iter *iter,
+                     struct bkey_packed *where,
+                     struct bkey_i *insert,
+                     unsigned clobber_u64s)
+{
+       struct bkey_format *f = &b->format;
+       struct bset_tree *t = bset_tree_last(b);
+       struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+       bch2_bset_verify_rw_aux_tree(b, t);
+
+       if (bch2_bkey_pack_key(&packed, &insert->k, f))
+               src = &packed;
+
+       if (!bkey_whiteout(&insert->k))
+               btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+       if (src->u64s != clobber_u64s) {
+               u64 *src_p = where->_data + clobber_u64s;
+               u64 *dst_p = where->_data + src->u64s;
+
+               EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+                       (int) clobber_u64s - src->u64s);
+
+               memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+               le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+               set_btree_bset_end(b, t);
+       }
+
+       memcpy_u64s(where, src,
+                   bkeyp_key_u64s(f, src));
+       memcpy_u64s(bkeyp_val(f, where), &insert->v,
+                   bkeyp_val_u64s(f, src));
+
+       bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+       bch2_verify_key_order(b, iter, where);
+       bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_bset_delete(struct btree *b,
+                     struct bkey_packed *where,
+                     unsigned clobber_u64s)
+{
+       struct bset_tree *t = bset_tree_last(b);
+       u64 *src_p = where->_data + clobber_u64s;
+       u64 *dst_p = where->_data;
+
+       bch2_bset_verify_rw_aux_tree(b, t);
+
+       EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+       memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+       le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+       set_btree_bset_end(b, t);
+
+       bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+                               struct bset_tree *t,
+                               struct bpos search,
+                               const struct bkey_packed *packed_search)
+{
+       unsigned l = 0, r = t->size;
+
+       while (l + 1 != r) {
+               unsigned m = (l + r) >> 1;
+
+               if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
+                       l = m;
+               else
+                       r = m;
+       }
+
+       return rw_aux_to_bkey(b, t, l);
+}
+
+noinline
+static int bset_search_tree_slowpath(const struct btree *b,
+                               struct bset_tree *t, struct bpos *search,
+                               const struct bkey_packed *packed_search,
+                               unsigned n)
+{
+       return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
+                                packed_search, search) < 0;
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+                               struct bset_tree *t,
+                               struct bpos search,
+                               const struct bkey_packed *packed_search)
+{
+       struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+       struct bkey_float *f = bkey_float_get(base, 1);
+       void *p;
+       unsigned inorder, n = 1;
+
+       while (1) {
+               if (likely(n << 4 < t->size)) {
+                       p = bkey_float_get(base, n << 4);
+                       prefetch(p);
+               } else if (n << 3 < t->size) {
+                       inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
+                       p = bset_cacheline(b, t, inorder);
+#ifdef CONFIG_X86_64
+                       asm(".intel_syntax noprefix;"
+                           "prefetcht0 [%0 - 127 + 64 * 0];"
+                           "prefetcht0 [%0 - 127 + 64 * 1];"
+                           "prefetcht0 [%0 - 127 + 64 * 2];"
+                           "prefetcht0 [%0 - 127 + 64 * 3];"
+                           ".att_syntax prefix;"
+                           :
+                           : "r" (p + 127));
+#else
+                       prefetch(p + L1_CACHE_BYTES * 0);
+                       prefetch(p + L1_CACHE_BYTES * 1);
+                       prefetch(p + L1_CACHE_BYTES * 2);
+                       prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+               } else if (n >= t->size)
+                       break;
+
+               f = bkey_float_get(base, n);
+
+               if (packed_search &&
+                   likely(f->exponent < BFLOAT_FAILED))
+                       n = n * 2 + (bfloat_mantissa(f, n) <
+                                    bkey_mantissa(packed_search, f, n));
+               else
+                       n = n * 2 + bset_search_tree_slowpath(b, t,
+                                               &search, packed_search, n);
+       } while (n < t->size);
+
+       inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+
+       /*
+        * n would have been the node we recursed to - the low bit tells us if
+        * we recursed left or recursed right.
+        */
+       if (n & 1) {
+               return cacheline_to_bkey(b, t, inorder, f->key_offset);
+       } else {
+               if (--inorder) {
+                       n = eytzinger1_prev(n >> 1, t->size);
+                       f = bkey_float_get(base, n);
+                       return cacheline_to_bkey(b, t, inorder, f->key_offset);
+               } else
+                       return btree_bkey_first(b, t);
+       }
+}
+
+/*
+ * Returns the first key greater than or equal to @search
+ */
+__always_inline __flatten
+static struct bkey_packed *bch2_bset_search(struct btree *b,
+                               struct bset_tree *t,
+                               struct bpos search,
+                               struct bkey_packed *packed_search,
+                               const struct bkey_packed *lossy_packed_search,
+                               bool strictly_greater)
+{
+       struct bkey_packed *m;
+
+       /*
+        * First, we search for a cacheline, then lastly we do a linear search
+        * within that cacheline.
+        *
+        * To search for the cacheline, there's three different possibilities:
+        *  * The set is too small to have a search tree, so we just do a linear
+        *    search over the whole set.
+        *  * The set is the one we're currently inserting into; keeping a full
+        *    auxiliary search tree up to date would be too expensive, so we
+        *    use a much simpler lookup table to do a binary search -
+        *    bset_search_write_set().
+        *  * Or we use the auxiliary search tree we constructed earlier -
+        *    bset_search_tree()
+        */
+
+       switch (bset_aux_tree_type(t)) {
+       case BSET_NO_AUX_TREE:
+               m = btree_bkey_first(b, t);
+               break;
+       case BSET_RW_AUX_TREE:
+               m = bset_search_write_set(b, t, search, lossy_packed_search);
+               break;
+       case BSET_RO_AUX_TREE:
+               /*
+                * Each node in the auxiliary search tree covers a certain range
+                * of bits, and keys above and below the set it covers might
+                * differ outside those bits - so we have to special case the
+                * start and end - handle that here:
+                */
+
+               if (bkey_cmp(search, t->max_key) > 0)
+                       return btree_bkey_last(b, t);
+
+               m = bset_search_tree(b, t, search, lossy_packed_search);
+               break;
+       }
+
+       if (lossy_packed_search)
+               while (m != btree_bkey_last(b, t) &&
+                      !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
+                                                   m, strictly_greater))
+                       m = bkey_next(m);
+
+       if (!packed_search)
+               while (m != btree_bkey_last(b, t) &&
+                      !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
+                       m = bkey_next(m);
+
+       if (btree_keys_expensive_checks(b)) {
+               struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
+
+               BUG_ON(prev &&
+                      btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
+                                                  prev, strictly_greater));
+       }
+
+       return m;
+}
+
+/* Btree node iterator */
+
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
+                              struct btree *b,
+                              const struct bkey_packed *k,
+                              const struct bkey_packed *end)
+{
+       __bch2_btree_node_iter_push(iter, b, k, end);
+       bch2_btree_node_iter_sort(iter, b);
+}
+
+noinline __flatten __attribute__((cold))
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+                             struct btree *b, struct bpos search,
+                             bool strictly_greater, bool is_extents)
+{
+       struct bset_tree *t;
+
+       trace_bkey_pack_pos_fail(&search);
+
+       for_each_bset(b, t)
+               __bch2_btree_node_iter_push(iter, b,
+                       bch2_bset_search(b, t, search, NULL, NULL,
+                                       strictly_greater),
+                       btree_bkey_last(b, t));
+
+       bch2_btree_node_iter_sort(iter, b);
+}
+
+/**
+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *     i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
+                              struct btree *b, struct bpos search,
+                              bool strictly_greater, bool is_extents)
+{
+       struct bset_tree *t;
+       struct bkey_packed p, *packed_search = NULL;
+
+       EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
+       bset_aux_tree_verify(b);
+
+       __bch2_btree_node_iter_init(iter, is_extents);
+
+       switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
+       case BKEY_PACK_POS_EXACT:
+               packed_search = &p;
+               break;
+       case BKEY_PACK_POS_SMALLER:
+               packed_search = NULL;
+               break;
+       case BKEY_PACK_POS_FAIL:
+               btree_node_iter_init_pack_failed(iter, b, search,
+                                       strictly_greater, is_extents);
+               return;
+       }
+
+       for_each_bset(b, t)
+               __bch2_btree_node_iter_push(iter, b,
+                                          bch2_bset_search(b, t, search,
+                                                          packed_search, &p,
+                                                          strictly_greater),
+                                          btree_bkey_last(b, t));
+
+       bch2_btree_node_iter_sort(iter, b);
+}
+
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+                                         struct btree *b,
+                                         bool is_extents)
+{
+       struct bset_tree *t;
+
+       __bch2_btree_node_iter_init(iter, is_extents);
+
+       for_each_bset(b, t)
+               __bch2_btree_node_iter_push(iter, b,
+                                          btree_bkey_first(b, t),
+                                          btree_bkey_last(b, t));
+       bch2_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+                                                 struct btree *b,
+                                                 struct bset_tree *t)
+{
+       struct btree_node_iter_set *set;
+
+       btree_node_iter_for_each(iter, set)
+               if (set->end == t->end_offset)
+                       return __btree_node_offset_to_key(b, set->k);
+
+       return btree_bkey_last(b, t);
+}
+
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
+                                           struct btree *b,
+                                           unsigned first)
+{
+       bool ret;
+
+       if ((ret = (btree_node_iter_cmp(iter, b,
+                                       iter->data[first],
+                                       iter->data[first + 1]) > 0)))
+               swap(iter->data[first], iter->data[first + 1]);
+       return ret;
+}
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
+                              struct btree *b)
+{
+       /* unrolled bubble sort: */
+
+       if (!__btree_node_iter_set_end(iter, 2)) {
+               btree_node_iter_sort_two(iter, b, 0);
+               btree_node_iter_sort_two(iter, b, 1);
+       }
+
+       if (!__btree_node_iter_set_end(iter, 1))
+               btree_node_iter_sort_two(iter, b, 0);
+}
+
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
+                                  struct btree_node_iter_set *set)
+{
+       struct btree_node_iter_set *last =
+               iter->data + ARRAY_SIZE(iter->data) - 1;
+
+       memmove(&set[0], &set[1], (void *) last - (void *) set);
+       *last = (struct btree_node_iter_set) { 0, 0 };
+}
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+                                                 struct btree *b)
+{
+       iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
+
+       EBUG_ON(iter->data->k > iter->data->end);
+
+       if (unlikely(__btree_node_iter_set_end(iter, 0))) {
+               bch2_btree_node_iter_set_drop(iter, iter->data);
+               return;
+       }
+
+       if (__btree_node_iter_set_end(iter, 1))
+               return;
+
+       if (!btree_node_iter_sort_two(iter, b, 0))
+               return;
+
+       if (__btree_node_iter_set_end(iter, 2))
+               return;
+
+       btree_node_iter_sort_two(iter, b, 1);
+}
+
+/**
+ * bch_btree_node_iter_advance - advance @iter by one key
+ *
+ * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
+ * momentarily have out of order extents.
+ */
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+                                 struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
+
+       __bch2_btree_node_iter_advance(iter, b);
+       bch2_btree_node_iter_next_check(iter, b, k);
+#else
+       __bch2_btree_node_iter_advance(iter, b);
+#endif
+}
+
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+       unsigned n = ARRAY_SIZE(iter->data);
+
+       while (n && __btree_node_iter_set_end(iter, n - 1))
+               --n;
+
+       return n;
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+                                                    struct btree *b,
+                                                    unsigned min_key_type)
+{
+       struct bkey_packed *k, *prev = NULL;
+       struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
+       struct btree_node_iter_set *set;
+       struct bset_tree *t;
+       unsigned end;
+
+       bch2_btree_node_iter_verify(iter, b);
+
+       for_each_bset(b, t) {
+               k = bch2_bkey_prev_filter(b, t,
+                       bch2_btree_node_iter_bset_pos(iter, b, t),
+                       min_key_type);
+               if (k &&
+                   (!prev || __btree_node_iter_cmp(iter->is_extents, b,
+                                                   k, prev) > 0)) {
+                       prev = k;
+                       end = t->end_offset;
+               }
+       }
+
+       if (!prev)
+               goto out;
+
+       /*
+        * We're manually memmoving instead of just calling sort() to ensure the
+        * prev we picked ends up in slot 0 - sort won't necessarily put it
+        * there because of duplicate deleted keys:
+        */
+       btree_node_iter_for_each(iter, set)
+               if (set->end == end)
+                       goto found;
+
+       BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
+found:
+       BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
+
+       memmove(&iter->data[1],
+               &iter->data[0],
+               (void *) set - (void *) &iter->data[0]);
+
+       iter->data[0].k = __btree_node_key_to_offset(b, prev);
+       iter->data[0].end = end;
+out:
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               struct btree_node_iter iter2 = *iter;
+
+               if (prev)
+                       bch2_btree_node_iter_advance(&iter2, b);
+
+               while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
+                       BUG_ON(k->type >= min_key_type);
+                       bch2_btree_node_iter_advance(&iter2, b);
+               }
+       }
+
+       return prev;
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+                                                struct btree *b,
+                                                struct bkey *u)
+{
+       struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
+
+       return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+
+/* Mergesort */
+
+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+{
+       struct bset_tree *t;
+
+       for_each_bset(b, t) {
+               enum bset_aux_tree_type type = bset_aux_tree_type(t);
+               size_t j;
+
+               stats->sets[type].nr++;
+               stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+                       sizeof(u64);
+
+               if (bset_has_ro_aux_tree(t)) {
+                       stats->floats += t->size - 1;
+
+                       for (j = 1; j < t->size; j++)
+                               switch (bkey_float(b, t, j)->exponent) {
+                               case BFLOAT_FAILED_UNPACKED:
+                                       stats->failed_unpacked++;
+                                       break;
+                               case BFLOAT_FAILED_PREV:
+                                       stats->failed_prev++;
+                                       break;
+                               case BFLOAT_FAILED_OVERFLOW:
+                                       stats->failed_overflow++;
+                                       break;
+                               }
+               }
+       }
+}
+
+int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
+                          char *buf, size_t size)
+{
+       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+       struct bkey_packed *l, *r, *p;
+       struct bkey uk, up;
+       char buf1[200], buf2[200];
+       unsigned j;
+
+       if (!size)
+               return 0;
+
+       if (!bset_has_ro_aux_tree(t))
+               goto out;
+
+       j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
+       if (j &&
+           j < t->size &&
+           k == tree_to_bkey(b, t, j))
+               switch (bkey_float(b, t, j)->exponent) {
+               case BFLOAT_FAILED_UNPACKED:
+                       uk = bkey_unpack_key(b, k);
+                       return scnprintf(buf, size,
+                                        "    failed unpacked at depth %u\n"
+                                        "\t%llu:%llu\n",
+                                        ilog2(j),
+                                        uk.p.inode, uk.p.offset);
+               case BFLOAT_FAILED_PREV:
+                       p = tree_to_prev_bkey(b, t, j);
+                       l = is_power_of_2(j)
+                               ? btree_bkey_first(b, t)
+                               : tree_to_prev_bkey(b, t, j >> ffs(j));
+                       r = is_power_of_2(j + 1)
+                               ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
+                               : tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+                       up = bkey_unpack_key(b, p);
+                       uk = bkey_unpack_key(b, k);
+                       bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
+                       bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
+
+                       return scnprintf(buf, size,
+                                        "    failed prev at depth %u\n"
+                                        "\tkey starts at bit %u but first differing bit at %u\n"
+                                        "\t%llu:%llu\n"
+                                        "\t%llu:%llu\n"
+                                        "\t%s\n"
+                                        "\t%s\n",
+                                        ilog2(j),
+                                        bch2_bkey_greatest_differing_bit(b, l, r),
+                                        bch2_bkey_greatest_differing_bit(b, p, k),
+                                        uk.p.inode, uk.p.offset,
+                                        up.p.inode, up.p.offset,
+                                        buf1, buf2);
+               case BFLOAT_FAILED_OVERFLOW:
+                       uk = bkey_unpack_key(b, k);
+                       return scnprintf(buf, size,
+                                        "    failed overflow at depth %u\n"
+                                        "\t%llu:%llu\n",
+                                        ilog2(j),
+                                        uk.p.inode, uk.p.offset);
+               }
+out:
+       *buf = '\0';
+       return 0;
+}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
new file mode 100644 (file)
index 0000000..2fa71d7
--- /dev/null
@@ -0,0 +1,668 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs_format.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+       return false;
+#endif
+}
+
+enum bset_aux_tree_type {
+       BSET_NO_AUX_TREE,
+       BSET_RO_AUX_TREE,
+       BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES     3
+
+#define BSET_NO_AUX_TREE_VAL   (U16_MAX)
+#define BSET_RW_AUX_TREE_VAL   (U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+       switch (t->extra) {
+       case BSET_NO_AUX_TREE_VAL:
+               EBUG_ON(t->size);
+               return BSET_NO_AUX_TREE;
+       case BSET_RW_AUX_TREE_VAL:
+               EBUG_ON(!t->size);
+               return BSET_RW_AUX_TREE;
+       default:
+               EBUG_ON(!t->size);
+               return BSET_RO_AUX_TREE;
+       }
+}
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+                              struct bkey *dst,
+                              const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+       {
+               compiled_unpack_fn unpack_fn = b->aux_data;
+               unpack_fn(dst, src);
+
+               if (btree_keys_expensive_checks(b)) {
+                       struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+                       /*
+                        * hack around a harmless race when compacting whiteouts
+                        * for a write:
+                        */
+                       dst2.needs_whiteout = dst->needs_whiteout;
+
+                       BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+               }
+       }
+#else
+       *dst = __bch2_bkey_unpack_key(&b->format, src);
+#endif
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+                              const struct bkey_packed *src)
+{
+       struct bkey dst;
+
+       __bkey_unpack_key_format_checked(b, &dst, src);
+       return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+                                    struct bkey *dst,
+                                    const struct bkey_packed *src)
+{
+       if (likely(bkey_packed(src)))
+               __bkey_unpack_key_format_checked(b, dst, src);
+       else
+               *dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+                                         const struct bkey_packed *src)
+{
+       return likely(bkey_packed(src))
+               ? bkey_unpack_key_format_checked(b, src)
+               : *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+                              const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+       return bkey_unpack_key_format_checked(b, src).p;
+#else
+       return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+                                         const struct bkey_packed *src)
+{
+       return likely(bkey_packed(src))
+               ? bkey_unpack_pos_format_checked(b, src)
+               : packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+                                              const struct bkey_packed *k,
+                                              struct bkey *u)
+{
+       __bkey_unpack_key(b, u, k);
+
+       return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+                                              struct bkey_packed *k,
+                                              struct bkey *u)
+{
+       __bkey_unpack_key(b, u, k);
+
+       return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+#define for_each_bset(_b, _t)                                  \
+       for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+{
+       return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+       return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+                                           struct bset_tree *t)
+{
+       BUG_ON(t < b->set);
+
+       for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+               t->size = 0;
+               t->extra = BSET_NO_AUX_TREE_VAL;
+               t->aux_data_offset = U16_MAX;
+       }
+}
+
+static inline void btree_node_set_format(struct btree *b,
+                                        struct bkey_format f)
+{
+       int len;
+
+       b->format       = f;
+       b->nr_key_bits  = bkey_format_key_bits(&f);
+
+       len = bch2_compile_bkey_format(&b->format, b->aux_data);
+       BUG_ON(len < 0 || len > U8_MAX);
+
+       b->unpack_fn_len = len;
+
+       bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+                                        unsigned block_bytes)
+{
+       struct bset *i = btree_bset_last(b);
+
+       EBUG_ON(!is_power_of_2(block_bytes));
+
+       return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_free(struct btree *);
+int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+void bch2_btree_keys_init(struct btree *, bool *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+                        struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
+                                 struct bkey_packed *);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+                    struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+                                   const struct bkey_packed *l,
+                                   const struct bkey_packed *r_packed,
+                                   struct bpos *r)
+{
+       EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+       if (unlikely(!bkey_packed(l)))
+               return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+
+       if (likely(r_packed))
+               return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+       return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
+                                            struct bpos *pos,
+                                            const struct bkey_packed *k,
+                                            bool strictly_greater)
+{
+       int cmp = bkey_cmp_left_packed(b, k, pos);
+
+       return cmp > 0 ||
+               (cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
+                                       struct bpos pos,
+                                       const struct bkey_packed *pos_packed,
+                                       const struct bkey_packed *k,
+                                       bool strictly_greater)
+{
+       int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
+
+       return cmp > 0 ||
+               (cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+                                         struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+       return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+       return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
+}
+
+enum bch_extent_overlap {
+       BCH_EXTENT_OVERLAP_ALL          = 0,
+       BCH_EXTENT_OVERLAP_BACK         = 1,
+       BCH_EXTENT_OVERLAP_FRONT        = 2,
+       BCH_EXTENT_OVERLAP_MIDDLE       = 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+                                                        const struct bkey *m)
+{
+       int cmp1 = bkey_cmp(k->p, m->p) < 0;
+       int cmp2 = bkey_cmp(bkey_start_pos(k),
+                           bkey_start_pos(m)) > 0;
+
+       return (cmp1 << 1) + cmp2;
+}
+
+/* Btree key iteration */
+
+static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
+                                             bool is_extents)
+{
+       iter->is_extents = is_extents;
+       memset(iter->data, 0, sizeof(iter->data));
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+                             const struct bkey_packed *,
+                             const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+                             struct bpos, bool, bool);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+                                        struct btree *, bool);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+                                                struct btree *,
+                                                struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+                                  struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)                          \
+       for (_set = (_iter)->data;                                      \
+            _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&        \
+            (_set)->k != (_set)->end;                                  \
+            _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+                                            unsigned i)
+{
+       return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+       return __btree_node_iter_set_end(iter, 0);
+}
+
+static inline int __btree_node_iter_cmp(bool is_extents,
+                                       struct btree *b,
+                                       struct bkey_packed *l,
+                                       struct bkey_packed *r)
+{
+       /*
+        * For non extents, when keys compare equal the deleted keys have to
+        * come first - so that bch2_btree_node_iter_next_check() can detect
+        * duplicate nondeleted keys (and possibly other reasons?)
+        *
+        * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
+        * deleted keys have to sort last.
+        */
+       return bkey_cmp_packed(b, l, r)
+               ?: (is_extents
+                   ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
+                   : (int) bkey_deleted(r) - (int) bkey_deleted(l))
+               ?: (l > r) - (l < r);
+}
+
+static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
+                                     struct btree *b,
+                                     struct btree_node_iter_set l,
+                                     struct btree_node_iter_set r)
+{
+       return __btree_node_iter_cmp(iter->is_extents, b,
+                       __btree_node_offset_to_key(b, l.k),
+                       __btree_node_offset_to_key(b, r.k));
+}
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+                             struct btree *b,
+                             const struct bkey_packed *k,
+                             const struct bkey_packed *end)
+{
+       if (k != end) {
+               struct btree_node_iter_set *pos;
+
+               btree_node_iter_for_each(iter, pos)
+                       ;
+
+               BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+               *pos = (struct btree_node_iter_set) {
+                       __btree_node_key_to_offset(b, k),
+                       __btree_node_key_to_offset(b, end)
+               };
+       }
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+                               struct btree *b)
+{
+       return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
+                                struct btree *b,
+                                unsigned min_key_type)
+{
+       while (!bch2_btree_node_iter_end(iter)) {
+               struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
+
+               if (k->type >= min_key_type)
+                       return k;
+
+               bch2_btree_node_iter_advance(iter, b);
+       }
+
+       return NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+                             struct btree *b)
+{
+       return bch2_btree_node_iter_peek_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+       return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+       struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+       if (ret)
+               bch2_btree_node_iter_advance(iter, b);
+
+       return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
+                                                    struct btree *, unsigned);
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
+{
+       return bch2_btree_node_iter_prev_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
+{
+       return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
+}
+
+/*
+ * Iterates over all _live_ keys - skipping deleted (and potentially
+ * overlapping) keys
+ */
+#define for_each_btree_node_key(b, k, iter, _is_extents)               \
+       for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+            ((k) = bch2_btree_node_iter_peek(iter, b));                        \
+            bch2_btree_node_iter_advance(iter, b))
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+                                               struct btree *,
+                                               struct bkey *);
+
+#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
+       for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+            (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+            bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+                                         unsigned bset,
+                                         struct bkey_packed *k,
+                                         int sign)
+{
+       n->live_u64s            += k->u64s * sign;
+       n->bset_u64s[bset]      += k->u64s * sign;
+
+       if (bkey_packed(k))
+               n->packed_keys  += sign;
+       else
+               n->unpacked_keys += sign;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)         \
+       btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)        \
+       btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+struct bset_stats {
+       struct {
+               size_t nr, bytes;
+       } sets[BSET_TREE_NR_TYPES];
+
+       size_t floats;
+       size_t failed_unpacked;
+       size_t failed_prev;
+       size_t failed_overflow;
+};
+
+void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
+int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
+                         char *, size_t);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
+                         struct bkey_packed *);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+                                             struct btree *b) {}
+static inline void bch2_verify_key_order(struct btree *b,
+                                       struct btree_node_iter *iter,
+                                       struct bkey_packed *where) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+       if (btree_keys_expensive_checks(b))
+               __bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
new file mode 100644 (file)
index 0000000..f9afae6
--- /dev/null
@@ -0,0 +1,941 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+
+#define DEF_BTREE_ID(kwd, val, name) name,
+
+const char * const bch2_btree_ids[] = {
+       DEFINE_BCH_BTREE_IDS()
+       NULL
+};
+
+#undef DEF_BTREE_ID
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+       unsigned i, reserve = 16;
+
+       if (!c->btree_roots[0].b)
+               reserve += 8;
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (c->btree_roots[i].b)
+                       reserve += min_t(unsigned, 1,
+                                        c->btree_roots[i].b->level) * 8;
+
+       c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+       return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+       EBUG_ON(btree_node_write_in_flight(b));
+
+       kvpfree(b->data, btree_bytes(c));
+       b->data = NULL;
+       bch2_btree_keys_free(b);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+       struct btree_cache *bc = &c->btree_cache;
+
+       __btree_node_data_free(c, b);
+       bc->used--;
+       list_move(&b->list, &bc->freed);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+                                  const void *obj)
+{
+       const struct btree *b = obj;
+       const u64 *v = arg->key;
+
+       return PTR_HASH(&b->key) == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+       .head_offset    = offsetof(struct btree, hash),
+       .key_offset     = offsetof(struct btree, key.v),
+       .key_len        = sizeof(struct bch_extent_ptr),
+       .obj_cmpfn      = bch2_btree_cache_cmp_fn,
+};
+
+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+       struct btree_cache *bc = &c->btree_cache;
+
+       b->data = kvpmalloc(btree_bytes(c), gfp);
+       if (!b->data)
+               goto err;
+
+       if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
+               goto err;
+
+       bc->used++;
+       list_move(&b->list, &bc->freeable);
+       return;
+err:
+       kvpfree(b->data, btree_bytes(c));
+       b->data = NULL;
+       list_move(&b->list, &bc->freed);
+}
+
+static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+       struct btree *b = kzalloc(sizeof(struct btree), gfp);
+       if (!b)
+               return NULL;
+
+       bkey_extent_init(&b->key);
+       six_lock_init(&b->lock);
+       lockdep_set_novalidate_class(&b->lock);
+       INIT_LIST_HEAD(&b->list);
+       INIT_LIST_HEAD(&b->write_blocked);
+
+       btree_node_data_alloc(c, b, gfp);
+       return b->data ? b : NULL;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+       rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+       /* Cause future lookups for this node to fail: */
+       bkey_i_to_extent(&b->key)->v._data[0] = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+       return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+                                            bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+                               unsigned level, enum btree_id id)
+{
+       int ret;
+
+       b->level        = level;
+       b->btree_id     = id;
+
+       mutex_lock(&bc->lock);
+       ret = __bch2_btree_node_hash_insert(bc, b);
+       if (!ret)
+               list_add(&b->list, &bc->live);
+       mutex_unlock(&bc->lock);
+
+       return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+                                    const struct bkey_i *k)
+{
+       return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
+                                     bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       int ret = 0;
+
+       lockdep_assert_held(&bc->lock);
+
+       if (!six_trylock_intent(&b->lock))
+               return -ENOMEM;
+
+       if (!six_trylock_write(&b->lock))
+               goto out_unlock_intent;
+
+       if (btree_node_noevict(b))
+               goto out_unlock;
+
+       if (!btree_node_may_write(b))
+               goto out_unlock;
+
+       if (btree_node_dirty(b) ||
+           btree_node_write_in_flight(b) ||
+           btree_node_read_in_flight(b)) {
+               if (!flush)
+                       goto out_unlock;
+
+               wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                              TASK_UNINTERRUPTIBLE);
+
+               /*
+                * Using the underscore version because we don't want to compact
+                * bsets after the write, since this node is about to be evicted
+                * - unless btree verify mode is enabled, since it runs out of
+                * the post write cleanup:
+                */
+               if (verify_btree_ondisk(c))
+                       bch2_btree_node_write(c, b, SIX_LOCK_intent);
+               else
+                       __bch2_btree_node_write(c, b, SIX_LOCK_read);
+
+               /* wait for any in flight btree write */
+               btree_node_wait_on_io(b);
+       }
+out:
+       if (PTR_HASH(&b->key) && !ret)
+               trace_btree_node_reap(c, b);
+       return ret;
+out_unlock:
+       six_unlock_write(&b->lock);
+out_unlock_intent:
+       six_unlock_intent(&b->lock);
+       ret = -ENOMEM;
+       goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+       return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+       return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+                                          struct shrink_control *sc)
+{
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_cache.shrink);
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b, *t;
+       unsigned long nr = sc->nr_to_scan;
+       unsigned long can_free;
+       unsigned long touched = 0;
+       unsigned long freed = 0;
+       unsigned i;
+
+       if (btree_shrinker_disabled(c))
+               return SHRINK_STOP;
+
+       /* Return -1 if we can't do anything right now */
+       if (sc->gfp_mask & __GFP_IO)
+               mutex_lock(&bc->lock);
+       else if (!mutex_trylock(&bc->lock))
+               return -1;
+
+       /*
+        * It's _really_ critical that we don't free too many btree nodes - we
+        * have to always leave ourselves a reserve. The reserve is how we
+        * guarantee that allocating memory for a new btree node can always
+        * succeed, so that inserting keys into the btree can always succeed and
+        * IO can always make forward progress:
+        */
+       nr /= btree_pages(c);
+       can_free = btree_cache_can_free(bc);
+       nr = min_t(unsigned long, nr, can_free);
+
+       i = 0;
+       list_for_each_entry_safe(b, t, &bc->freeable, list) {
+               touched++;
+
+               if (freed >= nr)
+                       break;
+
+               if (++i > 3 &&
+                   !btree_node_reclaim(c, b)) {
+                       btree_node_data_free(c, b);
+                       six_unlock_write(&b->lock);
+                       six_unlock_intent(&b->lock);
+                       freed++;
+               }
+       }
+restart:
+       list_for_each_entry_safe(b, t, &bc->live, list) {
+               touched++;
+
+               if (freed >= nr) {
+                       /* Save position */
+                       if (&t->list != &bc->live)
+                               list_move_tail(&bc->live, &t->list);
+                       break;
+               }
+
+               if (!btree_node_accessed(b) &&
+                   !btree_node_reclaim(c, b)) {
+                       /* can't call bch2_btree_node_hash_remove under lock  */
+                       freed++;
+                       if (&t->list != &bc->live)
+                               list_move_tail(&bc->live, &t->list);
+
+                       btree_node_data_free(c, b);
+                       mutex_unlock(&bc->lock);
+
+                       bch2_btree_node_hash_remove(bc, b);
+                       six_unlock_write(&b->lock);
+                       six_unlock_intent(&b->lock);
+
+                       if (freed >= nr)
+                               goto out;
+
+                       if (sc->gfp_mask & __GFP_IO)
+                               mutex_lock(&bc->lock);
+                       else if (!mutex_trylock(&bc->lock))
+                               goto out;
+                       goto restart;
+               } else
+                       clear_btree_node_accessed(b);
+       }
+
+       mutex_unlock(&bc->lock);
+out:
+       return (unsigned long) freed * btree_pages(c);
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+                                           struct shrink_control *sc)
+{
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_cache.shrink);
+       struct btree_cache *bc = &c->btree_cache;
+
+       if (btree_shrinker_disabled(c))
+               return 0;
+
+       return btree_cache_can_free(bc) * btree_pages(c);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+       unsigned i;
+
+       if (bc->shrink.list.next)
+               unregister_shrinker(&bc->shrink);
+
+       mutex_lock(&bc->lock);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       if (c->verify_data)
+               list_move(&c->verify_data->list, &bc->live);
+
+       kvpfree(c->verify_ondisk, btree_bytes(c));
+#endif
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (c->btree_roots[i].b)
+                       list_add(&c->btree_roots[i].b->list, &bc->live);
+
+       list_splice(&bc->freeable, &bc->live);
+
+       while (!list_empty(&bc->live)) {
+               b = list_first_entry(&bc->live, struct btree, list);
+
+               BUG_ON(btree_node_read_in_flight(b) ||
+                      btree_node_write_in_flight(b));
+
+               if (btree_node_dirty(b))
+                       bch2_btree_complete_write(c, b, btree_current_write(b));
+               clear_btree_node_dirty(b);
+
+               btree_node_data_free(c, b);
+       }
+
+       while (!list_empty(&bc->freed)) {
+               b = list_first_entry(&bc->freed, struct btree, list);
+               list_del(&b->list);
+               kfree(b);
+       }
+
+       mutex_unlock(&bc->lock);
+
+       if (bc->table_init_done)
+               rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       unsigned i;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+       if (ret)
+               goto out;
+
+       bc->table_init_done = true;
+
+       bch2_recalc_btree_reserve(c);
+
+       for (i = 0; i < bc->reserve; i++)
+               if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+
+       list_splice_init(&bc->live, &bc->freeable);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       mutex_init(&c->verify_lock);
+
+       c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+       if (!c->verify_ondisk) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
+       if (!c->verify_data) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       list_del_init(&c->verify_data->list);
+#endif
+
+       bc->shrink.count_objects        = bch2_btree_cache_count;
+       bc->shrink.scan_objects         = bch2_btree_cache_scan;
+       bc->shrink.seeks                = 4;
+       bc->shrink.batch                = btree_pages(c) * 2;
+       register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+       mutex_init(&bc->lock);
+       INIT_LIST_HEAD(&bc->live);
+       INIT_LIST_HEAD(&bc->freeable);
+       INIT_LIST_HEAD(&bc->freed);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+       struct btree_cache *bc = &c->btree_cache;
+
+       if (bc->alloc_lock == current) {
+               trace_btree_node_cannibalize_unlock(c);
+               bc->alloc_lock = NULL;
+               closure_wake_up(&bc->alloc_wait);
+       }
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct task_struct *old;
+
+       old = cmpxchg(&bc->alloc_lock, NULL, current);
+       if (old == NULL || old == current)
+               goto success;
+
+       if (!cl) {
+               trace_btree_node_cannibalize_lock_fail(c);
+               return -ENOMEM;
+       }
+
+       closure_wait(&bc->alloc_wait, cl);
+
+       /* Try again, after adding ourselves to waitlist */
+       old = cmpxchg(&bc->alloc_lock, NULL, current);
+       if (old == NULL || old == current) {
+               /* We raced */
+               closure_wake_up(&bc->alloc_wait);
+               goto success;
+       }
+
+       trace_btree_node_cannibalize_lock_fail(c);
+       return -EAGAIN;
+
+success:
+       trace_btree_node_cannibalize_lock(c);
+       return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+
+       list_for_each_entry_reverse(b, &bc->live, list)
+               if (!btree_node_reclaim(c, b))
+                       return b;
+
+       while (1) {
+               list_for_each_entry_reverse(b, &bc->live, list)
+                       if (!btree_node_write_and_reclaim(c, b))
+                               return b;
+
+               /*
+                * Rare case: all nodes were intent-locked.
+                * Just busy-wait.
+                */
+               WARN_ONCE(1, "btree cache cannibalize failed\n");
+               cond_resched();
+       }
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+       u64 start_time = local_clock();
+
+       mutex_lock(&bc->lock);
+
+       /*
+        * btree_free() doesn't free memory; it sticks the node on the end of
+        * the list. Check if there's any freed nodes there:
+        */
+       list_for_each_entry(b, &bc->freeable, list)
+               if (!btree_node_reclaim(c, b))
+                       goto out_unlock;
+
+       /*
+        * We never free struct btree itself, just the memory that holds the on
+        * disk node. Check the freed list before allocating a new one:
+        */
+       list_for_each_entry(b, &bc->freed, list)
+               if (!btree_node_reclaim(c, b)) {
+                       btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+                       if (b->data)
+                               goto out_unlock;
+
+                       six_unlock_write(&b->lock);
+                       six_unlock_intent(&b->lock);
+                       goto err;
+               }
+
+       b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
+       if (!b)
+               goto err;
+
+       BUG_ON(!six_trylock_intent(&b->lock));
+       BUG_ON(!six_trylock_write(&b->lock));
+out_unlock:
+       BUG_ON(btree_node_hashed(b));
+       BUG_ON(btree_node_write_in_flight(b));
+
+       list_del_init(&b->list);
+       mutex_unlock(&bc->lock);
+out:
+       b->flags                = 0;
+       b->written              = 0;
+       b->nsets                = 0;
+       b->sib_u64s[0]          = 0;
+       b->sib_u64s[1]          = 0;
+       b->whiteout_u64s        = 0;
+       b->uncompacted_whiteout_u64s = 0;
+       bch2_btree_keys_init(b, &c->expensive_debug_checks);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+                              start_time);
+
+       return b;
+err:
+       /* Try to cannibalize another cached btree node: */
+       if (bc->alloc_lock == current) {
+               b = btree_node_cannibalize(c);
+               list_del_init(&b->list);
+               mutex_unlock(&bc->lock);
+
+               bch2_btree_node_hash_remove(bc, b);
+
+               trace_btree_node_cannibalize(c);
+               goto out;
+       }
+
+       mutex_unlock(&bc->lock);
+       return ERR_PTR(-ENOMEM);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
+                               struct btree_iter *iter,
+                               const struct bkey_i *k,
+                               unsigned level,
+                               enum six_lock_type lock_type,
+                               bool sync)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+
+       /*
+        * Parent node must be locked, else we could read in a btree node that's
+        * been freed:
+        */
+       BUG_ON(!btree_node_locked(iter, level + 1));
+       BUG_ON(level >= BTREE_MAX_DEPTH);
+
+       b = bch2_btree_node_mem_alloc(c);
+       if (IS_ERR(b))
+               return b;
+
+       bkey_copy(&b->key, k);
+       if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+               /* raced with another fill: */
+
+               /* mark as unhashed... */
+               bkey_i_to_extent(&b->key)->v._data[0] = 0;
+
+               mutex_lock(&bc->lock);
+               list_add(&b->list, &bc->freeable);
+               mutex_unlock(&bc->lock);
+
+               six_unlock_write(&b->lock);
+               six_unlock_intent(&b->lock);
+               return NULL;
+       }
+
+       /*
+        * If the btree node wasn't cached, we can't drop our lock on
+        * the parent until after it's added to the cache - because
+        * otherwise we could race with a btree_split() freeing the node
+        * we're trying to lock.
+        *
+        * But the deadlock described below doesn't exist in this case,
+        * so it's safe to not drop the parent lock until here:
+        */
+       if (btree_node_read_locked(iter, level + 1))
+               btree_node_unlock(iter, level + 1);
+
+       bch2_btree_node_read(c, b, sync);
+
+       six_unlock_write(&b->lock);
+
+       if (!sync) {
+               six_unlock_intent(&b->lock);
+               return NULL;
+       }
+
+       if (lock_type == SIX_LOCK_read)
+               six_lock_downgrade(&b->lock);
+
+       return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
+                                 const struct bkey_i *k, unsigned level,
+                                 enum six_lock_type lock_type,
+                                 bool may_drop_locks)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+       struct bset_tree *t;
+
+       /*
+        * XXX: locking optimization
+        *
+        * we can make the locking looser here - caller can drop lock on parent
+        * node before locking child node (and potentially blocking): we just
+        * have to have bch2_btree_node_fill() call relock on the parent and
+        * return -EINTR if that fails
+        */
+       EBUG_ON(!btree_node_locked(iter, level + 1));
+       EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+       rcu_read_lock();
+       b = btree_cache_find(bc, k);
+       rcu_read_unlock();
+
+       if (unlikely(!b)) {
+               /*
+                * We must have the parent locked to call bch2_btree_node_fill(),
+                * else we could read in a btree node from disk that's been
+                * freed:
+                */
+               b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+
+               /* We raced and found the btree node in the cache */
+               if (!b)
+                       goto retry;
+
+               if (IS_ERR(b))
+                       return b;
+       } else {
+               /*
+                * There's a potential deadlock with splits and insertions into
+                * interior nodes we have to avoid:
+                *
+                * The other thread might be holding an intent lock on the node
+                * we want, and they want to update its parent node so they're
+                * going to upgrade their intent lock on the parent node to a
+                * write lock.
+                *
+                * But if we're holding a read lock on the parent, and we're
+                * trying to get the intent lock they're holding, we deadlock.
+                *
+                * So to avoid this we drop the read locks on parent nodes when
+                * we're starting to take intent locks - and handle the race.
+                *
+                * The race is that they might be about to free the node we
+                * want, and dropping our read lock on the parent node lets them
+                * update the parent marking the node we want as freed, and then
+                * free it:
+                *
+                * To guard against this, btree nodes are evicted from the cache
+                * when they're freed - and PTR_HASH() is zeroed out, which we
+                * check for after we lock the node.
+                *
+                * Then, bch2_btree_node_relock() on the parent will fail - because
+                * the parent was modified, when the pointer to the node we want
+                * was removed - and we'll bail out:
+                */
+               if (btree_node_read_locked(iter, level + 1))
+                       btree_node_unlock(iter, level + 1);
+
+               if (!btree_node_lock(b, k->k.p, level, iter,
+                                    lock_type, may_drop_locks))
+                       return ERR_PTR(-EINTR);
+
+               if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+                            b->level != level ||
+                            race_fault())) {
+                       six_unlock_type(&b->lock, lock_type);
+                       if (bch2_btree_node_relock(iter, level + 1))
+                               goto retry;
+
+                       return ERR_PTR(-EINTR);
+               }
+       }
+
+       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+
+       prefetch(b->aux_data);
+
+       for_each_bset(b, t) {
+               void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+               prefetch(p + L1_CACHE_BYTES * 0);
+               prefetch(p + L1_CACHE_BYTES * 1);
+               prefetch(p + L1_CACHE_BYTES * 2);
+       }
+
+       /* avoid atomic set bit if it's not needed: */
+       if (btree_node_accessed(b))
+               set_btree_node_accessed(b);
+
+       if (unlikely(btree_node_read_error(b))) {
+               six_unlock_type(&b->lock, lock_type);
+               return ERR_PTR(-EIO);
+       }
+
+       EBUG_ON(b->btree_id != iter->btree_id ||
+               BTREE_NODE_LEVEL(b->data) != level ||
+               bkey_cmp(b->data->max_key, k->k.p));
+
+       return b;
+}
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
+                                         struct btree_iter *iter,
+                                         struct btree *b,
+                                         bool may_drop_locks,
+                                         enum btree_node_sibling sib)
+{
+       struct btree *parent;
+       struct btree_node_iter node_iter;
+       struct bkey_packed *k;
+       BKEY_PADDED(k) tmp;
+       struct btree *ret = NULL;
+       unsigned level = b->level;
+
+       parent = btree_iter_node(iter, level + 1);
+       if (!parent)
+               return NULL;
+
+       if (!bch2_btree_node_relock(iter, level + 1))
+               goto out_upgrade;
+
+       node_iter = iter->l[parent->level].iter;
+
+       k = bch2_btree_node_iter_peek_all(&node_iter, parent);
+       BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+       k = sib == btree_prev_sib
+               ? bch2_btree_node_iter_prev(&node_iter, parent)
+               : (bch2_btree_node_iter_advance(&node_iter, parent),
+                  bch2_btree_node_iter_peek(&node_iter, parent));
+       if (!k)
+               goto out;
+
+       bch2_bkey_unpack(parent, &tmp.k, k);
+
+       ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+                                 SIX_LOCK_intent, may_drop_locks);
+
+       if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
+               struct btree_iter *linked;
+
+               if (!bch2_btree_node_relock(iter, level + 1))
+                       goto out_upgrade;
+
+               /*
+                * We might have got -EINTR because trylock failed, and we're
+                * holding other locks that would cause us to deadlock:
+                */
+               for_each_linked_btree_iter(iter, linked)
+                       if (btree_iter_cmp(iter, linked) < 0)
+                               __bch2_btree_iter_unlock(linked);
+
+               if (sib == btree_prev_sib)
+                       btree_node_unlock(iter, level);
+
+               ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+                                         SIX_LOCK_intent, may_drop_locks);
+
+               /*
+                * before btree_iter_relock() calls btree_iter_verify_locks():
+                */
+               if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+                       btree_node_unlock(iter, level + 1);
+
+               if (!bch2_btree_node_relock(iter, level)) {
+                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+                       if (!IS_ERR(ret)) {
+                               six_unlock_intent(&ret->lock);
+                               ret = ERR_PTR(-EINTR);
+                       }
+               }
+
+               bch2_btree_iter_relock(iter);
+       }
+out:
+       if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+               btree_node_unlock(iter, level + 1);
+
+       bch2_btree_iter_verify_locks(iter);
+
+       BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
+              (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
+               !btree_node_locked(iter, level)));
+
+       if (!IS_ERR_OR_NULL(ret)) {
+               struct btree *n1 = ret, *n2 = b;
+
+               if (sib != btree_prev_sib)
+                       swap(n1, n2);
+
+               BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
+                                                    n1->key.k.p),
+                               n2->data->min_key));
+       }
+
+       return ret;
+out_upgrade:
+       if (may_drop_locks)
+               bch2_btree_iter_upgrade(iter, level + 2, true);
+       ret = ERR_PTR(-EINTR);
+       goto out;
+}
+
+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+                             const struct bkey_i *k, unsigned level)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+
+       BUG_ON(!btree_node_locked(iter, level + 1));
+       BUG_ON(level >= BTREE_MAX_DEPTH);
+
+       rcu_read_lock();
+       b = btree_cache_find(bc, k);
+       rcu_read_unlock();
+
+       if (b)
+               return;
+
+       bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+}
+
+int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
+                         char *buf, size_t len)
+{
+       const struct bkey_format *f = &b->format;
+       struct bset_stats stats;
+       char ptrs[100];
+
+       memset(&stats, 0, sizeof(stats));
+
+       bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
+                       bkey_i_to_s_c(&b->key));
+       bch2_btree_keys_stats(b, &stats);
+
+       return scnprintf(buf, len,
+                        "l %u %llu:%llu - %llu:%llu:\n"
+                        "    ptrs: %s\n"
+                        "    format: u64s %u fields %u %u %u %u %u\n"
+                        "    unpack fn len: %u\n"
+                        "    bytes used %zu/%zu (%zu%% full)\n"
+                        "    sib u64s: %u, %u (merge threshold %zu)\n"
+                        "    nr packed keys %u\n"
+                        "    nr unpacked keys %u\n"
+                        "    floats %zu\n"
+                        "    failed unpacked %zu\n"
+                        "    failed prev %zu\n"
+                        "    failed overflow %zu\n",
+                        b->level,
+                        b->data->min_key.inode,
+                        b->data->min_key.offset,
+                        b->data->max_key.inode,
+                        b->data->max_key.offset,
+                        ptrs,
+                        f->key_u64s,
+                        f->bits_per_field[0],
+                        f->bits_per_field[1],
+                        f->bits_per_field[2],
+                        f->bits_per_field[3],
+                        f->bits_per_field[4],
+                        b->unpack_fn_len,
+                        b->nr.live_u64s * sizeof(u64),
+                        btree_bytes(c) - sizeof(struct btree_node),
+                        b->nr.live_u64s * 100 / btree_max_u64s(c),
+                        b->sib_u64s[0],
+                        b->sib_u64s[1],
+                        BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+                        b->nr.packed_keys,
+                        b->nr.unpacked_keys,
+                        stats.floats,
+                        stats.failed_unpacked,
+                        stats.failed_prev,
+                        stats.failed_overflow);
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
new file mode 100644 (file)
index 0000000..f7b9bcf
--- /dev/null
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+#include "extents.h"
+
+struct btree_iter;
+
+extern const char * const bch2_btree_ids[];
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+                               unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+
+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+                                 const struct bkey_i *, unsigned,
+                                 enum six_lock_type, bool);
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
+                                         struct btree *, bool,
+                                         enum btree_node_sibling);
+
+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+                             const struct bkey_i *, unsigned);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+#define PTR_HASH(_k)   (bkey_i_to_extent_c(_k)->v._data[0])
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+       return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)               \
+       for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,  \
+                                         &(_c)->btree_cache.table),    \
+            _iter = 0; _iter < (_tbl)->size; _iter++)                  \
+               rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+       return c->opts.btree_node_size << 9;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+       return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_page_order(struct bch_fs *c)
+{
+       return get_order(btree_bytes(c));
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+       return 1 << btree_page_order(c);
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+       return c->opts.btree_node_size >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c)               (btree_blocks(c) * 3 / 4)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)    (btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)                   \
+       (BTREE_FOREGROUND_MERGE_THRESHOLD(c) +                  \
+        (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+#define btree_node_root(_c, _b)        ((_c)->btree_roots[(_b)->btree_id].b)
+
+int bch2_print_btree_node(struct bch_fs *, struct btree *,
+                        char *, size_t);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
new file mode 100644 (file)
index 0000000..155e690
--- /dev/null
@@ -0,0 +1,1099 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+
+struct range_checks {
+       struct range_level {
+               struct bpos     min;
+               struct bpos     max;
+       }                       l[BTREE_MAX_DEPTH];
+       unsigned                depth;
+};
+
+static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+{
+       unsigned i;
+
+       for (i = 0; i < BTREE_MAX_DEPTH; i++)
+               r->l[i].min = r->l[i].max = POS_MIN;
+       r->depth = depth;
+}
+
+static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
+                                   struct range_checks *r)
+{
+       struct range_level *l = &r->l[b->level];
+
+       struct bpos expected_min = bkey_cmp(l->min, l->max)
+               ? btree_type_successor(b->btree_id, l->max)
+               : l->max;
+
+       bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
+               "btree node has incorrect min key: %llu:%llu != %llu:%llu",
+               b->data->min_key.inode,
+               b->data->min_key.offset,
+               expected_min.inode,
+               expected_min.offset);
+
+       l->max = b->data->max_key;
+
+       if (b->level > r->depth) {
+               l = &r->l[b->level - 1];
+
+               bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
+                       "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
+                       b->data->min_key.inode,
+                       b->data->min_key.offset,
+                       l->min.inode,
+                       l->min.offset);
+
+               bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
+                       "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
+                       b->data->max_key.inode,
+                       b->data->max_key.offset,
+                       l->max.inode,
+                       l->max.offset);
+
+               if (bkey_cmp(b->data->max_key, POS_MAX))
+                       l->min = l->max =
+                               btree_type_successor(b->btree_id,
+                                                    b->data->max_key);
+       }
+}
+
+u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
+{
+       const struct bch_extent_ptr *ptr;
+       u8 max_stale = 0;
+
+       if (bkey_extent_is_data(k.k)) {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+               extent_for_each_ptr(e, ptr) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+                       size_t b = PTR_BUCKET_NR(ca, ptr);
+
+                       if (gen_after(ca->oldest_gens[b], ptr->gen))
+                               ca->oldest_gens[b] = ptr->gen;
+
+                       max_stale = max(max_stale, ptr_stale(ca, ptr));
+               }
+       }
+
+       return max_stale;
+}
+
+/*
+ * For runtime mark and sweep:
+ */
+static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
+                          struct bkey_s_c k, unsigned flags)
+{
+       struct gc_pos pos = { 0 };
+       u8 ret = 0;
+
+       switch (type) {
+       case BKEY_TYPE_BTREE:
+               bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL,
+                             0, flags|
+                             BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                             BCH_BUCKET_MARK_GC_LOCK_HELD);
+               break;
+       case BKEY_TYPE_EXTENTS:
+               bch2_mark_key(c, k, k.k->size, false, pos, NULL,
+                             0, flags|
+                             BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                             BCH_BUCKET_MARK_GC_LOCK_HELD);
+               ret = bch2_btree_key_recalc_oldest_gen(c, k);
+               break;
+       default:
+               BUG();
+       }
+
+       return ret;
+}
+
+int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
+                               struct bkey_s_c k)
+{
+       enum bch_data_type data_type = type == BKEY_TYPE_BTREE
+               ? BCH_DATA_BTREE : BCH_DATA_USER;
+       int ret = 0;
+
+       BUG_ON(journal_seq_verify(c) &&
+              k.k->version.lo > journal_cur_seq(&c->journal));
+
+       if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+           fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
+                       "superblock not marked as containing replicas (type %u)",
+                       data_type)) {
+               ret = bch2_mark_bkey_replicas(c, data_type, k);
+               if (ret)
+                       return ret;
+       }
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const struct bch_extent_ptr *ptr;
+
+               extent_for_each_ptr(e, ptr) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+                       size_t b = PTR_BUCKET_NR(ca, ptr);
+                       struct bucket *g = PTR_BUCKET(ca, ptr);
+
+                       if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+                                       "found ptr with missing gen in alloc btree,\n"
+                                       "type %s gen %u",
+                                       bch2_data_types[data_type],
+                                       ptr->gen)) {
+                               g->_mark.gen = ptr->gen;
+                               g->_mark.gen_valid = 1;
+                               set_bit(b, ca->buckets_dirty);
+                       }
+
+                       if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+                                       "%s ptr gen in the future: %u > %u",
+                                       bch2_data_types[data_type],
+                                       ptr->gen, g->mark.gen)) {
+                               g->_mark.gen = ptr->gen;
+                               g->_mark.gen_valid = 1;
+                               set_bit(b, ca->buckets_dirty);
+                               set_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       }
+
+               }
+               break;
+       }
+       }
+
+       atomic64_set(&c->key_version,
+                    max_t(u64, k.k->version.lo,
+                          atomic64_read(&c->key_version)));
+
+       bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+fsck_err:
+       return ret;
+}
+
+static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
+{
+       enum bkey_type type = btree_node_type(b);
+       struct btree_node_iter iter;
+       struct bkey unpacked;
+       struct bkey_s_c k;
+       u8 stale = 0;
+
+       if (btree_node_has_ptrs(b))
+               for_each_btree_node_key_unpack(b, k, &iter,
+                                              btree_node_is_extents(b),
+                                              &unpacked) {
+                       bch2_bkey_debugcheck(c, b, k);
+                       stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
+               }
+
+       return stale;
+}
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+       preempt_disable();
+       write_seqcount_begin(&c->gc_pos_lock);
+       c->gc_pos = new_pos;
+       write_seqcount_end(&c->gc_pos_lock);
+       preempt_enable();
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+       BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+       __gc_pos_set(c, new_pos);
+}
+
+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
+{
+       struct btree_iter iter;
+       struct btree *b;
+       struct range_checks r;
+       unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
+       unsigned max_stale;
+       int ret = 0;
+
+       gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+       if (!c->btree_roots[btree_id].b)
+               return 0;
+
+       /*
+        * if expensive_debug_checks is on, run range_checks on all leaf nodes:
+        */
+       if (expensive_debug_checks(c))
+               depth = 0;
+
+       btree_node_range_checks_init(&r, depth);
+
+       __for_each_btree_node(&iter, c, btree_id, POS_MIN,
+                             0, depth, BTREE_ITER_PREFETCH, b) {
+               btree_node_range_checks(c, b, &r);
+
+               bch2_verify_btree_nr_keys(b);
+
+               max_stale = btree_gc_mark_node(c, b);
+
+               gc_pos_set(c, gc_pos_btree_node(b));
+
+               if (max_stale > 64)
+                       bch2_btree_node_rewrite(c, &iter,
+                                       b->data->keys.seq,
+                                       BTREE_INSERT_USE_RESERVE|
+                                       BTREE_INSERT_NOWAIT|
+                                       BTREE_INSERT_GC_LOCK_HELD);
+               else if (!btree_gc_rewrite_disabled(c) &&
+                        (btree_gc_always_rewrite(c) || max_stale > 16))
+                       bch2_btree_node_rewrite(c, &iter,
+                                       b->data->keys.seq,
+                                       BTREE_INSERT_NOWAIT|
+                                       BTREE_INSERT_GC_LOCK_HELD);
+
+               bch2_btree_iter_cond_resched(&iter);
+       }
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
+
+       mutex_lock(&c->btree_root_lock);
+
+       b = c->btree_roots[btree_id].b;
+       if (!btree_node_fake(b))
+               bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+       gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+
+       mutex_unlock(&c->btree_root_lock);
+       return 0;
+}
+
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+                                 u64 start, u64 end,
+                                 enum bch_data_type type,
+                                 unsigned flags)
+{
+       u64 b = sector_to_bucket(ca, start);
+
+       do {
+               unsigned sectors =
+                       min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+               bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+                                         gc_phase(GC_PHASE_SB), flags);
+               b++;
+               start += sectors;
+       } while (start < end);
+}
+
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+                             unsigned flags)
+{
+       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+       unsigned i;
+       u64 b;
+
+       if (c) {
+               lockdep_assert_held(&c->sb_lock);
+               percpu_down_read(&c->usage_lock);
+       }
+
+       for (i = 0; i < layout->nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+               if (offset == BCH_SB_SECTOR)
+                       mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+                                             BCH_DATA_SB, flags);
+
+               mark_metadata_sectors(c, ca, offset,
+                                     offset + (1 << layout->sb_max_size_bits),
+                                     BCH_DATA_SB, flags);
+       }
+
+       if (c)
+               spin_lock(&c->journal.lock);
+
+       for (i = 0; i < ca->journal.nr; i++) {
+               b = ca->journal.buckets[i];
+               bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
+                                         ca->mi.bucket_size,
+                                         gc_phase(GC_PHASE_SB), flags);
+       }
+
+       if (c) {
+               spin_unlock(&c->journal.lock);
+               percpu_up_read(&c->usage_lock);
+       }
+}
+
+static void bch2_mark_superblocks(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       mutex_lock(&c->sb_lock);
+       gc_pos_set(c, gc_phase(GC_PHASE_SB));
+
+       for_each_online_member(ca, c, i)
+               bch2_mark_dev_superblock(c, ca,
+                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
+       mutex_unlock(&c->sb_lock);
+}
+
+/* Also see bch2_pending_btree_node_free_insert_done() */
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+{
+       struct gc_pos pos = { 0 };
+       struct bch_fs_usage stats = { 0 };
+       struct btree_update *as;
+       struct pending_btree_node_free *d;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+       for_each_pending_btree_node_free(c, as, d)
+               if (d->index_update_done)
+                       bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+                                     c->opts.btree_node_size, true, pos,
+                                     &stats, 0,
+                                     BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                                     BCH_BUCKET_MARK_GC_LOCK_HELD);
+       /*
+        * Don't apply stats - pending deletes aren't tracked in
+        * bch_alloc_stats:
+        */
+
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void bch2_mark_allocator_buckets(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       struct open_bucket *ob;
+       size_t i, j, iter;
+       unsigned ci;
+
+       percpu_down_read(&c->usage_lock);
+
+       spin_lock(&c->freelist_lock);
+       gc_pos_set(c, gc_pos_alloc(c, NULL));
+
+       for_each_member_device(ca, c, ci) {
+               fifo_for_each_entry(i, &ca->free_inc, iter)
+                       bch2_mark_alloc_bucket(c, ca, i, true,
+                                              gc_pos_alloc(c, NULL),
+                                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                                              BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+
+
+               for (j = 0; j < RESERVE_NR; j++)
+                       fifo_for_each_entry(i, &ca->free[j], iter)
+                               bch2_mark_alloc_bucket(c, ca, i, true,
+                                                      gc_pos_alloc(c, NULL),
+                                                      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                                                      BCH_BUCKET_MARK_GC_LOCK_HELD);
+       }
+
+       spin_unlock(&c->freelist_lock);
+
+       for (ob = c->open_buckets;
+            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+            ob++) {
+               spin_lock(&ob->lock);
+               if (ob->valid) {
+                       gc_pos_set(c, gc_pos_alloc(c, ob));
+                       ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+                       bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
+                                              gc_pos_alloc(c, ob),
+                                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+                                              BCH_BUCKET_MARK_GC_LOCK_HELD);
+               }
+               spin_unlock(&ob->lock);
+       }
+
+       percpu_up_read(&c->usage_lock);
+}
+
+static void bch2_gc_start(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       struct bucket_array *buckets;
+       struct bucket_mark new;
+       unsigned i;
+       size_t b;
+       int cpu;
+
+       percpu_down_write(&c->usage_lock);
+
+       /*
+        * Indicates to buckets code that gc is now in progress - done under
+        * usage_lock to avoid racing with bch2_mark_key():
+        */
+       __gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+       /* Save a copy of the existing bucket stats while we recompute them: */
+       for_each_member_device(ca, c, i) {
+               ca->usage_cached = __bch2_dev_usage_read(ca);
+               for_each_possible_cpu(cpu) {
+                       struct bch_dev_usage *p =
+                               per_cpu_ptr(ca->usage_percpu, cpu);
+                       memset(p, 0, sizeof(*p));
+               }
+       }
+
+       c->usage_cached = __bch2_fs_usage_read(c);
+       for_each_possible_cpu(cpu) {
+               struct bch_fs_usage *p =
+                       per_cpu_ptr(c->usage_percpu, cpu);
+
+               memset(p->s, 0, sizeof(p->s));
+       }
+
+       percpu_up_write(&c->usage_lock);
+
+       /* Clear bucket marks: */
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               buckets = bucket_array(ca);
+
+               for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+                       bucket_cmpxchg(buckets->b + b, new, ({
+                               new.owned_by_allocator  = 0;
+                               new.data_type           = 0;
+                               new.cached_sectors      = 0;
+                               new.dirty_sectors       = 0;
+                       }));
+                       ca->oldest_gens[b] = new.gen;
+               }
+               up_read(&ca->bucket_lock);
+       }
+}
+
+/**
+ * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ */
+void bch2_gc(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       u64 start_time = local_clock();
+       unsigned i;
+
+       /*
+        * Walk _all_ references to buckets, and recompute them:
+        *
+        * Order matters here:
+        *  - Concurrent GC relies on the fact that we have a total ordering for
+        *    everything that GC walks - see  gc_will_visit_node(),
+        *    gc_will_visit_root()
+        *
+        *  - also, references move around in the course of index updates and
+        *    various other crap: everything needs to agree on the ordering
+        *    references are allowed to move around in - e.g., we're allowed to
+        *    start with a reference owned by an open_bucket (the allocator) and
+        *    move it to the btree, but not the reverse.
+        *
+        *    This is necessary to ensure that gc doesn't miss references that
+        *    move around - if references move backwards in the ordering GC
+        *    uses, GC could skip past them
+        */
+       trace_gc_start(c);
+
+       /*
+        * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
+        * gc_lock if sectors_available goes to 0:
+        */
+       bch2_recalc_sectors_available(c);
+
+       down_write(&c->gc_lock);
+       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+               goto out;
+
+       bch2_gc_start(c);
+
+       bch2_mark_superblocks(c);
+
+       /* Walk btree: */
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               int ret = bch2_gc_btree(c, i);
+               if (ret) {
+                       bch_err(c, "btree gc failed: %d", ret);
+                       set_bit(BCH_FS_GC_FAILURE, &c->flags);
+                       goto out;
+               }
+       }
+
+       bch2_mark_pending_btree_node_frees(c);
+       bch2_mark_allocator_buckets(c);
+
+       for_each_member_device(ca, c, i)
+               atomic_long_set(&ca->saturated_count, 0);
+
+       /* Indicates that gc is no longer in progress: */
+       gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+       c->gc_count++;
+out:
+       up_write(&c->gc_lock);
+       trace_gc_end(c);
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+
+       /*
+        * Wake up allocator in case it was waiting for buckets
+        * because of not being able to inc gens
+        */
+       for_each_member_device(ca, c, i)
+               bch2_wake_allocator(ca);
+
+       /*
+        * At startup, allocations can happen directly instead of via the
+        * allocator thread - issue wakeup in case they blocked on gc_lock:
+        */
+       closure_wake_up(&c->freelist_wait);
+}
+
+/* Btree coalescing */
+
+static void recalc_packed_keys(struct btree *b)
+{
+       struct bkey_packed *k;
+
+       memset(&b->nr, 0, sizeof(b->nr));
+
+       BUG_ON(b->nsets != 1);
+
+       for (k =  btree_bkey_first(b, b->set);
+            k != btree_bkey_last(b, b->set);
+            k = bkey_next(k))
+               btree_keys_account_key_add(&b->nr, 0, k);
+}
+
+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
+                               struct btree *old_nodes[GC_MERGE_NODES])
+{
+       struct btree *parent = btree_node_parent(iter, old_nodes[0]);
+       unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
+       unsigned blocks = btree_blocks(c) * 2 / 3;
+       struct btree *new_nodes[GC_MERGE_NODES];
+       struct btree_update *as;
+       struct keylist keylist;
+       struct bkey_format_state format_state;
+       struct bkey_format new_format;
+
+       memset(new_nodes, 0, sizeof(new_nodes));
+       bch2_keylist_init(&keylist, NULL);
+
+       /* Count keys that are not deleted */
+       for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
+               u64s += old_nodes[i]->nr.live_u64s;
+
+       nr_old_nodes = nr_new_nodes = i;
+
+       /* Check if all keys in @old_nodes could fit in one fewer node */
+       if (nr_old_nodes <= 1 ||
+           __vstruct_blocks(struct btree_node, c->block_bits,
+                            DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
+               return;
+
+       /* Find a format that all keys in @old_nodes can pack into */
+       bch2_bkey_format_init(&format_state);
+
+       for (i = 0; i < nr_old_nodes; i++)
+               __bch2_btree_calc_format(&format_state, old_nodes[i]);
+
+       new_format = bch2_bkey_format_done(&format_state);
+
+       /* Check if repacking would make any nodes too big to fit */
+       for (i = 0; i < nr_old_nodes; i++)
+               if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
+                       trace_btree_gc_coalesce_fail(c,
+                                       BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
+                       return;
+               }
+
+       if (bch2_keylist_realloc(&keylist, NULL, 0,
+                       (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+               trace_btree_gc_coalesce_fail(c,
+                               BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
+               return;
+       }
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+                       btree_update_reserve_required(c, parent) + nr_old_nodes,
+                       BTREE_INSERT_NOFAIL|
+                       BTREE_INSERT_USE_RESERVE,
+                       NULL);
+       if (IS_ERR(as)) {
+               trace_btree_gc_coalesce_fail(c,
+                               BTREE_GC_COALESCE_FAIL_RESERVE_GET);
+               bch2_keylist_free(&keylist, NULL);
+               return;
+       }
+
+       trace_btree_gc_coalesce(c, old_nodes[0]);
+
+       for (i = 0; i < nr_old_nodes; i++)
+               bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
+
+       /* Repack everything with @new_format and sort down to one bset */
+       for (i = 0; i < nr_old_nodes; i++)
+               new_nodes[i] =
+                       __bch2_btree_node_alloc_replacement(as, old_nodes[i],
+                                                           new_format);
+
+       /*
+        * Conceptually we concatenate the nodes together and slice them
+        * up at different boundaries.
+        */
+       for (i = nr_new_nodes - 1; i > 0; --i) {
+               struct btree *n1 = new_nodes[i];
+               struct btree *n2 = new_nodes[i - 1];
+
+               struct bset *s1 = btree_bset_first(n1);
+               struct bset *s2 = btree_bset_first(n2);
+               struct bkey_packed *k, *last = NULL;
+
+               /* Calculate how many keys from @n2 we could fit inside @n1 */
+               u64s = 0;
+
+               for (k = s2->start;
+                    k < vstruct_last(s2) &&
+                    vstruct_blocks_plus(n1->data, c->block_bits,
+                                        u64s + k->u64s) <= blocks;
+                    k = bkey_next(k)) {
+                       last = k;
+                       u64s += k->u64s;
+               }
+
+               if (u64s == le16_to_cpu(s2->u64s)) {
+                       /* n2 fits entirely in n1 */
+                       n1->key.k.p = n1->data->max_key = n2->data->max_key;
+
+                       memcpy_u64s(vstruct_last(s1),
+                                   s2->start,
+                                   le16_to_cpu(s2->u64s));
+                       le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
+
+                       set_btree_bset_end(n1, n1->set);
+
+                       six_unlock_write(&n2->lock);
+                       bch2_btree_node_free_never_inserted(c, n2);
+                       six_unlock_intent(&n2->lock);
+
+                       memmove(new_nodes + i - 1,
+                               new_nodes + i,
+                               sizeof(new_nodes[0]) * (nr_new_nodes - i));
+                       new_nodes[--nr_new_nodes] = NULL;
+               } else if (u64s) {
+                       /* move part of n2 into n1 */
+                       n1->key.k.p = n1->data->max_key =
+                               bkey_unpack_pos(n1, last);
+
+                       n2->data->min_key =
+                               btree_type_successor(iter->btree_id,
+                                                    n1->data->max_key);
+
+                       memcpy_u64s(vstruct_last(s1),
+                                   s2->start, u64s);
+                       le16_add_cpu(&s1->u64s, u64s);
+
+                       memmove(s2->start,
+                               vstruct_idx(s2, u64s),
+                               (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
+                       s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
+
+                       set_btree_bset_end(n1, n1->set);
+                       set_btree_bset_end(n2, n2->set);
+               }
+       }
+
+       for (i = 0; i < nr_new_nodes; i++) {
+               struct btree *n = new_nodes[i];
+
+               recalc_packed_keys(n);
+               btree_node_reset_sib_u64s(n);
+
+               bch2_btree_build_aux_trees(n);
+               six_unlock_write(&n->lock);
+
+               bch2_btree_node_write(c, n, SIX_LOCK_intent);
+       }
+
+       /*
+        * The keys for the old nodes get deleted. We don't want to insert keys
+        * that compare equal to the keys for the new nodes we'll also be
+        * inserting - we can't because keys on a keylist must be strictly
+        * greater than the previous keys, and we also don't need to since the
+        * key for the new node will serve the same purpose (overwriting the key
+        * for the old node).
+        */
+       for (i = 0; i < nr_old_nodes; i++) {
+               struct bkey_i delete;
+               unsigned j;
+
+               for (j = 0; j < nr_new_nodes; j++)
+                       if (!bkey_cmp(old_nodes[i]->key.k.p,
+                                     new_nodes[j]->key.k.p))
+                               goto next;
+
+               bkey_init(&delete.k);
+               delete.k.p = old_nodes[i]->key.k.p;
+               bch2_keylist_add_in_order(&keylist, &delete);
+next:
+               i = i;
+       }
+
+       /*
+        * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
+        * does the lookup once and thus expects the keys to be in sorted order
+        * so we have to make sure the new keys are correctly ordered with
+        * respect to the deleted keys added in the previous loop
+        */
+       for (i = 0; i < nr_new_nodes; i++)
+               bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
+
+       /* Insert the newly coalesced nodes */
+       bch2_btree_insert_node(as, parent, iter, &keylist, 0);
+
+       BUG_ON(!bch2_keylist_empty(&keylist));
+
+       BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+
+       bch2_btree_iter_node_replace(iter, new_nodes[0]);
+
+       for (i = 0; i < nr_new_nodes; i++)
+               bch2_btree_open_bucket_put(c, new_nodes[i]);
+
+       /* Free the old nodes and update our sliding window */
+       for (i = 0; i < nr_old_nodes; i++) {
+               bch2_btree_node_free_inmem(c, old_nodes[i], iter);
+               six_unlock_intent(&old_nodes[i]->lock);
+
+               /*
+                * the index update might have triggered a split, in which case
+                * the nodes we coalesced - the new nodes we just created -
+                * might not be sibling nodes anymore - don't add them to the
+                * sliding window (except the first):
+                */
+               if (!i) {
+                       old_nodes[i] = new_nodes[i];
+               } else {
+                       old_nodes[i] = NULL;
+                       if (new_nodes[i])
+                               six_unlock_intent(&new_nodes[i]->lock);
+               }
+       }
+
+       bch2_btree_update_done(as);
+       bch2_keylist_free(&keylist, NULL);
+}
+
+static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
+{
+       struct btree_iter iter;
+       struct btree *b;
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
+       unsigned i;
+
+       /* Sliding window of adjacent btree nodes */
+       struct btree *merge[GC_MERGE_NODES];
+       u32 lock_seq[GC_MERGE_NODES];
+
+       /*
+        * XXX: We don't have a good way of positively matching on sibling nodes
+        * that have the same parent - this code works by handling the cases
+        * where they might not have the same parent, and is thus fragile. Ugh.
+        *
+        * Perhaps redo this to use multiple linked iterators?
+        */
+       memset(merge, 0, sizeof(merge));
+
+       __for_each_btree_node(&iter, c, btree_id, POS_MIN,
+                             BTREE_MAX_DEPTH, 0,
+                             BTREE_ITER_PREFETCH, b) {
+               memmove(merge + 1, merge,
+                       sizeof(merge) - sizeof(merge[0]));
+               memmove(lock_seq + 1, lock_seq,
+                       sizeof(lock_seq) - sizeof(lock_seq[0]));
+
+               merge[0] = b;
+
+               for (i = 1; i < GC_MERGE_NODES; i++) {
+                       if (!merge[i] ||
+                           !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+                               break;
+
+                       if (merge[i]->level != merge[0]->level) {
+                               six_unlock_intent(&merge[i]->lock);
+                               break;
+                       }
+               }
+               memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
+
+               bch2_coalesce_nodes(c, &iter, merge);
+
+               for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
+                       lock_seq[i] = merge[i]->lock.state.seq;
+                       six_unlock_intent(&merge[i]->lock);
+               }
+
+               lock_seq[0] = merge[0]->lock.state.seq;
+
+               if (kthread && kthread_should_stop()) {
+                       bch2_btree_iter_unlock(&iter);
+                       return -ESHUTDOWN;
+               }
+
+               bch2_btree_iter_cond_resched(&iter);
+
+               /*
+                * If the parent node wasn't relocked, it might have been split
+                * and the nodes in our sliding window might not have the same
+                * parent anymore - blow away the sliding window:
+                */
+               if (btree_iter_node(&iter, iter.level + 1) &&
+                   !btree_node_intent_locked(&iter, iter.level + 1))
+                       memset(merge + 1, 0,
+                              (GC_MERGE_NODES - 1) * sizeof(merge[0]));
+       }
+       return bch2_btree_iter_unlock(&iter);
+}
+
+/**
+ * bch_coalesce - coalesce adjacent nodes with low occupancy
+ */
+void bch2_coalesce(struct bch_fs *c)
+{
+       enum btree_id id;
+
+       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+               return;
+
+       down_read(&c->gc_lock);
+       trace_gc_coalesce_start(c);
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               int ret = c->btree_roots[id].b
+                       ? bch2_coalesce_btree(c, id)
+                       : 0;
+
+               if (ret) {
+                       if (ret != -ESHUTDOWN)
+                               bch_err(c, "btree coalescing failed: %d", ret);
+                       set_bit(BCH_FS_GC_FAILURE, &c->flags);
+                       return;
+               }
+       }
+
+       trace_gc_coalesce_end(c);
+       up_read(&c->gc_lock);
+}
+
+static int bch2_gc_thread(void *arg)
+{
+       struct bch_fs *c = arg;
+       struct io_clock *clock = &c->io_clock[WRITE];
+       unsigned long last = atomic_long_read(&clock->now);
+       unsigned last_kick = atomic_read(&c->kick_gc);
+
+       set_freezable();
+
+       while (1) {
+               while (1) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+
+                       if (kthread_should_stop()) {
+                               __set_current_state(TASK_RUNNING);
+                               return 0;
+                       }
+
+                       if (atomic_read(&c->kick_gc) != last_kick)
+                               break;
+
+                       if (c->btree_gc_periodic) {
+                               unsigned long next = last + c->capacity / 16;
+
+                               if (atomic_long_read(&clock->now) >= next)
+                                       break;
+
+                               bch2_io_clock_schedule_timeout(clock, next);
+                       } else {
+                               schedule();
+                       }
+
+                       try_to_freeze();
+               }
+               __set_current_state(TASK_RUNNING);
+
+               last = atomic_long_read(&clock->now);
+               last_kick = atomic_read(&c->kick_gc);
+
+               bch2_gc(c);
+
+               debug_check_no_locks_held();
+       }
+
+       return 0;
+}
+
+void bch2_gc_thread_stop(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       p = c->gc_thread;
+       c->gc_thread = NULL;
+
+       if (p) {
+               kthread_stop(p);
+               put_task_struct(p);
+       }
+}
+
+int bch2_gc_thread_start(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       BUG_ON(c->gc_thread);
+
+       p = kthread_create(bch2_gc_thread, c, "bch_gc");
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       get_task_struct(p);
+       c->gc_thread = p;
+       wake_up_process(p);
+       return 0;
+}
+
+/* Initial GC computes bucket marks during startup */
+
+static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
+{
+       struct btree_iter iter;
+       struct btree *b;
+       struct range_checks r;
+       int ret = 0;
+
+       btree_node_range_checks_init(&r, 0);
+
+       gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
+
+       if (!c->btree_roots[id].b)
+               return 0;
+
+       b = c->btree_roots[id].b;
+       if (!btree_node_fake(b))
+               ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
+                                                 bkey_i_to_s_c(&b->key));
+       if (ret)
+               return ret;
+
+       /*
+        * We have to hit every btree node before starting journal replay, in
+        * order for the journal seq blacklist machinery to work:
+        */
+       for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+               btree_node_range_checks(c, b, &r);
+
+               if (btree_node_has_ptrs(b)) {
+                       struct btree_node_iter node_iter;
+                       struct bkey unpacked;
+                       struct bkey_s_c k;
+
+                       for_each_btree_node_key_unpack(b, k, &node_iter,
+                                                      btree_node_is_extents(b),
+                                                      &unpacked) {
+                               ret = bch2_btree_mark_key_initial(c,
+                                                       btree_node_type(b), k);
+                               if (ret)
+                                       goto err;
+                       }
+               }
+
+               bch2_btree_iter_cond_resched(&iter);
+       }
+err:
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
+{
+       unsigned iter = 0;
+       enum btree_id id;
+       int ret = 0;
+
+       down_write(&c->gc_lock);
+again:
+       bch2_gc_start(c);
+
+       bch2_mark_superblocks(c);
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               ret = bch2_initial_gc_btree(c, id);
+               if (ret)
+                       goto err;
+       }
+
+       ret = bch2_journal_mark(c, journal);
+       if (ret)
+               goto err;
+
+       if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+               if (iter++ > 2) {
+                       bch_info(c, "Unable to fix bucket gens, looping");
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               bch_info(c, "Fixed gens, restarting initial mark and sweep:");
+               clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+               goto again;
+       }
+
+       /*
+        * Skip past versions that might have possibly been used (as nonces),
+        * but hadn't had their pointers written:
+        */
+       if (c->sb.encryption_type)
+               atomic64_add(1 << 16, &c->key_version);
+
+       gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+err:
+       up_write(&c->gc_lock);
+       return ret;
+}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
new file mode 100644 (file)
index 0000000..9d2b9d5
--- /dev/null
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "btree_types.h"
+
+enum bkey_type;
+
+void bch2_coalesce(struct bch_fs *);
+void bch2_gc(struct bch_fs *);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+int bch2_initial_gc(struct bch_fs *, struct list_head *);
+u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
+int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
+                               struct bkey_s_c);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+       return (struct gc_pos) {
+               .phase  = phase,
+               .pos    = POS_MIN,
+               .level  = 0,
+       };
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+       if (l.phase != r.phase)
+               return l.phase < r.phase ? -1 : 1;
+       if (bkey_cmp(l.pos, r.pos))
+               return bkey_cmp(l.pos, r.pos);
+       if (l.level != r.level)
+               return l.level < r.level ? -1 : 1;
+       return 0;
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+                                        struct bpos pos, unsigned level)
+{
+       return (struct gc_pos) {
+               .phase  = GC_PHASE_BTREE_EXTENTS + id,
+               .pos    = pos,
+               .level  = level,
+       };
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+       return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+       return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+       return (struct gc_pos) {
+               .phase  = GC_PHASE_ALLOC,
+               .pos    = POS(ob ? ob - c->open_buckets : 0, 0),
+       };
+}
+
+static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+{
+       unsigned seq;
+       bool ret;
+
+       do {
+               seq = read_seqcount_begin(&c->gc_pos_lock);
+               ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+       } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+       return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
new file mode 100644 (file)
index 0000000..2d00494
--- /dev/null
@@ -0,0 +1,2095 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+#include "trace.h"
+
+/* btree_node_iter_large: */
+
+#define btree_node_iter_cmp_heap(h, _l, _r)                            \
+       __btree_node_iter_cmp((iter)->is_extents, b,                    \
+                              __btree_node_offset_to_key(b, (_l).k),   \
+                              __btree_node_offset_to_key(b, (_r).k))
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
+                                    struct btree *b,
+                                    const struct bkey_packed *k,
+                                    const struct bkey_packed *end)
+{
+       if (k != end) {
+               struct btree_node_iter_set n =
+                       ((struct btree_node_iter_set) {
+                                __btree_node_key_to_offset(b, k),
+                                __btree_node_key_to_offset(b, end)
+                        });
+
+               __heap_add(iter, n, btree_node_iter_cmp_heap);
+       }
+}
+
+void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
+                                       struct btree *b)
+{
+       iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+
+       EBUG_ON(!iter->used);
+       EBUG_ON(iter->data->k > iter->data->end);
+
+       if (iter->data->k == iter->data->end)
+               heap_del(iter, 0, btree_node_iter_cmp_heap);
+       else
+               heap_sift_down(iter, 0, btree_node_iter_cmp_heap);
+}
+
+static void verify_no_dups(struct btree *b,
+                          struct bkey_packed *start,
+                          struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bkey_packed *k;
+
+       for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
+               struct bkey l = bkey_unpack_key(b, k);
+               struct bkey r = bkey_unpack_key(b, bkey_next(k));
+
+               BUG_ON(btree_node_is_extents(b)
+                      ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
+                      : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+               //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+       }
+#endif
+}
+
+static void clear_needs_whiteout(struct bset *i)
+{
+       struct bkey_packed *k;
+
+       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+               k->needs_whiteout = false;
+}
+
+static void set_needs_whiteout(struct bset *i)
+{
+       struct bkey_packed *k;
+
+       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+               k->needs_whiteout = true;
+}
+
+static void btree_bounce_free(struct bch_fs *c, unsigned order,
+                             bool used_mempool, void *p)
+{
+       if (used_mempool)
+               mempool_free(p, &c->btree_bounce_pool);
+       else
+               vpfree(p, PAGE_SIZE << order);
+}
+
+static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
+                               bool *used_mempool)
+{
+       void *p;
+
+       BUG_ON(order > btree_page_order(c));
+
+       *used_mempool = false;
+       p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+       if (p)
+               return p;
+
+       *used_mempool = true;
+       return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+}
+
+typedef int (*sort_cmp_fn)(struct btree *,
+                          struct bkey_packed *,
+                          struct bkey_packed *);
+
+struct sort_iter {
+       struct btree    *b;
+       unsigned                used;
+
+       struct sort_iter_set {
+               struct bkey_packed *k, *end;
+       } data[MAX_BSETS + 1];
+};
+
+static void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+       memset(iter, 0, sizeof(*iter));
+       iter->b = b;
+}
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+                                   unsigned from,
+                                   sort_cmp_fn cmp)
+{
+       unsigned i;
+
+       for (i = from;
+            i + 1 < iter->used &&
+            cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+            i++)
+               swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+       __sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+       unsigned i = iter->used;
+
+       while (i--)
+               __sort_iter_sift(iter, i, cmp);
+}
+
+static void sort_iter_add(struct sort_iter *iter,
+                         struct bkey_packed *k,
+                         struct bkey_packed *end)
+{
+       BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+       if (k != end)
+               iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+       return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+       iter->data->k = bkey_next(iter->data->k);
+
+       BUG_ON(iter->data->k > iter->data->end);
+
+       if (iter->data->k == iter->data->end)
+               array_remove_item(iter->data, iter->used, 0);
+       else
+               sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+                                                sort_cmp_fn cmp)
+{
+       struct bkey_packed *ret = sort_iter_peek(iter);
+
+       if (ret)
+               sort_iter_advance(iter, cmp);
+
+       return ret;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+                                        struct bkey_packed *l,
+                                        struct bkey_packed *r)
+{
+       return bkey_cmp_packed(b, l, r);
+}
+
+static unsigned sort_key_whiteouts(struct bkey_packed *dst,
+                                  struct sort_iter *iter)
+{
+       struct bkey_packed *in, *out = dst;
+
+       sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+       while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+               bkey_copy(out, in);
+               out = bkey_next(out);
+       }
+
+       return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+                                           struct bkey_packed *l,
+                                           struct bkey_packed *r)
+{
+       struct bkey ul = bkey_unpack_key(b, l);
+       struct bkey ur = bkey_unpack_key(b, r);
+
+       return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
+                                     struct sort_iter *iter)
+{
+       const struct bkey_format *f = &iter->b->format;
+       struct bkey_packed *in, *out = dst;
+       struct bkey_i l, r;
+       bool prev = false, l_packed = false;
+       u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
+       u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
+       u64 new_size;
+
+       max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+       sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+       while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+               EBUG_ON(bkeyp_val_u64s(f, in));
+               EBUG_ON(in->type != KEY_TYPE_DISCARD);
+
+               r.k = bkey_unpack_key(iter->b, in);
+
+               if (prev &&
+                   bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+                       if (bkey_cmp(l.k.p, r.k.p) >= 0)
+                               continue;
+
+                       new_size = l_packed
+                               ? min(max_packed_size, max_packed_offset -
+                                     bkey_start_offset(&l.k))
+                               : KEY_SIZE_MAX;
+
+                       new_size = min(new_size, r.k.p.offset -
+                                      bkey_start_offset(&l.k));
+
+                       BUG_ON(new_size < l.k.size);
+
+                       bch2_key_resize(&l.k, new_size);
+
+                       if (bkey_cmp(l.k.p, r.k.p) >= 0)
+                               continue;
+
+                       bch2_cut_front(l.k.p, &r);
+               }
+
+               if (prev) {
+                       if (!bch2_bkey_pack(out, &l, f)) {
+                               BUG_ON(l_packed);
+                               bkey_copy(out, &l);
+                       }
+                       out = bkey_next(out);
+               }
+
+               l = r;
+               prev = true;
+               l_packed = bkey_packed(in);
+       }
+
+       if (prev) {
+               if (!bch2_bkey_pack(out, &l, f)) {
+                       BUG_ON(l_packed);
+                       bkey_copy(out, &l);
+               }
+               out = bkey_next(out);
+       }
+
+       return (u64 *) out - (u64 *) dst;
+}
+
+static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
+                                   bool compacting,
+                                   enum compact_mode mode)
+{
+       unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+       unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+       if (mode == COMPACT_LAZY) {
+               if (should_compact_bset_lazy(b, t) ||
+                   (compacting && bset_unwritten(b, bset(b, t))))
+                       return dead_u64s;
+       } else {
+               if (bset_written(b, bset(b, t)))
+                       return dead_u64s;
+       }
+
+       return 0;
+}
+
+bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+                            enum compact_mode mode)
+{
+       const struct bkey_format *f = &b->format;
+       struct bset_tree *t;
+       struct bkey_packed *whiteouts = NULL;
+       struct bkey_packed *u_start, *u_pos;
+       struct sort_iter sort_iter;
+       unsigned order, whiteout_u64s = 0, u64s;
+       bool used_mempool, compacting = false;
+
+       for_each_bset(b, t)
+               whiteout_u64s += should_compact_bset(b, t,
+                                       whiteout_u64s != 0, mode);
+
+       if (!whiteout_u64s)
+               return false;
+
+       sort_iter_init(&sort_iter, b);
+
+       whiteout_u64s += b->whiteout_u64s;
+       order = get_order(whiteout_u64s * sizeof(u64));
+
+       whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+       u_start = u_pos = whiteouts;
+
+       memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
+                   b->whiteout_u64s);
+       u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
+
+       sort_iter_add(&sort_iter, u_start, u_pos);
+
+       for_each_bset(b, t) {
+               struct bset *i = bset(b, t);
+               struct bkey_packed *k, *n, *out, *start, *end;
+               struct btree_node_entry *src = NULL, *dst = NULL;
+
+               if (t != b->set && bset_unwritten(b, i)) {
+                       src = container_of(i, struct btree_node_entry, keys);
+                       dst = max(write_block(b),
+                                 (void *) btree_bkey_last(b, t -1));
+               }
+
+               if (!should_compact_bset(b, t, compacting, mode)) {
+                       if (src != dst) {
+                               memmove(dst, src, sizeof(*src) +
+                                       le16_to_cpu(src->keys.u64s) *
+                                       sizeof(u64));
+                               i = &dst->keys;
+                               set_btree_bset(b, t, i);
+                       }
+                       continue;
+               }
+
+               compacting = true;
+               u_start = u_pos;
+               start = i->start;
+               end = vstruct_last(i);
+
+               if (src != dst) {
+                       memmove(dst, src, sizeof(*src));
+                       i = &dst->keys;
+                       set_btree_bset(b, t, i);
+               }
+
+               out = i->start;
+
+               for (k = start; k != end; k = n) {
+                       n = bkey_next(k);
+
+                       if (bkey_deleted(k) && btree_node_is_extents(b))
+                               continue;
+
+                       if (bkey_whiteout(k) && !k->needs_whiteout)
+                               continue;
+
+                       if (bkey_whiteout(k)) {
+                               unreserve_whiteout(b, t, k);
+                               memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
+                               set_bkeyp_val_u64s(f, u_pos, 0);
+                               u_pos = bkey_next(u_pos);
+                       } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+                               bkey_copy(out, k);
+                               out = bkey_next(out);
+                       }
+               }
+
+               sort_iter_add(&sort_iter, u_start, u_pos);
+
+               if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+                       i->u64s = cpu_to_le16((u64 *) out - i->_data);
+                       set_btree_bset_end(b, t);
+                       bch2_bset_set_no_aux_tree(b, t);
+               }
+       }
+
+       b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
+
+       BUG_ON((void *) unwritten_whiteouts_start(c, b) <
+              (void *) btree_bkey_last(b, bset_tree_last(b)));
+
+       u64s = btree_node_is_extents(b)
+               ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+                                       &sort_iter)
+               : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
+                                    &sort_iter);
+
+       BUG_ON(u64s > b->whiteout_u64s);
+       BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
+       BUG_ON(u_pos != whiteouts && !u64s);
+
+       if (u64s != b->whiteout_u64s) {
+               void *src = unwritten_whiteouts_start(c, b);
+
+               b->whiteout_u64s = u64s;
+               memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
+       }
+
+       verify_no_dups(b,
+                      unwritten_whiteouts_start(c, b),
+                      unwritten_whiteouts_end(c, b));
+
+       btree_bounce_free(c, order, used_mempool, whiteouts);
+
+       if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
+               bch2_btree_build_aux_trees(b);
+
+       bch_btree_keys_u64s_remaining(c, b);
+       bch2_verify_btree_nr_keys(b);
+
+       return true;
+}
+
+static bool bch2_drop_whiteouts(struct btree *b)
+{
+       struct bset_tree *t;
+       bool ret = false;
+
+       for_each_bset(b, t) {
+               struct bset *i = bset(b, t);
+               struct bkey_packed *k, *n, *out, *start, *end;
+
+               if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
+                       continue;
+
+               start   = btree_bkey_first(b, t);
+               end     = btree_bkey_last(b, t);
+
+               if (bset_unwritten(b, i) &&
+                   t != b->set) {
+                       struct bset *dst =
+                              max_t(struct bset *, write_block(b),
+                                    (void *) btree_bkey_last(b, t -1));
+
+                       memmove(dst, i, sizeof(struct bset));
+                       i = dst;
+                       set_btree_bset(b, t, i);
+               }
+
+               out = i->start;
+
+               for (k = start; k != end; k = n) {
+                       n = bkey_next(k);
+
+                       if (!bkey_whiteout(k)) {
+                               bkey_copy(out, k);
+                               out = bkey_next(out);
+                       }
+               }
+
+               i->u64s = cpu_to_le16((u64 *) out - i->_data);
+               bch2_bset_set_no_aux_tree(b, t);
+               ret = true;
+       }
+
+       bch2_verify_btree_nr_keys(b);
+
+       return ret;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+                               struct bkey_packed *l,
+                               struct bkey_packed *r)
+{
+       return bkey_cmp_packed(b, l, r) ?:
+               (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+               (int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+static unsigned sort_keys(struct bkey_packed *dst,
+                         struct sort_iter *iter,
+                         bool filter_whiteouts)
+{
+       const struct bkey_format *f = &iter->b->format;
+       struct bkey_packed *in, *next, *out = dst;
+
+       sort_iter_sort(iter, sort_keys_cmp);
+
+       while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+               if (bkey_whiteout(in) &&
+                   (filter_whiteouts || !in->needs_whiteout))
+                       continue;
+
+               if (bkey_whiteout(in) &&
+                   (next = sort_iter_peek(iter)) &&
+                   !bkey_cmp_packed(iter->b, in, next)) {
+                       BUG_ON(in->needs_whiteout &&
+                              next->needs_whiteout);
+                       /*
+                        * XXX racy, called with read lock from write path
+                        *
+                        * leads to spurious BUG_ON() in bkey_unpack_key() in
+                        * debug mode
+                        */
+                       next->needs_whiteout |= in->needs_whiteout;
+                       continue;
+               }
+
+               if (bkey_whiteout(in)) {
+                       memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+                       set_bkeyp_val_u64s(f, out, 0);
+               } else {
+                       bkey_copy(out, in);
+               }
+               out = bkey_next(out);
+       }
+
+       return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+                                  struct bkey_packed *l,
+                                  struct bkey_packed *r)
+{
+       return bkey_cmp_packed(b, l, r) ?:
+               (int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+static unsigned sort_extents(struct bkey_packed *dst,
+                            struct sort_iter *iter,
+                            bool filter_whiteouts)
+{
+       struct bkey_packed *in, *out = dst;
+
+       sort_iter_sort(iter, sort_extents_cmp);
+
+       while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+               if (bkey_deleted(in))
+                       continue;
+
+               if (bkey_whiteout(in) &&
+                   (filter_whiteouts || !in->needs_whiteout))
+                       continue;
+
+               bkey_copy(out, in);
+               out = bkey_next(out);
+       }
+
+       return (u64 *) out - (u64 *) dst;
+}
+
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
+                           struct btree_iter *iter,
+                           unsigned start_idx,
+                           unsigned end_idx,
+                           bool filter_whiteouts)
+{
+       struct btree_node *out;
+       struct sort_iter sort_iter;
+       struct bset_tree *t;
+       struct bset *start_bset = bset(b, &b->set[start_idx]);
+       bool used_mempool = false;
+       u64 start_time, seq = 0;
+       unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+       bool sorting_entire_node = start_idx == 0 &&
+               end_idx == b->nsets;
+
+       sort_iter_init(&sort_iter, b);
+
+       for (t = b->set + start_idx;
+            t < b->set + end_idx;
+            t++) {
+               u64s += le16_to_cpu(bset(b, t)->u64s);
+               sort_iter_add(&sort_iter,
+                             btree_bkey_first(b, t),
+                             btree_bkey_last(b, t));
+       }
+
+       order = sorting_entire_node
+               ? btree_page_order(c)
+               : get_order(__vstruct_bytes(struct btree_node, u64s));
+
+       out = btree_bounce_alloc(c, order, &used_mempool);
+
+       start_time = local_clock();
+
+       if (btree_node_is_extents(b))
+               filter_whiteouts = bset_written(b, start_bset);
+
+       u64s = btree_node_is_extents(b)
+               ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
+               : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+
+       out->keys.u64s = cpu_to_le16(u64s);
+
+       BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
+
+       if (sorting_entire_node)
+               bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+                                      start_time);
+
+       /* Make sure we preserve bset journal_seq: */
+       for (t = b->set + start_idx; t < b->set + end_idx; t++)
+               seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+       start_bset->journal_seq = cpu_to_le64(seq);
+
+       if (sorting_entire_node) {
+               unsigned u64s = le16_to_cpu(out->keys.u64s);
+
+               BUG_ON(order != btree_page_order(c));
+
+               /*
+                * Our temporary buffer is the same size as the btree node's
+                * buffer, we can just swap buffers instead of doing a big
+                * memcpy()
+                */
+               *out = *b->data;
+               out->keys.u64s = cpu_to_le16(u64s);
+               swap(out, b->data);
+               set_btree_bset(b, b->set, &b->data->keys);
+       } else {
+               start_bset->u64s = out->keys.u64s;
+               memcpy_u64s(start_bset->start,
+                           out->keys.start,
+                           le16_to_cpu(out->keys.u64s));
+       }
+
+       for (i = start_idx + 1; i < end_idx; i++)
+               b->nr.bset_u64s[start_idx] +=
+                       b->nr.bset_u64s[i];
+
+       b->nsets -= shift;
+
+       for (i = start_idx + 1; i < b->nsets; i++) {
+               b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
+               b->set[i]               = b->set[i + shift];
+       }
+
+       for (i = b->nsets; i < MAX_BSETS; i++)
+               b->nr.bset_u64s[i] = 0;
+
+       set_btree_bset_end(b, &b->set[start_idx]);
+       bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+       btree_bounce_free(c, order, used_mempool, out);
+
+       bch2_verify_btree_nr_keys(b);
+}
+
+/* Sort + repack in a new format: */
+static struct btree_nr_keys sort_repack(struct bset *dst,
+                                       struct btree *src,
+                                       struct btree_node_iter *src_iter,
+                                       struct bkey_format *out_f,
+                                       bool filter_whiteouts)
+{
+       struct bkey_format *in_f = &src->format;
+       struct bkey_packed *in, *out = vstruct_last(dst);
+       struct btree_nr_keys nr;
+
+       memset(&nr, 0, sizeof(nr));
+
+       while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+               if (filter_whiteouts && bkey_whiteout(in))
+                       continue;
+
+               if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+                                      ? in_f : &bch2_bkey_format_current, in))
+                       out->format = KEY_FORMAT_LOCAL_BTREE;
+               else
+                       bch2_bkey_unpack(src, (void *) out, in);
+
+               btree_keys_account_key_add(&nr, 0, out);
+               out = bkey_next(out);
+       }
+
+       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+       return nr;
+}
+
+/* Sort, repack, and merge: */
+static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
+                                             struct bset *dst,
+                                             struct btree *src,
+                                             struct btree_node_iter *iter,
+                                             struct bkey_format *out_f,
+                                             bool filter_whiteouts,
+                                             key_filter_fn filter,
+                                             key_merge_fn merge)
+{
+       struct bkey_packed *k, *prev = NULL, *out;
+       struct btree_nr_keys nr;
+       BKEY_PADDED(k) tmp;
+
+       memset(&nr, 0, sizeof(nr));
+
+       while ((k = bch2_btree_node_iter_next_all(iter, src))) {
+               if (filter_whiteouts && bkey_whiteout(k))
+                       continue;
+
+               /*
+                * The filter might modify pointers, so we have to unpack the
+                * key and values to &tmp.k:
+                */
+               bch2_bkey_unpack(src, &tmp.k, k);
+
+               if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
+                       continue;
+
+               /* prev is always unpacked, for key merging: */
+
+               if (prev &&
+                   merge &&
+                   merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
+                       continue;
+
+               /*
+                * the current key becomes the new prev: advance prev, then
+                * copy the current key - but first pack prev (in place):
+                */
+               if (prev) {
+                       bch2_bkey_pack(prev, (void *) prev, out_f);
+
+                       btree_keys_account_key_add(&nr, 0, prev);
+                       prev = bkey_next(prev);
+               } else {
+                       prev = vstruct_last(dst);
+               }
+
+               bkey_copy(prev, &tmp.k);
+       }
+
+       if (prev) {
+               bch2_bkey_pack(prev, (void *) prev, out_f);
+               btree_keys_account_key_add(&nr, 0, prev);
+               out = bkey_next(prev);
+       } else {
+               out = vstruct_last(dst);
+       }
+
+       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+       return nr;
+}
+
+void bch2_btree_sort_into(struct bch_fs *c,
+                        struct btree *dst,
+                        struct btree *src)
+{
+       struct btree_nr_keys nr;
+       struct btree_node_iter src_iter;
+       u64 start_time = local_clock();
+
+       BUG_ON(dst->nsets != 1);
+
+       bch2_bset_set_no_aux_tree(dst, dst->set);
+
+       bch2_btree_node_iter_init_from_start(&src_iter, src,
+                                           btree_node_is_extents(src));
+
+       if (btree_node_ops(src)->key_normalize ||
+           btree_node_ops(src)->key_merge)
+               nr = sort_repack_merge(c, btree_bset_first(dst),
+                               src, &src_iter,
+                               &dst->format,
+                               true,
+                               btree_node_ops(src)->key_normalize,
+                               btree_node_ops(src)->key_merge);
+       else
+               nr = sort_repack(btree_bset_first(dst),
+                               src, &src_iter,
+                               &dst->format,
+                               true);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
+
+       set_btree_bset_end(dst, dst->set);
+
+       dst->nr.live_u64s       += nr.live_u64s;
+       dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
+       dst->nr.packed_keys     += nr.packed_keys;
+       dst->nr.unpacked_keys   += nr.unpacked_keys;
+
+       bch2_verify_btree_nr_keys(dst);
+}
+
+#define SORT_CRIT      (4096 / sizeof(u64))
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct bch_fs *c, struct btree *b,
+                              struct btree_iter *iter)
+{
+       unsigned unwritten_idx;
+       bool ret = false;
+
+       for (unwritten_idx = 0;
+            unwritten_idx < b->nsets;
+            unwritten_idx++)
+               if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+                       break;
+
+       if (b->nsets - unwritten_idx > 1) {
+               btree_node_sort(c, b, iter, unwritten_idx,
+                               b->nsets, false);
+               ret = true;
+       }
+
+       if (unwritten_idx > 1) {
+               btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+               ret = true;
+       }
+
+       return ret;
+}
+
+void bch2_btree_build_aux_trees(struct btree *b)
+{
+       struct bset_tree *t;
+
+       for_each_bset(b, t)
+               bch2_bset_build_aux_tree(b, t,
+                               bset_unwritten(b, bset(b, t)) &&
+                               t == bset_tree_last(b));
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
+                         struct btree_iter *iter)
+{
+       struct btree_node_entry *bne;
+       bool did_sort;
+
+       EBUG_ON(!(b->lock.state.seq & 1));
+       EBUG_ON(iter && iter->l[b->level].b != b);
+
+       did_sort = btree_node_compact(c, b, iter);
+
+       bne = want_new_bset(c, b);
+       if (bne)
+               bch2_bset_init_next(c, b, bne);
+
+       bch2_btree_build_aux_trees(b);
+
+       if (iter && did_sort)
+               bch2_btree_iter_reinit_node(iter, b);
+}
+
+static struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+       return (struct nonce) {{
+               [0] = cpu_to_le32(offset),
+               [1] = ((__le32 *) &i->seq)[0],
+               [2] = ((__le32 *) &i->seq)[1],
+               [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+       }};
+}
+
+static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+       struct nonce nonce = btree_nonce(i, offset);
+
+       if (!offset) {
+               struct btree_node *bn = container_of(i, struct btree_node, keys);
+               unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+               bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+                            bytes);
+
+               nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+       }
+
+       bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+                    vstruct_end(i) - (void *) i->_data);
+}
+
+static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
+                        unsigned offset, int write, char *buf, size_t len)
+{
+       char *out = buf, *end = buf + len;
+
+       out += scnprintf(out, end - out,
+                        "error validating btree node %s"
+                        "at btree %u level %u/%u\n"
+                        "pos %llu:%llu node offset %u",
+                        write ? "before write " : "",
+                        b->btree_id, b->level,
+                        c->btree_roots[b->btree_id].level,
+                        b->key.k.p.inode, b->key.k.p.offset,
+                        b->written);
+       if (i)
+               out += scnprintf(out, end - out,
+                                " bset u64s %u",
+                                le16_to_cpu(i->u64s));
+
+       return out - buf;
+}
+
+enum btree_err_type {
+       BTREE_ERR_FIXABLE,
+       BTREE_ERR_WANT_RETRY,
+       BTREE_ERR_MUST_RETRY,
+       BTREE_ERR_FATAL,
+};
+
+enum btree_validate_ret {
+       BTREE_RETRY_READ = 64,
+};
+
+#define btree_err(type, c, b, i, msg, ...)                             \
+({                                                                     \
+       __label__ out;                                                  \
+       char _buf[300], *out = _buf, *end = out + sizeof(_buf);         \
+                                                                       \
+       out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
+       out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);      \
+                                                                       \
+       if (type == BTREE_ERR_FIXABLE &&                                \
+           write == READ &&                                            \
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
+               mustfix_fsck_err(c, "%s", _buf);                        \
+               goto out;                                               \
+       }                                                               \
+                                                                       \
+       switch (write) {                                                \
+       case READ:                                                      \
+               bch_err(c, "%s", _buf);                                 \
+                                                                       \
+               switch (type) {                                         \
+               case BTREE_ERR_FIXABLE:                                 \
+                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       goto fsck_err;                                  \
+               case BTREE_ERR_WANT_RETRY:                              \
+                       if (have_retry) {                               \
+                               ret = BTREE_RETRY_READ;                 \
+                               goto fsck_err;                          \
+                       }                                               \
+                       break;                                          \
+               case BTREE_ERR_MUST_RETRY:                              \
+                       ret = BTREE_RETRY_READ;                         \
+                       goto fsck_err;                                  \
+               case BTREE_ERR_FATAL:                                   \
+                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       goto fsck_err;                                  \
+               }                                                       \
+               break;                                                  \
+       case WRITE:                                                     \
+               bch_err(c, "corrupt metadata before write: %s", _buf);  \
+                                                                       \
+               if (bch2_fs_inconsistent(c)) {                          \
+                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       goto fsck_err;                                  \
+               }                                                       \
+               break;                                                  \
+       }                                                               \
+out:                                                                   \
+       true;                                                           \
+})
+
+#define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+                        struct bset *i, unsigned sectors,
+                        unsigned *whiteout_u64s, int write,
+                        bool have_retry)
+{
+       struct bkey_packed *k, *prev = NULL;
+       struct bpos prev_pos = POS_MIN;
+       enum bkey_type type = btree_node_type(b);
+       bool seen_non_whiteout = false;
+       const char *err;
+       int ret = 0;
+
+       if (i == &b->data->keys) {
+               /* These indicate that we read the wrong btree node: */
+               btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
+                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            "incorrect btree id");
+
+               btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
+                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            "incorrect level");
+
+               if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+                       u64 *p = (u64 *) &b->data->ptr;
+
+                       *p = swab64(*p);
+                       bch2_bpos_swab(&b->data->min_key);
+                       bch2_bpos_swab(&b->data->max_key);
+               }
+
+               btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
+                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            "incorrect max key");
+
+               /* XXX: ideally we would be validating min_key too */
+#if 0
+               /*
+                * not correct anymore, due to btree node write error
+                * handling
+                *
+                * need to add b->data->seq to btree keys and verify
+                * against that
+                */
+               btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
+                                                 b->data->ptr),
+                            BTREE_ERR_FATAL, c, b, i,
+                            "incorrect backpointer");
+#endif
+               err = bch2_bkey_format_validate(&b->data->format);
+               btree_err_on(err,
+                            BTREE_ERR_FATAL, c, b, i,
+                            "invalid bkey format: %s", err);
+       }
+
+       if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
+                        BTREE_ERR_FIXABLE, c, b, i,
+                        "unsupported bset version")) {
+               i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+               i->u64s = 0;
+               return 0;
+       }
+
+       if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+                        BTREE_ERR_FIXABLE, c, b, i,
+                        "bset past end of btree node")) {
+               i->u64s = 0;
+               return 0;
+       }
+
+       btree_err_on(b->written && !i->u64s,
+                    BTREE_ERR_FIXABLE, c, b, i,
+                    "empty bset");
+
+       if (!BSET_SEPARATE_WHITEOUTS(i)) {
+               seen_non_whiteout = true;
+               *whiteout_u64s = 0;
+       }
+
+       for (k = i->start;
+            k != vstruct_last(i);) {
+               struct bkey_s_c u;
+               struct bkey tmp;
+               const char *invalid;
+
+               if (btree_err_on(!k->u64s,
+                                BTREE_ERR_FIXABLE, c, b, i,
+                                "KEY_U64s 0: %zu bytes of metadata lost",
+                                vstruct_end(i) - (void *) k)) {
+                       i->u64s = cpu_to_le16((u64 *) k - i->_data);
+                       break;
+               }
+
+               if (btree_err_on(bkey_next(k) > vstruct_last(i),
+                                BTREE_ERR_FIXABLE, c, b, i,
+                                "key extends past end of bset")) {
+                       i->u64s = cpu_to_le16((u64 *) k - i->_data);
+                       break;
+               }
+
+               if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+                                BTREE_ERR_FIXABLE, c, b, i,
+                                "invalid bkey format %u", k->format)) {
+                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+                       memmove_u64s_down(k, bkey_next(k),
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
+                       continue;
+               }
+
+               if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+                       bch2_bkey_swab(type, &b->format, k);
+
+               u = bkey_disassemble(b, k, &tmp);
+
+               invalid = __bch2_bkey_invalid(c, type, u) ?:
+                       bch2_bkey_in_btree_node(b, u) ?:
+                       (write ? bch2_bkey_val_invalid(c, type, u) : NULL);
+               if (invalid) {
+                       char buf[160];
+
+                       bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+                       btree_err(BTREE_ERR_FIXABLE, c, b, i,
+                                 "invalid bkey:\n%s\n%s", invalid, buf);
+
+                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+                       memmove_u64s_down(k, bkey_next(k),
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
+                       continue;
+               }
+
+               /*
+                * with the separate whiteouts thing (used for extents), the
+                * second set of keys actually can have whiteouts too, so we
+                * can't solely go off bkey_whiteout()...
+                */
+
+               if (!seen_non_whiteout &&
+                   (!bkey_whiteout(k) ||
+                    (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
+                       *whiteout_u64s = k->_data - i->_data;
+                       seen_non_whiteout = true;
+               } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+                       btree_err(BTREE_ERR_FATAL, c, b, i,
+                                 "keys out of order: %llu:%llu > %llu:%llu",
+                                 prev_pos.inode,
+                                 prev_pos.offset,
+                                 u.k->p.inode,
+                                 bkey_start_offset(u.k));
+                       /* XXX: repair this */
+               }
+
+               prev_pos = u.k->p;
+               prev = k;
+               k = bkey_next(k);
+       }
+
+       SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+fsck_err:
+       return ret;
+}
+
+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+{
+       struct btree_node_entry *bne;
+       struct btree_node_iter_large *iter;
+       struct btree_node *sorted;
+       struct bkey_packed *k;
+       struct bset *i;
+       bool used_mempool;
+       unsigned u64s;
+       int ret, retry_read = 0, write = READ;
+
+       iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
+       __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+
+       if (bch2_meta_read_fault("btree"))
+               btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+                         "dynamic fault");
+
+       btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                    "bad magic");
+
+       btree_err_on(!b->data->keys.seq,
+                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                    "bad btree header");
+
+       while (b->written < c->opts.btree_node_size) {
+               unsigned sectors, whiteout_u64s = 0;
+               struct nonce nonce;
+               struct bch_csum csum;
+               bool first = !b->written;
+
+               if (!b->written) {
+                       i = &b->data->keys;
+
+                       btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "unknown checksum type");
+
+                       nonce = btree_nonce(i, b->written << 9);
+                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+
+                       btree_err_on(bch2_crc_cmp(csum, b->data->csum),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "invalid checksum");
+
+                       bset_encrypt(c, i, b->written << 9);
+
+                       sectors = vstruct_sectors(b->data, c->block_bits);
+
+                       btree_node_set_format(b, b->data->format);
+               } else {
+                       bne = write_block(b);
+                       i = &bne->keys;
+
+                       if (i->seq != b->data->keys.seq)
+                               break;
+
+                       btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "unknown checksum type");
+
+                       nonce = btree_nonce(i, b->written << 9);
+                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+                       btree_err_on(bch2_crc_cmp(csum, bne->csum),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "invalid checksum");
+
+                       bset_encrypt(c, i, b->written << 9);
+
+                       sectors = vstruct_sectors(bne, c->block_bits);
+               }
+
+               ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
+                                   READ, have_retry);
+               if (ret)
+                       goto fsck_err;
+
+               b->written += sectors;
+
+               ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
+               if (ret < 0) {
+                       btree_err(BTREE_ERR_FATAL, c, b, i,
+                                 "insufficient memory");
+                       goto err;
+               }
+
+               if (ret) {
+                       btree_err_on(first,
+                                    BTREE_ERR_FIXABLE, c, b, i,
+                                    "first btree node bset has blacklisted journal seq");
+                       if (!first)
+                               continue;
+               }
+
+               bch2_btree_node_iter_large_push(iter, b,
+                                          i->start,
+                                          vstruct_idx(i, whiteout_u64s));
+
+               bch2_btree_node_iter_large_push(iter, b,
+                                          vstruct_idx(i, whiteout_u64s),
+                                          vstruct_last(i));
+       }
+
+       for (bne = write_block(b);
+            bset_byte_offset(b, bne) < btree_bytes(c);
+            bne = (void *) bne + block_bytes(c))
+               btree_err_on(bne->keys.seq == b->data->keys.seq,
+                            BTREE_ERR_WANT_RETRY, c, b, NULL,
+                            "found bset signature after last bset");
+
+       sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
+       sorted->keys.u64s = 0;
+
+       set_btree_bset(b, b->set, &b->data->keys);
+
+       b->nr = btree_node_is_extents(b)
+               ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
+               : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
+
+       u64s = le16_to_cpu(sorted->keys.u64s);
+       *sorted = *b->data;
+       sorted->keys.u64s = cpu_to_le16(u64s);
+       swap(sorted, b->data);
+       set_btree_bset(b, b->set, &b->data->keys);
+       b->nsets = 1;
+
+       BUG_ON(b->nr.live_u64s != u64s);
+
+       btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+
+       i = &b->data->keys;
+       for (k = i->start; k != vstruct_last(i);) {
+               enum bkey_type type = btree_node_type(b);
+               struct bkey tmp;
+               struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+               const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+               if (invalid ||
+                   (inject_invalid_keys(c) &&
+                    !bversion_cmp(u.k->version, MAX_VERSION))) {
+                       char buf[160];
+
+                       bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+                       btree_err(BTREE_ERR_FIXABLE, c, b, i,
+                                 "invalid bkey %s: %s", buf, invalid);
+
+                       btree_keys_account_key_drop(&b->nr, 0, k);
+
+                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+                       memmove_u64s_down(k, bkey_next(k),
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
+                       set_btree_bset_end(b, b->set);
+                       continue;
+               }
+
+               k = bkey_next(k);
+       }
+
+       bch2_bset_build_aux_tree(b, b->set, false);
+
+       set_needs_whiteout(btree_bset_first(b));
+
+       btree_node_reset_sib_u64s(b);
+out:
+       mempool_free(iter, &c->fill_iter);
+       return retry_read;
+err:
+fsck_err:
+       if (ret == BTREE_RETRY_READ) {
+               retry_read = 1;
+       } else {
+               bch2_inconsistent_error(c);
+               set_btree_node_read_error(b);
+       }
+       goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+       struct btree_read_bio *rb =
+               container_of(work, struct btree_read_bio, work);
+       struct bch_fs *c        = rb->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+       struct btree *b         = rb->bio.bi_private;
+       struct bio *bio         = &rb->bio;
+       struct bch_devs_mask avoid;
+       bool can_retry;
+
+       memset(&avoid, 0, sizeof(avoid));
+
+       goto start;
+       while (1) {
+               bch_info(c, "retrying read");
+               ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+               rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
+               bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
+               bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
+               bio->bi_iter.bi_size    = btree_bytes(c);
+
+               if (rb->have_ioref) {
+                       bio_set_dev(bio, ca->disk_sb.bdev);
+                       submit_bio_wait(bio);
+               } else {
+                       bio->bi_status = BLK_STS_REMOVED;
+               }
+start:
+               bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+               if (rb->have_ioref)
+                       percpu_ref_put(&ca->io_ref);
+               rb->have_ioref = false;
+
+               __set_bit(rb->pick.ptr.dev, avoid.d);
+               can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
+
+               if (!bio->bi_status &&
+                   !bch2_btree_node_read_done(c, b, can_retry))
+                       break;
+
+               if (!can_retry) {
+                       set_btree_node_read_error(b);
+                       break;
+               }
+       }
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
+       bio_put(&rb->bio);
+       clear_btree_node_read_in_flight(b);
+       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+       struct btree_read_bio *rb =
+               container_of(bio, struct btree_read_bio, bio);
+       struct bch_fs *c        = rb->c;
+
+       if (rb->have_ioref) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+               bch2_latency_acct(ca, rb->start_time, READ);
+       }
+
+       queue_work(system_unbound_wq, &rb->work);
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+                         bool sync)
+{
+       struct extent_pick_ptr pick;
+       struct btree_read_bio *rb;
+       struct bch_dev *ca;
+       struct bio *bio;
+       int ret;
+
+       trace_btree_read(c, b);
+
+       ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+       if (bch2_fs_fatal_err_on(ret <= 0, c,
+                       "btree node read error: no device to read from")) {
+               set_btree_node_read_error(b);
+               return;
+       }
+
+       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+       bio = bio_alloc_bioset(NULL,
+                              buf_pages(b->data, btree_bytes(c)),
+                              REQ_OP_READ|REQ_SYNC|REQ_META,
+                              GFP_NOIO,
+                              &c->btree_bio);
+       rb = container_of(bio, struct btree_read_bio, bio);
+       rb->c                   = c;
+       rb->start_time          = local_clock();
+       rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
+       rb->pick                = pick;
+       INIT_WORK(&rb->work, btree_node_read_work);
+       bio->bi_iter.bi_sector  = pick.ptr.offset;
+       bio->bi_iter.bi_size    = btree_bytes(c);
+       bio->bi_end_io          = btree_node_read_endio;
+       bio->bi_private         = b;
+       bch2_bio_map(bio, b->data);
+
+       set_btree_node_read_in_flight(b);
+
+       if (rb->have_ioref) {
+               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+                            bio_sectors(bio));
+               bio_set_dev(bio, ca->disk_sb.bdev);
+
+               if (sync) {
+                       submit_bio_wait(bio);
+
+                       bio->bi_private = b;
+                       btree_node_read_work(&rb->work);
+               } else {
+                       submit_bio(bio);
+               }
+       } else {
+               bio->bi_status = BLK_STS_REMOVED;
+
+               if (sync)
+                       btree_node_read_work(&rb->work);
+               else
+                       queue_work(system_unbound_wq, &rb->work);
+
+       }
+}
+
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+                       const struct bkey_i *k, unsigned level)
+{
+       struct closure cl;
+       struct btree *b;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       do {
+               ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+               closure_sync(&cl);
+       } while (ret);
+
+       b = bch2_btree_node_mem_alloc(c);
+       bch2_btree_cache_cannibalize_unlock(c);
+
+       BUG_ON(IS_ERR(b));
+
+       bkey_copy(&b->key, k);
+       BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+
+       bch2_btree_node_read(c, b, true);
+
+       if (btree_node_read_error(b)) {
+               bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+               mutex_lock(&c->btree_cache.lock);
+               list_move(&b->list, &c->btree_cache.freeable);
+               mutex_unlock(&c->btree_cache.lock);
+
+               ret = -EIO;
+               goto err;
+       }
+
+       bch2_btree_set_root_for_read(c, b);
+err:
+       six_unlock_write(&b->lock);
+       six_unlock_intent(&b->lock);
+
+       return ret;
+}
+
+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+                             struct btree_write *w)
+{
+       unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+       do {
+               old = new = v;
+               if (!(old & 1))
+                       break;
+
+               new &= ~1UL;
+       } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+       if (old & 1)
+               closure_put(&((struct btree_update *) new)->cl);
+
+       bch2_journal_pin_drop(&c->journal, &w->journal);
+       closure_wake_up(&w->wait);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+       struct btree_write *w = btree_prev_write(b);
+
+       bch2_btree_complete_write(c, b, w);
+       btree_node_io_unlock(b);
+}
+
+static void bch2_btree_node_write_error(struct bch_fs *c,
+                                       struct btree_write_bio *wbio)
+{
+       struct btree *b         = wbio->wbio.bio.bi_private;
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+       struct bkey_i_extent *new_key;
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       struct btree_iter iter;
+       int ret;
+
+       __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+                              BTREE_MAX_DEPTH,
+                              b->level, BTREE_ITER_NODES);
+retry:
+       ret = bch2_btree_iter_traverse(&iter);
+       if (ret)
+               goto err;
+
+       /* has node been freed? */
+       if (iter.l[b->level].b != b) {
+               /* node has been freed: */
+               BUG_ON(!btree_node_dying(b));
+               goto out;
+       }
+
+       BUG_ON(!btree_node_hashed(b));
+
+       bkey_copy(&tmp.k, &b->key);
+
+       new_key = bkey_i_to_extent(&tmp.k);
+       e = extent_i_to_s(new_key);
+       extent_for_each_ptr_backwards(e, ptr)
+               if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
+                       bch2_extent_drop_ptr(e, ptr);
+
+       if (!bch2_extent_nr_ptrs(e.c))
+               goto err;
+
+       ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+       if (ret == -EINTR)
+               goto retry;
+       if (ret)
+               goto err;
+out:
+       bch2_btree_iter_unlock(&iter);
+       bio_put(&wbio->wbio.bio);
+       btree_node_write_done(c, b);
+       return;
+err:
+       set_btree_node_noevict(b);
+       bch2_fs_fatal_error(c, "fatal error writing btree node");
+       goto out;
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs,
+                                       btree_write_error_work);
+       struct bio *bio;
+
+       while (1) {
+               spin_lock_irq(&c->btree_write_error_lock);
+               bio = bio_list_pop(&c->btree_write_error_list);
+               spin_unlock_irq(&c->btree_write_error_lock);
+
+               if (!bio)
+                       break;
+
+               bch2_btree_node_write_error(c,
+                       container_of(bio, struct btree_write_bio, wbio.bio));
+       }
+}
+
+static void btree_node_write_work(struct work_struct *work)
+{
+       struct btree_write_bio *wbio =
+               container_of(work, struct btree_write_bio, work);
+       struct bch_fs *c        = wbio->wbio.c;
+       struct btree *b         = wbio->wbio.bio.bi_private;
+
+       btree_bounce_free(c,
+               wbio->wbio.order,
+               wbio->wbio.used_mempool,
+               wbio->data);
+
+       if (wbio->wbio.failed.nr) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&c->btree_write_error_lock, flags);
+               bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
+               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
+               queue_work(c->wq, &c->btree_write_error_work);
+               return;
+       }
+
+       bio_put(&wbio->wbio.bio);
+       btree_node_write_done(c, b);
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_write_bio *orig      = parent ?: wbio;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
+       unsigned long flags;
+
+       if (wbio->have_ioref)
+               bch2_latency_acct(ca, wbio->submit_time, WRITE);
+
+       if (bio->bi_status == BLK_STS_REMOVED ||
+           bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+           bch2_meta_write_fault("btree")) {
+               spin_lock_irqsave(&c->btree_write_error_lock, flags);
+               bch2_dev_list_add_dev(&orig->failed, wbio->dev);
+               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+       }
+
+       if (wbio->have_ioref)
+               percpu_ref_put(&ca->io_ref);
+
+       if (parent) {
+               bio_put(bio);
+               bio_endio(&parent->bio);
+       } else {
+               struct btree_write_bio *wb =
+                       container_of(orig, struct btree_write_bio, wbio);
+
+               INIT_WORK(&wb->work, btree_node_write_work);
+               queue_work(system_unbound_wq, &wb->work);
+       }
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+                                  struct bset *i, unsigned sectors)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned whiteout_u64s = 0;
+       int ret;
+
+       extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
+               break;
+
+       ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
+       if (ret)
+               bch2_inconsistent_error(c);
+
+       return ret;
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+                           enum six_lock_type lock_type_held)
+{
+       struct btree_write_bio *wbio;
+       struct bset_tree *t;
+       struct bset *i;
+       struct btree_node *bn = NULL;
+       struct btree_node_entry *bne = NULL;
+       BKEY_PADDED(key) k;
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       struct sort_iter sort_iter;
+       struct nonce nonce;
+       unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+       u64 seq = 0;
+       bool used_mempool;
+       unsigned long old, new;
+       void *data;
+
+       if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+               return;
+
+       /*
+        * We may only have a read lock on the btree node - the dirty bit is our
+        * "lock" against racing with other threads that may be trying to start
+        * a write, we do a write iff we clear the dirty bit. Since setting the
+        * dirty bit requires a write lock, we can't race with other threads
+        * redirtying it:
+        */
+       do {
+               old = new = READ_ONCE(b->flags);
+
+               if (!(old & (1 << BTREE_NODE_dirty)))
+                       return;
+
+               if (b->written &&
+                   !btree_node_may_write(b))
+                       return;
+
+               if (old & (1 << BTREE_NODE_write_in_flight)) {
+                       btree_node_wait_on_io(b);
+                       continue;
+               }
+
+               new &= ~(1 << BTREE_NODE_dirty);
+               new &= ~(1 << BTREE_NODE_need_write);
+               new |=  (1 << BTREE_NODE_write_in_flight);
+               new |=  (1 << BTREE_NODE_just_written);
+               new ^=  (1 << BTREE_NODE_write_idx);
+       } while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+       BUG_ON(btree_node_fake(b));
+       BUG_ON(!list_empty(&b->write_blocked));
+       BUG_ON((b->will_make_reachable != 0) != !b->written);
+
+       BUG_ON(b->written >= c->opts.btree_node_size);
+       BUG_ON(b->written & (c->opts.block_size - 1));
+       BUG_ON(bset_written(b, btree_bset_last(b)));
+       BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
+       BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+       /*
+        * We can't block on six_lock_write() here; another thread might be
+        * trying to get a journal reservation with read locks held, and getting
+        * a journal reservation might be blocked on flushing the journal and
+        * doing btree writes:
+        */
+       if (lock_type_held == SIX_LOCK_intent &&
+           six_trylock_write(&b->lock)) {
+               __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
+               six_unlock_write(&b->lock);
+       } else {
+               __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
+       }
+
+       BUG_ON(b->uncompacted_whiteout_u64s);
+
+       sort_iter_init(&sort_iter, b);
+
+       bytes = !b->written
+               ? sizeof(struct btree_node)
+               : sizeof(struct btree_node_entry);
+
+       bytes += b->whiteout_u64s * sizeof(u64);
+
+       for_each_bset(b, t) {
+               i = bset(b, t);
+
+               if (bset_written(b, i))
+                       continue;
+
+               bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+               sort_iter_add(&sort_iter,
+                             btree_bkey_first(b, t),
+                             btree_bkey_last(b, t));
+               seq = max(seq, le64_to_cpu(i->journal_seq));
+       }
+
+       order = get_order(bytes);
+       data = btree_bounce_alloc(c, order, &used_mempool);
+
+       if (!b->written) {
+               bn = data;
+               *bn = *b->data;
+               i = &bn->keys;
+       } else {
+               bne = data;
+               bne->keys = b->data->keys;
+               i = &bne->keys;
+       }
+
+       i->journal_seq  = cpu_to_le64(seq);
+       i->u64s         = 0;
+
+       if (!btree_node_is_extents(b)) {
+               sort_iter_add(&sort_iter,
+                             unwritten_whiteouts_start(c, b),
+                             unwritten_whiteouts_end(c, b));
+               SET_BSET_SEPARATE_WHITEOUTS(i, false);
+       } else {
+               memcpy_u64s(i->start,
+                           unwritten_whiteouts_start(c, b),
+                           b->whiteout_u64s);
+               i->u64s = cpu_to_le16(b->whiteout_u64s);
+               SET_BSET_SEPARATE_WHITEOUTS(i, true);
+       }
+
+       b->whiteout_u64s = 0;
+
+       u64s = btree_node_is_extents(b)
+               ? sort_extents(vstruct_last(i), &sort_iter, false)
+               : sort_keys(i->start, &sort_iter, false);
+       le16_add_cpu(&i->u64s, u64s);
+
+       clear_needs_whiteout(i);
+
+       /* do we have data to write? */
+       if (b->written && !i->u64s)
+               goto nowrite;
+
+       bytes_to_write = vstruct_end(i) - data;
+       sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+       memset(data + bytes_to_write, 0,
+              (sectors_to_write << 9) - bytes_to_write);
+
+       BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+       BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+       BUG_ON(i->seq != b->data->keys.seq);
+
+       i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+       SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
+
+       /* if we're going to be encrypting, check metadata validity first: */
+       if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+           validate_bset_for_write(c, b, i, sectors_to_write))
+               goto err;
+
+       bset_encrypt(c, i, b->written << 9);
+
+       nonce = btree_nonce(i, b->written << 9);
+
+       if (bn)
+               bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+       else
+               bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+       /* if we're not encrypting, check metadata after checksumming: */
+       if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+           validate_bset_for_write(c, b, i, sectors_to_write))
+               goto err;
+
+       /*
+        * We handle btree write errors by immediately halting the journal -
+        * after we've done that, we can't issue any subsequent btree writes
+        * because they might have pointers to new nodes that failed to write.
+        *
+        * Furthermore, there's no point in doing any more btree writes because
+        * with the journal stopped, we're never going to update the journal to
+        * reflect that those writes were done and the data flushed from the
+        * journal:
+        *
+        * Make sure to update b->written so bch2_btree_init_next() doesn't
+        * break:
+        */
+       if (bch2_journal_error(&c->journal) ||
+           c->opts.nochanges)
+               goto err;
+
+       trace_btree_write(b, bytes_to_write, sectors_to_write);
+
+       wbio = container_of(bio_alloc_bioset(NULL, 1 << order,
+                               REQ_OP_WRITE|REQ_META|REQ_FUA,
+                               GFP_NOIO,
+                               &c->btree_bio),
+                           struct btree_write_bio, wbio.bio);
+       wbio_init(&wbio->wbio.bio);
+       wbio->data                      = data;
+       wbio->wbio.order                = order;
+       wbio->wbio.used_mempool         = used_mempool;
+       wbio->wbio.bio.bi_iter.bi_size  = sectors_to_write << 9;
+       wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
+       wbio->wbio.bio.bi_private       = b;
+
+       bch2_bio_map(&wbio->wbio.bio, data);
+
+       /*
+        * If we're appending to a leaf node, we don't technically need FUA -
+        * this write just needs to be persisted before the next journal write,
+        * which will be marked FLUSH|FUA.
+        *
+        * Similarly if we're writing a new btree root - the pointer is going to
+        * be in the next journal entry.
+        *
+        * But if we're writing a new btree node (that isn't a root) or
+        * appending to a non leaf btree node, we need either FUA or a flush
+        * when we write the parent with the new pointer. FUA is cheaper than a
+        * flush, and writes appending to leaf nodes aren't blocking anything so
+        * just make all btree node writes FUA to keep things sane.
+        */
+
+       bkey_copy(&k.key, &b->key);
+       e = bkey_i_to_s_extent(&k.key);
+
+       extent_for_each_ptr(e, ptr)
+               ptr->offset += b->written;
+
+       b->written += sectors_to_write;
+
+       bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
+       return;
+err:
+       set_btree_node_noevict(b);
+       b->written += sectors_to_write;
+nowrite:
+       btree_bounce_free(c, order, used_mempool, data);
+       btree_node_write_done(c, b);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
+{
+       bool invalidated_iter = false;
+       struct btree_node_entry *bne;
+       struct bset_tree *t;
+
+       if (!btree_node_just_written(b))
+               return false;
+
+       BUG_ON(b->whiteout_u64s);
+       BUG_ON(b->uncompacted_whiteout_u64s);
+
+       clear_btree_node_just_written(b);
+
+       /*
+        * Note: immediately after write, bset_unwritten()/bset_written() don't
+        * work - the amount of data we had to write after compaction might have
+        * been smaller than the offset of the last bset.
+        *
+        * However, we know that all bsets have been written here, as long as
+        * we're still holding the write lock:
+        */
+
+       /*
+        * XXX: decide if we really want to unconditionally sort down to a
+        * single bset:
+        */
+       if (b->nsets > 1) {
+               btree_node_sort(c, b, NULL, 0, b->nsets, true);
+               invalidated_iter = true;
+       } else {
+               invalidated_iter = bch2_drop_whiteouts(b);
+       }
+
+       for_each_bset(b, t)
+               set_needs_whiteout(bset(b, t));
+
+       bch2_btree_verify(c, b);
+
+       /*
+        * If later we don't unconditionally sort down to a single bset, we have
+        * to ensure this is still true:
+        */
+       BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
+
+       bne = want_new_bset(c, b);
+       if (bne)
+               bch2_bset_init_next(c, b, bne);
+
+       bch2_btree_build_aux_trees(b);
+
+       return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+                         enum six_lock_type lock_type_held)
+{
+       BUG_ON(lock_type_held == SIX_LOCK_write);
+
+       if (lock_type_held == SIX_LOCK_intent ||
+           six_lock_tryupgrade(&b->lock)) {
+               __bch2_btree_node_write(c, b, SIX_LOCK_intent);
+
+               /* don't cycle lock unnecessarily: */
+               if (btree_node_just_written(b) &&
+                   six_trylock_write(&b->lock)) {
+                       bch2_btree_post_write_cleanup(c, b);
+                       six_unlock_write(&b->lock);
+               }
+
+               if (lock_type_held == SIX_LOCK_read)
+                       six_lock_downgrade(&b->lock);
+       } else {
+               __bch2_btree_node_write(c, b, SIX_LOCK_read);
+       }
+}
+
+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+       struct bucket_table *tbl;
+       struct rhash_head *pos;
+       struct btree *b;
+       unsigned i;
+restart:
+       rcu_read_lock();
+       for_each_cached_btree(b, c, tbl, i, pos)
+               if (test_bit(flag, &b->flags)) {
+                       rcu_read_unlock();
+                       wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+                       goto restart;
+
+               }
+       rcu_read_unlock();
+}
+
+void bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+       __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+void bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+       __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_verify_flushed(struct bch_fs *c)
+{
+       struct bucket_table *tbl;
+       struct rhash_head *pos;
+       struct btree *b;
+       unsigned i;
+
+       rcu_read_lock();
+       for_each_cached_btree(b, c, tbl, i, pos) {
+               unsigned long flags = READ_ONCE(b->flags);
+
+               BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
+                      (flags & (1 << BTREE_NODE_write_in_flight)));
+       }
+       rcu_read_unlock();
+}
+
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+{
+       char *out = buf, *end = buf + PAGE_SIZE;
+       struct bucket_table *tbl;
+       struct rhash_head *pos;
+       struct btree *b;
+       unsigned i;
+
+       rcu_read_lock();
+       for_each_cached_btree(b, c, tbl, i, pos) {
+               unsigned long flags = READ_ONCE(b->flags);
+               unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
+
+               if (//!(flags & (1 << BTREE_NODE_dirty)) &&
+                   !b->writes[0].wait.list.first &&
+                   !b->writes[1].wait.list.first &&
+                   !(b->will_make_reachable & 1))
+                       continue;
+
+               out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+                                b,
+                                (flags & (1 << BTREE_NODE_dirty)) != 0,
+                                b->level,
+                                b->written,
+                                !list_empty_careful(&b->write_blocked),
+                                b->will_make_reachable != 0,
+                                b->will_make_reachable & 1,
+                                b->writes[ idx].wait.list.first != NULL,
+                                b->writes[!idx].wait.list.first != NULL);
+       }
+       rcu_read_unlock();
+
+       return out - buf;
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
new file mode 100644 (file)
index 0000000..0688ce4
--- /dev/null
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bset.h"
+#include "extents.h"
+#include "io_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+
+struct btree_read_bio {
+       struct bch_fs           *c;
+       u64                     start_time;
+       unsigned                have_ioref:1;
+       struct extent_pick_ptr  pick;
+       struct work_struct      work;
+       struct bio              bio;
+};
+
+struct btree_write_bio {
+       void                    *data;
+       struct work_struct      work;
+       struct bch_write_bio    wbio;
+};
+
+static inline void btree_node_io_unlock(struct btree *b)
+{
+       EBUG_ON(!btree_node_write_in_flight(b));
+       clear_btree_node_write_in_flight(b);
+       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static inline void btree_node_io_lock(struct btree *b)
+{
+       wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+                           TASK_UNINTERRUPTIBLE);
+}
+
+static inline void btree_node_wait_on_io(struct btree *b)
+{
+       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+}
+
+static inline bool btree_node_may_write(struct btree *b)
+{
+       return list_empty_careful(&b->write_blocked) &&
+               !b->will_make_reachable;
+}
+
+enum compact_mode {
+       COMPACT_LAZY,
+       COMPACT_WRITTEN,
+       COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+
+static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+{
+       unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+       unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+       return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+       struct bset_tree *t;
+
+       for_each_bset(b, t)
+               if (should_compact_bset_lazy(b, t))
+                       return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+       return false;
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct bch_fs *, struct btree *,
+                        struct btree_iter *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+                        const struct bkey_i *, unsigned);
+
+void bch2_btree_complete_write(struct bch_fs *, struct btree *,
+                             struct btree_write *);
+void bch2_btree_write_error_work(struct work_struct *);
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *,
+                           enum six_lock_type);
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+                         enum six_lock_type);
+
+/*
+ * btree_node_dirty() can be cleared with only a read lock,
+ * and for bch2_btree_node_write_cond() we want to set need_write iff it's
+ * still dirty:
+ */
+static inline void set_btree_node_need_write_if_dirty(struct btree *b)
+{
+       unsigned long old, new, v = READ_ONCE(b->flags);
+
+       do {
+               old = new = v;
+
+               if (!(old & (1 << BTREE_NODE_dirty)))
+                       return;
+
+               new |= (1 << BTREE_NODE_need_write);
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+}
+
+#define bch2_btree_node_write_cond(_c, _b, cond)                       \
+do {                                                                   \
+       while ((_b)->written && btree_node_dirty(_b) && (cond)) {       \
+               if (!btree_node_may_write(_b)) {                        \
+                       set_btree_node_need_write_if_dirty(_b);         \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               if (!btree_node_write_in_flight(_b)) {                  \
+                       bch2_btree_node_write(_c, _b, SIX_LOCK_read);   \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               six_unlock_read(&(_b)->lock);                           \
+               btree_node_wait_on_io(_b);                              \
+               btree_node_lock_type(c, b, SIX_LOCK_read);              \
+       }                                                               \
+} while (0)
+
+void bch2_btree_flush_all_reads(struct bch_fs *);
+void bch2_btree_flush_all_writes(struct bch_fs *);
+void bch2_btree_verify_flushed(struct bch_fs *);
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+
+/* Sorting */
+
+struct btree_node_iter_large {
+       u8              is_extents;
+       u16             used;
+
+       struct btree_node_iter_set data[MAX_BSETS];
+};
+
+static inline void
+__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
+                                 bool is_extents)
+{
+       iter->used = 0;
+       iter->is_extents = is_extents;
+}
+
+void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
+                                       struct btree *);
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
+                                    struct btree *,
+                                    const struct bkey_packed *,
+                                    const struct bkey_packed *);
+
+static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+{
+       return !iter->used;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
+                                   struct btree *b)
+{
+       return bch2_btree_node_iter_large_end(iter)
+               ? NULL
+               : __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
+                                   struct btree *b)
+{
+       struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+
+       if (ret)
+               bch2_btree_node_iter_large_advance(iter, b);
+
+       return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
new file mode 100644 (file)
index 0000000..2b4ba41
--- /dev/null
@@ -0,0 +1,1844 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
+                                                   struct btree_iter_level *,
+                                                   struct bkey *);
+
+#define BTREE_ITER_NOT_END     ((struct btree *) 1)
+
+static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+{
+       return l < BTREE_MAX_DEPTH &&
+               iter->l[l].b &&
+               iter->l[l].b != BTREE_ITER_NOT_END;
+}
+
+/* Btree node locking: */
+
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+       struct btree_iter *linked;
+
+       EBUG_ON(iter->l[b->level].b != b);
+       EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
+
+       for_each_btree_iter_with_node(iter, b, linked)
+               linked->lock_seq[b->level] += 2;
+
+       six_unlock_write(&b->lock);
+}
+
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+       struct bch_fs *c = iter->c;
+       struct btree_iter *linked;
+       unsigned readers = 0;
+
+       EBUG_ON(btree_node_read_locked(iter, b->level));
+
+       for_each_linked_btree_iter(iter, linked)
+               if (linked->l[b->level].b == b &&
+                   btree_node_read_locked(linked, b->level))
+                       readers++;
+
+       /*
+        * Must drop our read locks before calling six_lock_write() -
+        * six_unlock() won't do wakeups until the reader count
+        * goes to 0, and it's safe because we have the node intent
+        * locked:
+        */
+       atomic64_sub(__SIX_VAL(read_lock, readers),
+                    &b->lock.state.counter);
+       btree_node_lock_type(c, b, SIX_LOCK_write);
+       atomic64_add(__SIX_VAL(read_lock, readers),
+                    &b->lock.state.counter);
+}
+
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+                                            struct btree *b, unsigned level,
+                                            enum btree_node_locked_type want)
+{
+       struct btree_iter *linked;
+
+       for_each_linked_btree_iter(iter, linked)
+               if (linked->l[level].b == b &&
+                   btree_node_locked_type(linked, level) >= want) {
+                       six_lock_increment(&b->lock, (enum six_lock_type) want);
+                       return true;
+               }
+
+       return false;
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+{
+       struct btree *b = btree_iter_node(iter, level);
+       int want = __btree_lock_want(iter, level);
+
+       if (!b || b == BTREE_ITER_NOT_END)
+               return false;
+
+       if (race_fault())
+               return false;
+
+       if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
+           !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+             btree_node_lock_increment(iter, b, level, want)))
+               return false;
+
+       mark_btree_node_locked(iter, level, want);
+       return true;
+}
+
+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+{
+       struct btree *b = iter->l[level].b;
+
+       EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
+
+       if (!is_btree_node(iter, level))
+               return false;
+
+       if (btree_node_intent_locked(iter, level))
+               return true;
+
+       if (race_fault())
+               return false;
+
+       if (btree_node_locked(iter, level)
+           ? six_lock_tryupgrade(&b->lock)
+           : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
+               goto success;
+
+       if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+           btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+               btree_node_unlock(iter, level);
+               goto success;
+       }
+
+       return false;
+success:
+       mark_btree_node_intent_locked(iter, level);
+       return true;
+}
+
+static inline bool btree_iter_get_locks(struct btree_iter *iter,
+                                       bool upgrade)
+{
+       unsigned l = iter->level;
+       int fail_idx = -1;
+
+       do {
+               if (!btree_iter_node(iter, l))
+                       break;
+
+               if (!(upgrade
+                     ? bch2_btree_node_upgrade(iter, l)
+                     : bch2_btree_node_relock(iter, l))) {
+                       fail_idx = l;
+                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               }
+
+               l++;
+       } while (l < iter->locks_want);
+
+       /*
+        * When we fail to get a lock, we have to ensure that any child nodes
+        * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+        * the node that we failed to relock:
+        */
+       while (fail_idx >= 0) {
+               btree_node_unlock(iter, fail_idx);
+               iter->l[fail_idx].b = BTREE_ITER_NOT_END;
+               --fail_idx;
+       }
+
+       if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+               iter->uptodate = BTREE_ITER_NEED_PEEK;
+
+       bch2_btree_iter_verify_locks(iter);
+       return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+/* Slowpath: */
+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
+                          unsigned level,
+                          struct btree_iter *iter,
+                          enum six_lock_type type,
+                          bool may_drop_locks)
+{
+       struct bch_fs *c = iter->c;
+       struct btree_iter *linked;
+       bool ret = true;
+
+       /* Can't have children locked before ancestors: */
+       EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
+
+       /*
+        * Can't hold any read locks while we block taking an intent lock - see
+        * below for reasoning, and we should have already dropped any read
+        * locks in the current iterator
+        */
+       EBUG_ON(type == SIX_LOCK_intent &&
+               iter->nodes_locked != iter->nodes_intent_locked);
+
+       if (btree_node_lock_increment(iter, b, level, (enum btree_node_locked_type) type))
+               return true;
+
+       /*
+        * Must lock btree nodes in key order - this case happens when locking
+        * the prev sibling in btree node merging:
+        */
+       if (iter->nodes_locked &&
+           __ffs(iter->nodes_locked) <= level &&
+           __btree_iter_cmp(iter->btree_id, pos, iter))
+               return false;
+
+       for_each_linked_btree_iter(iter, linked) {
+               if (!linked->nodes_locked)
+                       continue;
+
+               /* We have to lock btree nodes in key order: */
+               if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+                       ret = false;
+
+               /*
+                * Can't block taking an intent lock if we have _any_ nodes read
+                * locked:
+                *
+                * - Our read lock blocks another thread with an intent lock on
+                *   the same node from getting a write lock, and thus from
+                *   dropping its intent lock
+                *
+                * - And the other thread may have multiple nodes intent locked:
+                *   both the node we want to intent lock, and the node we
+                *   already have read locked - deadlock:
+                */
+               if (type == SIX_LOCK_intent &&
+                   linked->nodes_locked != linked->nodes_intent_locked) {
+                       if (may_drop_locks) {
+                               linked->locks_want = max_t(unsigned,
+                                               linked->locks_want,
+                                               __fls(linked->nodes_locked) + 1);
+                               btree_iter_get_locks(linked, true);
+                       }
+                       ret = false;
+               }
+
+               /*
+                * Interior nodes must be locked before their descendants: if
+                * another iterator has possible descendants locked of the node
+                * we're about to lock, it must have the ancestors locked too:
+                */
+               if (linked->btree_id == iter->btree_id &&
+                   level > __fls(linked->nodes_locked)) {
+                       if (may_drop_locks) {
+                               linked->locks_want = max_t(unsigned,
+                                                          linked->locks_want,
+                                                          iter->locks_want);
+                               btree_iter_get_locks(linked, true);
+                       }
+                       ret = false;
+               }
+       }
+
+       if (ret)
+               __btree_node_lock_type(c, b, type);
+       return ret;
+}
+
+/* Btree iterator locking: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+{
+       unsigned l;
+
+       for (l = 0; btree_iter_node(iter, l); l++) {
+               if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
+                   !btree_node_locked(iter, l))
+                       continue;
+
+               BUG_ON(btree_lock_want(iter, l) !=
+                      btree_node_locked_type(iter, l));
+       }
+}
+#endif
+
+__flatten
+static bool __bch2_btree_iter_relock(struct btree_iter *iter)
+{
+       return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+               ? btree_iter_get_locks(iter, false)
+               : true;
+}
+
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+       struct btree_iter *linked;
+       bool ret = true;
+
+       for_each_btree_iter(iter, linked)
+               ret &= __bch2_btree_iter_relock(linked);
+
+       return ret;
+}
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+                              unsigned new_locks_want)
+{
+       struct btree_iter *linked;
+
+       EBUG_ON(iter->locks_want >= new_locks_want);
+
+       iter->locks_want = new_locks_want;
+
+       if (btree_iter_get_locks(iter, true))
+               return true;
+
+       /*
+        * Ancestor nodes must be locked before child nodes, so set locks_want
+        * on iterators that might lock ancestors before us to avoid getting
+        * -EINTR later:
+        */
+       for_each_linked_btree_iter(iter, linked)
+               if (linked->btree_id == iter->btree_id &&
+                   btree_iter_cmp(linked, iter) <= 0 &&
+                   linked->locks_want < new_locks_want) {
+                       linked->locks_want = new_locks_want;
+                       btree_iter_get_locks(linked, true);
+               }
+
+       return false;
+}
+
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
+                                       unsigned new_locks_want)
+{
+       unsigned l = iter->level;
+
+       EBUG_ON(iter->locks_want >= new_locks_want);
+
+       iter->locks_want = new_locks_want;
+
+       do {
+               if (!btree_iter_node(iter, l))
+                       break;
+
+               if (!bch2_btree_node_upgrade(iter, l)) {
+                       iter->locks_want = l;
+                       return false;
+               }
+
+               l++;
+       } while (l < iter->locks_want);
+
+       return true;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+                                unsigned downgrade_to)
+{
+       struct btree_iter *linked;
+       unsigned l;
+
+       /*
+        * We downgrade linked iterators as well because btree_iter_upgrade
+        * might have had to modify locks_want on linked iterators due to lock
+        * ordering:
+        */
+       for_each_btree_iter(iter, linked) {
+               unsigned new_locks_want = downgrade_to ?:
+                       (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+               if (linked->locks_want <= new_locks_want)
+                       continue;
+
+               linked->locks_want = new_locks_want;
+
+               while (linked->nodes_locked &&
+                      (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
+                       if (l > linked->level) {
+                               btree_node_unlock(linked, l);
+                       } else {
+                               if (btree_node_intent_locked(linked, l)) {
+                                       six_lock_downgrade(&linked->l[l].b->lock);
+                                       linked->nodes_intent_locked ^= 1 << l;
+                               }
+                               break;
+                       }
+               }
+
+               bch2_btree_iter_verify_locks(linked);
+       }
+}
+
+int bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+       struct btree_iter *linked;
+
+       for_each_btree_iter(iter, linked)
+               __bch2_btree_iter_unlock(linked);
+
+       return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void __bch2_btree_iter_verify(struct btree_iter *iter,
+                                    struct btree *b)
+{
+       struct btree_iter_level *l = &iter->l[b->level];
+       struct btree_node_iter tmp = l->iter;
+       struct bkey_packed *k;
+
+       bch2_btree_node_iter_verify(&l->iter, b);
+
+       /*
+        * For interior nodes, the iterator will have skipped past
+        * deleted keys:
+        */
+       k = b->level
+               ? bch2_btree_node_iter_prev(&tmp, b)
+               : bch2_btree_node_iter_prev_all(&tmp, b);
+       if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
+                               iter->flags & BTREE_ITER_IS_EXTENTS)) {
+               char buf[100];
+               struct bkey uk = bkey_unpack_key(b, k);
+
+               bch2_bkey_to_text(buf, sizeof(buf), &uk);
+               panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+                     buf, iter->pos.inode, iter->pos.offset);
+       }
+
+       k = bch2_btree_node_iter_peek_all(&l->iter, b);
+       if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
+                               iter->flags & BTREE_ITER_IS_EXTENTS)) {
+               char buf[100];
+               struct bkey uk = bkey_unpack_key(b, k);
+
+               bch2_bkey_to_text(buf, sizeof(buf), &uk);
+               panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+                     iter->pos.inode, iter->pos.offset, buf);
+       }
+
+       if (iter->uptodate == BTREE_ITER_UPTODATE &&
+           (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) {
+               BUG_ON(!bkey_whiteout(&iter->k) &&
+                      bch2_btree_node_iter_end(&l->iter));
+       }
+}
+
+void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+{
+       struct btree_iter *linked;
+
+       for_each_btree_iter_with_node(iter, b, linked)
+               __bch2_btree_iter_verify(linked, b);
+}
+
+#endif
+
+static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
+                                     struct btree *b,
+                                     struct btree_node_iter *node_iter,
+                                     struct bset_tree *t,
+                                     struct bkey_packed *where,
+                                     unsigned clobber_u64s,
+                                     unsigned new_u64s)
+{
+       const struct bkey_packed *end = btree_bkey_last(b, t);
+       struct btree_node_iter_set *set;
+       unsigned offset = __btree_node_key_to_offset(b, where);
+       int shift = new_u64s - clobber_u64s;
+       unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+
+       btree_node_iter_for_each(node_iter, set)
+               if (set->end == old_end)
+                       goto found;
+
+       /* didn't find the bset in the iterator - might have to readd it: */
+       if (new_u64s &&
+           btree_iter_pos_cmp_packed(b, &iter->pos, where,
+                                     iter->flags & BTREE_ITER_IS_EXTENTS)) {
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+               bch2_btree_node_iter_push(node_iter, b, where, end);
+
+               if (!b->level &&
+                   node_iter == &iter->l[0].iter)
+                       bkey_disassemble(b,
+                               bch2_btree_node_iter_peek_all(node_iter, b),
+                               &iter->k);
+       }
+       return;
+found:
+       set->end = (int) set->end + shift;
+
+       /* Iterator hasn't gotten to the key that changed yet: */
+       if (set->k < offset)
+               return;
+
+       if (new_u64s &&
+           btree_iter_pos_cmp_packed(b, &iter->pos, where,
+                               iter->flags & BTREE_ITER_IS_EXTENTS)) {
+               set->k = offset;
+       } else if (set->k < offset + clobber_u64s) {
+               set->k = offset + new_u64s;
+               if (set->k == set->end)
+                       bch2_btree_node_iter_set_drop(node_iter, set);
+       } else {
+               set->k = (int) set->k + shift;
+               goto iter_current_key_not_modified;
+       }
+
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+       bch2_btree_node_iter_sort(node_iter, b);
+       if (!b->level && node_iter == &iter->l[0].iter)
+               __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
+iter_current_key_not_modified:
+
+       /*
+        * Interior nodes are special because iterators for interior nodes don't
+        * obey the usual invariants regarding the iterator position:
+        *
+        * We may have whiteouts that compare greater than the iterator
+        * position, and logically should be in the iterator, but that we
+        * skipped past to find the first live key greater than the iterator
+        * position. This becomes an issue when we insert a new key that is
+        * greater than the current iterator position, but smaller than the
+        * whiteouts we've already skipped past - this happens in the course of
+        * a btree split.
+        *
+        * We have to rewind the iterator past to before those whiteouts here,
+        * else bkey_node_iter_prev() is not going to work and who knows what
+        * else would happen. And we have to do it manually, because here we've
+        * already done the insert and the iterator is currently inconsistent:
+        *
+        * We've got multiple competing invariants, here - we have to be careful
+        * about rewinding iterators for interior nodes, because they should
+        * always point to the key for the child node the btree iterator points
+        * to.
+        */
+       if (b->level && new_u64s && !bkey_deleted(where) &&
+           btree_iter_pos_cmp_packed(b, &iter->pos, where,
+                               iter->flags & BTREE_ITER_IS_EXTENTS)) {
+               struct bset_tree *t;
+               struct bkey_packed *k;
+
+               for_each_bset(b, t) {
+                       if (bch2_bkey_to_bset(b, where) == t)
+                               continue;
+
+                       k = bch2_bkey_prev_all(b, t,
+                               bch2_btree_node_iter_bset_pos(node_iter, b, t));
+                       if (k &&
+                           __btree_node_iter_cmp(node_iter, b,
+                                                 k, where) > 0) {
+                               struct btree_node_iter_set *set;
+                               unsigned offset =
+                                       __btree_node_key_to_offset(b, bkey_next(k));
+
+                               btree_node_iter_for_each(node_iter, set)
+                                       if (set->k == offset) {
+                                               set->k = __btree_node_key_to_offset(b, k);
+                                               bch2_btree_node_iter_sort(node_iter, b);
+                                               goto next_bset;
+                                       }
+
+                               bch2_btree_node_iter_push(node_iter, b, k,
+                                               btree_bkey_last(b, t));
+                       }
+next_bset:
+                       t = t;
+               }
+       }
+}
+
+void bch2_btree_node_iter_fix(struct btree_iter *iter,
+                            struct btree *b,
+                            struct btree_node_iter *node_iter,
+                            struct bset_tree *t,
+                            struct bkey_packed *where,
+                            unsigned clobber_u64s,
+                            unsigned new_u64s)
+{
+       struct btree_iter *linked;
+
+       if (node_iter != &iter->l[b->level].iter)
+               __bch2_btree_node_iter_fix(iter, b, node_iter, t,
+                                         where, clobber_u64s, new_u64s);
+
+       for_each_btree_iter_with_node(iter, b, linked)
+               __bch2_btree_node_iter_fix(linked, b,
+                                         &linked->l[b->level].iter, t,
+                                         where, clobber_u64s, new_u64s);
+
+       /* interior node iterators are... special... */
+       if (!b->level)
+               bch2_btree_iter_verify(iter, b);
+}
+
+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
+                                                 struct btree_iter_level *l,
+                                                 struct bkey *u,
+                                                 struct bkey_packed *k)
+{
+       struct bkey_s_c ret;
+
+       if (unlikely(!k)) {
+               /*
+                * signal to bch2_btree_iter_peek_slot() that we're currently at
+                * a hole
+                */
+               u->type = KEY_TYPE_DELETED;
+               return bkey_s_c_null;
+       }
+
+       ret = bkey_disassemble(l->b, k, u);
+
+       if (debug_check_bkeys(iter->c))
+               bch2_bkey_debugcheck(iter->c, l->b, ret);
+
+       return ret;
+}
+
+/* peek_all() doesn't skip deleted keys */
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
+                                                   struct btree_iter_level *l,
+                                                   struct bkey *u)
+{
+       return __btree_iter_unpack(iter, l, u,
+                       bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
+                                               struct btree_iter_level *l)
+{
+       return __btree_iter_unpack(iter, l, &iter->k,
+                       bch2_btree_node_iter_peek(&l->iter, l->b));
+}
+
+static inline void __btree_iter_advance(struct btree_iter_level *l)
+{
+       bch2_btree_node_iter_advance(&l->iter, l->b);
+}
+
+/*
+ * Verify that iterator for parent node points to child node:
+ */
+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+{
+       struct btree_iter_level *l;
+       unsigned plevel;
+       bool parent_locked;
+       struct bkey_packed *k;
+
+       if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+               return;
+
+       plevel = b->level + 1;
+       if (!btree_iter_node(iter, plevel))
+               return;
+
+       parent_locked = btree_node_locked(iter, plevel);
+
+       if (!bch2_btree_node_relock(iter, plevel))
+               return;
+
+       l = &iter->l[plevel];
+       k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+       if (!k ||
+           bkey_deleted(k) ||
+           bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
+               char buf[100];
+               struct bkey uk = bkey_unpack_key(b, k);
+
+               bch2_bkey_to_text(buf, sizeof(buf), &uk);
+               panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
+                     buf, b->key.k.p.inode, b->key.k.p.offset);
+       }
+
+       if (!parent_locked)
+               btree_node_unlock(iter, b->level + 1);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
+                                     const struct bkey *k)
+{
+       int cmp = bkey_cmp(k->p, iter->pos);
+
+       return cmp > 0 ||
+               (cmp == 0 &&
+                !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+                                            struct btree *b)
+{
+       return !btree_iter_pos_cmp(iter, &b->key.k) &&
+               bkey_cmp(b->key.k.p, POS_MAX);
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+                                         struct btree *b)
+{
+       return iter->btree_id == b->btree_id &&
+               bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
+               !btree_iter_pos_after_node(iter, b);
+}
+
+static inline void __btree_iter_init(struct btree_iter *iter,
+                                    struct btree *b)
+{
+       struct btree_iter_level *l = &iter->l[b->level];
+
+       bch2_btree_node_iter_init(&l->iter, b, iter->pos,
+                                 iter->flags & BTREE_ITER_IS_EXTENTS,
+                                 btree_node_is_extents(b));
+
+       /* Skip to first non whiteout: */
+       if (b->level)
+               bch2_btree_node_iter_peek(&l->iter, b);
+
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
+static inline void btree_iter_node_set(struct btree_iter *iter,
+                                      struct btree *b)
+{
+       btree_iter_verify_new_node(iter, b);
+
+       EBUG_ON(!btree_iter_pos_in_node(iter, b));
+       EBUG_ON(b->lock.state.seq & 1);
+
+       iter->lock_seq[b->level] = b->lock.state.seq;
+       iter->l[b->level].b = b;
+       __btree_iter_init(iter, b);
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+{
+       enum btree_node_locked_type t;
+       struct btree_iter *linked;
+
+       for_each_btree_iter(iter, linked)
+               if (btree_iter_pos_in_node(linked, b)) {
+                       /*
+                        * bch2_btree_iter_node_drop() has already been called -
+                        * the old node we're replacing has already been
+                        * unlocked and the pointer invalidated
+                        */
+                       BUG_ON(btree_node_locked(linked, b->level));
+
+                       t = btree_lock_want(linked, b->level);
+                       if (t != BTREE_NODE_UNLOCKED) {
+                               six_lock_increment(&b->lock, (enum six_lock_type) t);
+                               mark_btree_node_locked(linked, b->level, (enum six_lock_type) t);
+                       }
+
+                       btree_iter_node_set(linked, b);
+               }
+
+       six_unlock_intent(&b->lock);
+}
+
+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
+{
+       struct btree_iter *linked;
+       unsigned level = b->level;
+
+       for_each_btree_iter(iter, linked)
+               if (linked->l[level].b == b) {
+                       btree_node_unlock(linked, level);
+                       linked->l[level].b = BTREE_ITER_NOT_END;
+               }
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+{
+       struct btree_iter *linked;
+
+       for_each_btree_iter_with_node(iter, b, linked)
+               __btree_iter_init(linked, b);
+}
+
+static inline int btree_iter_lock_root(struct btree_iter *iter,
+                                      unsigned depth_want)
+{
+       struct bch_fs *c = iter->c;
+       struct btree *b;
+       enum six_lock_type lock_type;
+       unsigned i;
+
+       EBUG_ON(iter->nodes_locked);
+
+       while (1) {
+               b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+               iter->level = READ_ONCE(b->level);
+
+               if (unlikely(iter->level < depth_want)) {
+                       /*
+                        * the root is at a lower depth than the depth we want:
+                        * got to the end of the btree, or we're walking nodes
+                        * greater than some depth and there are no nodes >=
+                        * that depth
+                        */
+                       iter->level = depth_want;
+                       iter->l[iter->level].b = NULL;
+                       return 0;
+               }
+
+               lock_type = __btree_lock_want(iter, iter->level);
+               if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
+                                             iter, lock_type, true)))
+                       return -EINTR;
+
+               if (likely(b == c->btree_roots[iter->btree_id].b &&
+                          b->level == iter->level &&
+                          !race_fault())) {
+                       for (i = 0; i < iter->level; i++)
+                               iter->l[i].b = BTREE_ITER_NOT_END;
+                       iter->l[iter->level].b = b;
+
+                       mark_btree_node_locked(iter, iter->level, lock_type);
+                       btree_iter_node_set(iter, b);
+                       return 0;
+
+               }
+
+               six_unlock_type(&b->lock, lock_type);
+       }
+}
+
+noinline
+static void btree_iter_prefetch(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[iter->level];
+       struct btree_node_iter node_iter = l->iter;
+       struct bkey_packed *k;
+       BKEY_PADDED(k) tmp;
+       unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
+               ? (iter->level > 1 ? 0 :  2)
+               : (iter->level > 1 ? 1 : 16);
+       bool was_locked = btree_node_locked(iter, iter->level);
+
+       while (nr) {
+               if (!bch2_btree_node_relock(iter, iter->level))
+                       return;
+
+               bch2_btree_node_iter_advance(&node_iter, l->b);
+               k = bch2_btree_node_iter_peek(&node_iter, l->b);
+               if (!k)
+                       break;
+
+               bch2_bkey_unpack(l->b, &tmp.k, k);
+               bch2_btree_node_prefetch(iter->c, iter, &tmp.k,
+                                        iter->level - 1);
+       }
+
+       if (!was_locked)
+               btree_node_unlock(iter, iter->level);
+}
+
+static inline int btree_iter_down(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[iter->level];
+       struct btree *b;
+       unsigned level = iter->level - 1;
+       enum six_lock_type lock_type = __btree_lock_want(iter, level);
+       BKEY_PADDED(k) tmp;
+
+       BUG_ON(!btree_node_locked(iter, iter->level));
+
+       bch2_bkey_unpack(l->b, &tmp.k,
+                        bch2_btree_node_iter_peek(&l->iter, l->b));
+
+       b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type, true);
+       if (unlikely(IS_ERR(b)))
+               return PTR_ERR(b);
+
+       mark_btree_node_locked(iter, level, lock_type);
+       btree_iter_node_set(iter, b);
+
+       if (iter->flags & BTREE_ITER_PREFETCH)
+               btree_iter_prefetch(iter);
+
+       iter->level = level;
+
+       return 0;
+}
+
+static void btree_iter_up(struct btree_iter *iter)
+{
+       btree_node_unlock(iter, iter->level++);
+}
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+
+static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+{
+       struct bch_fs *c = iter->c;
+       struct btree_iter *linked, *sorted_iters, **i;
+retry_all:
+       bch2_btree_iter_unlock(iter);
+
+       if (ret != -ENOMEM && ret != -EINTR)
+               goto io_error;
+
+       if (ret == -ENOMEM) {
+               struct closure cl;
+
+               closure_init_stack(&cl);
+
+               do {
+                       ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+                       closure_sync(&cl);
+               } while (ret);
+       }
+
+       /*
+        * Linked iters are normally a circular singly linked list - break cycle
+        * while we sort them:
+        */
+       linked = iter->next;
+       iter->next = NULL;
+       sorted_iters = NULL;
+
+       while (linked) {
+               iter = linked;
+               linked = linked->next;
+
+               i = &sorted_iters;
+               while (*i && btree_iter_cmp(iter, *i) > 0)
+                       i = &(*i)->next;
+
+               iter->next = *i;
+               *i = iter;
+       }
+
+       /* Make list circular again: */
+       iter = sorted_iters;
+       while (iter->next)
+               iter = iter->next;
+       iter->next = sorted_iters;
+
+       /* Now, redo traversals in correct order: */
+
+       iter = sorted_iters;
+       do {
+retry:
+               ret = __bch2_btree_iter_traverse(iter);
+               if (unlikely(ret)) {
+                       if (ret == -EINTR)
+                               goto retry;
+                       goto retry_all;
+               }
+
+               iter = iter->next;
+       } while (iter != sorted_iters);
+
+       ret = btree_iter_linked(iter) ? -EINTR : 0;
+out:
+       bch2_btree_cache_cannibalize_unlock(c);
+       return ret;
+io_error:
+       BUG_ON(ret != -EIO);
+
+       iter->flags |= BTREE_ITER_ERROR;
+       iter->l[iter->level].b = BTREE_ITER_NOT_END;
+       goto out;
+}
+
+static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
+                                          bool check_pos)
+{
+       unsigned l = iter->level;
+
+       while (btree_iter_node(iter, l) &&
+              !(is_btree_node(iter, l) &&
+                bch2_btree_node_relock(iter, l) &&
+                (!check_pos ||
+                 btree_iter_pos_in_node(iter, iter->l[l].b)))) {
+               btree_node_unlock(iter, l);
+               iter->l[l].b = BTREE_ITER_NOT_END;
+               l++;
+       }
+
+       return l;
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch2_btree_iter_unlock().
+ */
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       unsigned depth_want = iter->level;
+
+       if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+               return 0;
+
+       if (__bch2_btree_iter_relock(iter))
+               return 0;
+
+       iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
+
+       /*
+        * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
+        * here unnecessary
+        */
+       iter->level = btree_iter_up_until_locked(iter, true);
+
+       /*
+        * If we've got a btree node locked (i.e. we aren't about to relock the
+        * root) - advance its node iterator if necessary:
+        *
+        * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
+        */
+       if (btree_iter_node(iter, iter->level)) {
+               struct btree_iter_level *l = &iter->l[iter->level];
+               struct bkey_s_c k;
+               struct bkey u;
+
+               while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
+                      !btree_iter_pos_cmp(iter, k.k))
+                       __btree_iter_advance(l);
+       }
+
+       /*
+        * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+        * would indicate to other code that we got to the end of the btree,
+        * here it indicates that relocking the root failed - it's critical that
+        * btree_iter_lock_root() comes next and that it can't fail
+        */
+       while (iter->level > depth_want) {
+               int ret = btree_iter_node(iter, iter->level)
+                       ? btree_iter_down(iter)
+                       : btree_iter_lock_root(iter, depth_want);
+               if (unlikely(ret)) {
+                       iter->level = depth_want;
+                       iter->l[iter->level].b = BTREE_ITER_NOT_END;
+                       return ret;
+               }
+       }
+
+       iter->uptodate = BTREE_ITER_NEED_PEEK;
+       bch2_btree_iter_verify_locks(iter);
+       return 0;
+}
+
+int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       int ret;
+
+       ret = __bch2_btree_iter_traverse(iter);
+       if (unlikely(ret))
+               ret = btree_iter_traverse_error(iter, ret);
+
+       BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
+
+       return ret;
+}
+
+static inline void bch2_btree_iter_checks(struct btree_iter *iter,
+                                         enum btree_iter_type type)
+{
+       EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+       EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type);
+       EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+               (iter->btree_id == BTREE_ID_EXTENTS &&
+                type != BTREE_ITER_NODES));
+
+       bch2_btree_iter_verify_locks(iter);
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+       struct btree *b;
+       int ret;
+
+       bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+
+       if (iter->uptodate == BTREE_ITER_UPTODATE)
+               return iter->l[iter->level].b;
+
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               return NULL;
+
+       b = btree_iter_node(iter, iter->level);
+       if (!b)
+               return NULL;
+
+       BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+
+       iter->pos = b->key.k.p;
+       iter->uptodate = BTREE_ITER_UPTODATE;
+
+       return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+{
+       struct btree *b;
+       int ret;
+
+       bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+
+       /* already got to end? */
+       if (!btree_iter_node(iter, iter->level))
+               return NULL;
+
+       btree_iter_up(iter);
+
+       if (!bch2_btree_node_relock(iter, iter->level))
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               return NULL;
+
+       /* got to end? */
+       b = btree_iter_node(iter, iter->level);
+       if (!b)
+               return NULL;
+
+       if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+               /*
+                * Haven't gotten to the end of the parent node: go back down to
+                * the next child node
+                */
+
+               /*
+                * We don't really want to be unlocking here except we can't
+                * directly tell btree_iter_traverse() "traverse to this level"
+                * except by setting iter->level, so we have to unlock so we
+                * don't screw up our lock invariants:
+                */
+               if (btree_node_read_locked(iter, iter->level))
+                       btree_node_unlock(iter, iter->level);
+
+               /* ick: */
+               iter->pos       = iter->btree_id == BTREE_ID_INODES
+                       ? btree_type_successor(iter->btree_id, iter->pos)
+                       : bkey_successor(iter->pos);
+               iter->level     = depth;
+
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               ret = bch2_btree_iter_traverse(iter);
+               if (ret)
+                       return NULL;
+
+               b = iter->l[iter->level].b;
+       }
+
+       iter->pos = b->key.k.p;
+       iter->uptodate = BTREE_ITER_UPTODATE;
+
+       return b;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_packed *k;
+
+       EBUG_ON(iter->level != 0);
+       EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
+       EBUG_ON(!btree_node_locked(iter, 0));
+       EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
+
+       iter->pos = new_pos;
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+       while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+              !btree_iter_pos_cmp_packed(l->b, &iter->pos, k,
+                                         iter->flags & BTREE_ITER_IS_EXTENTS))
+               __btree_iter_advance(l);
+
+       if (!k && btree_iter_pos_after_node(iter, l->b)) {
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
+       }
+}
+
+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+       int cmp = bkey_cmp(new_pos, iter->pos);
+       unsigned level;
+
+       if (!cmp)
+               return;
+
+       iter->pos = new_pos;
+
+       level = btree_iter_up_until_locked(iter, true);
+
+       if (btree_iter_node(iter, level)) {
+               unsigned nr_advanced = 0;
+               struct btree_iter_level *l = &iter->l[level];
+               struct bkey_s_c k;
+               struct bkey u;
+
+               /*
+                * We might have to skip over many keys, or just a few: try
+                * advancing the node iterator, and if we have to skip over too
+                * many keys just reinit it (or if we're rewinding, since that
+                * is expensive).
+                */
+               if (cmp > 0) {
+                       while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
+                              !btree_iter_pos_cmp(iter, k.k)) {
+                               if (nr_advanced > 8)
+                                       goto reinit_node;
+
+                               __btree_iter_advance(l);
+                               nr_advanced++;
+                       }
+               } else {
+reinit_node:
+                       __btree_iter_init(iter, iter->l[level].b);
+               }
+
+               /* Don't leave it locked if we're not supposed to: */
+               if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
+                       btree_node_unlock(iter, level);
+       }
+
+       if (level != iter->level)
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+       else
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
+static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_s_c ret = { .k = &iter->k };
+
+       if (!bkey_deleted(&iter->k)) {
+               EBUG_ON(bch2_btree_node_iter_end(&l->iter));
+               ret.v = bkeyp_val(&l->b->format,
+                       __bch2_btree_node_iter_peek_all(&l->iter, l->b));
+       }
+
+       if (debug_check_bkeys(iter->c) &&
+           !bkey_deleted(ret.k))
+               bch2_bkey_debugcheck(iter->c, l->b, ret);
+       return ret;
+}
+
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+       if (iter->uptodate == BTREE_ITER_UPTODATE)
+               return btree_iter_peek_uptodate(iter);
+
+       while (1) {
+               ret = bch2_btree_iter_traverse(iter);
+               if (unlikely(ret))
+                       return bkey_s_c_err(ret);
+
+               k = __btree_iter_peek(iter, l);
+               if (likely(k.k))
+                       break;
+
+               /* got to the end of the leaf, iterator needs to be traversed: */
+               iter->pos       = l->b->key.k.p;
+               iter->uptodate  = BTREE_ITER_NEED_TRAVERSE;
+
+               if (!bkey_cmp(iter->pos, POS_MAX))
+                       return bkey_s_c_null;
+
+               iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+       }
+
+       /*
+        * iter->pos should always be equal to the key we just
+        * returned - except extents can straddle iter->pos:
+        */
+       if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
+           bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+               iter->pos = bkey_start_pos(k.k);
+
+       iter->uptodate = BTREE_ITER_UPTODATE;
+       return k;
+}
+
+static noinline
+struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+
+       iter->pos       = l->b->key.k.p;
+       iter->uptodate  = BTREE_ITER_NEED_TRAVERSE;
+
+       if (!bkey_cmp(iter->pos, POS_MAX))
+               return bkey_s_c_null;
+
+       iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+
+       return bch2_btree_iter_peek(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_packed *p;
+       struct bkey_s_c k;
+
+       bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+       if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+               k = bch2_btree_iter_peek(iter);
+               if (IS_ERR_OR_NULL(k.k))
+                       return k;
+       }
+
+       do {
+               __btree_iter_advance(l);
+               p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+               if (unlikely(!p))
+                       return bch2_btree_iter_peek_next_leaf(iter);
+       } while (bkey_whiteout(p));
+
+       k = __btree_iter_unpack(iter, l, &iter->k, p);
+
+       EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
+       iter->pos = bkey_start_pos(k.k);
+       return k;
+}
+
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_packed *p;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+       if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+               k = bch2_btree_iter_peek(iter);
+               if (IS_ERR(k.k))
+                       return k;
+       }
+
+       while (1) {
+               p = bch2_btree_node_iter_prev(&l->iter, l->b);
+               if (likely(p))
+                       break;
+
+               iter->pos = l->b->data->min_key;
+               if (!bkey_cmp(iter->pos, POS_MIN))
+                       return bkey_s_c_null;
+
+               bch2_btree_iter_set_pos(iter,
+                       btree_type_predecessor(iter->btree_id, iter->pos));
+
+               ret = bch2_btree_iter_traverse(iter);
+               if (unlikely(ret))
+                       return bkey_s_c_err(ret);
+
+               p = bch2_btree_node_iter_peek(&l->iter, l->b);
+               if (p)
+                       break;
+       }
+
+       k = __btree_iter_unpack(iter, l, &iter->k, p);
+
+       EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+
+       iter->pos       = bkey_start_pos(k.k);
+       iter->uptodate  = BTREE_ITER_UPTODATE;
+       return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_s_c k;
+       struct bkey n;
+       int ret;
+
+recheck:
+       while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+              bkey_deleted(k.k) &&
+              bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+               __btree_iter_advance(l);
+
+       /*
+        * If we got to the end of the node, check if we need to traverse to the
+        * next node:
+        */
+       if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               ret = bch2_btree_iter_traverse(iter);
+               if (unlikely(ret))
+                       return bkey_s_c_err(ret);
+
+               goto recheck;
+       }
+
+       if (k.k &&
+           !bkey_whiteout(k.k) &&
+           bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+               EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
+               EBUG_ON(bkey_deleted(k.k));
+               iter->uptodate = BTREE_ITER_UPTODATE;
+               return k;
+       }
+
+       /* hole */
+       bkey_init(&n);
+       n.p = iter->pos;
+
+       if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+               if (n.p.offset == KEY_OFFSET_MAX) {
+                       if (n.p.inode == KEY_INODE_MAX)
+                               return bkey_s_c_null;
+
+                       iter->pos = bkey_successor(iter->pos);
+                       goto recheck;
+               }
+
+               if (k.k && bkey_whiteout(k.k)) {
+                       struct btree_node_iter node_iter = l->iter;
+
+                       k = __btree_iter_unpack(iter, l, &iter->k,
+                               bch2_btree_node_iter_peek(&node_iter, l->b));
+               }
+
+               if (!k.k)
+                       k.k = &l->b->key.k;
+
+               bch2_key_resize(&n,
+                               min_t(u64, KEY_SIZE_MAX,
+                                     (k.k->p.inode == n.p.inode
+                                      ? bkey_start_offset(k.k)
+                                      : KEY_OFFSET_MAX) -
+                                     n.p.offset));
+
+               EBUG_ON(!n.size);
+       }
+
+       iter->k = n;
+       iter->uptodate = BTREE_ITER_UPTODATE;
+       return (struct bkey_s_c) { &iter->k, NULL };
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+       int ret;
+
+       bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+
+       if (iter->uptodate == BTREE_ITER_UPTODATE)
+               return btree_iter_peek_uptodate(iter);
+
+       ret = bch2_btree_iter_traverse(iter);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
+
+       return __bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+       bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+
+       iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
+       if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+               /*
+                * XXX: when we just need to relock we should be able to avoid
+                * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+                * for that to work
+                */
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+
+               return bch2_btree_iter_peek_slot(iter);
+       }
+
+       if (!bkey_deleted(&iter->k))
+               __btree_iter_advance(&iter->l[0]);
+
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+       return __bch2_btree_iter_peek_slot(iter);
+}
+
+void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
+                           enum btree_id btree_id, struct bpos pos,
+                           unsigned locks_want, unsigned depth,
+                           unsigned flags)
+{
+       unsigned i;
+
+       EBUG_ON(depth >= BTREE_MAX_DEPTH);
+       EBUG_ON(locks_want > BTREE_MAX_DEPTH);
+
+       iter->c                         = c;
+       iter->pos                       = pos;
+       bkey_init(&iter->k);
+       iter->k.p                       = pos;
+       iter->flags                     = flags;
+       iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
+       iter->btree_id                  = btree_id;
+       iter->level                     = depth;
+       iter->locks_want                = locks_want;
+       iter->nodes_locked              = 0;
+       iter->nodes_intent_locked       = 0;
+       for (i = 0; i < ARRAY_SIZE(iter->l); i++)
+               iter->l[i].b            = NULL;
+       iter->l[iter->level].b          = BTREE_ITER_NOT_END;
+       iter->next                      = iter;
+
+       prefetch(c->btree_roots[btree_id].b);
+}
+
+void bch2_btree_iter_unlink(struct btree_iter *iter)
+{
+       struct btree_iter *linked;
+
+       __bch2_btree_iter_unlock(iter);
+
+       if (!btree_iter_linked(iter))
+               return;
+
+       for_each_linked_btree_iter(iter, linked)
+               if (linked->next == iter) {
+                       linked->next = iter->next;
+                       iter->next = iter;
+                       return;
+               }
+
+       BUG();
+}
+
+void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+{
+       BUG_ON(btree_iter_linked(new));
+
+       new->next = iter->next;
+       iter->next = new;
+}
+
+void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+{
+       unsigned i;
+
+       __bch2_btree_iter_unlock(dst);
+       memcpy(dst, src, offsetof(struct btree_iter, next));
+
+       for (i = 0; i < BTREE_MAX_DEPTH; i++)
+               if (btree_node_locked(dst, i))
+                       six_lock_increment(&dst->l[i].b->lock,
+                                          __btree_lock_want(dst, i));
+}
+
+/* new transactional stuff: */
+
+static void btree_trans_verify(struct btree_trans *trans)
+{
+       unsigned i;
+
+       for (i = 0; i < trans->nr_iters; i++) {
+               struct btree_iter *iter = &trans->iters[i];
+
+               BUG_ON(btree_iter_linked(iter) !=
+                      ((trans->iters_linked & (1 << i)) &&
+                       !is_power_of_2(trans->iters_linked)));
+       }
+}
+
+void bch2_trans_iter_free(struct btree_trans *trans,
+                         struct btree_iter *iter)
+{
+       unsigned idx;
+
+       for (idx = 0; idx < trans->nr_iters; idx++)
+               if (&trans->iters[idx] == iter)
+                       goto found;
+       BUG();
+found:
+       BUG_ON(!(trans->iters_linked & (1U << idx)));
+
+       trans->iters_live       &= ~(1U << idx);
+       trans->iters_linked     &= ~(1U << idx);
+       bch2_btree_iter_unlink(iter);
+}
+
+static int btree_trans_realloc_iters(struct btree_trans *trans)
+{
+       struct btree_iter *new_iters;
+       unsigned i;
+
+       bch2_trans_unlock(trans);
+
+       new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
+                           GFP_NOFS);
+       if (!new_iters)
+               return -ENOMEM;
+
+       memcpy(new_iters, trans->iters,
+              sizeof(struct btree_iter) * trans->nr_iters);
+       trans->iters = new_iters;
+
+       for (i = 0; i < trans->nr_iters; i++)
+               trans->iters[i].next = &trans->iters[i];
+
+       if (trans->iters_linked) {
+               unsigned first_linked = __ffs(trans->iters_linked);
+
+               for (i = first_linked + 1; i < trans->nr_iters; i++)
+                       if (trans->iters_linked & (1 << i))
+                               bch2_btree_iter_link(&trans->iters[first_linked],
+                                                    &trans->iters[i]);
+       }
+
+       btree_trans_verify(trans);
+
+       return trans->iters_live ? -EINTR : 0;
+}
+
+int bch2_trans_preload_iters(struct btree_trans *trans)
+{
+       if (trans->iters != trans->iters_onstack)
+               return 0;
+
+       return btree_trans_realloc_iters(trans);
+}
+
+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
+                                                unsigned btree_id,
+                                                unsigned flags, u64 iter_id)
+{
+       struct btree_iter *iter;
+       int idx;
+
+       BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+
+       for (idx = 0; idx < trans->nr_iters; idx++)
+               if (trans->iter_ids[idx] == iter_id)
+                       goto found;
+       idx = -1;
+found:
+       if (idx < 0) {
+               idx = ffz(trans->iters_linked);
+               if (idx < trans->nr_iters)
+                       goto got_slot;
+
+               BUG_ON(trans->nr_iters == BTREE_ITER_MAX);
+
+               if (trans->iters == trans->iters_onstack &&
+                   trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) {
+                       int ret = btree_trans_realloc_iters(trans);
+                       if (ret)
+                               return ERR_PTR(ret);
+               }
+
+               idx = trans->nr_iters++;
+got_slot:
+               trans->iter_ids[idx] = iter_id;
+               iter = &trans->iters[idx];
+
+               bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
+       } else {
+               iter = &trans->iters[idx];
+
+               BUG_ON(iter->btree_id != btree_id);
+               BUG_ON((iter->flags ^ flags) &
+                      (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
+
+               iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+               iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+       }
+
+       BUG_ON(trans->iters_live & (1 << idx));
+       trans->iters_live |= 1 << idx;
+
+       if (trans->iters_linked &&
+           !(trans->iters_linked & (1 << idx)))
+               bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
+                                    iter);
+
+       trans->iters_linked |= 1 << idx;
+
+       btree_trans_verify(trans);
+
+       return iter;
+}
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+                                        enum btree_id btree_id,
+                                        struct bpos pos, unsigned flags,
+                                        u64 iter_id)
+{
+       struct btree_iter *iter =
+               __btree_trans_get_iter(trans, btree_id, flags, iter_id);
+
+       if (!IS_ERR(iter))
+               bch2_btree_iter_set_pos(iter, pos);
+       return iter;
+}
+
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
+                                         struct btree_iter *src,
+                                         u64 iter_id)
+{
+       struct btree_iter *iter =
+               __btree_trans_get_iter(trans, src->btree_id,
+                                      src->flags, iter_id);
+
+       if (!IS_ERR(iter))
+               bch2_btree_iter_copy(iter, src);
+       return iter;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *trans,
+                        size_t size)
+{
+       void *ret;
+
+       if (trans->mem_top + size > trans->mem_bytes) {
+               size_t old_bytes = trans->mem_bytes;
+               size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
+               void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+
+               if (!new_mem)
+                       return ERR_PTR(-ENOMEM);
+
+               trans->mem = new_mem;
+               trans->mem_bytes = new_bytes;
+
+               if (old_bytes)
+                       return ERR_PTR(-EINTR);
+       }
+
+       ret = trans->mem + trans->mem_top;
+       trans->mem_top += size;
+       return ret;
+}
+
+int bch2_trans_unlock(struct btree_trans *trans)
+{
+       unsigned iters = trans->iters_linked;
+       int ret = 0;
+
+       while (iters) {
+               unsigned idx = __ffs(iters);
+               struct btree_iter *iter = &trans->iters[idx];
+
+               if (iter->flags & BTREE_ITER_ERROR)
+                       ret = -EIO;
+
+               __bch2_btree_iter_unlock(iter);
+               iters ^= 1 << idx;
+       }
+
+       return ret;
+}
+
+void bch2_trans_begin(struct btree_trans *trans)
+{
+       unsigned idx;
+
+       btree_trans_verify(trans);
+
+       /*
+        * On transaction restart, the transaction isn't required to allocate
+        * all the same iterators it on the last iteration:
+        *
+        * Unlink any iterators it didn't use this iteration, assuming it got
+        * further (allocated an iter with a higher idx) than where the iter
+        * was originally allocated:
+        */
+       if (!trans->iters_live)
+               return;
+
+       while (trans->iters_linked &&
+              (idx = __fls(trans->iters_linked)) >
+              __fls(trans->iters_live)) {
+               trans->iters_linked ^= 1 << idx;
+               bch2_btree_iter_unlink(&trans->iters[idx]);
+       }
+
+       trans->iters_live       = 0;
+       trans->nr_updates       = 0;
+       trans->mem_top          = 0;
+
+       btree_trans_verify(trans);
+}
+
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
+{
+       trans->c                = c;
+       trans->nr_iters         = 0;
+       trans->iters_live       = 0;
+       trans->iters_linked     = 0;
+       trans->nr_updates       = 0;
+       trans->mem_top          = 0;
+       trans->mem_bytes        = 0;
+       trans->mem              = NULL;
+       trans->iters            = trans->iters_onstack;
+}
+
+int bch2_trans_exit(struct btree_trans *trans)
+{
+       int ret = bch2_trans_unlock(trans);
+
+       kfree(trans->mem);
+       if (trans->iters != trans->iters_onstack)
+               kfree(trans->iters);
+       trans->mem      = (void *) 0x1;
+       trans->iters    = (void *) 0x1;
+       return ret;
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
new file mode 100644 (file)
index 0000000..e686a7a
--- /dev/null
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "btree_types.h"
+
+static inline void btree_iter_set_dirty(struct btree_iter *iter,
+                                       enum btree_iter_uptodate u)
+{
+       iter->uptodate = max_t(unsigned, iter->uptodate, u);
+}
+
+static inline struct btree *btree_iter_node(struct btree_iter *iter,
+                                           unsigned level)
+{
+       return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+}
+
+static inline struct btree *btree_node_parent(struct btree_iter *iter,
+                                             struct btree *b)
+{
+       return btree_iter_node(iter, b->level + 1);
+}
+
+static inline bool btree_iter_linked(const struct btree_iter *iter)
+{
+       return iter->next != iter;
+}
+
+static inline bool __iter_has_node(const struct btree_iter *iter,
+                                  const struct btree *b)
+{
+       /*
+        * We don't compare the low bits of the lock sequence numbers because
+        * @iter might have taken a write lock on @b, and we don't want to skip
+        * the linked iterator if the sequence numbers were equal before taking
+        * that write lock. The lock sequence number is incremented by taking
+        * and releasing write locks and is even when unlocked:
+        */
+
+       return iter->l[b->level].b == b &&
+               iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+}
+
+static inline struct btree_iter *
+__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+{
+       return linked->next != iter ? linked->next : NULL;
+}
+
+static inline struct btree_iter *
+__next_iter_with_node(struct btree_iter *iter, struct btree *b,
+                     struct btree_iter *linked)
+{
+       while (linked && !__iter_has_node(linked, b))
+               linked = __next_linked_iter(iter, linked);
+
+       return linked;
+}
+
+/**
+ * for_each_btree_iter - iterate over all iterators linked with @_iter,
+ * including @_iter
+ */
+#define for_each_btree_iter(_iter, _linked)                            \
+       for ((_linked) = (_iter); (_linked);                            \
+            (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
+ * that also point to @_b
+ *
+ * @_b is assumed to be locked by @_iter
+ *
+ * Filters out iterators that don't have a valid btree_node iterator for @_b -
+ * i.e. iterators for which bch2_btree_node_relock() would not succeed.
+ */
+#define for_each_btree_iter_with_node(_iter, _b, _linked)              \
+       for ((_linked) = (_iter);                                       \
+            ((_linked) = __next_iter_with_node(_iter, _b, _linked));   \
+            (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
+ * _not_ including @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked)                     \
+       for ((_linked) = (_iter)->next;                                 \
+            (_linked) != (_iter);                                      \
+            (_linked) = (_linked)->next)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_iter_verify_locks(struct btree_iter *);
+#else
+static inline void bch2_btree_iter_verify(struct btree_iter *iter,
+                                         struct btree *b) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+#endif
+
+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
+                            struct btree_node_iter *, struct bset_tree *,
+                            struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_iter_unlock(struct btree_iter *);
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+                                          unsigned new_locks_want,
+                                          bool may_drop_locks)
+{
+       new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+       return iter->locks_want < new_locks_want
+               ? (may_drop_locks
+                  ? __bch2_btree_iter_upgrade(iter, new_locks_want)
+                  : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+               : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+{
+       if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
+               __bch2_btree_iter_downgrade(iter, 0);
+}
+
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
+
+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+
+void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
+                          enum btree_id, struct bpos,
+                          unsigned , unsigned, unsigned);
+
+static inline void bch2_btree_iter_init(struct btree_iter *iter,
+                       struct bch_fs *c, enum btree_id btree_id,
+                       struct bpos pos, unsigned flags)
+{
+       __bch2_btree_iter_init(iter, c, btree_id, pos,
+                              flags & BTREE_ITER_INTENT ? 1 : 0, 0,
+                              (btree_id == BTREE_ID_EXTENTS
+                               ?  BTREE_ITER_IS_EXTENTS : 0)|flags);
+}
+
+void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch2_btree_iter_unlink(struct btree_iter *);
+void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
+
+static inline struct bpos btree_type_successor(enum btree_id id,
+                                              struct bpos pos)
+{
+       if (id == BTREE_ID_INODES) {
+               pos.inode++;
+               pos.offset = 0;
+       } else if (id != BTREE_ID_EXTENTS) {
+               pos = bkey_successor(pos);
+       }
+
+       return pos;
+}
+
+static inline struct bpos btree_type_predecessor(enum btree_id id,
+                                              struct bpos pos)
+{
+       if (id == BTREE_ID_INODES) {
+               --pos.inode;
+               pos.offset = 0;
+       } else /* if (id != BTREE_ID_EXTENTS) */ {
+               pos = bkey_predecessor(pos);
+       }
+
+       return pos;
+}
+
+static inline int __btree_iter_cmp(enum btree_id id,
+                                  struct bpos pos,
+                                  const struct btree_iter *r)
+{
+       if (id != r->btree_id)
+               return id < r->btree_id ? -1 : 1;
+       return bkey_cmp(pos, r->pos);
+}
+
+static inline int btree_iter_cmp(const struct btree_iter *l,
+                                const struct btree_iter *r)
+{
+       return __btree_iter_cmp(l->btree_id, l->pos, r);
+}
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+{
+       if (need_resched()) {
+               bch2_btree_iter_unlock(iter);
+               schedule();
+       } else if (race_fault()) {
+               bch2_btree_iter_unlock(iter);
+       }
+}
+
+#define __for_each_btree_node(_iter, _c, _btree_id, _start,            \
+                             _locks_want, _depth, _flags, _b)          \
+       for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
+                                   _locks_want, _depth,                \
+                                   _flags|BTREE_ITER_NODES),           \
+            _b = bch2_btree_iter_peek_node(_iter);                     \
+            (_b);                                                      \
+            (_b) = bch2_btree_iter_next_node(_iter, _depth))
+
+#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b)  \
+       __for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
+
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+                                                    unsigned flags)
+{
+       return flags & BTREE_ITER_SLOTS
+               ? bch2_btree_iter_peek_slot(iter)
+               : bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
+                                                    unsigned flags)
+{
+       bch2_btree_iter_cond_resched(iter);
+
+       return flags & BTREE_ITER_SLOTS
+               ? bch2_btree_iter_next_slot(iter)
+               : bch2_btree_iter_next(iter);
+}
+
+#define for_each_btree_key(_iter, _c, _btree_id,  _start, _flags, _k)  \
+       for (bch2_btree_iter_init((_iter), (_c), (_btree_id),           \
+                                 (_start), (_flags)),                  \
+            (_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+            !IS_ERR_OR_NULL((_k).k);                                   \
+            (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+#define for_each_btree_key_continue(_iter, _flags, _k)                 \
+       for ((_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+            !IS_ERR_OR_NULL((_k).k);                                   \
+            (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+static inline int btree_iter_err(struct bkey_s_c k)
+{
+       return PTR_ERR_OR_ZERO(k.k);
+}
+
+/* new multiple iterator interface: */
+
+int bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_iter_free(struct btree_trans *,
+                               struct btree_iter *);
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+                                        struct bpos, unsigned, u64);
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
+                                         struct btree_iter *, u64);
+
+static __always_inline u64 __btree_iter_id(void)
+{
+       u64 ret = 0;
+
+       ret <<= 32;
+       ret |= _RET_IP_ & U32_MAX;
+       ret <<= 32;
+       ret |= _THIS_IP_ & U32_MAX;
+       return ret;
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+                   struct bpos pos, unsigned flags)
+{
+       return __bch2_trans_get_iter(trans, btree_id, pos, flags,
+                                    __btree_iter_id());
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+
+       return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+int bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_begin(struct btree_trans *);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *);
+int bch2_trans_exit(struct btree_trans *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
new file mode 100644 (file)
index 0000000..de3fc0a
--- /dev/null
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "btree_io.h"
+#include "six.h"
+
+/* matches six lock types */
+enum btree_node_locked_type {
+       BTREE_NODE_UNLOCKED             = -1,
+       BTREE_NODE_READ_LOCKED          = SIX_LOCK_read,
+       BTREE_NODE_INTENT_LOCKED        = SIX_LOCK_intent,
+};
+
+static inline int btree_node_locked_type(struct btree_iter *iter,
+                                        unsigned level)
+{
+       /*
+        * We're relying on the fact that if nodes_intent_locked is set
+        * nodes_locked must be set as well, so that we can compute without
+        * branches:
+        */
+       return BTREE_NODE_UNLOCKED +
+               ((iter->nodes_locked >> level) & 1) +
+               ((iter->nodes_intent_locked >> level) & 1);
+}
+
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
+                                           unsigned level)
+{
+       return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_iter *iter,
+                                         unsigned level)
+{
+       return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+{
+       return iter->nodes_locked & (1 << level);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+                                           unsigned level)
+{
+       iter->nodes_locked &= ~(1 << level);
+       iter->nodes_intent_locked &= ~(1 << level);
+}
+
+static inline void mark_btree_node_locked(struct btree_iter *iter,
+                                         unsigned level,
+                                         enum six_lock_type type)
+{
+       /* relying on this to avoid a branch */
+       BUILD_BUG_ON(SIX_LOCK_read   != 0);
+       BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+       iter->nodes_locked |= 1 << level;
+       iter->nodes_intent_locked |= type << level;
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+                                                unsigned level)
+{
+       mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+{
+       return level < iter->locks_want
+               ? SIX_LOCK_intent
+               : SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_iter *iter, int level)
+{
+       if (level < iter->level)
+               return BTREE_NODE_UNLOCKED;
+       if (level < iter->locks_want)
+               return BTREE_NODE_INTENT_LOCKED;
+       if (level == iter->level)
+               return BTREE_NODE_READ_LOCKED;
+       return BTREE_NODE_UNLOCKED;
+}
+
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+       int lock_type = btree_node_locked_type(iter, level);
+
+       EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+       if (lock_type != BTREE_NODE_UNLOCKED)
+               six_unlock_type(&iter->l[level].b->lock, lock_type);
+       mark_btree_node_unlocked(iter, level);
+}
+
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+       while (iter->nodes_locked)
+               btree_node_unlock(iter, __ffs(iter->nodes_locked));
+}
+
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+       switch (type) {
+       case SIX_LOCK_read:
+               return BCH_TIME_btree_lock_contended_read;
+       case SIX_LOCK_intent:
+               return BCH_TIME_btree_lock_contended_intent;
+       case SIX_LOCK_write:
+               return BCH_TIME_btree_lock_contended_write;
+       default:
+               BUG();
+       }
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+                                         enum six_lock_type type)
+{
+       u64 start_time = local_clock();
+
+       six_lock_type(&b->lock, type, NULL, NULL);
+       bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+                                       enum six_lock_type type)
+{
+       if (!six_trylock_type(&b->lock, type))
+               __btree_node_lock_type(c, b, type);
+}
+
+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
+                           struct btree_iter *, enum six_lock_type, bool);
+
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+                                  unsigned level,
+                                  struct btree_iter *iter,
+                                  enum six_lock_type type,
+                                  bool may_drop_locks)
+{
+       EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+       return likely(six_trylock_type(&b->lock, type)) ||
+               __bch2_btree_node_lock(b, pos, level, iter,
+                                      type, may_drop_locks);
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_node_relock(struct btree_iter *iter,
+                                         unsigned level)
+{
+       EBUG_ON(btree_node_locked(iter, level) &&
+               btree_node_locked_type(iter, level) !=
+               __btree_lock_want(iter, level));
+
+       return likely(btree_node_locked(iter, level)) ||
+               __bch2_btree_node_relock(iter, level);
+}
+
+bool bch2_btree_iter_relock(struct btree_iter *);
+
+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+       EBUG_ON(iter->l[b->level].b != b);
+       EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+       if (!six_trylock_write(&b->lock))
+               __bch2_btree_node_lock_write(b, iter);
+}
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
+
+
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
new file mode 100644 (file)
index 0000000..b922a8c
--- /dev/null
@@ -0,0 +1,479 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+
+#include "bkey_methods.h"
+#include "journal_types.h"
+#include "six.h"
+
+struct open_bucket;
+struct btree_update;
+
+#define MAX_BSETS              3U
+
+struct btree_nr_keys {
+
+       /*
+        * Amount of live metadata (i.e. size of node after a compaction) in
+        * units of u64s
+        */
+       u16                     live_u64s;
+       u16                     bset_u64s[MAX_BSETS];
+
+       /* live keys only: */
+       u16                     packed_keys;
+       u16                     unpacked_keys;
+};
+
+struct bset_tree {
+       /*
+        * We construct a binary tree in an array as if the array
+        * started at 1, so that things line up on the same cachelines
+        * better: see comments in bset.c at cacheline_to_bkey() for
+        * details
+        */
+
+       /* size of the binary tree and prev array */
+       u16                     size;
+
+       /* function of size - precalculated for to_inorder() */
+       u16                     extra;
+
+       u16                     data_offset;
+       u16                     aux_data_offset;
+       u16                     end_offset;
+
+       struct bpos             max_key;
+};
+
+struct btree_write {
+       struct journal_entry_pin        journal;
+       struct closure_waitlist         wait;
+};
+
+struct btree_ob_ref {
+       u8                      nr;
+       u8                      refs[BCH_REPLICAS_MAX];
+};
+
+struct btree_alloc {
+       struct btree_ob_ref     ob;
+       BKEY_PADDED(k);
+};
+
+struct btree {
+       /* Hottest entries first */
+       struct rhash_head       hash;
+
+       /* Key/pointer for this btree node */
+       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+       struct six_lock         lock;
+
+       unsigned long           flags;
+       u16                     written;
+       u8                      level;
+       u8                      btree_id;
+       u8                      nsets;
+       u8                      nr_key_bits;
+
+       struct bkey_format      format;
+
+       struct btree_node       *data;
+       void                    *aux_data;
+
+       /*
+        * Sets of sorted keys - the real btree node - plus a binary search tree
+        *
+        * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+        * to the memory we have allocated for this btree node. Additionally,
+        * set[0]->data points to the entire btree node as it exists on disk.
+        */
+       struct bset_tree        set[MAX_BSETS];
+
+       struct btree_nr_keys    nr;
+       u16                     sib_u64s[2];
+       u16                     whiteout_u64s;
+       u16                     uncompacted_whiteout_u64s;
+       u8                      page_order;
+       u8                      unpack_fn_len;
+
+       /*
+        * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+        * fails because the lock sequence number has changed - i.e. the
+        * contents were modified - we can still relock the node if it's still
+        * the one we want, without redoing the traversal
+        */
+
+       /*
+        * For asynchronous splits/interior node updates:
+        * When we do a split, we allocate new child nodes and update the parent
+        * node to point to them: we update the parent in memory immediately,
+        * but then we must wait until the children have been written out before
+        * the update to the parent can be written - this is a list of the
+        * btree_updates that are blocking this node from being
+        * written:
+        */
+       struct list_head        write_blocked;
+
+       /*
+        * Also for asynchronous splits/interior node updates:
+        * If a btree node isn't reachable yet, we don't want to kick off
+        * another write - because that write also won't yet be reachable and
+        * marking it as completed before it's reachable would be incorrect:
+        */
+       unsigned long           will_make_reachable;
+
+       struct btree_ob_ref     ob;
+
+       /* lru list */
+       struct list_head        list;
+
+       struct btree_write      writes[2];
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       bool                    *expensive_debug_checks;
+#endif
+};
+
+struct btree_cache {
+       struct rhashtable       table;
+       bool                    table_init_done;
+       /*
+        * We never free a struct btree, except on shutdown - we just put it on
+        * the btree_cache_freed list and reuse it later. This simplifies the
+        * code, and it doesn't cost us much memory as the memory usage is
+        * dominated by buffers that hold the actual btree node data and those
+        * can be freed - and the number of struct btrees allocated is
+        * effectively bounded.
+        *
+        * btree_cache_freeable effectively is a small cache - we use it because
+        * high order page allocations can be rather expensive, and it's quite
+        * common to delete and allocate btree nodes in quick succession. It
+        * should never grow past ~2-3 nodes in practice.
+        */
+       struct mutex            lock;
+       struct list_head        live;
+       struct list_head        freeable;
+       struct list_head        freed;
+
+       /* Number of elements in live + freeable lists */
+       unsigned                used;
+       unsigned                reserve;
+       struct shrinker         shrink;
+
+       /*
+        * If we need to allocate memory for a new btree node and that
+        * allocation fails, we can cannibalize another node in the btree cache
+        * to satisfy the allocation - lock to guarantee only one thread does
+        * this at a time:
+        */
+       struct task_struct      *alloc_lock;
+       struct closure_waitlist alloc_wait;
+};
+
+struct btree_node_iter {
+       u8              is_extents;
+
+       struct btree_node_iter_set {
+               u16     k, end;
+       } data[MAX_BSETS];
+};
+
+enum btree_iter_type {
+       BTREE_ITER_KEYS,
+       BTREE_ITER_SLOTS,
+       BTREE_ITER_NODES,
+};
+
+#define BTREE_ITER_TYPE                        ((1 << 2) - 1)
+
+#define BTREE_ITER_INTENT              (1 << 2)
+#define BTREE_ITER_PREFETCH            (1 << 3)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS          (1 << 4)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF      (1 << 5)
+#define BTREE_ITER_ERROR               (1 << 6)
+
+enum btree_iter_uptodate {
+       BTREE_ITER_UPTODATE             = 0,
+       BTREE_ITER_NEED_PEEK            = 1,
+       BTREE_ITER_NEED_RELOCK          = 2,
+       BTREE_ITER_NEED_TRAVERSE        = 3,
+};
+
+/*
+ * @pos                        - iterator's current position
+ * @level              - current btree depth
+ * @locks_want         - btree level below which we start taking intent locks
+ * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked        - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+       struct bch_fs           *c;
+       struct bpos             pos;
+
+       u8                      flags;
+       enum btree_iter_uptodate uptodate:4;
+       enum btree_id           btree_id:4;
+       unsigned                level:4,
+                               locks_want:4,
+                               nodes_locked:4,
+                               nodes_intent_locked:4;
+
+       struct btree_iter_level {
+               struct btree    *b;
+               struct btree_node_iter iter;
+       }                       l[BTREE_MAX_DEPTH];
+
+       u32                     lock_seq[BTREE_MAX_DEPTH];
+
+       /*
+        * Current unpacked key - so that bch2_btree_iter_next()/
+        * bch2_btree_iter_next_slot() can correctly advance pos.
+        */
+       struct bkey             k;
+
+       /*
+        * Circular linked list of linked iterators: linked iterators share
+        * locks (e.g. two linked iterators may have the same node intent
+        * locked, or read and write locked, at the same time), and insertions
+        * through one iterator won't invalidate the other linked iterators.
+        */
+
+       /* Must come last: */
+       struct btree_iter       *next;
+};
+
+#define BTREE_ITER_MAX         8
+
+struct btree_insert_entry {
+       struct btree_iter *iter;
+       struct bkey_i   *k;
+       unsigned        extra_res;
+       /*
+        * true if entire key was inserted - can only be false for
+        * extents
+        */
+       bool            done;
+};
+
+struct btree_trans {
+       struct bch_fs           *c;
+
+       u8                      nr_iters;
+       u8                      iters_live;
+       u8                      iters_linked;
+       u8                      nr_updates;
+
+       unsigned                mem_top;
+       unsigned                mem_bytes;
+       void                    *mem;
+
+       struct btree_iter       *iters;
+       u64                     iter_ids[BTREE_ITER_MAX];
+
+       struct btree_insert_entry updates[BTREE_ITER_MAX];
+
+       struct btree_iter       iters_onstack[2];
+};
+
+#define BTREE_FLAG(flag)                                               \
+static inline bool btree_node_ ## flag(struct btree *b)                        \
+{      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
+                                                                       \
+static inline void set_btree_node_ ## flag(struct btree *b)            \
+{      set_bit(BTREE_NODE_ ## flag, &b->flags); }                      \
+                                                                       \
+static inline void clear_btree_node_ ## flag(struct btree *b)          \
+{      clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+enum btree_flags {
+       BTREE_NODE_read_in_flight,
+       BTREE_NODE_read_error,
+       BTREE_NODE_dirty,
+       BTREE_NODE_need_write,
+       BTREE_NODE_noevict,
+       BTREE_NODE_write_idx,
+       BTREE_NODE_accessed,
+       BTREE_NODE_write_in_flight,
+       BTREE_NODE_just_written,
+       BTREE_NODE_dying,
+       BTREE_NODE_fake,
+};
+
+BTREE_FLAG(read_in_flight);
+BTREE_FLAG(read_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(need_write);
+BTREE_FLAG(noevict);
+BTREE_FLAG(write_idx);
+BTREE_FLAG(accessed);
+BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
+BTREE_FLAG(fake);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+       return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+       return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+       EBUG_ON(!b->nsets);
+       return b->set + b->nsets - 1;
+}
+
+static inline struct bset *bset(const struct btree *b,
+                               const struct bset_tree *t)
+{
+       return (void *) b->data + t->data_offset * sizeof(u64);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+       return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+       return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+       size_t ret = (u64 *) k - (u64 *) b->data - 1;
+
+       EBUG_ON(ret > U16_MAX);
+       return ret;
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+       return (void *) ((u64 *) b->data + k + 1);
+}
+
+#define btree_bkey_first(_b, _t)       (bset(_b, _t)->start)
+
+#define btree_bkey_last(_b, _t)                                                \
+({                                                                     \
+       EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=     \
+               vstruct_last(bset(_b, _t)));                            \
+                                                                       \
+       __btree_node_offset_to_key(_b, (_t)->end_offset);               \
+})
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+       t->end_offset =
+               __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
+       btree_bkey_last(b, t);
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+                                 const struct bset *i)
+{
+       t->data_offset = (u64 *) i - (u64 *) b->data;
+
+       EBUG_ON(bset(b, t) != i);
+
+       set_btree_bset_end(b, t);
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+       return i - (void *) b->data;
+}
+
+/* Type of keys @b contains: */
+static inline enum bkey_type btree_node_type(struct btree *b)
+{
+       return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+}
+
+static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+{
+       return &bch2_bkey_ops[btree_node_type(b)];
+}
+
+static inline bool btree_node_has_ptrs(struct btree *b)
+{
+       return btree_type_has_ptrs(btree_node_type(b));
+}
+
+static inline bool btree_node_is_extents(struct btree *b)
+{
+       return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+}
+
+struct btree_root {
+       struct btree            *b;
+
+       struct btree_update     *as;
+
+       /* On disk root - see async splits: */
+       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+       u8                      level;
+       u8                      alive;
+};
+
+/*
+ * Optional hook that will be called just prior to a btree node update, when
+ * we're holding the write lock and we know what key is about to be overwritten:
+ */
+
+struct btree_iter;
+struct btree_node_iter;
+
+enum btree_insert_ret {
+       BTREE_INSERT_OK,
+       /* extent spanned multiple leaf nodes: have to traverse to next node: */
+       BTREE_INSERT_NEED_TRAVERSE,
+       /* write lock held for too long */
+       BTREE_INSERT_NEED_RESCHED,
+       /* leaf node needs to be split */
+       BTREE_INSERT_BTREE_NODE_FULL,
+       BTREE_INSERT_JOURNAL_RES_FULL,
+       BTREE_INSERT_ENOSPC,
+       BTREE_INSERT_NEED_GC_LOCK,
+};
+
+struct extent_insert_hook {
+       enum btree_insert_ret
+       (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+             struct bkey_s_c, const struct bkey_i *);
+};
+
+enum btree_gc_coalesce_fail_reason {
+       BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+       BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+       BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+       btree_prev_sib,
+       btree_next_sib,
+};
+
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
+                                                       struct btree *,
+                                                       struct btree_node_iter *);
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
new file mode 100644 (file)
index 0000000..451d486
--- /dev/null
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+struct btree_insert;
+
+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
+                                    struct btree_iter *);
+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
+                               struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
+                           struct bkey_i *);
+
+/* Normal update interface: */
+
+struct btree_insert {
+       struct bch_fs           *c;
+       struct disk_reservation *disk_res;
+       struct journal_res      journal_res;
+       u64                     *journal_seq;
+       struct extent_insert_hook *hook;
+       unsigned                flags;
+       bool                    did_work;
+
+       unsigned short          nr;
+       struct btree_insert_entry  *entries;
+};
+
+int __bch2_btree_insert_at(struct btree_insert *);
+
+#define BTREE_INSERT_ENTRY(_iter, _k)                                  \
+       ((struct btree_insert_entry) {                                  \
+               .iter           = (_iter),                              \
+               .k              = (_k),                                 \
+               .done           = false,                                \
+       })
+
+#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)                        \
+       ((struct btree_insert_entry) {                                  \
+               .iter           = (_iter),                              \
+               .k              = (_k),                                 \
+               .extra_res = (_extra),                                  \
+               .done           = false,                                \
+       })
+
+/**
+ * bch_btree_insert_at - insert one or more keys at iterator positions
+ * @iter:              btree iterator
+ * @insert_key:                key to insert
+ * @disk_res:          disk reservation
+ * @hook:              extent insert callback
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+#define bch2_btree_insert_at(_c, _disk_res, _hook,                     \
+                           _journal_seq, _flags, ...)                  \
+       __bch2_btree_insert_at(&(struct btree_insert) {                 \
+               .c              = (_c),                                 \
+               .disk_res       = (_disk_res),                          \
+               .journal_seq    = (_journal_seq),                       \
+               .hook           = (_hook),                              \
+               .flags          = (_flags),                             \
+               .nr             = COUNT_ARGS(__VA_ARGS__),              \
+               .entries        = (struct btree_insert_entry[]) {       \
+                       __VA_ARGS__                                     \
+               }})
+
+enum {
+       __BTREE_INSERT_ATOMIC,
+       __BTREE_INSERT_NOUNLOCK,
+       __BTREE_INSERT_NOFAIL,
+       __BTREE_INSERT_USE_RESERVE,
+       __BTREE_INSERT_USE_ALLOC_RESERVE,
+       __BTREE_INSERT_JOURNAL_REPLAY,
+       __BTREE_INSERT_NOWAIT,
+       __BTREE_INSERT_GC_LOCK_HELD,
+       __BCH_HASH_SET_MUST_CREATE,
+       __BCH_HASH_SET_MUST_REPLACE,
+};
+
+/*
+ * Don't drop/retake locks before doing btree update, instead return -EINTR if
+ * we had to drop locks for any reason
+ */
+#define BTREE_INSERT_ATOMIC            (1 << __BTREE_INSERT_ATOMIC)
+
+/*
+ * Don't drop locks _after_ successfully updating btree:
+ */
+#define BTREE_INSERT_NOUNLOCK          (1 << __BTREE_INSERT_NOUNLOCK)
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL            (1 << __BTREE_INSERT_NOFAIL)
+
+/* for copygc, or when merging btree nodes */
+#define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
+
+/*
+ * Insert is for journal replay: don't get journal reservations, or mark extents
+ * (bch_mark_key)
+ */
+#define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT            (1 << __BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD      (1 << __BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE       (1 << __BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE      (1 << __BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_at(struct btree_iter *, unsigned);
+
+int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
+                            struct disk_reservation *,
+                            struct extent_insert_hook *, u64 *, unsigned);
+
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+                    struct disk_reservation *,
+                    struct extent_insert_hook *, u64 *, int flags);
+
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+                          struct bpos, struct bpos, struct bversion,
+                          struct disk_reservation *,
+                          struct extent_insert_hook *, u64 *);
+
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
+                           __le64, unsigned);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+                              struct btree *, struct bkey_i_extent *);
+
+/* new transactional interface: */
+
+void bch2_trans_update(struct btree_trans *, struct btree_iter *,
+                            struct bkey_i *, unsigned);
+int bch2_trans_commit(struct btree_trans *,
+                     struct disk_reservation *,
+                     struct extent_insert_hook *,
+                     u64 *, unsigned);
+
+#define bch2_trans_do(_c, _journal_seq, _flags, _do)                   \
+({                                                                     \
+       struct btree_trans trans;                                       \
+       int _ret;                                                       \
+                                                                       \
+       bch2_trans_init(&trans, (_c));                                  \
+                                                                       \
+       do {                                                            \
+               bch2_trans_begin(&trans);                               \
+                                                                       \
+               _ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL,   \
+                                       (_journal_seq), (_flags));      \
+       } while (_ret == -EINTR);                                       \
+                                                                       \
+       bch2_trans_exit(&trans);                                        \
+       _ret;                                                           \
+})
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
new file mode 100644 (file)
index 0000000..1710efd
--- /dev/null
@@ -0,0 +1,2171 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/random.h>
+
+static void btree_node_will_make_reachable(struct btree_update *,
+                                          struct btree *);
+static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
+
+/* Debug code: */
+
+static void btree_node_interior_verify(struct btree *b)
+{
+       struct btree_node_iter iter;
+       struct bkey_packed *k;
+
+       BUG_ON(!b->level);
+
+       bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+#if 1
+       BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
+              bkey_cmp_left_packed(b, k, &b->key.k.p));
+
+       BUG_ON((bch2_btree_node_iter_advance(&iter, b),
+               !bch2_btree_node_iter_end(&iter)));
+#else
+       const char *msg;
+
+       msg = "not found";
+       k = bch2_btree_node_iter_peek(&iter, b);
+       if (!k)
+               goto err;
+
+       msg = "isn't what it should be";
+       if (bkey_cmp_left_packed(b, k, &b->key.k.p))
+               goto err;
+
+       bch2_btree_node_iter_advance(&iter, b);
+
+       msg = "isn't last key";
+       if (!bch2_btree_node_iter_end(&iter))
+               goto err;
+       return;
+err:
+       bch2_dump_btree_node(b);
+       printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
+              b->key.k.p.offset, msg);
+       BUG();
+#endif
+}
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+       struct bkey_packed *k;
+       struct bset_tree *t;
+       struct bkey uk;
+
+       bch2_bkey_format_add_pos(s, b->data->min_key);
+
+       for_each_bset(b, t)
+               for (k = btree_bkey_first(b, t);
+                    k != btree_bkey_last(b, t);
+                    k = bkey_next(k))
+                       if (!bkey_whiteout(k)) {
+                               uk = bkey_unpack_key(b, k);
+                               bch2_bkey_format_add_key(s, &uk);
+                       }
+}
+
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
+{
+       struct bkey_format_state s;
+
+       bch2_bkey_format_init(&s);
+       __bch2_btree_calc_format(&s, b);
+
+       return bch2_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree *b,
+                                         struct bkey_format *new_f)
+{
+       struct bkey_format *old_f = &b->format;
+
+       /* stupid integer promotion rules */
+       ssize_t delta =
+           (((int) new_f->key_u64s - old_f->key_u64s) *
+            (int) b->nr.packed_keys) +
+           (((int) new_f->key_u64s - BKEY_U64s) *
+            (int) b->nr.unpacked_keys);
+
+       BUG_ON(delta + b->nr.live_u64s < 0);
+
+       return b->nr.live_u64s + delta;
+}
+
+/**
+ * btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * This assumes all keys can pack with the new format -- it just checks if
+ * the re-packed keys would fit inside the node itself.
+ */
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+                                struct bkey_format *new_f)
+{
+       size_t u64s = btree_node_u64s_with_format(b, new_f);
+
+       return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+static bool btree_key_matches(struct bch_fs *c,
+                             struct bkey_s_c_extent l,
+                             struct bkey_s_c_extent r)
+{
+       const struct bch_extent_ptr *ptr1, *ptr2;
+
+       extent_for_each_ptr(l, ptr1)
+               extent_for_each_ptr(r, ptr2)
+                       if (ptr1->dev == ptr2->dev &&
+                           ptr1->gen == ptr2->gen &&
+                           ptr1->offset == ptr2->offset)
+                               return true;
+
+       return false;
+}
+
+/*
+ * We're doing the index update that makes @b unreachable, update stuff to
+ * reflect that:
+ *
+ * Must be called _before_ btree_update_updated_root() or
+ * btree_update_updated_node:
+ */
+static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
+                                      struct bkey_s_c k,
+                                      struct bch_fs_usage *stats)
+{
+       struct bch_fs *c = as->c;
+       struct pending_btree_node_free *d;
+       unsigned replicas;
+
+       /*
+        * btree_update lock is only needed here to avoid racing with
+        * gc:
+        */
+       mutex_lock(&c->btree_interior_update_lock);
+
+       for (d = as->pending; d < as->pending + as->nr_pending; d++)
+               if (!bkey_cmp(k.k->p, d->key.k.p) &&
+                   btree_key_matches(c, bkey_s_c_to_extent(k),
+                                     bkey_i_to_s_c_extent(&d->key)))
+                       goto found;
+       BUG();
+found:
+       BUG_ON(d->index_update_done);
+       d->index_update_done = true;
+
+       /*
+        * Btree nodes are accounted as freed in bch_alloc_stats when they're
+        * freed from the index:
+        */
+       replicas = bch2_extent_nr_dirty_ptrs(k);
+       if (replicas)
+               stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
+
+       /*
+        * We're dropping @k from the btree, but it's still live until the
+        * index update is persistent so we need to keep a reference around for
+        * mark and sweep to find - that's primarily what the
+        * btree_node_pending_free list is for.
+        *
+        * So here (when we set index_update_done = true), we're moving an
+        * existing reference to a different part of the larger "gc keyspace" -
+        * and the new position comes after the old position, since GC marks
+        * the pending free list after it walks the btree.
+        *
+        * If we move the reference while mark and sweep is _between_ the old
+        * and the new position, mark and sweep will see the reference twice
+        * and it'll get double accounted - so check for that here and subtract
+        * to cancel out one of mark and sweep's markings if necessary:
+        */
+
+       /*
+        * bch2_mark_key() compares the current gc pos to the pos we're
+        * moving this reference from, hence one comparison here:
+        */
+       if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+               struct bch_fs_usage tmp = { 0 };
+
+               bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+                            -c->opts.btree_node_size, true, b
+                            ? gc_pos_btree_node(b)
+                            : gc_pos_btree_root(as->btree_id),
+                            &tmp, 0, 0);
+               /*
+                * Don't apply tmp - pending deletes aren't tracked in
+                * bch_alloc_stats:
+                */
+       }
+
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
+{
+       trace_btree_node_free(c, b);
+
+       BUG_ON(btree_node_dirty(b));
+       BUG_ON(btree_node_need_write(b));
+       BUG_ON(b == btree_node_root(c, b));
+       BUG_ON(b->ob.nr);
+       BUG_ON(!list_empty(&b->write_blocked));
+       BUG_ON(b->will_make_reachable);
+
+       clear_btree_node_noevict(b);
+
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+       mutex_lock(&c->btree_cache.lock);
+       list_move(&b->list, &c->btree_cache.freeable);
+       mutex_unlock(&c->btree_cache.lock);
+}
+
+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+{
+       struct btree_ob_ref ob = b->ob;
+
+       btree_update_drop_new_node(c, b);
+
+       b->ob.nr = 0;
+
+       clear_btree_node_dirty(b);
+
+       btree_node_lock_type(c, b, SIX_LOCK_write);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->lock);
+
+       bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
+}
+
+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
+                               struct btree_iter *iter)
+{
+       /*
+        * Is this a node that isn't reachable on disk yet?
+        *
+        * Nodes that aren't reachable yet have writes blocked until they're
+        * reachable - now that we've cancelled any pending writes and moved
+        * things waiting on that write to wait on this update, we can drop this
+        * node from the list of nodes that the other update is making
+        * reachable, prior to freeing it:
+        */
+       btree_update_drop_new_node(c, b);
+
+       __bch2_btree_node_lock_write(b, iter);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->lock);
+
+       bch2_btree_iter_node_drop(iter, b);
+}
+
+static void bch2_btree_node_free_ondisk(struct bch_fs *c,
+                                       struct pending_btree_node_free *pending)
+{
+       struct bch_fs_usage stats = { 0 };
+
+       BUG_ON(!pending->index_update_done);
+
+       bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+                    -c->opts.btree_node_size, true,
+                    gc_phase(GC_PHASE_PENDING_DELETE),
+                    &stats, 0, 0);
+       /*
+        * Don't apply stats - pending deletes aren't tracked in
+        * bch_alloc_stats:
+        */
+}
+
+void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
+{
+       bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
+}
+
+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+                                            struct disk_reservation *res,
+                                            struct closure *cl,
+                                            unsigned flags)
+{
+       struct write_point *wp;
+       struct btree *b;
+       BKEY_PADDED(k) tmp;
+       struct bkey_i_extent *e;
+       struct btree_ob_ref ob;
+       struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
+       unsigned nr_reserve;
+       enum alloc_reserve alloc_reserve;
+
+       if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+               nr_reserve      = 0;
+               alloc_reserve   = RESERVE_ALLOC;
+       } else if (flags & BTREE_INSERT_USE_RESERVE) {
+               nr_reserve      = BTREE_NODE_RESERVE / 2;
+               alloc_reserve   = RESERVE_BTREE;
+       } else {
+               nr_reserve      = BTREE_NODE_RESERVE;
+               alloc_reserve   = RESERVE_NONE;
+       }
+
+       mutex_lock(&c->btree_reserve_cache_lock);
+       if (c->btree_reserve_cache_nr > nr_reserve) {
+               struct btree_alloc *a =
+                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+               ob = a->ob;
+               bkey_copy(&tmp.k, &a->k);
+               mutex_unlock(&c->btree_reserve_cache_lock);
+               goto mem_alloc;
+       }
+       mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+                                     writepoint_ptr(&c->btree_write_point),
+                                     &devs_have,
+                                     res->nr_replicas,
+                                     c->opts.metadata_replicas_required,
+                                     alloc_reserve, 0, cl);
+       if (IS_ERR(wp))
+               return ERR_CAST(wp);
+
+       if (wp->sectors_free < c->opts.btree_node_size) {
+               struct open_bucket *ob;
+               unsigned i;
+
+               writepoint_for_each_ptr(wp, ob, i)
+                       if (ob->sectors_free < c->opts.btree_node_size)
+                               ob->sectors_free = 0;
+
+               bch2_alloc_sectors_done(c, wp);
+               goto retry;
+       }
+
+       e = bkey_extent_init(&tmp.k);
+       bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+
+       ob.nr = 0;
+       bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+       bch2_alloc_sectors_done(c, wp);
+mem_alloc:
+       b = bch2_btree_node_mem_alloc(c);
+
+       /* we hold cannibalize_lock: */
+       BUG_ON(IS_ERR(b));
+       BUG_ON(b->ob.nr);
+
+       bkey_copy(&b->key, &tmp.k);
+       b->ob = ob;
+
+       return b;
+}
+
+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+{
+       struct bch_fs *c = as->c;
+       struct btree *b;
+
+       BUG_ON(level >= BTREE_MAX_DEPTH);
+       BUG_ON(!as->reserve->nr);
+
+       b = as->reserve->b[--as->reserve->nr];
+
+       BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
+
+       set_btree_node_accessed(b);
+       set_btree_node_dirty(b);
+
+       bch2_bset_init_first(b, &b->data->keys);
+       memset(&b->nr, 0, sizeof(b->nr));
+       b->data->magic = cpu_to_le64(bset_magic(c));
+       b->data->flags = 0;
+       SET_BTREE_NODE_ID(b->data, as->btree_id);
+       SET_BTREE_NODE_LEVEL(b->data, level);
+       b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+
+       bch2_btree_build_aux_trees(b);
+
+       btree_node_will_make_reachable(as, b);
+
+       trace_btree_node_alloc(c, b);
+       return b;
+}
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+                                                 struct btree *b,
+                                                 struct bkey_format format)
+{
+       struct btree *n;
+
+       n = bch2_btree_node_alloc(as, b->level);
+
+       n->data->min_key        = b->data->min_key;
+       n->data->max_key        = b->data->max_key;
+       n->data->format         = format;
+       SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+       btree_node_set_format(n, format);
+
+       bch2_btree_sort_into(as->c, n, b);
+
+       btree_node_reset_sib_u64s(n);
+
+       n->key.k.p = b->key.k.p;
+       return n;
+}
+
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+                                                      struct btree *b)
+{
+       struct bkey_format new_f = bch2_btree_calc_format(b);
+
+       /*
+        * The keys might expand with the new format - if they wouldn't fit in
+        * the btree node anymore, use the old format for now:
+        */
+       if (!bch2_btree_node_format_fits(as->c, b, &new_f))
+               new_f = b->format;
+
+       return __bch2_btree_node_alloc_replacement(as, b, new_f);
+}
+
+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+{
+       struct btree *b = bch2_btree_node_alloc(as, level);
+
+       b->data->min_key = POS_MIN;
+       b->data->max_key = POS_MAX;
+       b->data->format = bch2_btree_calc_format(b);
+       b->key.k.p = POS_MAX;
+
+       btree_node_set_format(b, b->data->format);
+       bch2_btree_build_aux_trees(b);
+
+       six_unlock_write(&b->lock);
+
+       return b;
+}
+
+static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+{
+       bch2_disk_reservation_put(c, &reserve->disk_res);
+
+       mutex_lock(&c->btree_reserve_cache_lock);
+
+       while (reserve->nr) {
+               struct btree *b = reserve->b[--reserve->nr];
+
+               six_unlock_write(&b->lock);
+
+               if (c->btree_reserve_cache_nr <
+                   ARRAY_SIZE(c->btree_reserve_cache)) {
+                       struct btree_alloc *a =
+                               &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+                       a->ob = b->ob;
+                       b->ob.nr = 0;
+                       bkey_copy(&a->k, &b->key);
+               } else {
+                       bch2_btree_open_bucket_put(c, b);
+               }
+
+               btree_node_lock_type(c, b, SIX_LOCK_write);
+               __btree_node_free(c, b);
+               six_unlock_write(&b->lock);
+
+               six_unlock_intent(&b->lock);
+       }
+
+       mutex_unlock(&c->btree_reserve_cache_lock);
+
+       mempool_free(reserve, &c->btree_reserve_pool);
+}
+
+static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
+                                                   unsigned nr_nodes,
+                                                   unsigned flags,
+                                                   struct closure *cl)
+{
+       struct btree_reserve *reserve;
+       struct btree *b;
+       struct disk_reservation disk_res = { 0, 0 };
+       unsigned sectors = nr_nodes * c->opts.btree_node_size;
+       int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
+
+       if (flags & BTREE_INSERT_NOFAIL)
+               disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+       /*
+        * This check isn't necessary for correctness - it's just to potentially
+        * prevent us from doing a lot of work that'll end up being wasted:
+        */
+       ret = bch2_journal_error(&c->journal);
+       if (ret)
+               return ERR_PTR(ret);
+
+       if (bch2_disk_reservation_get(c, &disk_res, sectors,
+                                     c->opts.metadata_replicas,
+                                     disk_res_flags))
+               return ERR_PTR(-ENOSPC);
+
+       BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+
+       /*
+        * Protects reaping from the btree node cache and using the btree node
+        * open bucket reserve:
+        */
+       ret = bch2_btree_cache_cannibalize_lock(c, cl);
+       if (ret) {
+               bch2_disk_reservation_put(c, &disk_res);
+               return ERR_PTR(ret);
+       }
+
+       reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
+
+       reserve->disk_res = disk_res;
+       reserve->nr = 0;
+
+       while (reserve->nr < nr_nodes) {
+               b = __bch2_btree_node_alloc(c, &disk_res,
+                                           flags & BTREE_INSERT_NOWAIT
+                                           ? NULL : cl, flags);
+               if (IS_ERR(b)) {
+                       ret = PTR_ERR(b);
+                       goto err_free;
+               }
+
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                             bkey_i_to_s_c(&b->key));
+               if (ret)
+                       goto err_free;
+
+               reserve->b[reserve->nr++] = b;
+       }
+
+       bch2_btree_cache_cannibalize_unlock(c);
+       return reserve;
+err_free:
+       bch2_btree_reserve_put(c, reserve);
+       bch2_btree_cache_cannibalize_unlock(c);
+       trace_btree_reserve_get_fail(c, nr_nodes, cl);
+       return ERR_PTR(ret);
+}
+
+/* Asynchronous interior node update machinery */
+
+static void bch2_btree_update_free(struct btree_update *as)
+{
+       struct bch_fs *c = as->c;
+
+       BUG_ON(as->nr_new_nodes);
+       BUG_ON(as->nr_pending);
+
+       if (as->reserve)
+               bch2_btree_reserve_put(c, as->reserve);
+
+       mutex_lock(&c->btree_interior_update_lock);
+       list_del(&as->list);
+
+       closure_debug_destroy(&as->cl);
+       mempool_free(as, &c->btree_interior_update_pool);
+       percpu_ref_put(&c->writes);
+
+       closure_wake_up(&c->btree_interior_update_wait);
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_nodes_reachable(struct closure *cl)
+{
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct bch_fs *c = as->c;
+
+       bch2_journal_pin_drop(&c->journal, &as->journal);
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       while (as->nr_new_nodes) {
+               struct btree *b = as->new_nodes[--as->nr_new_nodes];
+
+               BUG_ON(b->will_make_reachable != (unsigned long) as);
+               b->will_make_reachable = 0;
+               mutex_unlock(&c->btree_interior_update_lock);
+
+               /*
+                * b->will_make_reachable prevented it from being written, so
+                * write it now if it needs to be written:
+                */
+               btree_node_lock_type(c, b, SIX_LOCK_read);
+               bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+               six_unlock_read(&b->lock);
+               mutex_lock(&c->btree_interior_update_lock);
+       }
+
+       while (as->nr_pending)
+               bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       closure_wake_up(&as->wait);
+
+       bch2_btree_update_free(as);
+}
+
+static void btree_update_wait_on_journal(struct closure *cl)
+{
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct bch_fs *c = as->c;
+       int ret;
+
+       ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
+       if (ret < 0)
+               goto err;
+       if (!ret) {
+               continue_at(cl, btree_update_wait_on_journal, system_wq);
+               return;
+       }
+
+       bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
+err:
+       continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
+static void btree_update_nodes_written(struct closure *cl)
+{
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct bch_fs *c = as->c;
+       struct btree *b;
+
+       /*
+        * We did an update to a parent node where the pointers we added pointed
+        * to child nodes that weren't written yet: now, the child nodes have
+        * been written so we can write out the update to the interior node.
+        */
+retry:
+       mutex_lock(&c->btree_interior_update_lock);
+       as->nodes_written = true;
+
+       switch (as->mode) {
+       case BTREE_INTERIOR_NO_UPDATE:
+               BUG();
+       case BTREE_INTERIOR_UPDATING_NODE:
+               /* The usual case: */
+               b = READ_ONCE(as->b);
+
+               if (!six_trylock_read(&b->lock)) {
+                       mutex_unlock(&c->btree_interior_update_lock);
+                       btree_node_lock_type(c, b, SIX_LOCK_read);
+                       six_unlock_read(&b->lock);
+                       goto retry;
+               }
+
+               BUG_ON(!btree_node_dirty(b));
+               closure_wait(&btree_current_write(b)->wait, cl);
+
+               list_del(&as->write_blocked_list);
+               mutex_unlock(&c->btree_interior_update_lock);
+
+               /*
+                * b->write_blocked prevented it from being written, so
+                * write it now if it needs to be written:
+                */
+               bch2_btree_node_write_cond(c, b, true);
+               six_unlock_read(&b->lock);
+               break;
+
+       case BTREE_INTERIOR_UPDATING_AS:
+               /*
+                * The btree node we originally updated has been freed and is
+                * being rewritten - so we need to write anything here, we just
+                * need to signal to that btree_update that it's ok to make the
+                * new replacement node visible:
+                */
+               closure_put(&as->parent_as->cl);
+
+               /*
+                * and then we have to wait on that btree_update to finish:
+                */
+               closure_wait(&as->parent_as->wait, cl);
+               mutex_unlock(&c->btree_interior_update_lock);
+               break;
+
+       case BTREE_INTERIOR_UPDATING_ROOT:
+               /* b is the new btree root: */
+               b = READ_ONCE(as->b);
+
+               if (!six_trylock_read(&b->lock)) {
+                       mutex_unlock(&c->btree_interior_update_lock);
+                       btree_node_lock_type(c, b, SIX_LOCK_read);
+                       six_unlock_read(&b->lock);
+                       goto retry;
+               }
+
+               BUG_ON(c->btree_roots[b->btree_id].as != as);
+               c->btree_roots[b->btree_id].as = NULL;
+
+               bch2_btree_set_root_ondisk(c, b, WRITE);
+
+               /*
+                * We don't have to wait anything anything here (before
+                * btree_update_nodes_reachable frees the old nodes
+                * ondisk) - we've ensured that the very next journal write will
+                * have the pointer to the new root, and before the allocator
+                * can reuse the old nodes it'll have to do a journal commit:
+                */
+               six_unlock_read(&b->lock);
+               mutex_unlock(&c->btree_interior_update_lock);
+
+               /*
+                * Bit of funny circularity going on here we have to break:
+                *
+                * We have to drop our journal pin before writing the journal
+                * entry that points to the new btree root: else, we could
+                * deadlock if the journal currently happens to be full.
+                *
+                * This mean we're dropping the journal pin _before_ the new
+                * nodes are technically reachable - but this is safe, because
+                * after the bch2_btree_set_root_ondisk() call above they will
+                * be reachable as of the very next journal write:
+                */
+               bch2_journal_pin_drop(&c->journal, &as->journal);
+
+               as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
+
+               btree_update_wait_on_journal(cl);
+               return;
+       }
+
+       continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
+{
+       struct bch_fs *c = as->c;
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+       BUG_ON(!btree_node_dirty(b));
+
+       as->mode = BTREE_INTERIOR_UPDATING_NODE;
+       as->b = b;
+       list_add(&as->write_blocked_list, &b->write_blocked);
+
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       /*
+        * In general, when you're staging things in a journal that will later
+        * be written elsewhere, and you also want to guarantee ordering: that
+        * is, if you have updates a, b, c, after a crash you should never see c
+        * and not a or b - there's a problem:
+        *
+        * If the final destination of the update(s) (i.e. btree node) can be
+        * written/flushed _before_ the relevant journal entry - oops, that
+        * breaks ordering, since the various leaf nodes can be written in any
+        * order.
+        *
+        * Normally we use bset->journal_seq to deal with this - if during
+        * recovery we find a btree node write that's newer than the newest
+        * journal entry, we just ignore it - we don't need it, anything we're
+        * supposed to have (that we reported as completed via fsync()) will
+        * still be in the journal, and as far as the state of the journal is
+        * concerned that btree node write never happened.
+        *
+        * That breaks when we're rewriting/splitting/merging nodes, since we're
+        * mixing btree node writes that haven't happened yet with previously
+        * written data that has been reported as completed to the journal.
+        *
+        * Thus, before making the new nodes reachable, we have to wait the
+        * newest journal sequence number we have data for to be written (if it
+        * hasn't been yet).
+        */
+       bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+}
+
+static void interior_update_flush(struct journal *j,
+                       struct journal_entry_pin *pin, u64 seq)
+{
+       struct btree_update *as =
+               container_of(pin, struct btree_update, journal);
+
+       bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+static void btree_update_reparent(struct btree_update *as,
+                                 struct btree_update *child)
+{
+       struct bch_fs *c = as->c;
+
+       child->b = NULL;
+       child->mode = BTREE_INTERIOR_UPDATING_AS;
+       child->parent_as = as;
+       closure_get(&as->cl);
+
+       /*
+        * When we write a new btree root, we have to drop our journal pin
+        * _before_ the new nodes are technically reachable; see
+        * btree_update_nodes_written().
+        *
+        * This goes for journal pins that are recursively blocked on us - so,
+        * just transfer the journal pin to the new interior update so
+        * btree_update_nodes_written() can drop it.
+        */
+       bch2_journal_pin_add_if_older(&c->journal, &child->journal,
+                                     &as->journal, interior_update_flush);
+       bch2_journal_pin_drop(&c->journal, &child->journal);
+
+       as->journal_seq = max(as->journal_seq, child->journal_seq);
+}
+
+static void btree_update_updated_root(struct btree_update *as)
+{
+       struct bch_fs *c = as->c;
+       struct btree_root *r = &c->btree_roots[as->btree_id];
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+       /*
+        * Old root might not be persistent yet - if so, redirect its
+        * btree_update operation to point to us:
+        */
+       if (r->as)
+               btree_update_reparent(as, r->as);
+
+       as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+       as->b = r->b;
+       r->as = as;
+
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       /*
+        * When we're rewriting nodes and updating interior nodes, there's an
+        * issue with updates that haven't been written in the journal getting
+        * mixed together with older data - see btree_update_updated_node()
+        * for the explanation.
+        *
+        * However, this doesn't affect us when we're writing a new btree root -
+        * because to make that new root reachable we have to write out a new
+        * journal entry, which must necessarily be newer than as->journal_seq.
+        */
+}
+
+static void btree_node_will_make_reachable(struct btree_update *as,
+                                          struct btree *b)
+{
+       struct bch_fs *c = as->c;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+       BUG_ON(b->will_make_reachable);
+
+       as->new_nodes[as->nr_new_nodes++] = b;
+       b->will_make_reachable = 1UL|(unsigned long) as;
+
+       closure_get(&as->cl);
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
+{
+       struct btree_update *as;
+       unsigned long v;
+       unsigned i;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       v = xchg(&b->will_make_reachable, 0);
+       as = (struct btree_update *) (v & ~1UL);
+
+       if (!as) {
+               mutex_unlock(&c->btree_interior_update_lock);
+               return;
+       }
+
+       for (i = 0; i < as->nr_new_nodes; i++)
+               if (as->new_nodes[i] == b)
+                       goto found;
+
+       BUG();
+found:
+       array_remove_item(as->new_nodes, as->nr_new_nodes, i);
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       if (v & 1)
+               closure_put(&as->cl);
+}
+
+static void btree_interior_update_add_node_reference(struct btree_update *as,
+                                                    struct btree *b)
+{
+       struct bch_fs *c = as->c;
+       struct pending_btree_node_free *d;
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       /* Add this node to the list of nodes being freed: */
+       BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+       d = &as->pending[as->nr_pending++];
+       d->index_update_done    = false;
+       d->seq                  = b->data->keys.seq;
+       d->btree_id             = b->btree_id;
+       d->level                = b->level;
+       bkey_copy(&d->key, &b->key);
+
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_updates - redirect @b's
+ * btree_updates to point to this btree_update:
+ */
+void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+                                              struct btree *b)
+{
+       struct bch_fs *c = as->c;
+       struct closure *cl, *cl_n;
+       struct btree_update *p, *n;
+       struct btree_write *w;
+       struct bset_tree *t;
+
+       set_btree_node_dying(b);
+
+       if (btree_node_fake(b))
+               return;
+
+       btree_interior_update_add_node_reference(as, b);
+
+       /*
+        * Does this node have data that hasn't been written in the journal?
+        *
+        * If so, we have to wait for the corresponding journal entry to be
+        * written before making the new nodes reachable - we can't just carry
+        * over the bset->journal_seq tracking, since we'll be mixing those keys
+        * in with keys that aren't in the journal anymore:
+        */
+       for_each_bset(b, t)
+               as->journal_seq = max(as->journal_seq,
+                                     le64_to_cpu(bset(b, t)->journal_seq));
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       /*
+        * Does this node have any btree_update operations preventing
+        * it from being written?
+        *
+        * If so, redirect them to point to this btree_update: we can
+        * write out our new nodes, but we won't make them visible until those
+        * operations complete
+        */
+       list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+               list_del(&p->write_blocked_list);
+               btree_update_reparent(as, p);
+       }
+
+       clear_btree_node_dirty(b);
+       clear_btree_node_need_write(b);
+       w = btree_current_write(b);
+
+       /*
+        * Does this node have any btree_update operations waiting on this node
+        * to be written?
+        *
+        * If so, wake them up when this btree_update operation is reachable:
+        */
+       llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
+               llist_add(&cl->list, &as->wait.list);
+
+       /*
+        * Does this node have unwritten data that has a pin on the journal?
+        *
+        * If so, transfer that pin to the btree_update operation -
+        * note that if we're freeing multiple nodes, we only need to keep the
+        * oldest pin of any of the nodes we're freeing. We'll release the pin
+        * when the new nodes are persistent and reachable on disk:
+        */
+       bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+                                     &as->journal, interior_update_flush);
+       bch2_journal_pin_drop(&c->journal, &w->journal);
+
+       w = btree_prev_write(b);
+       bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+                                     &as->journal, interior_update_flush);
+       bch2_journal_pin_drop(&c->journal, &w->journal);
+
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+void bch2_btree_update_done(struct btree_update *as)
+{
+       BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+
+       bch2_btree_reserve_put(as->c, as->reserve);
+       as->reserve = NULL;
+
+       continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+}
+
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+                       unsigned nr_nodes, unsigned flags,
+                       struct closure *cl)
+{
+       struct btree_reserve *reserve;
+       struct btree_update *as;
+
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return ERR_PTR(-EROFS);
+
+       reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
+       if (IS_ERR(reserve)) {
+               percpu_ref_put(&c->writes);
+               return ERR_CAST(reserve);
+       }
+
+       as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+       memset(as, 0, sizeof(*as));
+       closure_init(&as->cl, NULL);
+       as->c           = c;
+       as->mode        = BTREE_INTERIOR_NO_UPDATE;
+       as->btree_id    = id;
+       as->reserve     = reserve;
+       INIT_LIST_HEAD(&as->write_blocked_list);
+
+       bch2_keylist_init(&as->parent_keys, as->inline_keys);
+
+       mutex_lock(&c->btree_interior_update_lock);
+       list_add_tail(&as->list, &c->btree_interior_update_list);
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       return as;
+}
+
+/* Btree root updates: */
+
+static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+{
+       /* Root nodes cannot be reaped */
+       mutex_lock(&c->btree_cache.lock);
+       list_del_init(&b->list);
+       mutex_unlock(&c->btree_cache.lock);
+
+       mutex_lock(&c->btree_root_lock);
+       BUG_ON(btree_node_root(c, b) &&
+              (b->level < btree_node_root(c, b)->level ||
+               !btree_node_dying(btree_node_root(c, b))));
+
+       btree_node_root(c, b) = b;
+       mutex_unlock(&c->btree_root_lock);
+
+       bch2_recalc_btree_reserve(c);
+}
+
+static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
+{
+       struct bch_fs *c = as->c;
+       struct btree *old = btree_node_root(c, b);
+       struct bch_fs_usage stats = { 0 };
+
+       __bch2_btree_set_root_inmem(c, b);
+
+       bch2_mark_key(c, bkey_i_to_s_c(&b->key),
+                     c->opts.btree_node_size, true,
+                     gc_pos_btree_root(b->btree_id),
+                     &stats, 0, 0);
+
+       if (old && !btree_node_fake(old))
+               bch2_btree_node_free_index(as, NULL,
+                                          bkey_i_to_s_c(&old->key),
+                                          &stats);
+       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                           gc_pos_btree_root(b->btree_id));
+}
+
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
+{
+       struct btree_root *r = &c->btree_roots[b->btree_id];
+
+       mutex_lock(&c->btree_root_lock);
+
+       BUG_ON(b != r->b);
+       bkey_copy(&r->key, &b->key);
+       r->level = b->level;
+       r->alive = true;
+       if (rw == WRITE)
+               c->btree_roots_dirty = true;
+
+       mutex_unlock(&c->btree_root_lock);
+}
+
+/**
+ * bch_btree_set_root - update the root in memory and on disk
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks. However, you must hold an intent lock on the
+ * old root.
+ *
+ * Note: This allocates a journal entry but doesn't add any keys to
+ * it.  All the btree roots are part of every journal write, so there
+ * is nothing new to be done.  This just guarantees that there is a
+ * journal write.
+ */
+static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
+                               struct btree_iter *iter)
+{
+       struct bch_fs *c = as->c;
+       struct btree *old;
+
+       trace_btree_set_root(c, b);
+       BUG_ON(!b->written);
+
+       old = btree_node_root(c, b);
+
+       /*
+        * Ensure no one is using the old root while we switch to the
+        * new root:
+        */
+       bch2_btree_node_lock_write(old, iter);
+
+       bch2_btree_set_root_inmem(as, b);
+
+       btree_update_updated_root(as);
+
+       /*
+        * Unlock old root after new root is visible:
+        *
+        * The new root isn't persistent, but that's ok: we still have
+        * an intent lock on the new root, and any updates that would
+        * depend on the new root would have to update the new root.
+        */
+       bch2_btree_node_unlock_write(old, iter);
+}
+
+/* Interior node updates: */
+
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
+                                       struct btree_iter *iter,
+                                       struct bkey_i *insert,
+                                       struct btree_node_iter *node_iter)
+{
+       struct bch_fs *c = as->c;
+       struct bch_fs_usage stats = { 0 };
+       struct bkey_packed *k;
+       struct bkey tmp;
+
+       BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
+
+       if (bkey_extent_is_data(&insert->k))
+               bch2_mark_key(c, bkey_i_to_s_c(insert),
+                            c->opts.btree_node_size, true,
+                            gc_pos_btree_node(b), &stats, 0, 0);
+
+       while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
+              !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+               bch2_btree_node_iter_advance(node_iter, b);
+
+       /*
+        * If we're overwriting, look up pending delete and mark so that gc
+        * marks it on the pending delete list:
+        */
+       if (k && !bkey_cmp_packed(b, k, &insert->k))
+               bch2_btree_node_free_index(as, b,
+                                          bkey_disassemble(b, k, &tmp),
+                                          &stats);
+
+       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                           gc_pos_btree_node(b));
+
+       bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+       set_btree_node_dirty(b);
+       set_btree_node_need_write(b);
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static struct btree *__btree_split_node(struct btree_update *as,
+                                       struct btree *n1,
+                                       struct btree_iter *iter)
+{
+       size_t nr_packed = 0, nr_unpacked = 0;
+       struct btree *n2;
+       struct bset *set1, *set2;
+       struct bkey_packed *k, *prev = NULL;
+
+       n2 = bch2_btree_node_alloc(as, n1->level);
+
+       n2->data->max_key       = n1->data->max_key;
+       n2->data->format        = n1->format;
+       SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
+       n2->key.k.p = n1->key.k.p;
+
+       btree_node_set_format(n2, n2->data->format);
+
+       set1 = btree_bset_first(n1);
+       set2 = btree_bset_first(n2);
+
+       /*
+        * Has to be a linear search because we don't have an auxiliary
+        * search tree yet
+        */
+       k = set1->start;
+       while (1) {
+               if (bkey_next(k) == vstruct_last(set1))
+                       break;
+               if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
+                       break;
+
+               if (bkey_packed(k))
+                       nr_packed++;
+               else
+                       nr_unpacked++;
+
+               prev = k;
+               k = bkey_next(k);
+       }
+
+       BUG_ON(!prev);
+
+       n1->key.k.p = bkey_unpack_pos(n1, prev);
+       n1->data->max_key = n1->key.k.p;
+       n2->data->min_key =
+               btree_type_successor(n1->btree_id, n1->key.k.p);
+
+       set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
+       set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
+
+       set_btree_bset_end(n1, n1->set);
+       set_btree_bset_end(n2, n2->set);
+
+       n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
+       n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
+       n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
+       n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
+
+       n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
+       n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
+       n1->nr.packed_keys      = nr_packed;
+       n1->nr.unpacked_keys    = nr_unpacked;
+
+       BUG_ON(!set1->u64s);
+       BUG_ON(!set2->u64s);
+
+       memcpy_u64s(set2->start,
+                   vstruct_end(set1),
+                   le16_to_cpu(set2->u64s));
+
+       btree_node_reset_sib_u64s(n1);
+       btree_node_reset_sib_u64s(n2);
+
+       bch2_verify_btree_nr_keys(n1);
+       bch2_verify_btree_nr_keys(n2);
+
+       if (n1->level) {
+               btree_node_interior_verify(n1);
+               btree_node_interior_verify(n2);
+       }
+
+       return n2;
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
+                                   struct btree_iter *iter,
+                                   struct keylist *keys)
+{
+       struct btree_node_iter node_iter;
+       struct bkey_i *k = bch2_keylist_front(keys);
+       struct bkey_packed *p;
+       struct bset *i;
+
+       BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+
+       bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+
+       while (!bch2_keylist_empty(keys)) {
+               k = bch2_keylist_front(keys);
+
+               BUG_ON(bch_keylist_u64s(keys) >
+                      bch_btree_keys_u64s_remaining(as->c, b));
+               BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
+               BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
+
+               bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
+               bch2_keylist_pop_front(keys);
+       }
+
+       /*
+        * We can't tolerate whiteouts here - with whiteouts there can be
+        * duplicate keys, and it would be rather bad if we picked a duplicate
+        * for the pivot:
+        */
+       i = btree_bset_first(b);
+       p = i->start;
+       while (p != vstruct_last(i))
+               if (bkey_deleted(p)) {
+                       le16_add_cpu(&i->u64s, -p->u64s);
+                       set_btree_bset_end(b, b->set);
+                       memmove_u64s_down(p, bkey_next(p),
+                                         (u64 *) vstruct_last(i) -
+                                         (u64 *) p);
+               } else
+                       p = bkey_next(p);
+
+       BUG_ON(b->nsets != 1 ||
+              b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
+
+       btree_node_interior_verify(b);
+}
+
+static void btree_split(struct btree_update *as, struct btree *b,
+                       struct btree_iter *iter, struct keylist *keys,
+                       unsigned flags)
+{
+       struct bch_fs *c = as->c;
+       struct btree *parent = btree_node_parent(iter, b);
+       struct btree *n1, *n2 = NULL, *n3 = NULL;
+       u64 start_time = local_clock();
+
+       BUG_ON(!parent && (b != btree_node_root(c, b)));
+       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+
+       bch2_btree_interior_update_will_free_node(as, b);
+
+       n1 = bch2_btree_node_alloc_replacement(as, b);
+
+       if (keys)
+               btree_split_insert_keys(as, n1, iter, keys);
+
+       if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
+               trace_btree_split(c, b);
+
+               n2 = __btree_split_node(as, n1, iter);
+
+               bch2_btree_build_aux_trees(n2);
+               bch2_btree_build_aux_trees(n1);
+               six_unlock_write(&n2->lock);
+               six_unlock_write(&n1->lock);
+
+               bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+
+               /*
+                * Note that on recursive parent_keys == keys, so we
+                * can't start adding new keys to parent_keys before emptying it
+                * out (which we did with btree_split_insert_keys() above)
+                */
+               bch2_keylist_add(&as->parent_keys, &n1->key);
+               bch2_keylist_add(&as->parent_keys, &n2->key);
+
+               if (!parent) {
+                       /* Depth increases, make a new root */
+                       n3 = __btree_root_alloc(as, b->level + 1);
+
+                       n3->sib_u64s[0] = U16_MAX;
+                       n3->sib_u64s[1] = U16_MAX;
+
+                       btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+
+                       bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+               }
+       } else {
+               trace_btree_compact(c, b);
+
+               bch2_btree_build_aux_trees(n1);
+               six_unlock_write(&n1->lock);
+
+               bch2_keylist_add(&as->parent_keys, &n1->key);
+       }
+
+       bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+
+       /* New nodes all written, now make them visible: */
+
+       if (parent) {
+               /* Split a non root node */
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+       } else if (n3) {
+               bch2_btree_set_root(as, n3, iter);
+       } else {
+               /* Root filled up but didn't need to be split */
+               bch2_btree_set_root(as, n1, iter);
+       }
+
+       bch2_btree_open_bucket_put(c, n1);
+       if (n2)
+               bch2_btree_open_bucket_put(c, n2);
+       if (n3)
+               bch2_btree_open_bucket_put(c, n3);
+
+       /*
+        * Note - at this point other linked iterators could still have @b read
+        * locked; we're depending on the bch2_btree_iter_node_replace() calls
+        * below removing all references to @b so we don't return with other
+        * iterators pointing to a node they have locked that's been freed.
+        *
+        * We have to free the node first because the bch2_iter_node_replace()
+        * calls will drop _our_ iterator's reference - and intent lock - to @b.
+        */
+       bch2_btree_node_free_inmem(c, b, iter);
+
+       /* Successful split, update the iterator to point to the new nodes: */
+
+       if (n3)
+               bch2_btree_iter_node_replace(iter, n3);
+       if (n2)
+               bch2_btree_iter_node_replace(iter, n2);
+       bch2_btree_iter_node_replace(iter, n1);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
+                               struct btree_iter *iter, struct keylist *keys)
+{
+       struct btree_iter *linked;
+       struct btree_node_iter node_iter;
+       struct bkey_i *insert = bch2_keylist_front(keys);
+       struct bkey_packed *k;
+
+       /* Don't screw up @iter's position: */
+       node_iter = iter->l[b->level].iter;
+
+       /*
+        * btree_split(), btree_gc_coalesce() will insert keys before
+        * the iterator's current position - they know the keys go in
+        * the node the iterator points to:
+        */
+       while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+              (bkey_cmp_packed(b, k, &insert->k) >= 0))
+               ;
+
+       while (!bch2_keylist_empty(keys)) {
+               insert = bch2_keylist_front(keys);
+
+               bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+               bch2_keylist_pop_front(keys);
+       }
+
+       btree_update_updated_node(as, b);
+
+       for_each_btree_iter_with_node(iter, b, linked)
+               bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
+
+       bch2_btree_iter_verify(iter, b);
+}
+
+/**
+ * bch_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @iter:              btree iterator
+ * @keys:              list of keys to insert
+ * @hook:              insert callback
+ * @persistent:                if not null, @persistent will wait on journal write
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
+                           struct btree_iter *iter, struct keylist *keys,
+                           unsigned flags)
+{
+       struct bch_fs *c = as->c;
+       int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+       int old_live_u64s = b->nr.live_u64s;
+       int live_u64s_added, u64s_added;
+
+       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+       BUG_ON(!b->level);
+       BUG_ON(!as || as->b);
+       bch2_verify_keylist_sorted(keys);
+
+       if (as->must_rewrite)
+               goto split;
+
+       bch2_btree_node_lock_for_insert(c, b, iter);
+
+       if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+               bch2_btree_node_unlock_write(b, iter);
+               goto split;
+       }
+
+       bch2_btree_insert_keys_interior(as, b, iter, keys);
+
+       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+       u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+       if (u64s_added > live_u64s_added &&
+           bch2_maybe_compact_whiteouts(c, b))
+               bch2_btree_iter_reinit_node(iter, b);
+
+       bch2_btree_node_unlock_write(b, iter);
+
+       btree_node_interior_verify(b);
+
+       bch2_foreground_maybe_merge(c, iter, b->level, flags);
+       return;
+split:
+       btree_split(as, b, iter, keys, flags);
+}
+
+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
+                         unsigned flags)
+{
+       struct btree *b = iter->l[0].b;
+       struct btree_update *as;
+       struct closure cl;
+       int ret = 0;
+       struct btree_iter *linked;
+
+       /*
+        * We already have a disk reservation and open buckets pinned; this
+        * allocation must not block:
+        */
+       for_each_btree_iter(iter, linked)
+               if (linked->btree_id == BTREE_ID_EXTENTS)
+                       flags |= BTREE_INSERT_USE_RESERVE;
+
+       closure_init_stack(&cl);
+
+       /* Hack, because gc and splitting nodes doesn't mix yet: */
+       if (!down_read_trylock(&c->gc_lock)) {
+               if (flags & BTREE_INSERT_NOUNLOCK)
+                       return -EINTR;
+
+               bch2_btree_iter_unlock(iter);
+               down_read(&c->gc_lock);
+
+               if (btree_iter_linked(iter))
+                       ret = -EINTR;
+       }
+
+       /*
+        * XXX: figure out how far we might need to split,
+        * instead of locking/reserving all the way to the root:
+        */
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX,
+                       !(flags & BTREE_INSERT_NOUNLOCK))) {
+               ret = -EINTR;
+               goto out;
+       }
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+               btree_update_reserve_required(c, b), flags,
+               !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+       if (IS_ERR(as)) {
+               ret = PTR_ERR(as);
+               if (ret == -EAGAIN) {
+                       BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+                       bch2_btree_iter_unlock(iter);
+                       ret = -EINTR;
+               }
+               goto out;
+       }
+
+       btree_split(as, b, iter, NULL, flags);
+       bch2_btree_update_done(as);
+
+       /*
+        * We haven't successfully inserted yet, so don't downgrade all the way
+        * back to read locks;
+        */
+       __bch2_btree_iter_downgrade(iter, 1);
+out:
+       up_read(&c->gc_lock);
+       closure_sync(&cl);
+       return ret;
+}
+
+void __bch2_foreground_maybe_merge(struct bch_fs *c,
+                                  struct btree_iter *iter,
+                                  unsigned level,
+                                  unsigned flags,
+                                  enum btree_node_sibling sib)
+{
+       struct btree_update *as;
+       struct bkey_format_state new_s;
+       struct bkey_format new_f;
+       struct bkey_i delete;
+       struct btree *b, *m, *n, *prev, *next, *parent;
+       struct closure cl;
+       size_t sib_u64s;
+       int ret = 0;
+
+       closure_init_stack(&cl);
+retry:
+       BUG_ON(!btree_node_locked(iter, level));
+
+       b = iter->l[level].b;
+
+       parent = btree_node_parent(iter, b);
+       if (!parent)
+               goto out;
+
+       if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+               goto out;
+
+       /* XXX: can't be holding read locks */
+       m = bch2_btree_node_get_sibling(c, iter, b,
+                       !(flags & BTREE_INSERT_NOUNLOCK), sib);
+       if (IS_ERR(m)) {
+               ret = PTR_ERR(m);
+               goto err;
+       }
+
+       /* NULL means no sibling: */
+       if (!m) {
+               b->sib_u64s[sib] = U16_MAX;
+               goto out;
+       }
+
+       if (sib == btree_prev_sib) {
+               prev = m;
+               next = b;
+       } else {
+               prev = b;
+               next = m;
+       }
+
+       bch2_bkey_format_init(&new_s);
+       __bch2_btree_calc_format(&new_s, b);
+       __bch2_btree_calc_format(&new_s, m);
+       new_f = bch2_bkey_format_done(&new_s);
+
+       sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+               btree_node_u64s_with_format(m, &new_f);
+
+       if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+               sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+               sib_u64s /= 2;
+               sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+       }
+
+       sib_u64s = min(sib_u64s, btree_max_u64s(c));
+       b->sib_u64s[sib] = sib_u64s;
+
+       if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
+               six_unlock_intent(&m->lock);
+               goto out;
+       }
+
+       /* We're changing btree topology, doesn't mix with gc: */
+       if (!down_read_trylock(&c->gc_lock))
+               goto err_cycle_gc_lock;
+
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX,
+                       !(flags & BTREE_INSERT_NOUNLOCK))) {
+               ret = -EINTR;
+               goto err_unlock;
+       }
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+                        btree_update_reserve_required(c, parent) + 1,
+                        BTREE_INSERT_NOFAIL|
+                        BTREE_INSERT_USE_RESERVE,
+                        !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+       if (IS_ERR(as)) {
+               ret = PTR_ERR(as);
+               goto err_unlock;
+       }
+
+       trace_btree_merge(c, b);
+
+       bch2_btree_interior_update_will_free_node(as, b);
+       bch2_btree_interior_update_will_free_node(as, m);
+
+       n = bch2_btree_node_alloc(as, b->level);
+
+       n->data->min_key        = prev->data->min_key;
+       n->data->max_key        = next->data->max_key;
+       n->data->format         = new_f;
+       n->key.k.p              = next->key.k.p;
+
+       btree_node_set_format(n, new_f);
+
+       bch2_btree_sort_into(c, n, prev);
+       bch2_btree_sort_into(c, n, next);
+
+       bch2_btree_build_aux_trees(n);
+       six_unlock_write(&n->lock);
+
+       bkey_init(&delete.k);
+       delete.k.p = prev->key.k.p;
+       bch2_keylist_add(&as->parent_keys, &delete);
+       bch2_keylist_add(&as->parent_keys, &n->key);
+
+       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
+       bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+
+       bch2_btree_open_bucket_put(c, n);
+       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_btree_node_free_inmem(c, m, iter);
+       bch2_btree_iter_node_replace(iter, n);
+
+       bch2_btree_iter_verify(iter, n);
+
+       bch2_btree_update_done(as);
+
+       six_unlock_intent(&m->lock);
+       up_read(&c->gc_lock);
+out:
+       /*
+        * Don't downgrade locks here: we're called after successful insert,
+        * and the caller will downgrade locks after a successful insert
+        * anyways (in case e.g. a split was required first)
+        *
+        * And we're also called when inserting into interior nodes in the
+        * split path, and downgrading to read locks in there is potentially
+        * confusing:
+        */
+       closure_sync(&cl);
+       return;
+
+err_cycle_gc_lock:
+       six_unlock_intent(&m->lock);
+
+       if (flags & BTREE_INSERT_NOUNLOCK)
+               goto out;
+
+       bch2_btree_iter_unlock(iter);
+
+       down_read(&c->gc_lock);
+       up_read(&c->gc_lock);
+       ret = -EINTR;
+       goto err;
+
+err_unlock:
+       six_unlock_intent(&m->lock);
+       up_read(&c->gc_lock);
+err:
+       BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
+
+       if ((ret == -EAGAIN || ret == -EINTR) &&
+           !(flags & BTREE_INSERT_NOUNLOCK)) {
+               bch2_btree_iter_unlock(iter);
+               closure_sync(&cl);
+               ret = bch2_btree_iter_traverse(iter);
+               if (ret)
+                       goto out;
+
+               goto retry;
+       }
+
+       goto out;
+}
+
+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+                               struct btree *b, unsigned flags,
+                               struct closure *cl)
+{
+       struct btree *n, *parent = btree_node_parent(iter, b);
+       struct btree_update *as;
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+               (parent
+                ? btree_update_reserve_required(c, parent)
+                : 0) + 1,
+               flags, cl);
+       if (IS_ERR(as)) {
+               trace_btree_gc_rewrite_node_fail(c, b);
+               return PTR_ERR(as);
+       }
+
+       bch2_btree_interior_update_will_free_node(as, b);
+
+       n = bch2_btree_node_alloc_replacement(as, b);
+
+       bch2_btree_build_aux_trees(n);
+       six_unlock_write(&n->lock);
+
+       trace_btree_gc_rewrite_node(c, b);
+
+       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
+       if (parent) {
+               bch2_keylist_add(&as->parent_keys, &n->key);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+       } else {
+               bch2_btree_set_root(as, n, iter);
+       }
+
+       bch2_btree_open_bucket_put(c, n);
+
+       bch2_btree_node_free_inmem(c, b, iter);
+
+       bch2_btree_iter_node_replace(iter, n);
+
+       bch2_btree_update_done(as);
+       return 0;
+}
+
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ *
+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
+ * btree_check_reserve() has to wait)
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+                           __le64 seq, unsigned flags)
+{
+       struct closure cl;
+       struct btree *b;
+       int ret;
+
+       flags |= BTREE_INSERT_NOFAIL;
+
+       closure_init_stack(&cl);
+
+       bch2_btree_iter_upgrade(iter, U8_MAX, true);
+
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
+               if (!down_read_trylock(&c->gc_lock)) {
+                       bch2_btree_iter_unlock(iter);
+                       down_read(&c->gc_lock);
+               }
+       }
+
+       while (1) {
+               ret = bch2_btree_iter_traverse(iter);
+               if (ret)
+                       break;
+
+               b = bch2_btree_iter_peek_node(iter);
+               if (!b || b->data->keys.seq != seq)
+                       break;
+
+               ret = __btree_node_rewrite(c, iter, b, flags, &cl);
+               if (ret != -EAGAIN &&
+                   ret != -EINTR)
+                       break;
+
+               bch2_btree_iter_unlock(iter);
+               closure_sync(&cl);
+       }
+
+       bch2_btree_iter_downgrade(iter);
+
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+               up_read(&c->gc_lock);
+
+       closure_sync(&cl);
+       return ret;
+}
+
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+                                        struct btree_update *as,
+                                        struct btree_iter *iter,
+                                        struct btree *b, struct btree *new_hash,
+                                        struct bkey_i_extent *new_key)
+{
+       struct btree *parent;
+       int ret;
+
+       /*
+        * Two corner cases that need to be thought about here:
+        *
+        * @b may not be reachable yet - there might be another interior update
+        * operation waiting on @b to be written, and we're gonna deliver the
+        * write completion to that interior update operation _before_
+        * persisting the new_key update
+        *
+        * That ends up working without us having to do anything special here:
+        * the reason is, we do kick off (and do the in memory updates) for the
+        * update for @new_key before we return, creating a new interior_update
+        * operation here.
+        *
+        * The new interior update operation here will in effect override the
+        * previous one. The previous one was going to terminate - make @b
+        * reachable - in one of two ways:
+        * - updating the btree root pointer
+        *   In that case,
+        *   no, this doesn't work. argh.
+        */
+
+       if (b->will_make_reachable)
+               as->must_rewrite = true;
+
+       btree_interior_update_add_node_reference(as, b);
+
+       parent = btree_node_parent(iter, b);
+       if (parent) {
+               if (new_hash) {
+                       bkey_copy(&new_hash->key, &new_key->k_i);
+                       ret = bch2_btree_node_hash_insert(&c->btree_cache,
+                                       new_hash, b->level, b->btree_id);
+                       BUG_ON(ret);
+               }
+
+               bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
+
+               if (new_hash) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+
+                       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+                       bkey_copy(&b->key, &new_key->k_i);
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+               } else {
+                       bkey_copy(&b->key, &new_key->k_i);
+               }
+       } else {
+               struct bch_fs_usage stats = { 0 };
+
+               BUG_ON(btree_node_root(c, b) != b);
+
+               bch2_btree_node_lock_write(b, iter);
+
+               bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
+                             c->opts.btree_node_size, true,
+                             gc_pos_btree_root(b->btree_id),
+                             &stats, 0, 0);
+               bch2_btree_node_free_index(as, NULL,
+                                          bkey_i_to_s_c(&b->key),
+                                          &stats);
+               bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                   gc_pos_btree_root(b->btree_id));
+
+               if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+                       bkey_copy(&b->key, &new_key->k_i);
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+               } else {
+                       bkey_copy(&b->key, &new_key->k_i);
+               }
+
+               btree_update_updated_root(as);
+               bch2_btree_node_unlock_write(b, iter);
+       }
+
+       bch2_btree_update_done(as);
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+                              struct btree *b, struct bkey_i_extent *new_key)
+{
+       struct btree *parent = btree_node_parent(iter, b);
+       struct btree_update *as = NULL;
+       struct btree *new_hash = NULL;
+       struct closure cl;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX, true))
+               return -EINTR;
+
+       if (!down_read_trylock(&c->gc_lock)) {
+               bch2_btree_iter_unlock(iter);
+               down_read(&c->gc_lock);
+
+               if (!bch2_btree_iter_relock(iter)) {
+                       ret = -EINTR;
+                       goto err;
+               }
+       }
+
+       /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+       if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+               /* bch2_btree_reserve_get will unlock */
+               ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+               if (ret) {
+                       ret = -EINTR;
+
+                       bch2_btree_iter_unlock(iter);
+                       up_read(&c->gc_lock);
+                       closure_sync(&cl);
+                       down_read(&c->gc_lock);
+
+                       if (!bch2_btree_iter_relock(iter))
+                               goto err;
+               }
+
+               new_hash = bch2_btree_node_mem_alloc(c);
+       }
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+               parent ? btree_update_reserve_required(c, parent) : 0,
+               BTREE_INSERT_NOFAIL|
+               BTREE_INSERT_USE_RESERVE|
+               BTREE_INSERT_USE_ALLOC_RESERVE,
+               &cl);
+
+       if (IS_ERR(as)) {
+               ret = PTR_ERR(as);
+               if (ret == -EAGAIN)
+                       ret = -EINTR;
+
+               if (ret != -EINTR)
+                       goto err;
+
+               bch2_btree_iter_unlock(iter);
+               up_read(&c->gc_lock);
+               closure_sync(&cl);
+               down_read(&c->gc_lock);
+
+               if (!bch2_btree_iter_relock(iter))
+                       goto err;
+       }
+
+       ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                     extent_i_to_s_c(new_key).s_c);
+       if (ret)
+               goto err_free_update;
+
+       __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+
+       bch2_btree_iter_downgrade(iter);
+err:
+       if (new_hash) {
+               mutex_lock(&c->btree_cache.lock);
+               list_move(&new_hash->list, &c->btree_cache.freeable);
+               mutex_unlock(&c->btree_cache.lock);
+
+               six_unlock_write(&new_hash->lock);
+               six_unlock_intent(&new_hash->lock);
+       }
+       up_read(&c->gc_lock);
+       closure_sync(&cl);
+       return ret;
+err_free_update:
+       bch2_btree_update_free(as);
+       goto err;
+}
+
+/* Init code: */
+
+/*
+ * Only for filesystem bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new filesystem:
+ */
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
+{
+       BUG_ON(btree_node_root(c, b));
+
+       __bch2_btree_set_root_inmem(c, b);
+       bch2_btree_set_root_ondisk(c, b, READ);
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+       struct closure cl;
+       struct btree *b;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       do {
+               ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+               closure_sync(&cl);
+       } while (ret);
+
+       b = bch2_btree_node_mem_alloc(c);
+       bch2_btree_cache_cannibalize_unlock(c);
+
+       set_btree_node_fake(b);
+       b->level        = 0;
+       b->btree_id     = id;
+
+       bkey_extent_init(&b->key);
+       b->key.k.p = POS_MAX;
+       bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
+
+       bch2_bset_init_first(b, &b->data->keys);
+       bch2_btree_build_aux_trees(b);
+
+       b->data->min_key = POS_MIN;
+       b->data->max_key = POS_MAX;
+       b->data->format = bch2_btree_calc_format(b);
+       btree_node_set_format(b, b->data->format);
+
+       ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+       BUG_ON(ret);
+
+       __bch2_btree_set_root_inmem(c, b);
+
+       six_unlock_write(&b->lock);
+       six_unlock_intent(&b->lock);
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+{
+       char *out = buf, *end = buf + PAGE_SIZE;
+       struct btree_update *as;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       list_for_each_entry(as, &c->btree_interior_update_list, list)
+               out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
+                                as,
+                                as->mode,
+                                as->nodes_written,
+                                atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+                                bch2_journal_pin_seq(&c->journal, &as->journal));
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       return out - buf;
+}
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+       size_t ret = 0;
+       struct list_head *i;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       list_for_each(i, &c->btree_interior_update_list)
+               ret++;
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       return ret;
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
new file mode 100644 (file)
index 0000000..7a19a52
--- /dev/null
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+struct btree_reserve {
+       struct disk_reservation disk_res;
+       unsigned                nr;
+       struct btree            *b[BTREE_RESERVE_MAX];
+};
+
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
+                               struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+       bool                    index_update_done;
+
+       __le64                  seq;
+       enum btree_id           btree_id;
+       unsigned                level;
+       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+       struct closure                  cl;
+       struct bch_fs                   *c;
+
+       struct list_head                list;
+
+       /* What kind of update are we doing? */
+       enum {
+               BTREE_INTERIOR_NO_UPDATE,
+               BTREE_INTERIOR_UPDATING_NODE,
+               BTREE_INTERIOR_UPDATING_ROOT,
+               BTREE_INTERIOR_UPDATING_AS,
+       } mode;
+
+       unsigned                        must_rewrite:1;
+       unsigned                        nodes_written:1;
+
+       enum btree_id                   btree_id;
+
+       struct btree_reserve            *reserve;
+
+       /*
+        * BTREE_INTERIOR_UPDATING_NODE:
+        * The update that made the new nodes visible was a regular update to an
+        * existing interior node - @b. We can't write out the update to @b
+        * until the new nodes we created are finished writing, so we block @b
+        * from writing by putting this btree_interior update on the
+        * @b->write_blocked list with @write_blocked_list:
+        */
+       struct btree                    *b;
+       struct list_head                write_blocked_list;
+
+       /*
+        * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+        * we're now blocking another btree_update
+        * @parent_as - btree_update that's waiting on our nodes to finish
+        * writing, before it can make new nodes visible on disk
+        * @wait - list of child btree_updates that are waiting on this
+        * btree_update to make all the new nodes visible before they can free
+        * their old btree nodes
+        */
+       struct btree_update             *parent_as;
+       struct closure_waitlist         wait;
+
+       /*
+        * We may be freeing nodes that were dirty, and thus had journal entries
+        * pinned: we need to transfer the oldest of those pins to the
+        * btree_update operation, and release it when the new node(s)
+        * are all persistent and reachable:
+        */
+       struct journal_entry_pin        journal;
+
+       u64                             journal_seq;
+
+       /*
+        * Nodes being freed:
+        * Protected by c->btree_node_pending_free_lock
+        */
+       struct pending_btree_node_free  pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+       unsigned                        nr_pending;
+
+       /* New nodes, that will be made reachable by this update: */
+       struct btree                    *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+       unsigned                        nr_new_nodes;
+
+       /* Only here to reduce stack usage on recursive splits: */
+       struct keylist                  parent_keys;
+       /*
+        * Enough room for btree_split's keys without realloc - btree node
+        * pointers never have crc/compression info, so we only need to acount
+        * for the pointers for three keys
+        */
+       u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define for_each_pending_btree_node_free(c, as, p)                     \
+       list_for_each_entry(as, &c->btree_interior_update_list, list)   \
+               for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
+                               struct btree_iter *);
+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
+void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+                                                 struct btree *,
+                                                 struct bkey_format);
+
+void bch2_btree_update_done(struct btree_update *);
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+                       unsigned, struct closure *);
+
+void bch2_btree_interior_update_will_free_node(struct btree_update *,
+                                              struct btree *);
+
+void bch2_btree_insert_node(struct btree_update *, struct btree *,
+                           struct btree_iter *, struct keylist *,
+                           unsigned);
+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+
+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+                                  unsigned, unsigned, enum btree_node_sibling);
+
+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+                                       struct btree_iter *iter,
+                                       unsigned level, unsigned flags,
+                                       enum btree_node_sibling sib)
+{
+       struct btree *b;
+
+       /*
+        * iterators are inconsistent when they hit end of leaf, until
+        * traversed again
+        *
+        * XXX inconsistent how?
+        */
+       if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
+               return;
+
+       if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+               return;
+
+       if (!bch2_btree_node_relock(iter, level))
+               return;
+
+       b = iter->l[level].b;
+       if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+               return;
+
+       __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+}
+
+static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+                                              struct btree_iter *iter,
+                                              unsigned level,
+                                              unsigned flags)
+{
+       bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+                                           btree_prev_sib);
+       bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+                                           btree_next_sib);
+}
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+                                                    struct btree *b)
+{
+       unsigned depth = btree_node_root(c, b)->level + 1;
+
+       /*
+        * Number of nodes we might have to allocate in a worst case btree
+        * split operation - we split all the way up to the root, then allocate
+        * a new root, unless we're already at max depth:
+        */
+       if (depth < BTREE_MAX_DEPTH)
+               return (depth - b->level) * 2 + 1;
+       else
+               return (depth - b->level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+       b->sib_u64s[0] = b->nr.live_u64s;
+       b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+       return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+                                                           struct btree *b)
+{
+       return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+                                                         struct btree *b)
+{
+       return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+       return (void *) b->data + (b->written << 9);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+       return (void *) i < write_block(b);
+}
+
+static inline bool bset_unwritten(struct btree *b, struct bset *i)
+{
+       return (void *) i > write_block(b);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+                                                struct btree *b,
+                                                void *end)
+{
+       ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+               b->whiteout_u64s +
+               b->uncompacted_whiteout_u64s;
+       ssize_t total = c->opts.btree_node_size << 6;
+
+       return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+                                                  struct btree *b)
+{
+       ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+                               btree_bkey_last(b, bset_tree_last(b)));
+
+       BUG_ON(remaining < 0);
+
+       if (bset_written(b, btree_bset_last(b)))
+               return 0;
+
+       return remaining;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+       /*
+        * Could buffer up larger amounts of keys for btrees with larger keys,
+        * pending benchmarking:
+        */
+       return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+                                                    struct btree *b)
+{
+       struct bset *i = btree_bset_last(b);
+       struct btree_node_entry *bne = max(write_block(b),
+                       (void *) btree_bkey_last(b, bset_tree_last(b)));
+       ssize_t remaining_space =
+               __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+       if (unlikely(bset_written(b, i))) {
+               if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+                       return bne;
+       } else {
+               if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+                   remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+                       return bne;
+       }
+
+       return NULL;
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+                                     struct bkey_packed *k)
+{
+       if (bset_written(b, bset(b, t))) {
+               EBUG_ON(b->uncompacted_whiteout_u64s <
+                       bkeyp_key_u64s(&b->format, k));
+               b->uncompacted_whiteout_u64s -=
+                       bkeyp_key_u64s(&b->format, k);
+       }
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+                                   struct bkey_packed *k)
+{
+       if (bset_written(b, bset(b, t))) {
+               BUG_ON(!k->needs_whiteout);
+               b->uncompacted_whiteout_u64s +=
+                       bkeyp_key_u64s(&b->format, k);
+       }
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+                                             struct btree *b, unsigned u64s)
+{
+       if (unlikely(btree_node_fake(b)))
+               return false;
+
+       if (btree_node_is_extents(b)) {
+               /* The insert key might split an existing key
+                * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
+                */
+               u64s += BKEY_EXTENT_U64s_MAX;
+       }
+
+       return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+static inline bool journal_res_insert_fits(struct btree_insert *trans,
+                                          struct btree_insert_entry *insert)
+{
+       unsigned u64s = 0;
+       struct btree_insert_entry *i;
+
+       /*
+        * If we didn't get a journal reservation, we're in journal replay and
+        * we're not journalling updates:
+        */
+       if (!trans->journal_res.ref)
+               return true;
+
+       for (i = insert; i < trans->entries + trans->nr; i++)
+               u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+       return u64s <= trans->journal_res.u64s;
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
new file mode 100644 (file)
index 0000000..4d1d095
--- /dev/null
@@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "trace.h"
+
+#include <linux/sort.h>
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+                               struct btree *b,
+                               struct btree_node_iter *node_iter,
+                               struct bkey_i *insert)
+{
+       const struct bkey_format *f = &b->format;
+       struct bkey_packed *k;
+       struct bset_tree *t;
+       unsigned clobber_u64s;
+
+       EBUG_ON(btree_node_just_written(b));
+       EBUG_ON(bset_written(b, btree_bset_last(b)));
+       EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+       EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+               bkey_cmp(insert->k.p, b->data->max_key) > 0);
+
+       k = bch2_btree_node_iter_peek_all(node_iter, b);
+       if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+               BUG_ON(bkey_whiteout(k));
+
+               t = bch2_bkey_to_bset(b, k);
+
+               if (bset_unwritten(b, bset(b, t)) &&
+                   bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
+                   !bkey_whiteout(&insert->k)) {
+                       k->type = insert->k.type;
+                       memcpy_u64s(bkeyp_val(f, k), &insert->v,
+                                   bkey_val_u64s(&insert->k));
+                       return true;
+               }
+
+               insert->k.needs_whiteout = k->needs_whiteout;
+
+               btree_keys_account_key_drop(&b->nr, t - b->set, k);
+
+               if (t == bset_tree_last(b)) {
+                       clobber_u64s = k->u64s;
+
+                       /*
+                        * If we're deleting, and the key we're deleting doesn't
+                        * need a whiteout (it wasn't overwriting a key that had
+                        * been written to disk) - just delete it:
+                        */
+                       if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+                               bch2_bset_delete(b, k, clobber_u64s);
+                               bch2_btree_node_iter_fix(iter, b, node_iter, t,
+                                                       k, clobber_u64s, 0);
+                               return true;
+                       }
+
+                       goto overwrite;
+               }
+
+               k->type = KEY_TYPE_DELETED;
+               bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+                                       k->u64s, k->u64s);
+
+               if (bkey_whiteout(&insert->k)) {
+                       reserve_whiteout(b, t, k);
+                       return true;
+               } else {
+                       k->needs_whiteout = false;
+               }
+       } else {
+               /*
+                * Deleting, but the key to delete wasn't found - nothing to do:
+                */
+               if (bkey_whiteout(&insert->k))
+                       return false;
+
+               insert->k.needs_whiteout = false;
+       }
+
+       t = bset_tree_last(b);
+       k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+       clobber_u64s = 0;
+overwrite:
+       bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+       if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
+               bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+                                       clobber_u64s, k->u64s);
+       return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+                              unsigned i, u64 seq)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct btree_write *w = container_of(pin, struct btree_write, journal);
+       struct btree *b = container_of(w, struct btree, writes[i]);
+
+       btree_node_lock_type(c, b, SIX_LOCK_read);
+       bch2_btree_node_write_cond(c, b,
+                       (btree_current_write(b) == w &&
+                        w->journal.pin_list == journal_seq_pin(j, seq)));
+       six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+       return __btree_node_flush(j, pin, 0, seq);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+       return __btree_node_flush(j, pin, 1, seq);
+}
+
+void bch2_btree_journal_key(struct btree_insert *trans,
+                          struct btree_iter *iter,
+                          struct bkey_i *insert)
+{
+       struct bch_fs *c = trans->c;
+       struct journal *j = &c->journal;
+       struct btree *b = iter->l[0].b;
+       struct btree_write *w = btree_current_write(b);
+
+       EBUG_ON(iter->level || b->level);
+       EBUG_ON(trans->journal_res.ref !=
+               !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+
+       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+               u64 seq = trans->journal_res.seq;
+               bool needs_whiteout = insert->k.needs_whiteout;
+
+               /* ick */
+               insert->k.needs_whiteout = false;
+               bch2_journal_add_keys(j, &trans->journal_res,
+                                     iter->btree_id, insert);
+               insert->k.needs_whiteout = needs_whiteout;
+
+               bch2_journal_set_has_inode(j, &trans->journal_res,
+                                          insert->k.p.inode);
+
+               if (trans->journal_seq)
+                       *trans->journal_seq = seq;
+               btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+       }
+
+       if (unlikely(!journal_pin_active(&w->journal))) {
+               u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+                       ? trans->journal_res.seq
+                       : j->replay_journal_seq;
+
+               bch2_journal_pin_add(j, seq, &w->journal,
+                                    btree_node_write_idx(b) == 0
+                                    ? btree_node_flush0
+                                    : btree_node_flush1);
+       }
+
+       if (unlikely(!btree_node_dirty(b)))
+               set_btree_node_dirty(b);
+}
+
+static enum btree_insert_ret
+bch2_insert_fixup_key(struct btree_insert *trans,
+                    struct btree_insert_entry *insert)
+{
+       struct btree_iter *iter = insert->iter;
+       struct btree_iter_level *l = &iter->l[0];
+
+       EBUG_ON(iter->level);
+       EBUG_ON(insert->k->k.u64s >
+               bch_btree_keys_u64s_remaining(trans->c, l->b));
+
+       if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+                                      insert->k))
+               bch2_btree_journal_key(trans, iter, insert->k);
+
+       trans->did_work = true;
+       return BTREE_INSERT_OK;
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static enum btree_insert_ret
+btree_insert_key_leaf(struct btree_insert *trans,
+                     struct btree_insert_entry *insert)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter = insert->iter;
+       struct btree *b = iter->l[0].b;
+       enum btree_insert_ret ret;
+       int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+       int old_live_u64s = b->nr.live_u64s;
+       int live_u64s_added, u64s_added;
+
+       ret = !btree_node_is_extents(b)
+               ? bch2_insert_fixup_key(trans, insert)
+               : bch2_insert_fixup_extent(trans, insert);
+
+       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+       u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+       if (u64s_added > live_u64s_added &&
+           bch2_maybe_compact_whiteouts(c, b))
+               bch2_btree_iter_reinit_node(iter, b);
+
+       trace_btree_insert_key(c, b, insert->k);
+       return ret;
+}
+
+#define trans_for_each_entry(trans, i)                                 \
+       for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+/*
+ * We sort transaction entries so that if multiple iterators point to the same
+ * leaf node they'll be adjacent:
+ */
+static bool same_leaf_as_prev(struct btree_insert *trans,
+                             struct btree_insert_entry *i)
+{
+       return i != trans->entries &&
+               i[0].iter->l[0].b == i[-1].iter->l[0].b;
+}
+
+static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
+                                                        struct btree_insert_entry *i)
+{
+       struct btree *b = i->iter->l[0].b;
+
+       do {
+               i++;
+       } while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+
+       return i;
+}
+
+#define trans_for_each_leaf(trans, i)                                  \
+       for ((i) = (trans)->entries;                                    \
+            (i) < (trans)->entries + (trans)->nr;                      \
+            (i) = trans_next_leaf(trans, i))
+
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
+                                           struct btree_iter *iter)
+{
+       bch2_btree_node_lock_write(b, iter);
+
+       if (btree_node_just_written(b) &&
+           bch2_btree_post_write_cleanup(c, b))
+               bch2_btree_iter_reinit_node(iter, b);
+
+       /*
+        * If the last bset has been written, or if it's gotten too big - start
+        * a new bset to insert into:
+        */
+       if (want_new_bset(c, b))
+               bch2_btree_init_next(c, b, iter);
+}
+
+static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_leaf(trans, i)
+               bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+}
+
+static void multi_unlock_write(struct btree_insert *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_leaf(trans, i)
+               bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+}
+
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+                                 struct btree_insert_entry r)
+{
+       return btree_iter_cmp(l.iter, r.iter);
+}
+
+/* Normal update interface: */
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_btree_insert_at(struct btree_insert *trans,
+                                    struct btree_iter **split,
+                                    bool *cycle_gc_lock)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       unsigned u64s;
+       int ret;
+
+       trans_for_each_entry(trans, i) {
+               BUG_ON(i->done);
+               BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+       }
+
+       u64s = 0;
+       trans_for_each_entry(trans, i)
+               u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+       memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+       ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
+               ? bch2_journal_res_get(&c->journal,
+                                     &trans->journal_res,
+                                     u64s, u64s)
+               : 0;
+       if (ret)
+               return ret;
+
+       multi_lock_write(c, trans);
+
+       if (race_fault()) {
+               ret = -EINTR;
+               goto out;
+       }
+
+       u64s = 0;
+       trans_for_each_entry(trans, i) {
+               /* Multiple inserts might go to same leaf: */
+               if (!same_leaf_as_prev(trans, i))
+                       u64s = 0;
+
+               /*
+                * bch2_btree_node_insert_fits() must be called under write lock:
+                * with only an intent lock, another thread can still call
+                * bch2_btree_node_write(), converting an unwritten bset to a
+                * written one
+                */
+               u64s += i->k->k.u64s + i->extra_res;
+               if (!bch2_btree_node_insert_fits(c,
+                               i->iter->l[0].b, u64s)) {
+                       ret = -EINTR;
+                       *split = i->iter;
+                       goto out;
+               }
+       }
+
+       if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+               if (journal_seq_verify(c))
+                       trans_for_each_entry(trans, i)
+                               i->k->k.version.lo = trans->journal_res.seq;
+               else if (inject_invalid_keys(c))
+                       trans_for_each_entry(trans, i)
+                               i->k->k.version = MAX_VERSION;
+       }
+
+       trans_for_each_entry(trans, i) {
+               switch (btree_insert_key_leaf(trans, i)) {
+               case BTREE_INSERT_OK:
+                       i->done = true;
+                       break;
+               case BTREE_INSERT_JOURNAL_RES_FULL:
+               case BTREE_INSERT_NEED_TRAVERSE:
+               case BTREE_INSERT_NEED_RESCHED:
+                       ret = -EINTR;
+                       break;
+               case BTREE_INSERT_BTREE_NODE_FULL:
+                       ret = -EINTR;
+                       *split = i->iter;
+                       break;
+               case BTREE_INSERT_ENOSPC:
+                       ret = -ENOSPC;
+                       break;
+               case BTREE_INSERT_NEED_GC_LOCK:
+                       ret = -EINTR;
+                       *cycle_gc_lock = true;
+                       break;
+               default:
+                       BUG();
+               }
+
+               /*
+                * If we did some work (i.e. inserted part of an extent),
+                * we have to do all the other updates as well:
+                */
+               if (!trans->did_work && (ret || *split))
+                       break;
+       }
+out:
+       multi_unlock_write(trans);
+       bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+       return ret;
+}
+
+static inline void btree_insert_entry_checks(struct bch_fs *c,
+                                            struct btree_insert_entry *i)
+{
+       BUG_ON(i->iter->level);
+       BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+       BUG_ON(debug_check_bkeys(c) &&
+              !bkey_deleted(&i->k->k) &&
+              bch2_bkey_invalid(c, (enum bkey_type) i->iter->btree_id,
+                                bkey_i_to_s_c(i->k)));
+}
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch2_btree_insert_at(struct btree_insert *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       struct btree_iter *linked, *split = NULL;
+       bool cycle_gc_lock = false;
+       unsigned flags;
+       int ret;
+
+       BUG_ON(!trans->nr);
+
+       for_each_btree_iter(trans->entries[0].iter, linked)
+               bch2_btree_iter_verify_locks(linked);
+
+       /* for the sake of sanity: */
+       BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+
+       trans_for_each_entry(trans, i)
+               btree_insert_entry_checks(c, i);
+
+       bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return -EROFS;
+retry:
+       split = NULL;
+       cycle_gc_lock = false;
+
+       trans_for_each_entry(trans, i) {
+               if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+                       ret = -EINTR;
+                       goto err;
+               }
+
+               if (i->iter->flags & BTREE_ITER_ERROR) {
+                       ret = -EIO;
+                       goto err;
+               }
+       }
+
+       ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
+       if (unlikely(ret))
+               goto err;
+
+       trans_for_each_leaf(trans, i)
+               bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+
+       trans_for_each_entry(trans, i)
+               bch2_btree_iter_downgrade(i->iter);
+out:
+       percpu_ref_put(&c->writes);
+
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               /* make sure we didn't drop or screw up locks: */
+               for_each_btree_iter(trans->entries[0].iter, linked) {
+                       bch2_btree_iter_verify_locks(linked);
+                       BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
+                              trans->did_work &&
+                              linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+               }
+
+               /* make sure we didn't lose an error: */
+               if (!ret)
+                       trans_for_each_entry(trans, i)
+                               BUG_ON(!i->done);
+       }
+
+       BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
+       return ret;
+err:
+       flags = trans->flags;
+
+       /*
+        * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
+        * update; if we haven't done anything yet it doesn't apply
+        */
+       if (!trans->did_work)
+               flags &= ~BTREE_INSERT_NOUNLOCK;
+
+       if (split) {
+               ret = bch2_btree_split_leaf(c, split, flags);
+
+               /*
+                * if the split succeeded without dropping locks the insert will
+                * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
+                * caller peeked() and is overwriting won't have changed)
+                */
+#if 0
+               /*
+                * XXX:
+                * split -> btree node merging (of parent node) might still drop
+                * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+                */
+               if (!ret && !trans->did_work)
+                       goto retry;
+#endif
+
+               /*
+                * don't care if we got ENOSPC because we told split it
+                * couldn't block:
+                */
+               if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+                       ret = -EINTR;
+       }
+
+       if (cycle_gc_lock) {
+               if (!down_read_trylock(&c->gc_lock)) {
+                       if (flags & BTREE_INSERT_NOUNLOCK)
+                               goto out;
+
+                       bch2_btree_iter_unlock(trans->entries[0].iter);
+                       down_read(&c->gc_lock);
+               }
+               up_read(&c->gc_lock);
+       }
+
+       if (ret == -EINTR) {
+               if (flags & BTREE_INSERT_NOUNLOCK)
+                       goto out;
+
+               trans_for_each_entry(trans, i) {
+                       int ret2 = bch2_btree_iter_traverse(i->iter);
+                       if (ret2) {
+                               ret = ret2;
+                               goto out;
+                       }
+
+                       BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+               }
+
+               /*
+                * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+                * dropped locks:
+                */
+               if (!(flags & BTREE_INSERT_ATOMIC))
+                       goto retry;
+       }
+
+       goto out;
+}
+
+void bch2_trans_update(struct btree_trans *trans,
+                      struct btree_iter *iter,
+                      struct bkey_i *k,
+                      unsigned extra_journal_res)
+{
+       struct btree_insert_entry *i;
+
+       BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+       i = &trans->updates[trans->nr_updates++];
+
+       *i = (struct btree_insert_entry) {
+               .iter   = iter,
+               .k              = k,
+               .extra_res      = extra_journal_res,
+       };
+
+       btree_insert_entry_checks(trans->c, i);
+}
+
+int bch2_trans_commit(struct btree_trans *trans,
+                     struct disk_reservation *disk_res,
+                     struct extent_insert_hook *hook,
+                     u64 *journal_seq,
+                     unsigned flags)
+{
+       struct btree_insert insert = {
+               .c              = trans->c,
+               .disk_res       = disk_res,
+               .journal_seq    = journal_seq,
+               .flags          = flags,
+               .nr             = trans->nr_updates,
+               .entries        = trans->updates,
+       };
+
+       if (!trans->nr_updates)
+               return 0;
+
+       trans->nr_updates = 0;
+
+       return __bch2_btree_insert_at(&insert);
+}
+
+int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+{
+       struct bkey_i k;
+
+       bkey_init(&k.k);
+       k.k.p = iter->pos;
+
+       return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+                                   BTREE_INSERT_NOFAIL|
+                                   BTREE_INSERT_USE_RESERVE|flags,
+                                   BTREE_INSERT_ENTRY(iter, &k));
+}
+
+int bch2_btree_insert_list_at(struct btree_iter *iter,
+                            struct keylist *keys,
+                            struct disk_reservation *disk_res,
+                            struct extent_insert_hook *hook,
+                            u64 *journal_seq, unsigned flags)
+{
+       BUG_ON(flags & BTREE_INSERT_ATOMIC);
+       BUG_ON(bch2_keylist_empty(keys));
+       bch2_verify_keylist_sorted(keys);
+
+       while (!bch2_keylist_empty(keys)) {
+               int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+                               journal_seq, flags,
+                               BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
+               if (ret)
+                       return ret;
+
+               bch2_keylist_pop_front(keys);
+       }
+
+       return 0;
+}
+
+/**
+ * bch_btree_insert - insert keys into the extent btree
+ * @c:                 pointer to struct bch_fs
+ * @id:                        btree to insert into
+ * @insert_keys:       list of keys to insert
+ * @hook:              insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+                    struct bkey_i *k,
+                    struct disk_reservation *disk_res,
+                    struct extent_insert_hook *hook,
+                    u64 *journal_seq, int flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
+                            BTREE_ITER_INTENT);
+       ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+                                  BTREE_INSERT_ENTRY(&iter, k));
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+                          struct bpos start,
+                          struct bpos end,
+                          struct bversion version,
+                          struct disk_reservation *disk_res,
+                          struct extent_insert_hook *hook,
+                          u64 *journal_seq)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_btree_iter_init(&iter, c, id, start,
+                            BTREE_ITER_INTENT);
+
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
+              !(ret = btree_iter_err(k))) {
+               unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+               /* really shouldn't be using a bare, unpadded bkey_i */
+               struct bkey_i delete;
+
+               if (bkey_cmp(iter.pos, end) >= 0)
+                       break;
+
+               bkey_init(&delete.k);
+
+               /*
+                * For extents, iter.pos won't necessarily be the same as
+                * bkey_start_pos(k.k) (for non extents they always will be the
+                * same). It's important that we delete starting from iter.pos
+                * because the range we want to delete could start in the middle
+                * of k.
+                *
+                * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+                * bkey_start_pos(k.k)).
+                */
+               delete.k.p = iter.pos;
+               delete.k.version = version;
+
+               if (iter.flags & BTREE_ITER_IS_EXTENTS) {
+                       /* create the biggest key we can */
+                       bch2_key_resize(&delete.k, max_sectors);
+                       bch2_cut_back(end, &delete.k);
+               }
+
+               ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
+                                          BTREE_INSERT_NOFAIL,
+                                          BTREE_INSERT_ENTRY(&iter, &delete));
+               if (ret)
+                       break;
+
+               bch2_btree_iter_cond_resched(&iter);
+       }
+
+       bch2_btree_iter_unlock(&iter);
+       return ret;
+}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
new file mode 100644 (file)
index 0000000..f347c93
--- /dev/null
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ *
+ * Bucket states:
+ * - free bucket: mark == 0
+ *   The bucket contains no data and will not be read
+ *
+ * - allocator bucket: owned_by_allocator == 1
+ *   The bucket is on a free list, or it is an open bucket
+ *
+ * - cached bucket: owned_by_allocator == 0 &&
+ *                  dirty_sectors == 0 &&
+ *                  cached_sectors > 0
+ *   The bucket contains data but may be safely discarded as there are
+ *   enough replicas of the data on other cache devices, or it has been
+ *   written back to the backing device
+ *
+ * - dirty bucket: owned_by_allocator == 0 &&
+ *                 dirty_sectors > 0
+ *   The bucket contains data that we must not discard (either only copy,
+ *   or one of the 'main copies' for data requiring multiple replicas)
+ *
+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
+ *   This is a btree node, journal or gen/prio bucket
+ *
+ * Lifecycle:
+ *
+ * bucket invalidated => bucket on freelist => open bucket =>
+ *     [dirty bucket =>] cached bucket => bucket invalidated => ...
+ *
+ * Note that cache promotion can skip the dirty bucket step, as data
+ * is copied from a deeper tier to a shallower tier, onto a cached
+ * bucket.
+ * Note also that a cached bucket can spontaneously become dirty --
+ * see below.
+ *
+ * Only a traversal of the key space can determine whether a bucket is
+ * truly dirty or cached.
+ *
+ * Transitions:
+ *
+ * - free => allocator: bucket was invalidated
+ * - cached => allocator: bucket was invalidated
+ *
+ * - allocator => dirty: open bucket was filled up
+ * - allocator => cached: open bucket was filled up
+ * - allocator => metadata: metadata was allocated
+ *
+ * - dirty => cached: dirty sectors were copied to a deeper tier
+ * - dirty => free: dirty sectors were overwritten or moved (copy gc)
+ * - cached => free: cached sectors were overwritten
+ *
+ * - metadata => free: metadata was freed
+ *
+ * Oddities:
+ * - cached => dirty: a device was removed so formerly replicated data
+ *                    is no longer sufficiently replicated
+ * - free => cached: cannot happen
+ * - free => dirty: cannot happen
+ * - free => metadata: cannot happen
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "error.h"
+#include "movinggc.h"
+#include "trace.h"
+
+#include <linux/preempt.h>
+
+#ifdef DEBUG_BUCKETS
+
+#define lg_local_lock  lg_global_lock
+#define lg_local_unlock        lg_global_unlock
+
+static void bch2_fs_stats_verify(struct bch_fs *c)
+{
+       struct bch_fs_usage stats =
+               __bch2_fs_usage_read(c);
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
+               if ((s64) stats.s[i].data[S_META] < 0)
+                       panic("replicas %u meta underflow: %lli\n",
+                             i + 1, stats.s[i].data[S_META]);
+
+               if ((s64) stats.s[i].data[S_DIRTY] < 0)
+                       panic("replicas %u dirty underflow: %lli\n",
+                             i + 1, stats.s[i].data[S_DIRTY]);
+
+               if ((s64) stats.s[i].persistent_reserved < 0)
+                       panic("replicas %u reserved underflow: %lli\n",
+                             i + 1, stats.s[i].persistent_reserved);
+       }
+
+       if ((s64) stats.online_reserved < 0)
+               panic("sectors_online_reserved underflow: %lli\n",
+                     stats.online_reserved);
+}
+
+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+       struct bch_dev_usage stats =
+               __bch2_dev_usage_read(ca);
+       u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
+               BUG_ON(stats.buckets[i]         > n);
+       BUG_ON(stats.buckets_alloc              > n);
+       BUG_ON(stats.buckets_unavailable        > n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+       if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+               u64 used = __bch2_fs_sectors_used(c);
+               u64 cached = 0;
+               u64 avail = atomic64_read(&c->sectors_available);
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+               if (used + avail + cached > c->capacity)
+                       panic("used %llu avail %llu cached %llu capacity %llu\n",
+                             used, avail, cached, c->capacity);
+       }
+}
+
+#else
+
+static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
+
+#endif
+
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
+void bch2_bucket_seq_cleanup(struct bch_fs *c)
+{
+       u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+       struct bch_dev *ca;
+       struct bucket_array *buckets;
+       struct bucket *g;
+       struct bucket_mark m;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               buckets = bucket_array(ca);
+
+               for_each_bucket(g, buckets) {
+                       bucket_cmpxchg(g, m, ({
+                               if (!m.journal_seq_valid ||
+                                   bucket_needs_journal_commit(m, last_seq_ondisk))
+                                       break;
+
+                               m.journal_seq_valid = 0;
+                       }));
+               }
+               up_read(&ca->bucket_lock);
+       }
+}
+
+#define bch2_usage_add(_acc, _stats)                                   \
+do {                                                                   \
+       typeof(_acc) _a = (_acc), _s = (_stats);                        \
+       unsigned i;                                                     \
+                                                                       \
+       for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)                 \
+               ((u64 *) (_a))[i] += ((u64 *) (_s))[i];                 \
+} while (0)
+
+#define bch2_usage_read_raw(_stats)                                    \
+({                                                                     \
+       typeof(*this_cpu_ptr(_stats)) _acc;                             \
+       int cpu;                                                        \
+                                                                       \
+       memset(&_acc, 0, sizeof(_acc));                                 \
+                                                                       \
+       for_each_possible_cpu(cpu)                                      \
+               bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));      \
+                                                                       \
+       _acc;                                                           \
+})
+
+#define bch2_usage_read_cached(_c, _cached, _uncached)                 \
+({                                                                     \
+       typeof(_cached) _ret;                                           \
+       unsigned _seq;                                                  \
+                                                                       \
+       do {                                                            \
+               _seq = read_seqcount_begin(&(_c)->gc_pos_lock);         \
+               _ret = (_c)->gc_pos.phase == GC_PHASE_DONE              \
+                       ? bch2_usage_read_raw(_uncached)                        \
+                       : (_cached);                                    \
+       } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));        \
+                                                                       \
+       _ret;                                                           \
+})
+
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+{
+       return bch2_usage_read_raw(ca->usage_percpu);
+}
+
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
+{
+       return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+}
+
+struct bch_fs_usage
+__bch2_fs_usage_read(struct bch_fs *c)
+{
+       return bch2_usage_read_raw(c->usage_percpu);
+}
+
+struct bch_fs_usage
+bch2_fs_usage_read(struct bch_fs *c)
+{
+       return bch2_usage_read_cached(c,
+                                    c->usage_cached,
+                                    c->usage_percpu);
+}
+
+struct fs_usage_sum {
+       u64     data;
+       u64     reserved;
+};
+
+static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
+{
+       struct fs_usage_sum sum = { 0 };
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
+               sum.data += (stats.s[i].data[S_META] +
+                            stats.s[i].data[S_DIRTY]) * (i + 1);
+               sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+       }
+
+       sum.reserved += stats.online_reserved;
+       return sum;
+}
+
+#define RESERVE_FACTOR 6
+
+static u64 reserve_factor(u64 r)
+{
+       return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+static u64 avail_factor(u64 r)
+{
+       return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+}
+
+u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+{
+       struct fs_usage_sum sum = __fs_usage_sum(stats);
+
+       return sum.data + reserve_factor(sum.reserved);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+{
+       return min(c->capacity, __bch2_fs_sectors_used(c, stats));
+}
+
+u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+{
+       return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+}
+
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+       return !is_available_bucket(m);
+}
+
+static inline int is_fragmented_bucket(struct bucket_mark m,
+                                      struct bch_dev *ca)
+{
+       if (!m.owned_by_allocator &&
+           m.data_type == BCH_DATA_USER &&
+           bucket_sectors_used(m))
+               return max_t(int, 0, (int) ca->mi.bucket_size -
+                            bucket_sectors_used(m));
+       return 0;
+}
+
+static inline enum bch_data_type bucket_type(struct bucket_mark m)
+{
+       return m.cached_sectors && !m.dirty_sectors
+               ?  BCH_DATA_CACHED
+               : m.data_type;
+}
+
+static bool bucket_became_unavailable(struct bch_fs *c,
+                                     struct bucket_mark old,
+                                     struct bucket_mark new)
+{
+       return is_available_bucket(old) &&
+              !is_available_bucket(new) &&
+              (!c || c->gc_pos.phase == GC_PHASE_DONE);
+}
+
+void bch2_fs_usage_apply(struct bch_fs *c,
+                       struct bch_fs_usage *stats,
+                       struct disk_reservation *disk_res,
+                       struct gc_pos gc_pos)
+{
+       struct fs_usage_sum sum = __fs_usage_sum(*stats);
+       s64 added = sum.data + sum.reserved;
+
+       /*
+        * Not allowed to reduce sectors_available except by getting a
+        * reservation:
+        */
+       BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+
+       if (added > 0) {
+               disk_res->sectors       -= added;
+               stats->online_reserved  -= added;
+       }
+
+       percpu_down_read(&c->usage_lock);
+       preempt_disable();
+       /* online_reserved not subject to gc: */
+       this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+       stats->online_reserved = 0;
+
+       if (!gc_will_visit(c, gc_pos))
+               bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+
+       bch2_fs_stats_verify(c);
+       preempt_enable();
+       percpu_up_read(&c->usage_lock);
+
+       memset(stats, 0, sizeof(*stats));
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+                                 struct bucket_mark old, struct bucket_mark new)
+{
+       struct bch_dev_usage *dev_usage;
+
+       if (c)
+               percpu_rwsem_assert_held(&c->usage_lock);
+
+       if (old.data_type && new.data_type &&
+           old.data_type != new.data_type) {
+               BUG_ON(!c);
+               bch2_fs_inconsistent(c,
+                       "different types of data in same bucket: %s, %s",
+                       bch2_data_types[old.data_type],
+                       bch2_data_types[new.data_type]);
+       }
+
+       preempt_disable();
+       dev_usage = this_cpu_ptr(ca->usage_percpu);
+
+       dev_usage->buckets[bucket_type(old)]--;
+       dev_usage->buckets[bucket_type(new)]++;
+
+       dev_usage->buckets_alloc +=
+               (int) new.owned_by_allocator - (int) old.owned_by_allocator;
+       dev_usage->buckets_unavailable +=
+               is_unavailable_bucket(new) - is_unavailable_bucket(old);
+
+       dev_usage->sectors[old.data_type] -= old.dirty_sectors;
+       dev_usage->sectors[new.data_type] += new.dirty_sectors;
+       dev_usage->sectors[BCH_DATA_CACHED] +=
+               (int) new.cached_sectors - (int) old.cached_sectors;
+       dev_usage->sectors_fragmented +=
+               is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+       preempt_enable();
+
+       if (!is_available_bucket(old) && is_available_bucket(new))
+               bch2_wake_allocator(ca);
+
+       bch2_dev_stats_verify(ca);
+}
+
+#define bucket_data_cmpxchg(c, ca, g, new, expr)               \
+({                                                             \
+       struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
+                                                               \
+       bch2_dev_usage_update(c, ca, _old, new);                \
+       _old;                                                   \
+})
+
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+                           size_t b, struct bucket_mark *old)
+{
+       struct bucket *g;
+       struct bucket_mark new;
+
+       percpu_rwsem_assert_held(&c->usage_lock);
+
+       g = bucket(ca, b);
+
+       *old = bucket_data_cmpxchg(c, ca, g, new, ({
+               if (!is_available_bucket(new))
+                       return false;
+
+               new.owned_by_allocator  = 1;
+               new.data_type           = 0;
+               new.cached_sectors      = 0;
+               new.dirty_sectors       = 0;
+               new.gen++;
+       }));
+
+       if (!old->owned_by_allocator && old->cached_sectors)
+               trace_invalidate(ca, bucket_to_sector(ca, b),
+                                old->cached_sectors);
+       return true;
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                           size_t b, bool owned_by_allocator,
+                           struct gc_pos pos, unsigned flags)
+{
+       struct bucket *g;
+       struct bucket_mark old, new;
+
+       percpu_rwsem_assert_held(&c->usage_lock);
+       g = bucket(ca, b);
+
+       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+           gc_will_visit(c, pos))
+               return;
+
+       old = bucket_data_cmpxchg(c, ca, g, new, ({
+               new.owned_by_allocator  = owned_by_allocator;
+       }));
+
+       BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
+              c->gc_pos.phase == GC_PHASE_DONE);
+}
+
+#define saturated_add(ca, dst, src, max)                       \
+do {                                                           \
+       BUG_ON((int) (dst) + (src) < 0);                        \
+       if ((dst) == (max))                                     \
+               ;                                               \
+       else if ((dst) + (src) <= (max))                        \
+               dst += (src);                                   \
+       else {                                                  \
+               dst = (max);                                    \
+               trace_sectors_saturated(ca);            \
+       }                                                       \
+} while (0)
+
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+                              size_t b, enum bch_data_type type,
+                              unsigned sectors, struct gc_pos pos,
+                              unsigned flags)
+{
+       struct bucket *g;
+       struct bucket_mark old, new;
+
+       BUG_ON(!type);
+
+       if (likely(c)) {
+               percpu_rwsem_assert_held(&c->usage_lock);
+
+               if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+                   gc_will_visit(c, pos))
+                       return;
+       }
+
+       rcu_read_lock();
+
+       g = bucket(ca, b);
+       old = bucket_data_cmpxchg(c, ca, g, new, ({
+               saturated_add(ca, new.dirty_sectors, sectors,
+                             GC_MAX_SECTORS_USED);
+               new.data_type           = type;
+       }));
+
+       rcu_read_unlock();
+
+       BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+              bucket_became_unavailable(c, old, new));
+}
+
+/* Reverting this until the copygc + compression issue is fixed: */
+
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
+{
+       if (!sectors)
+               return 0;
+
+       return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+                                   crc.uncompressed_size));
+}
+
+/*
+ * Checking against gc's position has to be done here, inside the cmpxchg()
+ * loop, to avoid racing with the start of gc clearing all the marks - GC does
+ * that with the gc pos seqlock held.
+ */
+static void bch2_mark_pointer(struct bch_fs *c,
+                             struct bkey_s_c_extent e,
+                             const struct bch_extent_ptr *ptr,
+                             struct bch_extent_crc_unpacked crc,
+                             s64 sectors, enum s_alloc type,
+                             struct bch_fs_usage *stats,
+                             u64 journal_seq, unsigned flags)
+{
+       struct bucket_mark old, new;
+       unsigned saturated;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       struct bucket *g = PTR_BUCKET(ca, ptr);
+       enum bch_data_type data_type = type == S_META
+               ? BCH_DATA_BTREE : BCH_DATA_USER;
+       u64 v;
+
+       if (crc.compression_type) {
+               unsigned old_sectors, new_sectors;
+
+               if (sectors > 0) {
+                       old_sectors = 0;
+                       new_sectors = sectors;
+               } else {
+                       old_sectors = e.k->size;
+                       new_sectors = e.k->size + sectors;
+               }
+
+               sectors = -__disk_sectors(crc, old_sectors)
+                         +__disk_sectors(crc, new_sectors);
+       }
+
+       if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
+               if (journal_seq)
+                       bucket_cmpxchg(g, new, ({
+                               new.journal_seq_valid   = 1;
+                               new.journal_seq         = journal_seq;
+                       }));
+
+               return;
+       }
+
+       v = atomic64_read(&g->_mark.v);
+       do {
+               new.v.counter = old.v.counter = v;
+               saturated = 0;
+
+               /*
+                * Check this after reading bucket mark to guard against
+                * the allocator invalidating a bucket after we've already
+                * checked the gen
+                */
+               if (gen_after(new.gen, ptr->gen)) {
+                       BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
+                       EBUG_ON(!ptr->cached &&
+                               test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+                       return;
+               }
+
+               if (!ptr->cached &&
+                   new.dirty_sectors == GC_MAX_SECTORS_USED &&
+                   sectors < 0)
+                       saturated = -sectors;
+
+               if (ptr->cached)
+                       saturated_add(ca, new.cached_sectors, sectors,
+                                     GC_MAX_SECTORS_USED);
+               else
+                       saturated_add(ca, new.dirty_sectors, sectors,
+                                     GC_MAX_SECTORS_USED);
+
+               if (!new.dirty_sectors &&
+                   !new.cached_sectors) {
+                       new.data_type   = 0;
+
+                       if (journal_seq) {
+                               new.journal_seq_valid = 1;
+                               new.journal_seq = journal_seq;
+                       }
+               } else {
+                       new.data_type = data_type;
+               }
+
+               if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+                       g->_mark = new;
+                       break;
+               }
+       } while ((v = atomic64_cmpxchg(&g->_mark.v,
+                             old.v.counter,
+                             new.v.counter)) != old.v.counter);
+
+       bch2_dev_usage_update(c, ca, old, new);
+
+       BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+              bucket_became_unavailable(c, old, new));
+
+       if (saturated &&
+           atomic_long_add_return(saturated,
+                                  &ca->saturated_count) >=
+           bucket_to_sector(ca, ca->free_inc.size)) {
+               if (c->gc_thread) {
+                       trace_gc_sectors_saturated(c);
+                       wake_up_process(c->gc_thread);
+               }
+       }
+}
+
+void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+                  s64 sectors, bool metadata,
+                  struct gc_pos pos,
+                  struct bch_fs_usage *stats,
+                  u64 journal_seq, unsigned flags)
+{
+       /*
+        * synchronization w.r.t. GC:
+        *
+        * Normally, bucket sector counts/marks are updated on the fly, as
+        * references are added/removed from the btree, the lists of buckets the
+        * allocator owns, other metadata buckets, etc.
+        *
+        * When GC is in progress and going to mark this reference, we do _not_
+        * mark this reference here, to avoid double counting - GC will count it
+        * when it gets to it.
+        *
+        * To know whether we should mark a given reference (GC either isn't
+        * running, or has already marked references at this position) we
+        * construct a total order for everything GC walks. Then, we can simply
+        * compare the position of the reference we're marking - @pos - with
+        * GC's current position. If GC is going to mark this reference, GC's
+        * current position will be less than @pos; if GC's current position is
+        * greater than @pos GC has either already walked this position, or
+        * isn't running.
+        *
+        * To avoid racing with GC's position changing, we have to deal with
+        *  - GC's position being set to GC_POS_MIN when GC starts:
+        *    usage_lock guards against this
+        *  - GC's position overtaking @pos: we guard against this with
+        *    whatever lock protects the data structure the reference lives in
+        *    (e.g. the btree node lock, or the relevant allocator lock).
+        */
+
+       percpu_down_read(&c->usage_lock);
+       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+           gc_will_visit(c, pos))
+               flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+       if (!stats)
+               stats = this_cpu_ptr(c->usage_percpu);
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const struct bch_extent_ptr *ptr;
+               struct bch_extent_crc_unpacked crc;
+               enum s_alloc type = metadata ? S_META : S_DIRTY;
+               unsigned replicas = 0;
+
+               BUG_ON(metadata && bkey_extent_is_cached(e.k));
+               BUG_ON(!sectors);
+
+               extent_for_each_ptr_crc(e, ptr, crc) {
+                       bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+                                         stats, journal_seq, flags);
+                       replicas += !ptr->cached;
+               }
+
+               if (replicas) {
+                       BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
+                       stats->s[replicas - 1].data[type] += sectors;
+               }
+               break;
+       }
+       case BCH_RESERVATION: {
+               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+               if (r.v->nr_replicas) {
+                       BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
+                       stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+               }
+               break;
+       }
+       }
+       percpu_up_read(&c->usage_lock);
+}
+
+/* Disk reservations: */
+
+static u64 __recalc_sectors_available(struct bch_fs *c)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+
+       return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+}
+
+/* Used by gc when it's starting: */
+void bch2_recalc_sectors_available(struct bch_fs *c)
+{
+       percpu_down_write(&c->usage_lock);
+       atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
+       percpu_up_write(&c->usage_lock);
+}
+
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+{
+       percpu_down_read(&c->usage_lock);
+       this_cpu_sub(c->usage_percpu->online_reserved,
+                    res->sectors);
+
+       bch2_fs_stats_verify(c);
+       percpu_up_read(&c->usage_lock);
+
+       res->sectors = 0;
+}
+
+#define SECTORS_CACHE  1024
+
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+                             unsigned sectors, int flags)
+{
+       struct bch_fs_usage *stats;
+       u64 old, v, get;
+       s64 sectors_available;
+       int ret;
+
+       percpu_down_read(&c->usage_lock);
+       preempt_disable();
+       stats = this_cpu_ptr(c->usage_percpu);
+
+       if (sectors <= stats->available_cache)
+               goto out;
+
+       v = atomic64_read(&c->sectors_available);
+       do {
+               old = v;
+               get = min((u64) sectors + SECTORS_CACHE, old);
+
+               if (get < sectors) {
+                       preempt_enable();
+                       percpu_up_read(&c->usage_lock);
+                       goto recalculate;
+               }
+       } while ((v = atomic64_cmpxchg(&c->sectors_available,
+                                      old, old - get)) != old);
+
+       stats->available_cache  += get;
+
+out:
+       stats->available_cache  -= sectors;
+       stats->online_reserved  += sectors;
+       res->sectors            += sectors;
+
+       bch2_disk_reservations_verify(c, flags);
+       bch2_fs_stats_verify(c);
+       preempt_enable();
+       percpu_up_read(&c->usage_lock);
+       return 0;
+
+recalculate:
+       /*
+        * GC recalculates sectors_available when it starts, so that hopefully
+        * we don't normally end up blocking here:
+        */
+
+       /*
+        * Piss fuck, we can be called from extent_insert_fixup() with btree
+        * locks held:
+        */
+
+       if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
+               if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
+                       down_read(&c->gc_lock);
+               else if (!down_read_trylock(&c->gc_lock))
+                       return -EINTR;
+       }
+
+       percpu_down_write(&c->usage_lock);
+       sectors_available = __recalc_sectors_available(c);
+
+       if (sectors <= sectors_available ||
+           (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+               atomic64_set(&c->sectors_available,
+                            max_t(s64, 0, sectors_available - sectors));
+               stats->online_reserved  += sectors;
+               res->sectors            += sectors;
+               ret = 0;
+
+               bch2_disk_reservations_verify(c, flags);
+       } else {
+               atomic64_set(&c->sectors_available, sectors_available);
+               ret = -ENOSPC;
+       }
+
+       bch2_fs_stats_verify(c);
+       percpu_up_write(&c->usage_lock);
+
+       if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
+               up_read(&c->gc_lock);
+
+       return ret;
+}
+
+/* Startup/shutdown: */
+
+static void buckets_free_rcu(struct rcu_head *rcu)
+{
+       struct bucket_array *buckets =
+               container_of(rcu, struct bucket_array, rcu);
+
+       kvpfree(buckets,
+               sizeof(struct bucket_array) +
+               buckets->nbuckets * sizeof(struct bucket));
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+       struct bucket_array *buckets = NULL, *old_buckets = NULL;
+       unsigned long *buckets_dirty = NULL;
+       u8 *oldest_gens = NULL;
+       alloc_fifo      free[RESERVE_NR];
+       alloc_fifo      free_inc;
+       alloc_heap      alloc_heap;
+       copygc_heap     copygc_heap;
+
+       size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+                            ca->mi.bucket_size / c->opts.btree_node_size);
+       /* XXX: these should be tunable */
+       size_t reserve_none     = max_t(size_t, 4, ca->mi.nbuckets >> 9);
+       size_t copygc_reserve   = max_t(size_t, 16, ca->mi.nbuckets >> 7);
+       size_t free_inc_reserve = copygc_reserve / 2;
+       bool resize = ca->buckets != NULL,
+            start_copygc = ca->copygc_thread != NULL;
+       int ret = -ENOMEM;
+       unsigned i;
+
+       memset(&free,           0, sizeof(free));
+       memset(&free_inc,       0, sizeof(free_inc));
+       memset(&alloc_heap,     0, sizeof(alloc_heap));
+       memset(&copygc_heap,    0, sizeof(copygc_heap));
+
+       if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
+                                           nbuckets * sizeof(struct bucket),
+                                           GFP_KERNEL|__GFP_ZERO)) ||
+           !(oldest_gens       = kvpmalloc(nbuckets * sizeof(u8),
+                                           GFP_KERNEL|__GFP_ZERO)) ||
+           !(buckets_dirty     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+                                           sizeof(unsigned long),
+                                           GFP_KERNEL|__GFP_ZERO)) ||
+           !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
+           !init_fifo(&free[RESERVE_MOVINGGC],
+                      copygc_reserve, GFP_KERNEL) ||
+           !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+           !init_fifo(&free_inc,       free_inc_reserve, GFP_KERNEL) ||
+           !init_heap(&alloc_heap,     free_inc_reserve, GFP_KERNEL) ||
+           !init_heap(&copygc_heap,    copygc_reserve, GFP_KERNEL))
+               goto err;
+
+       buckets->first_bucket   = ca->mi.first_bucket;
+       buckets->nbuckets       = nbuckets;
+
+       bch2_copygc_stop(ca);
+
+       if (resize) {
+               down_write(&c->gc_lock);
+               down_write(&ca->bucket_lock);
+               percpu_down_write(&c->usage_lock);
+       }
+
+       old_buckets = bucket_array(ca);
+
+       if (resize) {
+               size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+
+               memcpy(buckets->b,
+                      old_buckets->b,
+                      n * sizeof(struct bucket));
+               memcpy(oldest_gens,
+                      ca->oldest_gens,
+                      n * sizeof(u8));
+               memcpy(buckets_dirty,
+                      ca->buckets_dirty,
+                      BITS_TO_LONGS(n) * sizeof(unsigned long));
+       }
+
+       rcu_assign_pointer(ca->buckets, buckets);
+       buckets = old_buckets;
+
+       swap(ca->oldest_gens, oldest_gens);
+       swap(ca->buckets_dirty, buckets_dirty);
+
+       if (resize)
+               percpu_up_write(&c->usage_lock);
+
+       spin_lock(&c->freelist_lock);
+       for (i = 0; i < RESERVE_NR; i++) {
+               fifo_move(&free[i], &ca->free[i]);
+               swap(ca->free[i], free[i]);
+       }
+       fifo_move(&free_inc, &ca->free_inc);
+       swap(ca->free_inc, free_inc);
+       spin_unlock(&c->freelist_lock);
+
+       /* with gc lock held, alloc_heap can't be in use: */
+       swap(ca->alloc_heap, alloc_heap);
+
+       /* and we shut down copygc: */
+       swap(ca->copygc_heap, copygc_heap);
+
+       nbuckets = ca->mi.nbuckets;
+
+       if (resize) {
+               up_write(&ca->bucket_lock);
+               up_write(&c->gc_lock);
+       }
+
+       if (start_copygc &&
+           bch2_copygc_start(c, ca))
+               bch_err(ca, "error restarting copygc thread");
+
+       ret = 0;
+err:
+       free_heap(&copygc_heap);
+       free_heap(&alloc_heap);
+       free_fifo(&free_inc);
+       for (i = 0; i < RESERVE_NR; i++)
+               free_fifo(&free[i]);
+       kvpfree(buckets_dirty,
+               BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+       kvpfree(oldest_gens,
+               nbuckets * sizeof(u8));
+       if (buckets)
+               call_rcu(&old_buckets->rcu, buckets_free_rcu);
+
+       return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+       unsigned i;
+
+       free_heap(&ca->copygc_heap);
+       free_heap(&ca->alloc_heap);
+       free_fifo(&ca->free_inc);
+       for (i = 0; i < RESERVE_NR; i++)
+               free_fifo(&ca->free[i]);
+       kvpfree(ca->buckets_dirty,
+               BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+       kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
+       kvpfree(rcu_dereference_protected(ca->buckets, 1),
+               sizeof(struct bucket_array) +
+               ca->mi.nbuckets * sizeof(struct bucket));
+
+       free_percpu(ca->usage_percpu);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+       if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+               return -ENOMEM;
+
+       return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
new file mode 100644 (file)
index 0000000..a4ba6d7
--- /dev/null
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "super.h"
+
+#define for_each_bucket(_b, _buckets)                          \
+       for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
+            _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+#define bucket_cmpxchg(g, new, expr)                           \
+({                                                             \
+       u64 _v = atomic64_read(&(g)->_mark.v);                  \
+       struct bucket_mark _old;                                \
+                                                               \
+       do {                                                    \
+               (new).v.counter = _old.v.counter = _v;          \
+               expr;                                           \
+       } while ((_v = atomic64_cmpxchg(&(g)->_mark.v,          \
+                              _old.v.counter,                  \
+                              (new).v.counter)) != _old.v.counter);\
+       _old;                                                   \
+})
+
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+       return rcu_dereference_check(ca->buckets,
+                                    !ca->fs ||
+                                    percpu_rwsem_is_held(&ca->fs->usage_lock) ||
+                                    lockdep_is_held(&ca->fs->gc_lock) ||
+                                    lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+       struct bucket_array *buckets = bucket_array(ca);
+
+       BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+       return buckets->b + b;
+}
+
+static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
+                                        size_t b, int rw)
+{
+       bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+       return c->bucket_clock[rw].hand - g->io_time[rw];
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+{
+       return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+                                  const struct bch_extent_ptr *ptr)
+{
+       return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
+                                       const struct bch_extent_ptr *ptr)
+{
+       return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
+                                                const struct bch_extent_ptr *ptr)
+{
+       struct bucket_mark m;
+
+       rcu_read_lock();
+       m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
+       rcu_read_unlock();
+
+       return m;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+       return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+       int r = gen_cmp(a, b);
+
+       return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+                          const struct bch_extent_ptr *ptr)
+{
+       return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+}
+
+/* bucket gc marks */
+
+/* The dirty and cached sector counts saturate. If this occurs,
+ * reference counting alone will not free the bucket, and a btree
+ * GC must be performed. */
+#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
+
+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
+{
+       return mark.dirty_sectors + mark.cached_sectors;
+}
+
+static inline bool bucket_unused(struct bucket_mark mark)
+{
+       return !mark.owned_by_allocator &&
+               !mark.data_type &&
+               !bucket_sectors_used(mark);
+}
+
+/* Device usage: */
+
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+                                         struct bch_dev_usage stats)
+{
+       u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+       if (WARN_ONCE(stats.buckets_unavailable > total,
+                     "buckets_unavailable overflow (%llu > %llu)\n",
+                     stats.buckets_unavailable, total))
+               return 0;
+
+       return total - stats.buckets_unavailable;
+}
+
+/*
+ * Number of reclaimable buckets - only for use by the allocator thread:
+ */
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+       return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+}
+
+static inline u64 __dev_buckets_free(struct bch_dev *ca,
+                                    struct bch_dev_usage stats)
+{
+       return __dev_buckets_available(ca, stats) +
+               fifo_used(&ca->free[RESERVE_NONE]) +
+               fifo_used(&ca->free_inc);
+}
+
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+{
+       return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+}
+
+/* Filesystem usage: */
+
+static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
+{
+       switch (s) {
+       case S_META:
+               return BCH_DATA_BTREE;
+       case S_DIRTY:
+               return BCH_DATA_USER;
+       default:
+               BUG();
+       }
+}
+
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
+void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+                        struct disk_reservation *, struct gc_pos);
+
+u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
+
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+       return (!mark.owned_by_allocator &&
+               !mark.dirty_sectors &&
+               !mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+                                              u16 last_seq_ondisk)
+{
+       return m.journal_seq_valid &&
+               ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+}
+
+void bch2_bucket_seq_cleanup(struct bch_fs *);
+
+bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+                           size_t, struct bucket_mark *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+                           size_t, bool, struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+                              size_t, enum bch_data_type, unsigned,
+                              struct gc_pos, unsigned);
+
+#define BCH_BUCKET_MARK_NOATOMIC               (1 << 0)
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE   (1 << 1)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT          (1 << 2)
+#define BCH_BUCKET_MARK_GC_LOCK_HELD           (1 << 3)
+
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
+                  struct bch_fs_usage *, u64, unsigned);
+
+void bch2_recalc_sectors_available(struct bch_fs *);
+
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+                                            struct disk_reservation *res)
+{
+       if (res->sectors)
+               __bch2_disk_reservation_put(c, res);
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL            (1 << 0)
+#define BCH_DISK_RESERVATION_GC_LOCK_HELD      (1 << 1)
+#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD  (1 << 2)
+
+int bch2_disk_reservation_add(struct bch_fs *,
+                            struct disk_reservation *,
+                            unsigned, int);
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+       return (struct disk_reservation) {
+               .sectors        = 0,
+#if 0
+               /* not used yet: */
+               .gen            = c->capacity_gen,
+#endif
+               .nr_replicas    = nr_replicas,
+       };
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+                                           struct disk_reservation *res,
+                                           unsigned sectors,
+                                           unsigned nr_replicas,
+                                           int flags)
+{
+       *res = bch2_disk_reservation_init(c, nr_replicas);
+
+       return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
new file mode 100644 (file)
index 0000000..5be9013
--- /dev/null
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "util.h"
+
+struct bucket_mark {
+       union {
+       struct {
+               atomic64_t      v;
+       };
+
+       struct {
+               u8              gen;
+               u8              data_type:3,
+                               gen_valid:1,
+                               owned_by_allocator:1,
+                               nouse:1,
+                               journal_seq_valid:1;
+               u16             dirty_sectors;
+               u16             cached_sectors;
+
+               /*
+                * low bits of journal sequence number when this bucket was most
+                * recently modified: if journal_seq_valid is set, this bucket
+                * can't be reused until the journal sequence number written to
+                * disk is >= the bucket's journal sequence number:
+                */
+               u16             journal_seq;
+       };
+       };
+};
+
+struct bucket {
+       union {
+               struct bucket_mark      _mark;
+               const struct bucket_mark mark;
+       };
+
+       u16                             io_time[2];
+};
+
+struct bucket_array {
+       struct rcu_head         rcu;
+       u16                     first_bucket;
+       size_t                  nbuckets;
+       struct bucket           b[];
+};
+
+struct bch_dev_usage {
+       u64                     buckets[BCH_DATA_NR];
+       u64                     buckets_alloc;
+       u64                     buckets_unavailable;
+
+       /* _compressed_ sectors: */
+       u64                     sectors[BCH_DATA_NR];
+       u64                     sectors_fragmented;
+};
+
+/* kill, switch to bch_data_type? */
+enum s_alloc {
+       S_META,
+       S_DIRTY,
+       S_ALLOC_NR,
+};
+
+struct bch_fs_usage {
+       /* all fields are in units of 512 byte sectors: */
+       /* _uncompressed_ sectors: */
+       u64                     online_reserved;
+       u64                     available_cache;
+
+       struct {
+               u64             data[S_ALLOC_NR];
+               u64             persistent_reserved;
+       }                       s[BCH_REPLICAS_MAX];
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+       u64             sectors;
+       u32             gen;
+       unsigned        nr_replicas;
+};
+
+struct copygc_heap_entry {
+       u8                      gen;
+       u32                     sectors;
+       u64                     offset;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
new file mode 100644 (file)
index 0000000..2aa8633
--- /dev/null
@@ -0,0 +1,663 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "move.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+                                         unsigned flags)
+{
+       struct bch_dev *ca;
+
+       if (flags & BCH_BY_INDEX) {
+               if (dev >= c->sb.nr_devices)
+                       return ERR_PTR(-EINVAL);
+
+               rcu_read_lock();
+               ca = rcu_dereference(c->devs[dev]);
+               if (ca)
+                       percpu_ref_get(&ca->ref);
+               rcu_read_unlock();
+
+               if (!ca)
+                       return ERR_PTR(-EINVAL);
+       } else {
+               char *path;
+
+               path = strndup_user((const char __user *)
+                                   (unsigned long) dev, PATH_MAX);
+               if (IS_ERR(path))
+                       return ERR_CAST(path);
+
+               ca = bch2_dev_lookup(c, path);
+               kfree(path);
+       }
+
+       return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+       struct bch_ioctl_assemble arg;
+       struct bch_fs *c;
+       u64 *user_devs = NULL;
+       char **devs = NULL;
+       unsigned i;
+       int ret = -EFAULT;
+
+       if (copy_from_user(&arg, user_arg, sizeof(arg)))
+               return -EFAULT;
+
+       if (arg.flags || arg.pad)
+               return -EINVAL;
+
+       user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+       if (!user_devs)
+               return -ENOMEM;
+
+       devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+       if (copy_from_user(user_devs, user_arg->devs,
+                          sizeof(u64) * arg.nr_devs))
+               goto err;
+
+       for (i = 0; i < arg.nr_devs; i++) {
+               devs[i] = strndup_user((const char __user *)(unsigned long)
+                                      user_devs[i],
+                                      PATH_MAX);
+               if (!devs[i]) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+       }
+
+       c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+       ret = PTR_ERR_OR_ZERO(c);
+       if (!ret)
+               closure_put(&c->cl);
+err:
+       if (devs)
+               for (i = 0; i < arg.nr_devs; i++)
+                       kfree(devs[i]);
+       kfree(devs);
+       return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+       struct bch_ioctl_incremental arg;
+       const char *err;
+       char *path;
+
+       if (copy_from_user(&arg, user_arg, sizeof(arg)))
+               return -EFAULT;
+
+       if (arg.flags || arg.pad)
+               return -EINVAL;
+
+       path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+       if (!path)
+               return -ENOMEM;
+
+       err = bch2_fs_open_incremental(path);
+       kfree(path);
+
+       if (err) {
+               pr_err("Could not register bcachefs devices: %s", err);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+       switch (cmd) {
+#if 0
+       case BCH_IOCTL_ASSEMBLE:
+               return bch2_ioctl_assemble(arg);
+       case BCH_IOCTL_INCREMENTAL:
+               return bch2_ioctl_incremental(arg);
+#endif
+       default:
+               return -ENOTTY;
+       }
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+                       struct bch_ioctl_query_uuid __user *user_arg)
+{
+       return copy_to_user(&user_arg->uuid,
+                           &c->sb.user_uuid,
+                           sizeof(c->sb.user_uuid));
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+       if (arg.flags || arg.pad)
+               return -EINVAL;
+
+       return bch2_fs_start(c) ? -EIO : 0;
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+       bch2_fs_stop(c);
+       return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+       char *path;
+       int ret;
+
+       if (arg.flags || arg.pad)
+               return -EINVAL;
+
+       path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+       if (!path)
+               return -ENOMEM;
+
+       ret = bch2_dev_add(c, path);
+       kfree(path);
+
+       return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+       struct bch_dev *ca;
+
+       if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+                          BCH_FORCE_IF_METADATA_LOST|
+                          BCH_FORCE_IF_DEGRADED|
+                          BCH_BY_INDEX)) ||
+           arg.pad)
+               return -EINVAL;
+
+       ca = bch2_device_lookup(c, arg.dev, arg.flags);
+       if (IS_ERR(ca))
+               return PTR_ERR(ca);
+
+       return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+       char *path;
+       int ret;
+
+       if (arg.flags || arg.pad)
+               return -EINVAL;
+
+       path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+       if (!path)
+               return -ENOMEM;
+
+       ret = bch2_dev_online(c, path);
+       kfree(path);
+       return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+       struct bch_dev *ca;
+       int ret;
+
+       if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+                          BCH_FORCE_IF_METADATA_LOST|
+                          BCH_FORCE_IF_DEGRADED|
+                          BCH_BY_INDEX)) ||
+           arg.pad)
+               return -EINVAL;
+
+       ca = bch2_device_lookup(c, arg.dev, arg.flags);
+       if (IS_ERR(ca))
+               return PTR_ERR(ca);
+
+       ret = bch2_dev_offline(c, ca, arg.flags);
+       percpu_ref_put(&ca->ref);
+       return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+                       struct bch_ioctl_disk_set_state arg)
+{
+       struct bch_dev *ca;
+       int ret;
+
+       if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+                          BCH_FORCE_IF_METADATA_LOST|
+                          BCH_FORCE_IF_DEGRADED|
+                          BCH_BY_INDEX)) ||
+           arg.pad[0] || arg.pad[1] || arg.pad[2])
+               return -EINVAL;
+
+       ca = bch2_device_lookup(c, arg.dev, arg.flags);
+       if (IS_ERR(ca))
+               return PTR_ERR(ca);
+
+       ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+
+       percpu_ref_put(&ca->ref);
+       return ret;
+}
+
+struct bch_data_ctx {
+       struct bch_fs                   *c;
+       struct bch_ioctl_data           arg;
+       struct bch_move_stats           stats;
+
+       int                             ret;
+
+       struct task_struct              *thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+       struct bch_data_ctx *ctx = arg;
+
+       ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+       ctx->stats.data_type = U8_MAX;
+       return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+       struct bch_data_ctx *ctx = file->private_data;
+
+       kthread_stop(ctx->thread);
+       put_task_struct(ctx->thread);
+       kfree(ctx);
+       return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+                                 size_t len, loff_t *ppos)
+{
+       struct bch_data_ctx *ctx = file->private_data;
+       struct bch_fs *c = ctx->c;
+       struct bch_ioctl_data_event e = {
+               .type                   = BCH_DATA_EVENT_PROGRESS,
+               .p.data_type            = ctx->stats.data_type,
+               .p.btree_id             = ctx->stats.iter.btree_id,
+               .p.pos                  = ctx->stats.iter.pos,
+               .p.sectors_done         = atomic64_read(&ctx->stats.sectors_seen),
+               .p.sectors_total        = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+       };
+
+       if (len < sizeof(e))
+               return -EINVAL;
+
+       return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+       .release        = bch2_data_job_release,
+       .read           = bch2_data_job_read,
+       .llseek         = no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+                           struct bch_ioctl_data arg)
+{
+       struct bch_data_ctx *ctx = NULL;
+       struct file *file = NULL;
+       unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+       int ret, fd = -1;
+
+       if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+               return -EINVAL;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->c = c;
+       ctx->arg = arg;
+
+       ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+       if (IS_ERR(ctx->thread)) {
+               ret = PTR_ERR(ctx->thread);
+               goto err;
+       }
+
+       ret = get_unused_fd_flags(flags);
+       if (ret < 0)
+               goto err;
+       fd = ret;
+
+       file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+       if (IS_ERR(file)) {
+               ret = PTR_ERR(file);
+               goto err;
+       }
+
+       fd_install(fd, file);
+
+       get_task_struct(ctx->thread);
+       wake_up_process(ctx->thread);
+
+       return fd;
+err:
+       if (fd >= 0)
+               put_unused_fd(fd);
+       if (!IS_ERR_OR_NULL(ctx->thread))
+               kthread_stop(ctx->thread);
+       kfree(ctx);
+       return ret;
+}
+
+static long bch2_ioctl_usage(struct bch_fs *c,
+                            struct bch_ioctl_usage __user *user_arg)
+{
+       struct bch_ioctl_usage arg;
+       struct bch_dev *ca;
+       unsigned i, j;
+       int ret;
+
+       if (!test_bit(BCH_FS_STARTED, &c->flags))
+               return -EINVAL;
+
+       if (copy_from_user(&arg, user_arg, sizeof(arg)))
+               return -EFAULT;
+
+       for (i = 0; i < arg.nr_devices; i++) {
+               struct bch_ioctl_dev_usage dst = { .alive = 0 };
+
+               ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+               if (ret)
+                       return ret;
+       }
+
+       {
+               struct bch_fs_usage src = bch2_fs_usage_read(c);
+               struct bch_ioctl_fs_usage dst = {
+                       .capacity               = c->capacity,
+                       .used                   = bch2_fs_sectors_used(c, src),
+                       .online_reserved        = src.online_reserved,
+               };
+
+               for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+                       dst.persistent_reserved[i] =
+                               src.s[i].persistent_reserved;
+
+                       for (j = 0; j < S_ALLOC_NR; j++)
+                               dst.sectors[s_alloc_to_data_type(j)][i] =
+                                       src.s[i].data[j];
+               }
+
+               ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
+               if (ret)
+                       return ret;
+       }
+
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
+               struct bch_ioctl_dev_usage dst = {
+                       .alive          = 1,
+                       .state          = ca->mi.state,
+                       .bucket_size    = ca->mi.bucket_size,
+                       .nr_buckets     = ca->mi.nbuckets - ca->mi.first_bucket,
+               };
+
+               if (ca->dev_idx >= arg.nr_devices) {
+                       percpu_ref_put(&ca->ref);
+                       return -ERANGE;
+               }
+
+               if (percpu_ref_tryget(&ca->io_ref)) {
+                       dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
+                       percpu_ref_put(&ca->io_ref);
+               }
+
+               for (j = 0; j < BCH_DATA_NR; j++) {
+                       dst.buckets[j] = src.buckets[j];
+                       dst.sectors[j] = src.sectors[j];
+               }
+
+               ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+                                 struct bch_ioctl_read_super arg)
+{
+       struct bch_dev *ca = NULL;
+       struct bch_sb *sb;
+       int ret = 0;
+
+       if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+           arg.pad)
+               return -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+
+       if (arg.flags & BCH_READ_DEV) {
+               ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+               if (IS_ERR(ca)) {
+                       ret = PTR_ERR(ca);
+                       goto err;
+               }
+
+               sb = ca->disk_sb.sb;
+       } else {
+               sb = c->disk_sb.sb;
+       }
+
+       if (vstruct_bytes(sb) > arg.size) {
+               ret = -ERANGE;
+               goto err;
+       }
+
+       ret = copy_to_user((void __user *)(unsigned long)arg.sb,
+                          sb, vstruct_bytes(sb));
+err:
+       if (ca)
+               percpu_ref_put(&ca->ref);
+       mutex_unlock(&c->sb_lock);
+       return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+                                   struct bch_ioctl_disk_get_idx arg)
+{
+       dev_t dev = huge_decode_dev(arg.dev);
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i)
+               if (ca->disk_sb.bdev->bd_dev == dev) {
+                       percpu_ref_put(&ca->io_ref);
+                       return i;
+               }
+
+       return -ENOENT;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+                                  struct bch_ioctl_disk_resize arg)
+{
+       struct bch_dev *ca;
+       int ret;
+
+       if ((arg.flags & ~BCH_BY_INDEX) ||
+           arg.pad)
+               return -EINVAL;
+
+       ca = bch2_device_lookup(c, arg.dev, arg.flags);
+       if (IS_ERR(ca))
+               return PTR_ERR(ca);
+
+       ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+       percpu_ref_put(&ca->ref);
+       return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype)                                     \
+do {                                                                   \
+       _argtype i;                                                     \
+                                                                       \
+       if (copy_from_user(&i, arg, sizeof(i)))                         \
+               return -EFAULT;                                         \
+       return bch2_ioctl_##_name(c, i);                                \
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+       /* ioctls that don't require admin cap: */
+       switch (cmd) {
+       case BCH_IOCTL_QUERY_UUID:
+               return bch2_ioctl_query_uuid(c, arg);
+       case BCH_IOCTL_USAGE:
+               return bch2_ioctl_usage(c, arg);
+       }
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+#if 0
+       case BCH_IOCTL_START:
+               BCH_IOCTL(start, struct bch_ioctl_start);
+       case BCH_IOCTL_STOP:
+               return bch2_ioctl_stop(c);
+#endif
+       case BCH_IOCTL_READ_SUPER:
+               BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+       case BCH_IOCTL_DISK_GET_IDX:
+               BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+       }
+
+       if (!test_bit(BCH_FS_STARTED, &c->flags))
+               return -EINVAL;
+
+       /* ioctls that do require admin cap: */
+       switch (cmd) {
+       case BCH_IOCTL_DISK_ADD:
+               BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+       case BCH_IOCTL_DISK_REMOVE:
+               BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+       case BCH_IOCTL_DISK_ONLINE:
+               BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+       case BCH_IOCTL_DISK_OFFLINE:
+               BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+       case BCH_IOCTL_DISK_SET_STATE:
+               BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+       case BCH_IOCTL_DATA:
+               BCH_IOCTL(data, struct bch_ioctl_data);
+       case BCH_IOCTL_DISK_RESIZE:
+               BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+
+       default:
+               return -ENOTTY;
+       }
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+       unsigned minor = iminor(file_inode(filp));
+       struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+       void __user *arg = (void __user *) v;
+
+       return c
+               ? bch2_fs_ioctl(c, cmd, arg)
+               : bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+       .owner          = THIS_MODULE,
+       .unlocked_ioctl = bch2_chardev_ioctl,
+       .open           = nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+       if (!IS_ERR_OR_NULL(c->chardev))
+               device_unregister(c->chardev);
+       if (c->minor >= 0)
+               idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+       c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+       if (c->minor < 0)
+               return c->minor;
+
+       c->chardev = device_create(bch_chardev_class, NULL,
+                                  MKDEV(bch_chardev_major, c->minor), c,
+                                  "bcachefs%u-ctl", c->minor);
+       if (IS_ERR(c->chardev))
+               return PTR_ERR(c->chardev);
+
+       return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+       if (!IS_ERR_OR_NULL(bch_chardev_class))
+               device_destroy(bch_chardev_class,
+                              MKDEV(bch_chardev_major, U8_MAX));
+       if (!IS_ERR_OR_NULL(bch_chardev_class))
+               class_destroy(bch_chardev_class);
+       if (bch_chardev_major > 0)
+               unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+       bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+       if (bch_chardev_major < 0)
+               return bch_chardev_major;
+
+       bch_chardev_class = class_create("bcachefs");
+       if (IS_ERR(bch_chardev_class))
+               return PTR_ERR(bch_chardev_class);
+
+       bch_chardev = device_create(bch_chardev_class, NULL,
+                                   MKDEV(bch_chardev_major, U8_MAX),
+                                   NULL, "bcachefs-ctl");
+       if (IS_ERR(bch_chardev))
+               return PTR_ERR(bch_chardev);
+
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
new file mode 100644 (file)
index 0000000..3a4890d
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+                               unsigned cmd, void __user * arg)
+{
+       return -ENOSYS;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
new file mode 100644 (file)
index 0000000..3733cbf
--- /dev/null
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const u64 crc_table[256] = {
+       0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+       0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+       0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+       0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+       0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+       0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+       0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+       0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+       0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+       0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+       0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+       0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+       0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+       0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+       0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+       0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+       0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+       0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+       0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+       0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+       0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+       0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+       0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+       0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+       0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+       0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+       0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+       0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+       0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+       0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+       0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+       0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+       0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+       0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+       0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+       0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+       0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+       0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+       0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+       0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+       0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+       0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+       0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+       0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+       0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+       0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+       0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+       0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+       0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+       0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+       0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+       0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+       0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+       0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+       0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+       0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+       0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+       0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+       0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+       0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+       0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+       0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+       0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+       0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+       0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+       0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+       0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+       0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+       0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+       0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+       0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+       0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+       0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+       0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+       0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+       0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+       0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+       0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+       0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+       0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+       0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+       0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+       0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+       0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+       0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+       0x9AFCE626CE85B507ULL,
+};
+
+u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
+{
+       const unsigned char *data = _data;
+
+       while (len--) {
+               int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+               crc = crc_table[i] ^ (crc << 8);
+       }
+
+       return crc;
+}
+
+static u64 bch2_checksum_init(unsigned type)
+{
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return 0;
+       case BCH_CSUM_CRC32C_NONZERO:
+               return U32_MAX;
+       case BCH_CSUM_CRC64_NONZERO:
+               return U64_MAX;
+       case BCH_CSUM_CRC32C:
+               return 0;
+       case BCH_CSUM_CRC64:
+               return 0;
+       default:
+               BUG();
+       }
+}
+
+static u64 bch2_checksum_final(unsigned type, u64 crc)
+{
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return 0;
+       case BCH_CSUM_CRC32C_NONZERO:
+               return crc ^ U32_MAX;
+       case BCH_CSUM_CRC64_NONZERO:
+               return crc ^ U64_MAX;
+       case BCH_CSUM_CRC32C:
+               return crc;
+       case BCH_CSUM_CRC64:
+               return crc;
+       default:
+               BUG();
+       }
+}
+
+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+{
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return 0;
+       case BCH_CSUM_CRC32C_NONZERO:
+       case BCH_CSUM_CRC32C:
+               return crc32c(crc, data, len);
+       case BCH_CSUM_CRC64_NONZERO:
+       case BCH_CSUM_CRC64:
+               return bch2_crc64_update(crc, data, len);
+       default:
+               BUG();
+       }
+}
+
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+                                struct nonce nonce,
+                                struct scatterlist *sg, size_t len)
+{
+       SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+       int ret;
+
+       skcipher_request_set_sync_tfm(req, tfm);
+       skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+       ret = crypto_skcipher_encrypt(req);
+       BUG_ON(ret);
+}
+
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+                             struct nonce nonce,
+                             void *buf, size_t len)
+{
+       struct scatterlist sg;
+
+       sg_init_one(&sg, buf, len);
+       do_encrypt_sg(tfm, nonce, &sg, len);
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+                           void *buf, size_t len)
+{
+       struct crypto_sync_skcipher *chacha20 =
+               crypto_alloc_sync_skcipher("chacha20", 0, 0);
+       int ret;
+
+       if (!chacha20) {
+               pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
+               return PTR_ERR(chacha20);
+       }
+
+       ret = crypto_skcipher_setkey(&chacha20->base,
+                                    (void *) key, sizeof(*key));
+       if (ret) {
+               pr_err("crypto_skcipher_setkey() error: %i", ret);
+               goto err;
+       }
+
+       do_encrypt(chacha20, nonce, buf, len);
+err:
+       crypto_free_sync_skcipher(chacha20);
+       return ret;
+}
+
+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+                        struct nonce nonce)
+{
+       u8 key[POLY1305_KEY_SIZE];
+
+       nonce.d[3] ^= BCH_NONCE_POLY;
+
+       memset(key, 0, sizeof(key));
+       do_encrypt(c->chacha20, nonce, key, sizeof(key));
+
+       desc->tfm = c->poly1305;
+       crypto_shash_init(desc);
+       crypto_shash_update(desc, key, sizeof(key));
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+                             struct nonce nonce, const void *data, size_t len)
+{
+       switch (type) {
+       case BCH_CSUM_NONE:
+       case BCH_CSUM_CRC32C_NONZERO:
+       case BCH_CSUM_CRC64_NONZERO:
+       case BCH_CSUM_CRC32C:
+       case BCH_CSUM_CRC64: {
+               u64 crc = bch2_checksum_init(type);
+
+               crc = bch2_checksum_update(type, crc, data, len);
+               crc = bch2_checksum_final(type, crc);
+
+               return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+       }
+
+       case BCH_CSUM_CHACHA20_POLY1305_80:
+       case BCH_CSUM_CHACHA20_POLY1305_128: {
+               SHASH_DESC_ON_STACK(desc, c->poly1305);
+               u8 digest[POLY1305_DIGEST_SIZE];
+               struct bch_csum ret = { 0 };
+
+               gen_poly_key(c, desc, nonce);
+
+               crypto_shash_update(desc, data, len);
+               crypto_shash_final(desc, digest);
+
+               memcpy(&ret, digest, bch_crc_bytes[type]);
+               return ret;
+       }
+       default:
+               BUG();
+       }
+}
+
+void bch2_encrypt(struct bch_fs *c, unsigned type,
+                 struct nonce nonce, void *data, size_t len)
+{
+       if (!bch2_csum_type_is_encryption(type))
+               return;
+
+       do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+                                          struct nonce nonce, struct bio *bio,
+                                          struct bvec_iter *iter)
+{
+       struct bio_vec bv;
+
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return (struct bch_csum) { 0 };
+       case BCH_CSUM_CRC32C_NONZERO:
+       case BCH_CSUM_CRC64_NONZERO:
+       case BCH_CSUM_CRC32C:
+       case BCH_CSUM_CRC64: {
+               u64 crc = bch2_checksum_init(type);
+
+#ifdef CONFIG_HIGHMEM
+               __bio_for_each_segment(bv, bio, *iter, *iter) {
+                       void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+                       crc = bch2_checksum_update(type,
+                               crc, p, bv.bv_len);
+                       kunmap_atomic(p);
+               }
+#else
+               __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+                       crc = bch2_checksum_update(type, crc,
+                               page_address(bv.bv_page) + bv.bv_offset,
+                               bv.bv_len);
+#endif
+               crc = bch2_checksum_final(type, crc);
+               return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+       }
+
+       case BCH_CSUM_CHACHA20_POLY1305_80:
+       case BCH_CSUM_CHACHA20_POLY1305_128: {
+               SHASH_DESC_ON_STACK(desc, c->poly1305);
+               u8 digest[POLY1305_DIGEST_SIZE];
+               struct bch_csum ret = { 0 };
+
+               gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+               __bio_for_each_segment(bv, bio, *iter, *iter) {
+                       void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+                       crypto_shash_update(desc, p, bv.bv_len);
+                       kunmap_atomic(p);
+               }
+#else
+               __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+                       crypto_shash_update(desc,
+                               page_address(bv.bv_page) + bv.bv_offset,
+                               bv.bv_len);
+#endif
+               crypto_shash_final(desc, digest);
+
+               memcpy(&ret, digest, bch_crc_bytes[type]);
+               return ret;
+       }
+       default:
+               BUG();
+       }
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+                                 struct nonce nonce, struct bio *bio)
+{
+       struct bvec_iter iter = bio->bi_iter;
+
+       return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+                     struct nonce nonce, struct bio *bio)
+{
+       struct bio_vec bv;
+       struct bvec_iter iter;
+       struct scatterlist sgl[16], *sg = sgl;
+       size_t bytes = 0;
+
+       if (!bch2_csum_type_is_encryption(type))
+               return;
+
+       sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+       bio_for_each_segment(bv, bio, iter) {
+               if (sg == sgl + ARRAY_SIZE(sgl)) {
+                       sg_mark_end(sg - 1);
+                       do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+                       nonce = nonce_add(nonce, bytes);
+                       bytes = 0;
+
+                       sg_init_table(sgl, ARRAY_SIZE(sgl));
+                       sg = sgl;
+               }
+
+               sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+               bytes += bv.bv_len;
+       }
+
+       sg_mark_end(sg - 1);
+       do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+       switch (type) {
+       case BCH_CSUM_NONE:
+       case BCH_CSUM_CRC32C:
+       case BCH_CSUM_CRC64:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static struct bch_csum bch2_checksum_merge(unsigned type,
+                                          struct bch_csum a,
+                                          struct bch_csum b, size_t b_len)
+{
+       BUG_ON(!bch2_checksum_mergeable(type));
+
+       while (b_len) {
+               unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+
+               a.lo = bch2_checksum_update(type, a.lo,
+                               page_address(ZERO_PAGE(0)), b);
+               b_len -= b;
+       }
+
+       a.lo ^= b.lo;
+       a.hi ^= b.hi;
+       return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+                       struct bversion version,
+                       struct bch_extent_crc_unpacked crc_old,
+                       struct bch_extent_crc_unpacked *crc_a,
+                       struct bch_extent_crc_unpacked *crc_b,
+                       unsigned len_a, unsigned len_b,
+                       unsigned new_csum_type)
+{
+       struct bvec_iter iter = bio->bi_iter;
+       struct nonce nonce = extent_nonce(version, crc_old);
+       struct bch_csum merged = { 0 };
+       struct crc_split {
+               struct bch_extent_crc_unpacked  *crc;
+               unsigned                        len;
+               unsigned                        csum_type;
+               struct bch_csum                 csum;
+       } splits[3] = {
+               { crc_a, len_a, new_csum_type },
+               { crc_b, len_b, new_csum_type },
+               { NULL,  bio_sectors(bio) - len_a - len_b, new_csum_type },
+       }, *i;
+       bool mergeable = crc_old.csum_type == new_csum_type &&
+               bch2_checksum_mergeable(new_csum_type);
+       unsigned crc_nonce = crc_old.nonce;
+
+       BUG_ON(len_a + len_b > bio_sectors(bio));
+       BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+       BUG_ON(crc_old.compression_type);
+       BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+              bch2_csum_type_is_encryption(new_csum_type));
+
+       for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+               iter.bi_size = i->len << 9;
+               if (mergeable || i->crc)
+                       i->csum = __bch2_checksum_bio(c, i->csum_type,
+                                                     nonce, bio, &iter);
+               else
+                       bio_advance_iter(bio, &iter, i->len << 9);
+               nonce = nonce_add(nonce, i->len << 9);
+       }
+
+       if (mergeable)
+               for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+                       merged = bch2_checksum_merge(new_csum_type, merged,
+                                                    i->csum, i->len << 9);
+       else
+               merged = bch2_checksum_bio(c, crc_old.csum_type,
+                               extent_nonce(version, crc_old), bio);
+
+       if (bch2_crc_cmp(merged, crc_old.csum))
+               return -EIO;
+
+       for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+               if (i->crc)
+                       *i->crc = (struct bch_extent_crc_unpacked) {
+                               .csum_type              = i->csum_type,
+                               .compressed_size        = i->len,
+                               .uncompressed_size      = i->len,
+                               .offset                 = 0,
+                               .live_size              = i->len,
+                               .nonce                  = crc_nonce,
+                               .csum                   = i->csum,
+                       };
+
+               if (bch2_csum_type_is_encryption(new_csum_type))
+                       crc_nonce += i->len;
+       }
+
+       return 0;
+}
+
+#ifdef __KERNEL__
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+       char key_description[60];
+       struct key *keyring_key;
+       const struct user_key_payload *ukp;
+       int ret;
+
+       snprintf(key_description, sizeof(key_description),
+                "bcachefs:%pUb", &sb->user_uuid);
+
+       keyring_key = request_key(&key_type_logon, key_description, NULL);
+       if (IS_ERR(keyring_key))
+               return PTR_ERR(keyring_key);
+
+       down_read(&keyring_key->sem);
+       ukp = dereference_key_locked(keyring_key);
+       if (ukp->datalen == sizeof(*key)) {
+               memcpy(key, ukp->data, ukp->datalen);
+               ret = 0;
+       } else {
+               ret = -EINVAL;
+       }
+       up_read(&keyring_key->sem);
+       key_put(keyring_key);
+
+       return ret;
+}
+#else
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+       key_serial_t key_id;
+       char key_description[60];
+       char uuid[40];
+
+       uuid_unparse_lower(sb->user_uuid.b, uuid);
+       sprintf(key_description, "bcachefs:%s", uuid);
+
+       key_id = request_key("user", key_description, NULL,
+                            KEY_SPEC_USER_KEYRING);
+       if (key_id < 0)
+               return -errno;
+
+       if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+               return -1;
+
+       return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+                       struct bch_sb_field_crypt *crypt,
+                       struct bch_key *key)
+{
+       struct bch_encrypted_key sb_key = crypt->key;
+       struct bch_key user_key;
+       int ret = 0;
+
+       /* is key encrypted? */
+       if (!bch2_key_is_encrypted(&sb_key))
+               goto out;
+
+       ret = bch2_request_key(c->disk_sb.sb, &user_key);
+       if (ret) {
+               bch_err(c, "error requesting encryption key: %i", ret);
+               goto err;
+       }
+
+       /* decrypt real key: */
+       ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+                            &sb_key, sizeof(sb_key));
+       if (ret)
+               goto err;
+
+       if (bch2_key_is_encrypted(&sb_key)) {
+               bch_err(c, "incorrect encryption key");
+               ret = -EINVAL;
+               goto err;
+       }
+out:
+       *key = sb_key.key;
+err:
+       memzero_explicit(&sb_key, sizeof(sb_key));
+       memzero_explicit(&user_key, sizeof(user_key));
+       return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+       if (!c->chacha20)
+               c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+       if (IS_ERR(c->chacha20)) {
+               bch_err(c, "error requesting chacha20 module: %li",
+                       PTR_ERR(c->chacha20));
+               return PTR_ERR(c->chacha20);
+       }
+
+       if (!c->poly1305)
+               c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+       if (IS_ERR(c->poly1305)) {
+               bch_err(c, "error requesting poly1305 module: %li",
+                       PTR_ERR(c->poly1305));
+               return PTR_ERR(c->poly1305);
+       }
+
+       return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+       struct bch_sb_field_crypt *crypt;
+       struct bch_key key;
+       int ret = -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+
+       crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+       if (!crypt)
+               goto out;
+
+       /* is key encrypted? */
+       ret = 0;
+       if (bch2_key_is_encrypted(&crypt->key))
+               goto out;
+
+       ret = bch2_decrypt_sb_key(c, crypt, &key);
+       if (ret)
+               goto out;
+
+       crypt->key.magic        = BCH_KEY_MAGIC;
+       crypt->key.key          = key;
+
+       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+       bch2_write_super(c);
+out:
+       mutex_unlock(&c->sb_lock);
+
+       return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+       struct bch_encrypted_key key;
+       struct bch_key user_key;
+       struct bch_sb_field_crypt *crypt;
+       int ret = -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+
+       /* Do we already have an encryption key? */
+       if (bch2_sb_get_crypt(c->disk_sb.sb))
+               goto err;
+
+       ret = bch2_alloc_ciphers(c);
+       if (ret)
+               goto err;
+
+       key.magic = BCH_KEY_MAGIC;
+       get_random_bytes(&key.key, sizeof(key.key));
+
+       if (keyed) {
+               ret = bch2_request_key(c->disk_sb.sb, &user_key);
+               if (ret) {
+                       bch_err(c, "error requesting encryption key: %i", ret);
+                       goto err;
+               }
+
+               ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+                                             &key, sizeof(key));
+               if (ret)
+                       goto err;
+       }
+
+       ret = crypto_skcipher_setkey(&c->chacha20->base,
+                       (void *) &key.key, sizeof(key.key));
+       if (ret)
+               goto err;
+
+       crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+       if (!crypt) {
+               ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+               goto err;
+       }
+
+       crypt->key = key;
+
+       /* write superblock */
+       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+       bch2_write_super(c);
+err:
+       mutex_unlock(&c->sb_lock);
+       memzero_explicit(&user_key, sizeof(user_key));
+       memzero_explicit(&key, sizeof(key));
+       return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+       if (!IS_ERR_OR_NULL(c->poly1305))
+               crypto_free_shash(c->poly1305);
+       if (!IS_ERR_OR_NULL(c->chacha20))
+               crypto_free_sync_skcipher(c->chacha20);
+       if (!IS_ERR_OR_NULL(c->sha256))
+               crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+       struct bch_sb_field_crypt *crypt;
+       struct bch_key key;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+       if (IS_ERR(c->sha256)) {
+               bch_err(c, "error requesting sha256 module");
+               ret = PTR_ERR(c->sha256);
+               goto out;
+       }
+
+       crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+       if (!crypt)
+               goto out;
+
+       ret = bch2_alloc_ciphers(c);
+       if (ret)
+               goto out;
+
+       ret = bch2_decrypt_sb_key(c, crypt, &key);
+       if (ret)
+               goto out;
+
+       ret = crypto_skcipher_setkey(&c->chacha20->base,
+                       (void *) &key.key, sizeof(key.key));
+       if (ret)
+               goto out;
+out:
+       memzero_explicit(&key, sizeof(key));
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
+}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
new file mode 100644 (file)
index 0000000..42c8646
--- /dev/null
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <crypto/chacha.h>
+
+u64 bch2_crc64_update(u64, const void *, size_t);
+
+#define BCH_NONCE_EXTENT       cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE                cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL      cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO         cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY         cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+                            const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i)                            \
+({                                                                     \
+       const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
+       const void *end = vstruct_end(_i);                              \
+                                                                       \
+       bch2_checksum(_c, _type, _nonce, start, end - start);           \
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+
+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+                void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+                                 struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+                       struct bch_extent_crc_unpacked,
+                       struct bch_extent_crc_unpacked *,
+                       struct bch_extent_crc_unpacked *,
+                       unsigned, unsigned, unsigned);
+
+void bch2_encrypt_bio(struct bch_fs *, unsigned,
+                   struct nonce, struct bio *);
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+                       struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+                                                      bool data)
+{
+       switch (type) {
+       case BCH_CSUM_OPT_NONE:
+            return BCH_CSUM_NONE;
+       case BCH_CSUM_OPT_CRC32C:
+            return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+       case BCH_CSUM_OPT_CRC64:
+            return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+       default:
+            BUG();
+       }
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+                                                        unsigned opt)
+{
+       if (c->sb.encryption_type)
+               return c->opts.wide_macs
+                       ? BCH_CSUM_CHACHA20_POLY1305_128
+                       : BCH_CSUM_CHACHA20_POLY1305_80;
+
+       return bch2_csum_opt_to_type(opt, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+       if (c->sb.encryption_type)
+               return BCH_CSUM_CHACHA20_POLY1305_128;
+
+       return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+       BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+                                          unsigned type)
+{
+       if (type >= BCH_CSUM_NR)
+               return false;
+
+       if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+               return false;
+
+       return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+       /*
+        * XXX: need some way of preventing the compiler from optimizing this
+        * into a form that isn't constant time..
+        */
+       return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+       EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+       le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+       return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+       struct nonce ret;
+
+       memset(&ret, 0, sizeof(ret));
+       return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+                                       struct bch_extent_crc_unpacked crc)
+{
+       unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+       struct nonce nonce = (struct nonce) {{
+               [0] = cpu_to_le32(size << 22),
+               [1] = cpu_to_le32(version.lo),
+               [2] = cpu_to_le32(version.lo >> 32),
+               [3] = cpu_to_le32(version.hi|
+                                 (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+       }};
+
+       return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+       return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+       __le64 magic = __bch2_sb_magic(sb);
+
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = 0,
+               [2] = ((__le32 *) &magic)[0],
+               [3] = ((__le32 *) &magic)[1],
+       }};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+       __le64 magic = bch2_sb_magic(c);
+
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = 0,
+               [2] = ((__le32 *) &magic)[0],
+               [3] = ((__le32 *) &magic)[1],
+       }};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
new file mode 100644 (file)
index 0000000..96f8030
--- /dev/null
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+                               struct io_timer *l,
+                               struct io_timer *r)
+{
+       return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+       size_t i;
+
+       spin_lock(&clock->timer_lock);
+       for (i = 0; i < clock->timers.used; i++)
+               if (clock->timers.data[i] == timer)
+                       goto out;
+
+       BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+out:
+       spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+       size_t i;
+
+       spin_lock(&clock->timer_lock);
+
+       for (i = 0; i < clock->timers.used; i++)
+               if (clock->timers.data[i] == timer) {
+                       heap_del(&clock->timers, i, io_timer_cmp);
+                       break;
+               }
+
+       spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+       struct io_timer         io_timer;
+       struct timer_list       cpu_timer;
+       struct task_struct      *task;
+       int                     expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+       struct io_clock_wait *wait = container_of(timer,
+                               struct io_clock_wait, io_timer);
+
+       wait->expired = 1;
+       wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+       struct io_clock_wait *wait = container_of(timer,
+                               struct io_clock_wait, cpu_timer);
+
+       wait->expired = 1;
+       wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+       struct io_clock_wait wait;
+
+       /* XXX: calculate sleep time rigorously */
+       wait.io_timer.expire    = until;
+       wait.io_timer.fn        = io_clock_wait_fn;
+       wait.task               = current;
+       wait.expired            = 0;
+       bch2_io_timer_add(clock, &wait.io_timer);
+
+       schedule();
+
+       bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+                               unsigned long io_until,
+                               unsigned long cpu_timeout)
+{
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
+       struct io_clock_wait wait;
+
+       wait.io_timer.expire    = io_until;
+       wait.io_timer.fn        = io_clock_wait_fn;
+       wait.task               = current;
+       wait.expired            = 0;
+       bch2_io_timer_add(clock, &wait.io_timer);
+
+       timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+       if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+               mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread && kthread_should_stop())
+                       break;
+
+               if (wait.expired)
+                       break;
+
+               schedule();
+               try_to_freeze();
+       }
+
+       __set_current_state(TASK_RUNNING);
+       del_timer_sync(&wait.cpu_timer);
+       destroy_timer_on_stack(&wait.cpu_timer);
+       bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+                                         unsigned long now)
+{
+       struct io_timer *ret = NULL;
+
+       spin_lock(&clock->timer_lock);
+
+       if (clock->timers.used &&
+           time_after_eq(now, clock->timers.data[0]->expire))
+               heap_pop(&clock->timers, ret, io_timer_cmp);
+
+       spin_unlock(&clock->timer_lock);
+
+       return ret;
+}
+
+void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+{
+       struct io_clock *clock = &c->io_clock[rw];
+       struct io_timer *timer;
+       unsigned long now;
+
+       /* Buffer up one megabyte worth of IO in the percpu counter */
+       preempt_disable();
+
+       if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+                  IO_CLOCK_PCPU_SECTORS)) {
+               preempt_enable();
+               return;
+       }
+
+       sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+       preempt_enable();
+       now = atomic_long_add_return(sectors, &clock->now);
+
+       while ((timer = get_expired_timer(clock, now)))
+               timer->fn(timer);
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+       free_heap(&clock->timers);
+       free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+       atomic_long_set(&clock->now, 0);
+       spin_lock_init(&clock->timer_lock);
+
+       clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+       if (!clock->pcpu_buf)
+               return -ENOMEM;
+
+       if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
new file mode 100644 (file)
index 0000000..5cb043c
--- /dev/null
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+                               unsigned long);
+void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({                                                                     \
+       long __ret = timeout;                                           \
+       might_sleep();                                                  \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __wait_event_timeout(wq, condition, timeout);   \
+       __ret;                                                          \
+})
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
new file mode 100644 (file)
index 0000000..2b5e499
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS           (BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+       io_timer_fn             fn;
+       unsigned long           expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS  128
+
+typedef HEAP(struct io_timer *)        io_timer_heap;
+
+struct io_clock {
+       atomic_long_t           now;
+       u16 __percpu            *pcpu_buf;
+
+       spinlock_t              timer_lock;
+       io_timer_heap           timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
new file mode 100644 (file)
index 0000000..42ae4cf
--- /dev/null
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "io.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+       void            *b;
+       enum {
+               BB_NONE,
+               BB_VMAP,
+               BB_KMALLOC,
+               BB_VMALLOC,
+               BB_MEMPOOL,
+       }               type;
+       int             rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+       void *b;
+
+       BUG_ON(size > c->sb.encoded_extent_max << 9);
+
+       b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+       if (b)
+               return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+       b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
+       b = b ? page_address(b) : NULL;
+       if (b)
+               return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+       b = vmalloc(size);
+       if (b)
+               return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
+
+       b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
+       b = b ? page_address(b) : NULL;
+       if (b)
+               return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+       BUG();
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+                                      struct bvec_iter start, int rw)
+{
+       struct bbuf ret;
+       struct bio_vec bv;
+       struct bvec_iter iter;
+       unsigned nr_pages = 0;
+       struct page *stack_pages[16];
+       struct page **pages = NULL;
+       bool first = true;
+       unsigned prev_end = PAGE_SIZE;
+       void *data;
+
+       BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+
+#ifndef CONFIG_HIGHMEM
+       __bio_for_each_contig_segment(bv, bio, iter, start) {
+               if (bv.bv_len == start.bi_size)
+                       return (struct bbuf) {
+                               .b = page_address(bv.bv_page) + bv.bv_offset,
+                               .type = BB_NONE, .rw = rw
+                       };
+       }
+#endif
+       __bio_for_each_segment(bv, bio, iter, start) {
+               if ((!first && bv.bv_offset) ||
+                   prev_end != PAGE_SIZE)
+                       goto bounce;
+
+               prev_end = bv.bv_offset + bv.bv_len;
+               nr_pages++;
+       }
+
+       BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+       pages = nr_pages > ARRAY_SIZE(stack_pages)
+               ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+               : stack_pages;
+       if (!pages)
+               goto bounce;
+
+       nr_pages = 0;
+       __bio_for_each_segment(bv, bio, iter, start)
+               pages[nr_pages++] = bv.bv_page;
+
+       data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+       if (pages != stack_pages)
+               kfree(pages);
+
+       if (data)
+               return (struct bbuf) {
+                       .b = data + bio_iter_offset(bio, start),
+                       .type = BB_VMAP, .rw = rw
+               };
+bounce:
+       ret = __bounce_alloc(c, start.bi_size, rw);
+
+       if (rw == READ)
+               memcpy_from_bio(ret.b, bio, start);
+
+       return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+       return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+       switch (buf.type) {
+       case BB_NONE:
+               break;
+       case BB_VMAP:
+               vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+               break;
+       case BB_KMALLOC:
+               kfree(buf.b);
+               break;
+       case BB_VMALLOC:
+               vfree(buf.b);
+               break;
+       case BB_MEMPOOL:
+               mempool_free(virt_to_page(buf.b),
+                            &c->compression_bounce[buf.rw]);
+               break;
+       }
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+       strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+                           void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+       struct bbuf src_data = { NULL };
+       size_t src_len = src->bi_iter.bi_size;
+       size_t dst_len = crc.uncompressed_size << 9;
+       void *workspace;
+       int ret;
+
+       src_data = bio_map_or_bounce(c, src, READ);
+
+       switch (crc.compression_type) {
+       case BCH_COMPRESSION_LZ4_OLD:
+       case BCH_COMPRESSION_LZ4:
+               ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+                                                 src_len, dst_len, dst_len);
+               if (ret != dst_len)
+                       goto err;
+               break;
+       case BCH_COMPRESSION_GZIP: {
+               z_stream strm = {
+                       .next_in        = src_data.b,
+                       .avail_in       = src_len,
+                       .next_out       = dst_data,
+                       .avail_out      = dst_len,
+               };
+
+               workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+
+               zlib_set_workspace(&strm, workspace);
+               zlib_inflateInit2(&strm, -MAX_WBITS);
+               ret = zlib_inflate(&strm, Z_FINISH);
+
+               mempool_free(workspace, &c->decompress_workspace);
+
+               if (ret != Z_STREAM_END)
+                       goto err;
+               break;
+       }
+       case BCH_COMPRESSION_ZSTD: {
+               ZSTD_DCtx *ctx;
+               size_t len;
+
+               workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+               ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+               src_len = le32_to_cpup(src_data.b);
+
+               len = zstd_decompress_dctx(ctx,
+                               dst_data,       dst_len,
+                               src_data.b + 4, src_len);
+
+               mempool_free(workspace, &c->decompress_workspace);
+
+               if (len != dst_len)
+                       goto err;
+               break;
+       }
+       default:
+               BUG();
+       }
+       ret = 0;
+out:
+       bio_unmap_or_unbounce(c, src_data);
+       return ret;
+err:
+       ret = -EIO;
+       goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+                               struct bch_extent_crc_unpacked *crc)
+{
+       struct bbuf data = { NULL };
+       size_t dst_len = crc->uncompressed_size << 9;
+
+       /* bio must own its pages: */
+       BUG_ON(!bio->bi_vcnt);
+       BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+       if (crc->uncompressed_size      > c->sb.encoded_extent_max ||
+           crc->compressed_size        > c->sb.encoded_extent_max) {
+               bch_err(c, "error rewriting existing data: extent too big");
+               return -EIO;
+       }
+
+       data = __bounce_alloc(c, dst_len, WRITE);
+
+       if (__bio_uncompress(c, bio, data.b, *crc)) {
+               bch_err(c, "error rewriting existing data: decompression error");
+               bio_unmap_or_unbounce(c, data);
+               return -EIO;
+       }
+
+       /*
+        * might have to free existing pages and retry allocation from mempool -
+        * do this _after_ decompressing:
+        */
+       bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+
+       memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+       crc->csum_type          = 0;
+       crc->compression_type   = 0;
+       crc->compressed_size    = crc->live_size;
+       crc->uncompressed_size  = crc->live_size;
+       crc->offset             = 0;
+       crc->csum               = (struct bch_csum) { 0, 0 };
+
+       bio_unmap_or_unbounce(c, data);
+       return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+                      struct bio *dst, struct bvec_iter dst_iter,
+                      struct bch_extent_crc_unpacked crc)
+{
+       struct bbuf dst_data = { NULL };
+       size_t dst_len = crc.uncompressed_size << 9;
+       int ret = -ENOMEM;
+
+       if (crc.uncompressed_size       > c->sb.encoded_extent_max ||
+           crc.compressed_size         > c->sb.encoded_extent_max)
+               return -EIO;
+
+       dst_data = dst_len == dst_iter.bi_size
+               ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+               : __bounce_alloc(c, dst_len, WRITE);
+
+       ret = __bio_uncompress(c, src, dst_data.b, crc);
+       if (ret)
+               goto err;
+
+       if (dst_data.type != BB_NONE)
+               memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+       bio_unmap_or_unbounce(c, dst_data);
+       return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+                           void *workspace,
+                           void *dst, size_t dst_len,
+                           void *src, size_t src_len,
+                           unsigned compression_type)
+{
+       switch (compression_type) {
+       case BCH_COMPRESSION_LZ4: {
+               int len = src_len;
+               int ret = LZ4_compress_destSize(
+                               src,            dst,
+                               &len,           dst_len,
+                               workspace);
+
+               if (len < src_len)
+                       return -len;
+
+               return ret;
+       }
+       case BCH_COMPRESSION_GZIP: {
+               z_stream strm = {
+                       .next_in        = src,
+                       .avail_in       = src_len,
+                       .next_out       = dst,
+                       .avail_out      = dst_len,
+               };
+
+               zlib_set_workspace(&strm, workspace);
+               zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                                 Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+                                 Z_DEFAULT_STRATEGY);
+
+               if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+                       return 0;
+
+               if (zlib_deflateEnd(&strm) != Z_OK)
+                       return 0;
+
+               return strm.total_out;
+       }
+       case BCH_COMPRESSION_ZSTD: {
+               ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+                       zstd_cctx_workspace_bound(&c->zstd_params.cParams));
+
+               size_t len = zstd_compress_cctx(ctx,
+                               dst + 4,        dst_len - 4,
+                               src,            src_len,
+                               &c->zstd_params);
+               if (zstd_is_error(len))
+                       return 0;
+
+               *((__le32 *) dst) = cpu_to_le32(len);
+               return len + 4;
+       }
+       default:
+               BUG();
+       }
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+                              struct bio *dst, size_t *dst_len,
+                              struct bio *src, size_t *src_len,
+                              unsigned compression_type)
+{
+       struct bbuf src_data = { NULL }, dst_data = { NULL };
+       void *workspace;
+       unsigned pad;
+       int ret = 0;
+
+       BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+       BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+       /* If it's only one block, don't bother trying to compress: */
+       if (bio_sectors(src) <= c->opts.block_size)
+               return 0;
+
+       dst_data = bio_map_or_bounce(c, dst, WRITE);
+       src_data = bio_map_or_bounce(c, src, READ);
+
+       workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
+
+       *src_len = src->bi_iter.bi_size;
+       *dst_len = dst->bi_iter.bi_size;
+
+       /*
+        * XXX: this algorithm sucks when the compression code doesn't tell us
+        * how much would fit, like LZ4 does:
+        */
+       while (1) {
+               if (*src_len <= block_bytes(c)) {
+                       ret = -1;
+                       break;
+               }
+
+               ret = attempt_compress(c, workspace,
+                                      dst_data.b,      *dst_len,
+                                      src_data.b,      *src_len,
+                                      compression_type);
+               if (ret > 0) {
+                       *dst_len = ret;
+                       ret = 0;
+                       break;
+               }
+
+               /* Didn't fit: should we retry with a smaller amount?  */
+               if (*src_len <= *dst_len) {
+                       ret = -1;
+                       break;
+               }
+
+               /*
+                * If ret is negative, it's a hint as to how much data would fit
+                */
+               BUG_ON(-ret >= *src_len);
+
+               if (ret < 0)
+                       *src_len = -ret;
+               else
+                       *src_len -= (*src_len - *dst_len) / 2;
+               *src_len = round_down(*src_len, block_bytes(c));
+       }
+
+       mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+       if (ret)
+               goto err;
+
+       /* Didn't get smaller: */
+       if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+               goto err;
+
+       pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+       memset(dst_data.b + *dst_len, 0, pad);
+       *dst_len += pad;
+
+       if (dst_data.type != BB_NONE)
+               memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+       BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+       BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+       BUG_ON(*dst_len & (block_bytes(c) - 1));
+       BUG_ON(*src_len & (block_bytes(c) - 1));
+out:
+       bio_unmap_or_unbounce(c, src_data);
+       bio_unmap_or_unbounce(c, dst_data);
+       return compression_type;
+err:
+       compression_type = 0;
+       goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+                          struct bio *dst, size_t *dst_len,
+                          struct bio *src, size_t *src_len,
+                          unsigned compression_type)
+{
+       unsigned orig_dst = dst->bi_iter.bi_size;
+       unsigned orig_src = src->bi_iter.bi_size;
+
+       /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+       src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+                                    c->sb.encoded_extent_max << 9);
+       /* Don't generate a bigger output than input: */
+       dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+       if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+               compression_type = BCH_COMPRESSION_LZ4;
+
+       compression_type =
+               __bio_compress(c, dst, dst_len, src, src_len, compression_type);
+
+       dst->bi_iter.bi_size = orig_dst;
+       src->bi_iter.bi_size = orig_src;
+       return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_NONE       0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+       BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+       int ret = 0;
+
+       if ((c->sb.features & f) == f)
+               return 0;
+
+       mutex_lock(&c->sb_lock);
+
+       if ((c->sb.features & f) == f) {
+               mutex_unlock(&c->sb_lock);
+               return 0;
+       }
+
+       ret = __bch2_fs_compress_init(c, c->sb.features|f);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               return ret;
+       }
+
+       c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+                                      unsigned compression_type)
+{
+       BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+       return compression_type
+               ? __bch2_check_set_has_compressed_data(c,
+                               1ULL << bch2_compression_opt_to_feature[compression_type])
+               : 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+       unsigned i;
+
+       mempool_exit(&c->decompress_workspace);
+       for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+               mempool_exit(&c->compress_workspace[i]);
+       mempool_exit(&c->compression_bounce[WRITE]);
+       mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+       size_t max_extent = c->sb.encoded_extent_max << 9;
+       size_t order = get_order(max_extent);
+       size_t decompress_workspace_size = 0;
+       bool decompress_workspace_needed;
+       ZSTD_parameters params = zstd_get_params(0, max_extent);
+       struct {
+               unsigned        feature;
+               unsigned        type;
+               size_t          compress_workspace;
+               size_t          decompress_workspace;
+       } compression_types[] = {
+               { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+               { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+                       zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+                       zlib_inflate_workspacesize(), },
+               { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+                       zstd_cctx_workspace_bound(&params.cParams),
+                       zstd_dctx_workspace_bound() },
+       }, *i;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       c->zstd_params = params;
+
+       for (i = compression_types;
+            i < compression_types + ARRAY_SIZE(compression_types);
+            i++)
+               if (features & (1 << i->feature))
+                       goto have_compressed;
+
+       goto out;
+have_compressed:
+
+       if (!mempool_initialized(&c->compression_bounce[READ])) {
+               ret = mempool_init_page_pool(&c->compression_bounce[READ],
+                                            1, order);
+               if (ret)
+                       goto out;
+       }
+
+       if (!mempool_initialized(&c->compression_bounce[WRITE])) {
+               ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
+                                            1, order);
+               if (ret)
+                       goto out;
+       }
+
+       for (i = compression_types;
+            i < compression_types + ARRAY_SIZE(compression_types);
+            i++) {
+               decompress_workspace_size =
+                       max(decompress_workspace_size, i->decompress_workspace);
+
+               if (!(features & (1 << i->feature)))
+                       continue;
+
+               if (i->decompress_workspace)
+                       decompress_workspace_needed = true;
+
+               if (mempool_initialized(&c->compress_workspace[i->type]))
+                       continue;
+
+               ret = mempool_init_kvpmalloc_pool(
+                               &c->compress_workspace[i->type],
+                               1, i->compress_workspace);
+               if (ret)
+                       goto out;
+       }
+
+       ret = mempool_init_kmalloc_pool(
+                       &c->decompress_workspace,
+                       1, decompress_workspace_size);
+       if (ret)
+               goto out;
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+       u64 f = c->sb.features;
+
+       if (c->opts.compression)
+               f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
+
+       if (c->opts.background_compression)
+               f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+
+       return __bch2_fs_compress_init(c, f);
+
+}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
new file mode 100644 (file)
index 0000000..4bab1f6
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+                               struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+                      struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+                          struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
new file mode 100644 (file)
index 0000000..7db0e65
--- /dev/null
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+       struct btree *v = c->verify_data;
+       struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
+       struct bset *sorted, *inmemory;
+       struct extent_pick_ptr pick;
+       struct bch_dev *ca;
+       struct bio *bio;
+
+       if (c->opts.nochanges)
+               return;
+
+       btree_node_io_lock(b);
+       mutex_lock(&c->verify_lock);
+
+       n_ondisk = c->verify_ondisk;
+       n_sorted = c->verify_data->data;
+       n_inmemory = b->data;
+
+       bkey_copy(&v->key, &b->key);
+       v->written      = 0;
+       v->level        = b->level;
+       v->btree_id     = b->btree_id;
+       bch2_btree_keys_init(v, &c->expensive_debug_checks);
+
+       if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+               return;
+
+       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+       if (!bch2_dev_get_ioref(ca, READ))
+               return;
+
+       bio = bio_alloc_bioset(ca->disk_sb.bdev,
+                              buf_pages(n_sorted, btree_bytes(c)),
+                              REQ_OP_READ|REQ_META,
+                              GFP_NOIO,
+                              &c->btree_bio);
+       bio->bi_iter.bi_sector  = pick.ptr.offset;
+       bio->bi_iter.bi_size    = btree_bytes(c);
+       bch2_bio_map(bio, n_sorted);
+
+       submit_bio_wait(bio);
+
+       bio_put(bio);
+       percpu_ref_put(&ca->io_ref);
+
+       memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+       if (bch2_btree_node_read_done(c, v, false))
+               goto out;
+
+       n_sorted = c->verify_data->data;
+       sorted = &n_sorted->keys;
+       inmemory = &n_inmemory->keys;
+
+       if (inmemory->u64s != sorted->u64s ||
+           memcmp(inmemory->start,
+                  sorted->start,
+                  vstruct_end(inmemory) - (void *) inmemory->start)) {
+               unsigned offset = 0, sectors;
+               struct bset *i;
+               unsigned j;
+
+               console_lock();
+
+               printk(KERN_ERR "*** in memory:\n");
+               bch2_dump_bset(b, inmemory, 0);
+
+               printk(KERN_ERR "*** read back in:\n");
+               bch2_dump_bset(v, sorted, 0);
+
+               while (offset < b->written) {
+                       if (!offset ) {
+                               i = &n_ondisk->keys;
+                               sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+                                       c->block_bits;
+                       } else {
+                               struct btree_node_entry *bne =
+                                       (void *) n_ondisk + (offset << 9);
+                               i = &bne->keys;
+
+                               sectors = vstruct_blocks(bne, c->block_bits) <<
+                                       c->block_bits;
+                       }
+
+                       printk(KERN_ERR "*** on disk block %u:\n", offset);
+                       bch2_dump_bset(b, i, offset);
+
+                       offset += sectors;
+               }
+
+               printk(KERN_ERR "*** block %u/%u not written\n",
+                      offset >> c->block_bits, btree_blocks(c));
+
+               for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+                       if (inmemory->_data[j] != sorted->_data[j])
+                               break;
+
+               printk(KERN_ERR "b->written %u\n", b->written);
+
+               console_unlock();
+               panic("verify failed at %u\n", j);
+       }
+out:
+       mutex_unlock(&c->verify_lock);
+       btree_node_io_unlock(b);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+       struct bpos             from;
+       struct bch_fs   *c;
+       enum btree_id           id;
+
+       char                    buf[PAGE_SIZE];
+       size_t                  bytes;  /* what's currently in buf */
+
+       char __user             *ubuf;  /* destination user buffer */
+       size_t                  size;   /* size of requested read */
+       ssize_t                 ret;    /* bytes read so far */
+};
+
+static int flush_buf(struct dump_iter *i)
+{
+       if (i->bytes) {
+               size_t bytes = min(i->bytes, i->size);
+               int err = copy_to_user(i->ubuf, i->buf, bytes);
+
+               if (err)
+                       return err;
+
+               i->ret   += bytes;
+               i->ubuf  += bytes;
+               i->size  -= bytes;
+               i->bytes -= bytes;
+               memmove(i->buf, i->buf + bytes, i->bytes);
+       }
+
+       return 0;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+       struct btree_debug *bd = inode->i_private;
+       struct dump_iter *i;
+
+       i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+       if (!i)
+               return -ENOMEM;
+
+       file->private_data = i;
+       i->from = POS_MIN;
+       i->c    = container_of(bd, struct bch_fs, btree_debug[bd->id]);
+       i->id   = bd->id;
+
+       return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+                              size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       err = flush_buf(i);
+       if (err)
+               return err;
+
+       if (!i->size)
+               return i->ret;
+
+       bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+       k = bch2_btree_iter_peek(&iter);
+
+       while (k.k && !(err = btree_iter_err(k))) {
+               bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
+                                     i->buf, sizeof(i->buf), k);
+               i->bytes = strlen(i->buf);
+               BUG_ON(i->bytes >= PAGE_SIZE);
+               i->buf[i->bytes] = '\n';
+               i->bytes++;
+
+               k = bch2_btree_iter_next(&iter);
+               i->from = iter.pos;
+
+               err = flush_buf(i);
+               if (err)
+                       break;
+
+               if (!i->size)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+                                      size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct btree_iter iter;
+       struct btree *b;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       err = flush_buf(i);
+       if (err)
+               return err;
+
+       if (!i->size || !bkey_cmp(POS_MAX, i->from))
+               return i->ret;
+
+       for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
+               i->bytes = bch2_print_btree_node(i->c, b, i->buf,
+                                               sizeof(i->buf));
+               err = flush_buf(i);
+               if (err)
+                       break;
+
+               /*
+                * can't easily correctly restart a btree node traversal across
+                * all nodes, meh
+                */
+               i->from = bkey_cmp(POS_MAX, b->key.k.p)
+                       ? bkey_successor(b->key.k.p)
+                       : b->key.k.p;
+
+               if (!i->size)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+                                      size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct btree *prev_node = NULL;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       err = flush_buf(i);
+       if (err)
+               return err;
+
+       if (!i->size)
+               return i->ret;
+
+       bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
+              !(err = btree_iter_err(k))) {
+               struct btree_iter_level *l = &iter.l[0];
+               struct bkey_packed *_k =
+                       bch2_btree_node_iter_peek(&l->iter, l->b);
+
+               if (l->b != prev_node) {
+                       i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
+                                                       sizeof(i->buf));
+                       err = flush_buf(i);
+                       if (err)
+                               break;
+               }
+               prev_node = l->b;
+
+               i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
+                                                 sizeof(i->buf));
+
+               err = flush_buf(i);
+               if (err)
+                       break;
+
+               bch2_btree_iter_next(&iter);
+               i->from = iter.pos;
+
+               err = flush_buf(i);
+               if (err)
+                       break;
+
+               if (!i->size)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_read_bfloat_failed,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+       if (!IS_ERR_OR_NULL(c->debug))
+               debugfs_remove_recursive(c->debug);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+       struct btree_debug *bd;
+       char name[100];
+
+       if (IS_ERR_OR_NULL(bch_debug))
+               return;
+
+       snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+       c->debug = debugfs_create_dir(name, bch_debug);
+       if (IS_ERR_OR_NULL(c->debug))
+               return;
+
+       for (bd = c->btree_debug;
+            bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+            bd++) {
+               bd->id = bd - c->btree_debug;
+               bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
+                                               0400, c->debug, bd,
+                                               &btree_debug_ops);
+
+               snprintf(name, sizeof(name), "%s-formats",
+                        bch2_btree_ids[bd->id]);
+
+               bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
+                                                      &btree_format_debug_ops);
+
+               snprintf(name, sizeof(name), "%s-bfloat-failed",
+                        bch2_btree_ids[bd->id]);
+
+               bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
+                                                &bfloat_failed_debug_ops);
+       }
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+       if (!IS_ERR_OR_NULL(bch_debug))
+               debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+       int ret = 0;
+
+       bch_debug = debugfs_create_dir("bcachefs", NULL);
+       return ret;
+}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
new file mode 100644 (file)
index 0000000..56c2d1a
--- /dev/null
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_DEBUG_PARAM(name, description)                             \
+       static inline bool name(struct bch_fs *c)                       \
+       { return bch2_##name || c->name;        }
+BCH_DEBUG_PARAMS_ALWAYS()
+#undef BCH_DEBUG_PARAM
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define BCH_DEBUG_PARAM(name, description)                             \
+       static inline bool name(struct bch_fs *c)                       \
+       { return bch2_##name || c->name;        }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+
+#define bypass_torture_test(d)         ((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+#define BCH_DEBUG_PARAM(name, description)                             \
+       static inline bool name(struct bch_fs *c) { return false; }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
+
+#define bypass_torture_test(d)         0
+
+#endif
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+       if (verify_btree_ondisk(c))
+               __bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
new file mode 100644 (file)
index 0000000..9e5936f
--- /dev/null
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+
+#include <linux/dcache.h>
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+       unsigned len = bkey_val_bytes(d.k) -
+               offsetof(struct bch_dirent, d_name);
+
+       while (len && !d.v->d_name[len - 1])
+               --len;
+
+       return len;
+}
+
+static unsigned dirent_val_u64s(unsigned len)
+{
+       return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+                           sizeof(u64));
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+                           const struct qstr *name)
+{
+       struct bch_str_hash_ctx ctx;
+
+       bch2_str_hash_init(&ctx, info);
+       bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+       /* [0,2) reserved for dots */
+       return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+       return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+       struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+
+       return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+       struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+       int len = bch2_dirent_name_bytes(l);
+       const struct qstr *r = _r;
+
+       return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+       struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+       struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+       int l_len = bch2_dirent_name_bytes(l);
+       int r_len = bch2_dirent_name_bytes(r);
+
+       return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+       .btree_id       = BTREE_ID_DIRENTS,
+       .key_type       = BCH_DIRENT,
+       .whiteout_type  = BCH_DIRENT_WHITEOUT,
+       .hash_key       = dirent_hash_key,
+       .hash_bkey      = dirent_hash_bkey,
+       .cmp_key        = dirent_cmp_key,
+       .cmp_bkey       = dirent_cmp_bkey,
+};
+
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_dirent d;
+       unsigned len;
+
+       switch (k.k->type) {
+       case BCH_DIRENT:
+               if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+                       return "value too small";
+
+               d = bkey_s_c_to_dirent(k);
+               len = bch2_dirent_name_bytes(d);
+
+               if (!len)
+                       return "empty name";
+
+               /*
+                * older versions of bcachefs were buggy and creating dirent
+                * keys that were bigger than necessary:
+                */
+               if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+                       return "value too big";
+
+               if (len > BCH_NAME_MAX)
+                       return "dirent name too big";
+
+               if (memchr(d.v->d_name, '/', len))
+                       return "dirent name has invalid characters";
+
+               return NULL;
+       case BCH_DIRENT_WHITEOUT:
+               return bkey_val_bytes(k.k) != 0
+                       ? "value size should be zero"
+                       : NULL;
+
+       default:
+               return "invalid type";
+       }
+}
+
+void bch2_dirent_to_text(struct bch_fs *c, char *buf,
+                        size_t size, struct bkey_s_c k)
+{
+       struct bkey_s_c_dirent d;
+       size_t n = 0;
+
+       switch (k.k->type) {
+       case BCH_DIRENT:
+               d = bkey_s_c_to_dirent(k);
+
+               n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
+                                  bch2_dirent_name_bytes(d));
+               n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
+               break;
+       case BCH_DIRENT_WHITEOUT:
+               scnprintf(buf, size, "whiteout");
+               break;
+       }
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+                               u8 type, const struct qstr *name, u64 dst)
+{
+       struct bkey_i_dirent *dirent;
+       unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+       if (name->len > BCH_NAME_MAX)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       BUG_ON(u64s > U8_MAX);
+
+       dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+       if (IS_ERR(dirent))
+               return dirent;
+
+       bkey_dirent_init(&dirent->k_i);
+       dirent->k.u64s = u64s;
+       dirent->v.d_inum = cpu_to_le64(dst);
+       dirent->v.d_type = type;
+
+       memcpy(dirent->v.d_name, name->name, name->len);
+       memset(dirent->v.d_name + name->len, 0,
+              bkey_val_bytes(&dirent->k) -
+              offsetof(struct bch_dirent, d_name) -
+              name->len);
+
+       EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+       return dirent;
+}
+
+int __bch2_dirent_create(struct btree_trans *trans,
+                        u64 dir_inum, const struct bch_hash_info *hash_info,
+                        u8 type, const struct qstr *name, u64 dst_inum,
+                        int flags)
+{
+       struct bkey_i_dirent *dirent;
+       int ret;
+
+       dirent = dirent_create_key(trans, type, name, dst_inum);
+       ret = PTR_ERR_OR_ZERO(dirent);
+       if (ret)
+               return ret;
+
+       return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+                              dir_inum, &dirent->k_i, flags);
+}
+
+int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
+                      const struct bch_hash_info *hash_info,
+                      u8 type, const struct qstr *name, u64 dst_inum,
+                      u64 *journal_seq, int flags)
+{
+       return bch2_trans_do(c, journal_seq, flags,
+               __bch2_dirent_create(&trans, dir_inum, hash_info,
+                                    type, name, dst_inum, flags));
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+                              struct bkey_s_c_dirent src)
+{
+       dst->v.d_inum = src.v->d_inum;
+       dst->v.d_type = src.v->d_type;
+}
+
+static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
+                                  const struct qstr *name)
+{
+       return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+               struct bch_inode_info *src_dir, const struct qstr *src_name,
+               struct bch_inode_info *dst_dir, const struct qstr *dst_name,
+               enum bch_rename_mode mode)
+{
+       struct btree_iter *src_iter, *dst_iter;
+       struct bkey_s_c old_src, old_dst;
+       struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+       struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
+       int ret;
+
+       /*
+        * Lookup dst:
+        *
+        * Note that in BCH_RENAME mode, we're _not_ checking if
+        * the target already exists - we're relying on the VFS
+        * to do that check for us for correctness:
+        */
+       dst_iter = mode == BCH_RENAME
+               ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+                                &dst_dir->ei_str_hash,
+                                dst_dir->v.i_ino, dst_name)
+               : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+                                  &dst_dir->ei_str_hash,
+                                  dst_dir->v.i_ino, dst_name,
+                                  BTREE_ITER_INTENT);
+       if (IS_ERR(dst_iter))
+               return PTR_ERR(dst_iter);
+       old_dst = bch2_btree_iter_peek_slot(dst_iter);
+
+       /* Lookup src: */
+       src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+                                   &src_dir->ei_str_hash,
+                                   src_dir->v.i_ino, src_name,
+                                   BTREE_ITER_INTENT);
+       if (IS_ERR(src_iter))
+               return PTR_ERR(src_iter);
+       old_src = bch2_btree_iter_peek_slot(src_iter);
+
+       /* Create new dst key: */
+       new_dst = dirent_create_key(trans, 0, dst_name, 0);
+       if (IS_ERR(new_dst))
+               return PTR_ERR(new_dst);
+
+       dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+       new_dst->k.p = dst_iter->pos;
+
+       /* Create new src key: */
+       if (mode == BCH_RENAME_EXCHANGE) {
+               new_src = dirent_create_key(trans, 0, src_name, 0);
+               if (IS_ERR(new_src))
+                       return PTR_ERR(new_src);
+
+               dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+               new_src->k.p = src_iter->pos;
+       } else {
+               new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+               if (IS_ERR(new_src))
+                       return PTR_ERR(new_src);
+               bkey_init(&new_src->k);
+               new_src->k.p = src_iter->pos;
+
+               if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
+                   bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+                       /*
+                        * We have a hash collision for the new dst key,
+                        * and new_src - the key we're deleting - is between
+                        * new_dst's hashed slot and the slot we're going to be
+                        * inserting it into - oops.  This will break the hash
+                        * table if we don't deal with it:
+                        */
+                       if (mode == BCH_RENAME) {
+                               /*
+                                * If we're not overwriting, we can just insert
+                                * new_dst at the src position:
+                                */
+                               new_dst->k.p = src_iter->pos;
+                               bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+                               return 0;
+                       } else {
+                               /* If we're overwriting, we can't insert new_dst
+                                * at a different slot because it has to
+                                * overwrite old_dst - just make sure to use a
+                                * whiteout when deleting src:
+                                */
+                               new_src->k.type = BCH_DIRENT_WHITEOUT;
+                       }
+               } else {
+                       /* Check if we need a whiteout to delete src: */
+                       ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+                                                      &src_dir->ei_str_hash,
+                                                      src_iter);
+                       if (ret < 0)
+                               return ret;
+
+                       if (ret)
+                               new_src->k.type = BCH_DIRENT_WHITEOUT;
+               }
+       }
+
+       bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+       bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+       return 0;
+}
+
+int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
+                        const struct bch_hash_info *hash_info,
+                        const struct qstr *name)
+{
+       return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
+                               dir_inum, name);
+}
+
+int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
+                      const struct bch_hash_info *hash_info,
+                      const struct qstr *name,
+                      u64 *journal_seq)
+{
+       return bch2_trans_do(c, journal_seq,
+                            BTREE_INSERT_ATOMIC|
+                            BTREE_INSERT_NOFAIL,
+               __bch2_dirent_delete(&trans, dir_inum, hash_info, name));
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+                      const struct bch_hash_info *hash_info,
+                      const struct qstr *name)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       u64 inum = 0;
+
+       bch2_trans_init(&trans, c);
+
+       iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
+                               hash_info, dir_inum, name, 0);
+       if (IS_ERR(iter)) {
+               BUG_ON(PTR_ERR(iter) == -EINTR);
+               goto out;
+       }
+
+       k = bch2_btree_iter_peek_slot(iter);
+       inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+out:
+       bch2_trans_exit(&trans);
+       return inum;
+}
+
+int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
+               if (k.k->p.inode > dir_inum)
+                       break;
+
+               if (k.k->type == BCH_DIRENT) {
+                       ret = -ENOTEMPTY;
+                       break;
+               }
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
+
+int bch2_readdir(struct bch_fs *c, struct file *file,
+                struct dir_context *ctx)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent dirent;
+       unsigned len;
+
+       if (!dir_emit_dots(file, ctx))
+               return 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+                          POS(inode->v.i_ino, ctx->pos), 0, k) {
+               if (k.k->type != BCH_DIRENT)
+                       continue;
+
+               dirent = bkey_s_c_to_dirent(k);
+
+               if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
+                       continue;
+
+               if (k.k->p.inode > inode->v.i_ino)
+                       break;
+
+               len = bch2_dirent_name_bytes(dirent);
+
+               /*
+                * XXX: dir_emit() can fault and block, while we're holding
+                * locks
+                */
+               if (!dir_emit(ctx, dirent.v->d_name, len,
+                             le64_to_cpu(dirent.v->d_inum),
+                             dirent.v->d_type))
+                       break;
+
+               ctx->pos = k.k->p.offset + 1;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return 0;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
new file mode 100644 (file)
index 0000000..d02dc3e
--- /dev/null
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_dirent_ops (struct bkey_ops) {       \
+       .key_invalid    = bch2_dirent_invalid,          \
+       .val_to_text    = bch2_dirent_to_text,          \
+}
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+
+int __bch2_dirent_create(struct btree_trans *, u64,
+                        const struct bch_hash_info *, u8,
+                        const struct qstr *, u64, int);
+int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
+                      u8, const struct qstr *, u64, u64 *, int);
+
+int __bch2_dirent_delete(struct btree_trans *, u64,
+                        const struct bch_hash_info *,
+                        const struct qstr *);
+int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
+                      const struct qstr *, u64 *);
+
+enum bch_rename_mode {
+       BCH_RENAME,
+       BCH_RENAME_OVERWRITE,
+       BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+                      struct bch_inode_info *, const struct qstr *,
+                      struct bch_inode_info *, const struct qstr *,
+                      enum bch_rename_mode);
+
+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
+                      const struct qstr *);
+
+int bch2_empty_dir(struct bch_fs *, u64);
+int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
new file mode 100644 (file)
index 0000000..48f472a
--- /dev/null
@@ -0,0 +1,494 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+       const struct bch_disk_group *l = _l;
+       const struct bch_disk_group *r = _r;
+
+       return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+               (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+               ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+                (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+               strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+                                               struct bch_sb_field *f)
+{
+       struct bch_sb_field_disk_groups *groups =
+               field_to_type(f, disk_groups);
+       struct bch_disk_group *g, *sorted = NULL;
+       struct bch_sb_field_members *mi;
+       struct bch_member *m;
+       unsigned i, nr_groups, len;
+       const char *err = NULL;
+
+       mi              = bch2_sb_get_members(sb);
+       groups          = bch2_sb_get_disk_groups(sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       for (m = mi->members;
+            m < mi->members + sb->nr_devices;
+            m++) {
+               unsigned g;
+
+               if (!BCH_MEMBER_GROUP(m))
+                       continue;
+
+               g = BCH_MEMBER_GROUP(m) - 1;
+
+               if (g >= nr_groups ||
+                   BCH_GROUP_DELETED(&groups->entries[g]))
+                       return "disk has invalid group";
+       }
+
+       if (!nr_groups)
+               return NULL;
+
+       for (g = groups->entries;
+            g < groups->entries + nr_groups;
+            g++) {
+               if (BCH_GROUP_DELETED(g))
+                       continue;
+
+               len = strnlen(g->label, sizeof(g->label));
+               if (!len) {
+                       err = "group with empty label";
+                       goto err;
+               }
+       }
+
+       sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+       if (!sorted)
+               return "cannot allocate memory";
+
+       memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+       sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+       for (i = 0; i + 1 < nr_groups; i++)
+               if (!BCH_GROUP_DELETED(sorted + i) &&
+                   !group_cmp(sorted + i, sorted + i + 1)) {
+                       err = "duplicate groups";
+                       goto err;
+               }
+
+       err = NULL;
+err:
+       kfree(sorted);
+       return err;
+}
+
+static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+                                       struct bch_sb *sb,
+                                       struct bch_sb_field *f)
+{
+       char *out = buf, *end = buf + size;
+       struct bch_sb_field_disk_groups *groups =
+               field_to_type(f, disk_groups);
+       struct bch_disk_group *g;
+       unsigned nr_groups = disk_groups_nr(groups);
+
+       for (g = groups->entries;
+            g < groups->entries + nr_groups;
+            g++) {
+               if (g != groups->entries)
+                       out += scnprintf(out, end - out, " ");
+
+               if (BCH_GROUP_DELETED(g))
+                       out += scnprintf(out, end - out, "[deleted]");
+               else
+                       out += scnprintf(out, end - out,
+                                        "[parent %llu name %s]",
+                                        BCH_GROUP_PARENT(g),
+                                        g->label);
+       }
+
+       return out - buf;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+       .validate       = bch2_sb_disk_groups_validate,
+       .to_text        = bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_sb_field_disk_groups *groups;
+       struct bch_disk_groups_cpu *cpu_g, *old_g;
+       unsigned i, g, nr_groups;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       mi              = bch2_sb_get_members(c->disk_sb.sb);
+       groups          = bch2_sb_get_disk_groups(c->disk_sb.sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       if (!groups)
+               return 0;
+
+       cpu_g = kzalloc(sizeof(*cpu_g) +
+                       sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+       if (!cpu_g)
+               return -ENOMEM;
+
+       cpu_g->nr = nr_groups;
+
+       for (i = 0; i < nr_groups; i++) {
+               struct bch_disk_group *src      = &groups->entries[i];
+               struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
+
+               dst->deleted    = BCH_GROUP_DELETED(src);
+               dst->parent     = BCH_GROUP_PARENT(src);
+       }
+
+       for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
+               struct bch_disk_group_cpu *dst =
+                       &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+               if (!bch2_member_exists(m))
+                       continue;
+
+               g = BCH_MEMBER_GROUP(m);
+               while (g) {
+                       dst = &cpu_g->entries[g - 1];
+                       __set_bit(i, dst->devs.d);
+                       g = dst->parent;
+               }
+       }
+
+       old_g = rcu_dereference_protected(c->disk_groups,
+                               lockdep_is_held(&c->sb_lock));
+       rcu_assign_pointer(c->disk_groups, cpu_g);
+       if (old_g)
+               kfree_rcu(old_g, rcu);
+
+       return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return NULL;
+       case TARGET_DEV: {
+               struct bch_dev *ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+               return ca ? &ca->self : NULL;
+       }
+       case TARGET_GROUP: {
+               struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+               return t.group < g->nr && !g->entries[t.group].deleted
+                       ? &g->entries[t.group].devs
+                       : NULL;
+       }
+       default:
+               BUG();
+       }
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return false;
+       case TARGET_DEV:
+               return dev == t.dev;
+       case TARGET_GROUP: {
+               struct bch_disk_groups_cpu *g;
+               const struct bch_devs_mask *m;
+               bool ret;
+
+               rcu_read_lock();
+               g = rcu_dereference(c->disk_groups);
+               m = t.group < g->nr && !g->entries[t.group].deleted
+                       ? &g->entries[t.group].devs
+                       : NULL;
+
+               ret = m ? test_bit(dev, m->d) : false;
+               rcu_read_unlock();
+
+               return ret;
+       }
+       default:
+               BUG();
+       }
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+                                 unsigned parent,
+                                 const char *name, unsigned namelen)
+{
+       unsigned i, nr_groups = disk_groups_nr(groups);
+
+       if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+               return -EINVAL;
+
+       for (i = 0; i < nr_groups; i++) {
+               struct bch_disk_group *g = groups->entries + i;
+
+               if (BCH_GROUP_DELETED(g))
+                       continue;
+
+               if (!BCH_GROUP_DELETED(g) &&
+                   BCH_GROUP_PARENT(g) == parent &&
+                   strnlen(g->label, sizeof(g->label)) == namelen &&
+                   !memcmp(name, g->label, namelen))
+                       return i;
+       }
+
+       return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+                                const char *name, unsigned namelen)
+{
+       struct bch_sb_field_disk_groups *groups =
+               bch2_sb_get_disk_groups(sb->sb);
+       unsigned i, nr_groups = disk_groups_nr(groups);
+       struct bch_disk_group *g;
+
+       if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+               return -EINVAL;
+
+       for (i = 0;
+            i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+            i++)
+               ;
+
+       if (i == nr_groups) {
+               unsigned u64s =
+                       (sizeof(struct bch_sb_field_disk_groups) +
+                        sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+                       sizeof(u64);
+
+               groups = bch2_sb_resize_disk_groups(sb, u64s);
+               if (!groups)
+                       return -ENOSPC;
+
+               nr_groups = disk_groups_nr(groups);
+       }
+
+       BUG_ON(i >= nr_groups);
+
+       g = &groups->entries[i];
+
+       memcpy(g->label, name, namelen);
+       if (namelen < sizeof(g->label))
+               g->label[namelen] = '\0';
+       SET_BCH_GROUP_DELETED(g, 0);
+       SET_BCH_GROUP_PARENT(g, parent);
+       SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+       return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+       struct bch_sb_field_disk_groups *groups =
+               bch2_sb_get_disk_groups(sb->sb);
+       int v = -1;
+
+       do {
+               const char *next = strchrnul(name, '.');
+               unsigned len = next - name;
+
+               if (*next == '.')
+                       next++;
+
+               v = __bch2_disk_group_find(groups, v + 1, name, len);
+               name = next;
+       } while (*name && v >= 0);
+
+       return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+       struct bch_sb_field_disk_groups *groups;
+       unsigned parent = 0;
+       int v = -1;
+
+       do {
+               const char *next = strchrnul(name, '.');
+               unsigned len = next - name;
+
+               if (*next == '.')
+                       next++;
+
+               groups = bch2_sb_get_disk_groups(sb->sb);
+
+               v = __bch2_disk_group_find(groups, parent, name, len);
+               if (v < 0)
+                       v = __bch2_disk_group_add(sb, parent, name, len);
+               if (v < 0)
+                       return v;
+
+               parent = v + 1;
+               name = next;
+       } while (*name && v >= 0);
+
+       return v;
+}
+
+int bch2_disk_path_print(struct bch_sb_handle *sb,
+                        char *buf, size_t len, unsigned v)
+{
+       char *out = buf, *end = out + len;
+       struct bch_sb_field_disk_groups *groups =
+               bch2_sb_get_disk_groups(sb->sb);
+       struct bch_disk_group *g;
+       unsigned nr = 0;
+       u16 path[32];
+
+       while (1) {
+               if (nr == ARRAY_SIZE(path))
+                       goto inval;
+
+               if (v >= disk_groups_nr(groups))
+                       goto inval;
+
+               g = groups->entries + v;
+
+               if (BCH_GROUP_DELETED(g))
+                       goto inval;
+
+               path[nr++] = v;
+
+               if (!BCH_GROUP_PARENT(g))
+                       break;
+
+               v = BCH_GROUP_PARENT(g) - 1;
+       }
+
+       while (nr) {
+               unsigned b = 0;
+
+               v = path[--nr];
+               g = groups->entries + v;
+
+               if (end != out)
+                       b = min_t(size_t, end - out,
+                                 strnlen(g->label, sizeof(g->label)));
+               memcpy(out, g->label, b);
+               if (b < end - out)
+                       out[b] = '\0';
+               out += b;
+
+               if (nr)
+                       out += scnprintf(out, end - out, ".");
+       }
+
+       return out - buf;
+inval:
+       return scnprintf(buf, len, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+       struct bch_member *mi;
+       int v = -1;
+
+       mutex_lock(&c->sb_lock);
+
+       if (!strlen(name) || !strcmp(name, "none"))
+               goto write_sb;
+
+       v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+       if (v < 0) {
+               mutex_unlock(&c->sb_lock);
+               return v;
+       }
+
+write_sb:
+       mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+       SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+       struct bch_dev *ca;
+       int g;
+
+       if (!strlen(buf) || !strcmp(buf, "none")) {
+               *v = 0;
+               return 0;
+       }
+
+       /* Is it a device? */
+       ca = bch2_dev_lookup(c, buf);
+       if (!IS_ERR(ca)) {
+               *v = dev_to_target(ca->dev_idx);
+               percpu_ref_put(&ca->ref);
+               return 0;
+       }
+
+       mutex_lock(&c->sb_lock);
+       g = bch2_disk_path_find(&c->disk_sb, buf);
+       mutex_unlock(&c->sb_lock);
+
+       if (g >= 0) {
+               *v = group_to_target(g);
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+       struct target t = target_decode(v);
+       int ret;
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return scnprintf(buf, len, "none");
+       case TARGET_DEV: {
+               struct bch_dev *ca;
+
+               rcu_read_lock();
+               ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+
+               if (ca && percpu_ref_tryget(&ca->io_ref)) {
+                       ret = scnprintf(buf, len, "/dev/%pg",
+                                       ca->disk_sb.bdev);
+                       percpu_ref_put(&ca->io_ref);
+               } else if (ca) {
+                       ret = scnprintf(buf, len, "offline device %u", t.dev);
+               } else {
+                       ret = scnprintf(buf, len, "invalid device %u", t.dev);
+               }
+
+               rcu_read_unlock();
+               break;
+       }
+       case TARGET_GROUP:
+               mutex_lock(&c->sb_lock);
+               ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+               mutex_unlock(&c->sb_lock);
+               break;
+       default:
+               BUG();
+       }
+
+       return ret;
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
new file mode 100644 (file)
index 0000000..d202eb3
--- /dev/null
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+       return groups
+               ? (vstruct_end(&groups->field) -
+                  (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+               : 0;
+}
+
+struct target {
+       enum {
+               TARGET_NULL,
+               TARGET_DEV,
+               TARGET_GROUP,
+       }                       type;
+       union {
+               unsigned        dev;
+               unsigned        group;
+       };
+};
+
+#define TARGET_DEV_START       1
+#define TARGET_GROUP_START     (256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+       return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+       return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+       if (target >= TARGET_GROUP_START)
+               return (struct target) {
+                       .type   = TARGET_GROUP,
+                       .group  = target - TARGET_GROUP_START
+               };
+
+       if (target >= TARGET_DEV_START)
+               return (struct target) {
+                       .type   = TARGET_DEV,
+                       .group  = target - TARGET_DEV_START
+               };
+
+       return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+                                        struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
new file mode 100644 (file)
index 0000000..e975fab
--- /dev/null
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "io.h"
+#include "super.h"
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+       set_bit(BCH_FS_ERROR, &c->flags);
+
+       switch (c->opts.errors) {
+       case BCH_ON_ERROR_CONTINUE:
+               return false;
+       case BCH_ON_ERROR_RO:
+               if (bch2_fs_emergency_read_only(c))
+                       bch_err(c, "emergency read only");
+               return true;
+       case BCH_ON_ERROR_PANIC:
+               panic(bch2_fmt(c, "panic after error"));
+               return true;
+       default:
+               BUG();
+       }
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+       if (bch2_fs_emergency_read_only(c))
+               bch_err(c, "emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+       struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+       struct bch_fs *c = ca->fs;
+       bool dev;
+
+       mutex_lock(&c->state_lock);
+       dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+                                   BCH_FORCE_IF_DEGRADED);
+       if (dev
+           ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+                                 BCH_FORCE_IF_DEGRADED)
+           : bch2_fs_emergency_read_only(c))
+               bch_err(ca,
+                       "too many IO errors, setting %s RO",
+                       dev ? "device" : "filesystem");
+       mutex_unlock(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca)
+{
+       //queue_work(system_long_wq, &ca->io_error_work);
+}
+
+#ifdef __KERNEL__
+#define ask_yn()       false
+#else
+#include "tools-util.h"
+#endif
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
+                               const char *fmt, ...)
+{
+       struct fsck_err_state *s;
+       va_list args;
+       bool fix = false, print = true, suppressing = false;
+       char _buf[sizeof(s->buf)], *buf = _buf;
+
+       mutex_lock(&c->fsck_error_lock);
+
+       if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+               goto print;
+
+       list_for_each_entry(s, &c->fsck_errors, list)
+               if (s->fmt == fmt)
+                       goto found;
+
+       s = kzalloc(sizeof(*s), GFP_KERNEL);
+       if (!s) {
+               if (!c->fsck_alloc_err)
+                       bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+               c->fsck_alloc_err = true;
+               buf = _buf;
+               goto print;
+       }
+
+       INIT_LIST_HEAD(&s->list);
+       s->fmt = fmt;
+found:
+       list_move(&s->list, &c->fsck_errors);
+       s->nr++;
+       suppressing     = s->nr == 10;
+       print           = s->nr <= 10;
+       buf             = s->buf;
+print:
+       va_start(args, fmt);
+       vscnprintf(buf, sizeof(_buf), fmt, args);
+       va_end(args);
+
+       if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+               bch_err(c, "%s, exiting", buf);
+               mutex_unlock(&c->fsck_error_lock);
+               return FSCK_ERR_EXIT;
+       }
+
+       if (flags & FSCK_CAN_FIX) {
+               if (c->opts.fix_errors == FSCK_OPT_ASK) {
+                       printk(KERN_ERR "%s: fix?", buf);
+                       fix = ask_yn();
+               } else if (c->opts.fix_errors == FSCK_OPT_YES ||
+                          (c->opts.nochanges &&
+                           !(flags & FSCK_CAN_IGNORE))) {
+                       if (print)
+                               bch_err(c, "%s, fixing", buf);
+                       fix = true;
+               } else {
+                       if (print)
+                               bch_err(c, "%s, not fixing", buf);
+                       fix = false;
+               }
+       } else if (flags & FSCK_NEED_FSCK) {
+               if (print)
+                       bch_err(c, "%s (run fsck to correct)", buf);
+       } else {
+               if (print)
+                       bch_err(c, "%s (repair unimplemented)", buf);
+       }
+
+       if (suppressing)
+               bch_err(c, "Ratelimiting new instances of previous error");
+
+       mutex_unlock(&c->fsck_error_lock);
+
+       if (fix)
+               set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+
+       return fix                              ? FSCK_ERR_FIX
+               : flags & FSCK_CAN_IGNORE       ? FSCK_ERR_IGNORE
+                                               : FSCK_ERR_EXIT;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+       struct fsck_err_state *s, *n;
+
+       mutex_lock(&c->fsck_error_lock);
+       set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+       list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+               if (s->nr > 10)
+                       bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+
+               list_del(&s->list);
+               kfree(s);
+       }
+
+       mutex_unlock(&c->fsck_error_lock);
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
new file mode 100644 (file)
index 0000000..2591e12
--- /dev/null
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Very fatal logic/inconsistency errors: these indicate that we've majorly
+ * screwed up at runtime, i.e. it's not likely that it was just caused by the
+ * data on disk being inconsistent. These BUG():
+ *
+ * XXX: audit and convert to inconsistent() checks
+ */
+
+#define bch2_fs_bug(c, ...)                                            \
+do {                                                                   \
+       bch_err(c, __VA_ARGS__);                                        \
+       BUG();                                                          \
+} while (0)
+
+#define bch2_fs_bug_on(cond, c, ...)                                   \
+do {                                                                   \
+       if (cond)                                                       \
+               bch2_fs_bug(c, __VA_ARGS__);                            \
+} while (0)
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...)                                   \
+({                                                                     \
+       bch_err(c, __VA_ARGS__);                                        \
+       bch2_inconsistent_error(c);                                     \
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...)                          \
+({                                                                     \
+       int _ret = !!(cond);                                            \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_fs_inconsistent(c, __VA_ARGS__);                   \
+       _ret;                                                           \
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...)                                 \
+do {                                                                   \
+       bch_err(ca, __VA_ARGS__);                                       \
+       bch2_inconsistent_error((ca)->fs);                              \
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...)                                \
+({                                                                     \
+       int _ret = !!(cond);                                            \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_dev_inconsistent(ca, __VA_ARGS__);                 \
+       _ret;                                                           \
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+enum {
+       BCH_FSCK_OK                     = 0,
+       BCH_FSCK_ERRORS_NOT_FIXED       = 1,
+       BCH_FSCK_REPAIR_UNIMPLEMENTED   = 2,
+       BCH_FSCK_REPAIR_IMPOSSIBLE      = 3,
+       BCH_FSCK_UNKNOWN_VERSION        = 4,
+};
+
+enum fsck_err_opts {
+       FSCK_OPT_EXIT,
+       FSCK_OPT_YES,
+       FSCK_OPT_NO,
+       FSCK_OPT_ASK,
+};
+
+enum fsck_err_ret {
+       FSCK_ERR_IGNORE = 0,
+       FSCK_ERR_FIX    = 1,
+       FSCK_ERR_EXIT   = 2,
+};
+
+struct fsck_err_state {
+       struct list_head        list;
+       const char              *fmt;
+       u64                     nr;
+       char                    buf[512];
+};
+
+#define FSCK_CAN_FIX           (1 << 0)
+#define FSCK_CAN_IGNORE                (1 << 1)
+#define FSCK_NEED_FSCK         (1 << 2)
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
+                               unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...)                                        \
+({                                                                     \
+       int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+                                                                       \
+       if (_fix == FSCK_ERR_EXIT) {                                    \
+               bch_err(c, "Unable to continue, halting");              \
+               ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
+               goto fsck_err;                                          \
+       }                                                               \
+                                                                       \
+       _fix;                                                           \
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...)                            \
+       ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, ...)                                 \
+       __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define need_fsck_err(c, ...)                                          \
+       __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define mustfix_fsck_err(c, ...)                                       \
+       __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, ...)                              \
+       __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define fsck_err(c, ...)                                               \
+       __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+#define fsck_err_on(cond, c, ...)                                      \
+       __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...)                                    \
+do {                                                                   \
+       bch_err(c, __VA_ARGS__);                                        \
+       bch2_fatal_error(c);                                            \
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...)                             \
+({                                                                     \
+       int _ret = !!(cond);                                            \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_fs_fatal_error(c, __VA_ARGS__);                    \
+       _ret;                                                           \
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *);
+
+/* Logs message and handles the error: */
+#define bch2_dev_io_error(ca, fmt, ...)                                        \
+do {                                                                   \
+       printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,                  \
+               "IO error on %s for " fmt),                             \
+               (ca)->name, ##__VA_ARGS__);                             \
+       bch2_io_error(ca);                                              \
+} while (0)
+
+#define bch2_dev_io_err_on(cond, ca, ...)                              \
+({                                                                     \
+       bool _ret = (cond);                                             \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_dev_io_error(ca, __VA_ARGS__);                     \
+       _ret;                                                           \
+})
+
+/* kill? */
+
+#define __bcache_io_error(c, fmt, ...)                                 \
+       printk_ratelimited(KERN_ERR bch2_fmt(c,                         \
+                       "IO error: " fmt), ##__VA_ARGS__)
+
+#define bcache_io_error(c, bio, fmt, ...)                              \
+do {                                                                   \
+       __bcache_io_error(c, fmt, ##__VA_ARGS__);                       \
+       (bio)->bi_status = BLK_STS_IOERR;                                       \
+} while (0)
+
+#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
new file mode 100644 (file)
index 0000000..2c1cf29
--- /dev/null
@@ -0,0 +1,2395 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "dirent.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+#include "util.h"
+#include "xattr.h"
+
+static void sort_key_next(struct btree_node_iter_large *iter,
+                         struct btree *b,
+                         struct btree_node_iter_set *i)
+{
+       i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+       if (i->k == i->end)
+               *i = iter->data[--iter->used];
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(h, l, r)                                          \
+({                                                                     \
+       bkey_cmp_packed(b,                                              \
+                       __btree_node_offset_to_key(b, (l).k),           \
+                       __btree_node_offset_to_key(b, (r).k))           \
+                                                                       \
+       ?: (l).k - (r).k;                                               \
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
+                                       struct btree *b)
+{
+       struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+       struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+       if (bkey_whiteout(k))
+               return true;
+
+       if (iter->used < 2)
+               return false;
+
+       if (iter->used > 2 &&
+           key_sort_cmp(iter, r[0], r[1]) >= 0)
+               r++;
+
+       /*
+        * key_sort_cmp() ensures that when keys compare equal the older key
+        * comes first; so if l->k compares equal to r->k then l->k is older and
+        * should be dropped.
+        */
+       return !bkey_cmp_packed(b,
+                               __btree_node_offset_to_key(b, l->k),
+                               __btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
+                                       struct btree *b,
+                                       struct btree_node_iter_large *iter)
+{
+       struct bkey_packed *out = dst->start;
+       struct btree_nr_keys nr;
+
+       memset(&nr, 0, sizeof(nr));
+
+       heap_resort(iter, key_sort_cmp);
+
+       while (!bch2_btree_node_iter_large_end(iter)) {
+               if (!should_drop_next_key(iter, b)) {
+                       struct bkey_packed *k =
+                               __btree_node_offset_to_key(b, iter->data->k);
+
+                       bkey_copy(out, k);
+                       btree_keys_account_key_add(&nr, 0, out);
+                       out = bkey_next(out);
+               }
+
+               sort_key_next(iter, b, iter->data);
+               heap_sift_down(iter, 0, key_sort_cmp);
+       }
+
+       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+       return nr;
+}
+
+/* Common among btree and extent ptrs */
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (ptr->dev == dev)
+                       return ptr;
+
+       return NULL;
+}
+
+bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+{
+       struct bch_extent_ptr *ptr;
+       bool dropped = false;
+
+       extent_for_each_ptr_backwards(e, ptr)
+               if (ptr->dev == dev) {
+                       __bch2_extent_drop_ptr(e, ptr);
+                       dropped = true;
+               }
+
+       if (dropped)
+               bch2_extent_drop_redundant_crcs(e);
+       return dropped;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ca->mi.group &&
+                   ca->mi.group - 1 == group)
+                       return ptr;
+       }
+
+       return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (bch2_dev_in_target(c, ptr->dev, target) &&
+                   (!ptr->cached ||
+                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+                       return ptr;
+
+       return NULL;
+}
+
+unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned nr_ptrs = 0;
+
+       extent_for_each_ptr(e, ptr)
+               nr_ptrs++;
+
+       return nr_ptrs;
+}
+
+unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
+{
+       struct bkey_s_c_extent e;
+       const struct bch_extent_ptr *ptr;
+       unsigned nr_ptrs = 0;
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               e = bkey_s_c_to_extent(k);
+
+               extent_for_each_ptr(e, ptr)
+                       nr_ptrs += !ptr->cached;
+               break;
+
+       case BCH_RESERVATION:
+               nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+               break;
+       }
+
+       return nr_ptrs;
+}
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+                                   const struct bch_extent_ptr *ptr)
+{
+       struct bch_dev *ca;
+
+       if (ptr->cached)
+               return 0;
+
+       ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+               return 0;
+
+       return ca->mi.durability;
+}
+
+unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned durability = 0;
+
+       extent_for_each_ptr(e, ptr)
+               durability += bch2_extent_ptr_durability(c, ptr);
+
+       return durability;
+}
+
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+       struct bkey_s_c_extent e;
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       unsigned ret = 0;
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               e = bkey_s_c_to_extent(k);
+
+               extent_for_each_ptr_crc(e, ptr, crc)
+                       if (!ptr->cached &&
+                           crc.compression_type != BCH_COMPRESSION_NONE &&
+                           crc.compressed_size < crc.live_size)
+                               ret = max_t(unsigned, ret, crc.compressed_size);
+       }
+
+       return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+                            struct bch_extent_ptr m, u64 offset)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (ptr->dev    == m.dev &&
+                   ptr->gen    == m.gen &&
+                   (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+                   (s64) m.offset  - offset)
+                       return ptr;
+
+       return NULL;
+}
+
+/* Doesn't cleanup redundant crcs */
+void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+       EBUG_ON(ptr < &e.v->start->ptr ||
+               ptr >= &extent_entry_last(e)->ptr);
+       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+       memmove_u64s_down(ptr, ptr + 1,
+                         (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+       e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+       __bch2_extent_drop_ptr(e, ptr);
+       bch2_extent_drop_redundant_crcs(e);
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+                                 struct bch_extent_crc_unpacked n)
+{
+       return !u.compression_type &&
+               u.csum_type &&
+               u.uncompressed_size > u.live_size &&
+               bch2_csum_type_is_encryption(u.csum_type) ==
+               bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+                                struct bch_extent_crc_unpacked n)
+{
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+
+       if (!n.csum_type)
+               return false;
+
+       extent_for_each_crc(e, crc, i)
+               if (can_narrow_crc(crc, n))
+                       return true;
+
+       return false;
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ */
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+                            struct bch_extent_crc_unpacked n)
+{
+       struct bch_extent_crc_unpacked u;
+       struct bch_extent_ptr *ptr;
+       union bch_extent_entry *i;
+
+       /* Find a checksum entry that covers only live data: */
+       if (!n.csum_type)
+               extent_for_each_crc(extent_i_to_s(e), u, i)
+                       if (!u.compression_type &&
+                           u.csum_type &&
+                           u.live_size == u.uncompressed_size) {
+                               n = u;
+                               break;
+                       }
+
+       if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+               return false;
+
+       BUG_ON(n.compression_type);
+       BUG_ON(n.offset);
+       BUG_ON(n.live_size != e->k.size);
+
+       bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+       extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+               if (can_narrow_crc(u, n)) {
+                       ptr->offset += u.offset;
+                       extent_ptr_append(e, *ptr);
+                       __bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+                       goto restart_narrow_pointers;
+               }
+
+       bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+       return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+                                        struct bch_extent_crc_unpacked r)
+{
+       return (l.csum_type             != r.csum_type ||
+               l.compression_type      != r.compression_type ||
+               l.compressed_size       != r.compressed_size ||
+               l.uncompressed_size     != r.uncompressed_size ||
+               l.offset                != r.offset ||
+               l.live_size             != r.live_size ||
+               l.nonce                 != r.nonce ||
+               bch2_crc_cmp(l.csum, r.csum));
+}
+
+void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
+{
+       union bch_extent_entry *entry = e.v->start;
+       union bch_extent_crc *crc, *prev = NULL;
+       struct bch_extent_crc_unpacked u, prev_u = { 0 };
+
+       while (entry != extent_entry_last(e)) {
+               union bch_extent_entry *next = extent_entry_next(entry);
+               size_t crc_u64s = extent_entry_u64s(entry);
+
+               if (!extent_entry_is_crc(entry))
+                       goto next;
+
+               crc = entry_to_crc(entry);
+               u = bch2_extent_crc_unpack(e.k, crc);
+
+               if (next == extent_entry_last(e)) {
+                       /* crc entry with no pointers after it: */
+                       goto drop;
+               }
+
+               if (extent_entry_is_crc(next)) {
+                       /* no pointers before next crc entry: */
+                       goto drop;
+               }
+
+               if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
+                       /* identical to previous crc entry: */
+                       goto drop;
+               }
+
+               if (!prev &&
+                   !u.csum_type &&
+                   !u.compression_type) {
+                       /* null crc entry: */
+                       union bch_extent_entry *e2;
+
+                       extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+                               if (!extent_entry_is_ptr(e2))
+                                       break;
+
+                               e2->ptr.offset += u.offset;
+                       }
+                       goto drop;
+               }
+
+               prev = crc;
+               prev_u = u;
+next:
+               entry = next;
+               continue;
+drop:
+               memmove_u64s_down(crc, next,
+                                 (u64 *) extent_entry_last(e) - (u64 *) next);
+               e.k->u64s -= crc_u64s;
+       }
+
+       EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
+}
+
+static bool should_drop_ptr(const struct bch_fs *c,
+                           struct bkey_s_c_extent e,
+                           const struct bch_extent_ptr *ptr)
+{
+       return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
+}
+
+static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
+{
+       struct bch_extent_ptr *ptr = &e.v->start->ptr;
+       bool dropped = false;
+
+       while ((ptr = extent_ptr_next(e, ptr)))
+               if (should_drop_ptr(c, e.c, ptr)) {
+                       __bch2_extent_drop_ptr(e, ptr);
+                       dropped = true;
+               } else
+                       ptr++;
+
+       if (dropped)
+               bch2_extent_drop_redundant_crcs(e);
+}
+
+bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
+{
+       return bch2_extent_normalize(c, k);
+}
+
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+       switch (k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED: {
+               union bch_extent_entry *entry;
+               u64 *d = (u64 *) bkeyp_val(f, k);
+               unsigned i;
+
+               for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+                       d[i] = swab64(d[i]);
+
+               for (entry = (union bch_extent_entry *) d;
+                    entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+                    entry = extent_entry_next(entry)) {
+                       switch (extent_entry_type(entry)) {
+                       case BCH_EXTENT_ENTRY_crc32:
+                               entry->crc32.csum = swab32(entry->crc32.csum);
+                               break;
+                       case BCH_EXTENT_ENTRY_crc64:
+                               entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+                               entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+                               break;
+                       case BCH_EXTENT_ENTRY_crc128:
+                               entry->crc128.csum.hi = (__force __le64)
+                                       swab64((__force u64) entry->crc128.csum.hi);
+                               entry->crc128.csum.lo = (__force __le64)
+                                       swab64((__force u64) entry->crc128.csum.lo);
+                               break;
+                       case BCH_EXTENT_ENTRY_ptr:
+                               break;
+                       }
+               }
+               break;
+       }
+       }
+}
+
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+                                     struct bkey_s_c_extent e,
+                                     const struct bch_extent_ptr *ptr,
+                                     unsigned size_ondisk,
+                                     bool metadata)
+{
+       const struct bch_extent_ptr *ptr2;
+       struct bch_dev *ca;
+
+       if (ptr->dev >= c->sb.nr_devices ||
+           !c->devs[ptr->dev])
+               return "pointer to invalid device";
+
+       ca = bch_dev_bkey_exists(c, ptr->dev);
+       if (!ca)
+               return "pointer to invalid device";
+
+       extent_for_each_ptr(e, ptr2)
+               if (ptr != ptr2 && ptr->dev == ptr2->dev)
+                       return "multiple pointers to same device";
+
+       if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+               return "offset past end of device";
+
+       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+               return "offset before first bucket";
+
+       if (bucket_remainder(ca, ptr->offset) +
+           size_ondisk > ca->mi.bucket_size)
+               return "spans multiple buckets";
+
+       return NULL;
+}
+
+static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
+                               size_t size, struct bkey_s_c_extent e)
+{
+       char *out = buf, *end = buf + size;
+       const union bch_extent_entry *entry;
+       struct bch_extent_crc_unpacked crc;
+       const struct bch_extent_ptr *ptr;
+       struct bch_dev *ca;
+       bool first = true;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+       extent_for_each_entry(e, entry) {
+               if (!first)
+                       p(" ");
+
+               switch (__extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_crc32:
+               case BCH_EXTENT_ENTRY_crc64:
+               case BCH_EXTENT_ENTRY_crc128:
+                       crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+                       p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+                         crc.compressed_size,
+                         crc.uncompressed_size,
+                         crc.offset, crc.nonce,
+                         crc.csum_type,
+                         crc.compression_type);
+                       break;
+               case BCH_EXTENT_ENTRY_ptr:
+                       ptr = entry_to_ptr(entry);
+                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                               ? bch_dev_bkey_exists(c, ptr->dev)
+                               : NULL;
+
+                       p("ptr: %u:%llu gen %u%s%s", ptr->dev,
+                         (u64) ptr->offset, ptr->gen,
+                         ptr->cached ? " cached" : "",
+                         ca && ptr_stale(ca, ptr)
+                         ? " stale" : "");
+                       break;
+               default:
+                       p("(invalid extent entry %.16llx)", *((u64 *) entry));
+                       goto out;
+               }
+
+               first = false;
+       }
+out:
+       if (bkey_extent_is_cached(e.k))
+               p(" cached");
+#undef p
+       return out - buf;
+}
+
+static inline bool dev_latency_better(struct bch_fs *c,
+                             const struct bch_extent_ptr *ptr1,
+                             const struct bch_extent_ptr *ptr2)
+{
+       struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
+       struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
+       u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+       u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+       /* Pick at random, biased in favor of the faster device: */
+
+       return bch2_rand_range(l1 + l2) > l1;
+}
+
+static int extent_pick_read_device(struct bch_fs *c,
+                                  struct bkey_s_c_extent e,
+                                  struct bch_devs_mask *avoid,
+                                  struct extent_pick_ptr *pick)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       struct bch_dev *ca;
+       int ret = 0;
+
+       extent_for_each_ptr_crc(e, ptr, crc) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ptr->cached && ptr_stale(ca, ptr))
+                       continue;
+
+               if (avoid && test_bit(ptr->dev, avoid->d))
+                       continue;
+
+               if (ret && !dev_latency_better(c, ptr, &pick->ptr))
+                       continue;
+
+               *pick = (struct extent_pick_ptr) {
+                       .ptr    = *ptr,
+                       .crc    = crc,
+               };
+
+               ret = 1;
+       }
+
+       return ret;
+}
+
+/* Btree ptrs */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (bkey_extent_is_cached(k.k))
+               return "cached";
+
+       if (k.k->size)
+               return "nonzero key size";
+
+       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+               return "value too big";
+
+       switch (k.k->type) {
+       case BCH_EXTENT: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const union bch_extent_entry *entry;
+               const struct bch_extent_ptr *ptr;
+               const char *reason;
+
+               extent_for_each_entry(e, entry) {
+                       if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+                               return "invalid extent entry type";
+
+                       if (extent_entry_is_crc(entry))
+                               return "has crc field";
+               }
+
+               extent_for_each_ptr(e, ptr) {
+                       reason = extent_ptr_invalid(c, e, ptr,
+                                                   c->opts.btree_node_size,
+                                                   true);
+                       if (reason)
+                               return reason;
+               }
+
+               return NULL;
+       }
+
+       default:
+               return "invalid value type";
+       }
+}
+
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
+                              struct bkey_s_c k)
+{
+       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+       const struct bch_extent_ptr *ptr;
+       unsigned seq;
+       const char *err;
+       char buf[160];
+       struct bucket_mark mark;
+       struct bch_dev *ca;
+       unsigned replicas = 0;
+       bool bad;
+
+       extent_for_each_ptr(e, ptr) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+               replicas++;
+
+               if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
+                       continue;
+
+               err = "stale";
+               if (ptr_stale(ca, ptr))
+                       goto err;
+
+               do {
+                       seq = read_seqcount_begin(&c->gc_pos_lock);
+                       mark = ptr_bucket_mark(ca, ptr);
+
+                       bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+                               (mark.data_type != BCH_DATA_BTREE ||
+                                mark.dirty_sectors < c->opts.btree_node_size);
+               } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+               err = "inconsistent";
+               if (bad)
+                       goto err;
+       }
+
+       if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
+               bch2_bkey_val_to_text(c, btree_node_type(b),
+                                    buf, sizeof(buf), k);
+               bch2_fs_bug(c,
+                       "btree key bad (replicas not marked in superblock):\n%s",
+                       buf);
+               return;
+       }
+
+       return;
+err:
+       bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
+       bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
+                     "gen %i mark %08x",
+                     err, buf, PTR_BUCKET_NR(ca, ptr),
+                     mark.gen, (unsigned) mark.v.counter);
+}
+
+void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+                           size_t size, struct bkey_s_c k)
+{
+       char *out = buf, *end = buf + size;
+       const char *invalid;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+       if (bkey_extent_is_data(k.k))
+               out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+       invalid = bch2_btree_ptr_invalid(c, k);
+       if (invalid)
+               p(" invalid: %s", invalid);
+#undef p
+}
+
+int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+                       struct bch_devs_mask *avoid,
+                       struct extent_pick_ptr *pick)
+{
+       return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+                                      avoid, pick);
+}
+
+/* Extents */
+
+static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+{
+       u64 len = 0;
+
+       if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+               return false;
+
+       EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+
+       len = k.k->p.offset - where.offset;
+
+       BUG_ON(len > k.k->size);
+
+       /*
+        * Don't readjust offset if the key size is now 0, because that could
+        * cause offset to point to the next bucket:
+        */
+       if (!len)
+               k.k->type = KEY_TYPE_DELETED;
+       else if (bkey_extent_is_data(k.k)) {
+               struct bkey_s_extent e = bkey_s_to_extent(k);
+               union bch_extent_entry *entry;
+               bool seen_crc = false;
+
+               extent_for_each_entry(e, entry) {
+                       switch (extent_entry_type(entry)) {
+                       case BCH_EXTENT_ENTRY_ptr:
+                               if (!seen_crc)
+                                       entry->ptr.offset += e.k->size - len;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc32:
+                               entry->crc32.offset += e.k->size - len;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc64:
+                               entry->crc64.offset += e.k->size - len;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc128:
+                               entry->crc128.offset += e.k->size - len;
+                               break;
+                       }
+
+                       if (extent_entry_is_crc(entry))
+                               seen_crc = true;
+               }
+       }
+
+       k.k->size = len;
+
+       return true;
+}
+
+bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+       return __bch2_cut_front(where, bkey_i_to_s(k));
+}
+
+bool bch2_cut_back(struct bpos where, struct bkey *k)
+{
+       u64 len = 0;
+
+       if (bkey_cmp(where, k->p) >= 0)
+               return false;
+
+       EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
+
+       len = where.offset - bkey_start_offset(k);
+
+       BUG_ON(len > k->size);
+
+       k->p = where;
+       k->size = len;
+
+       if (!len)
+               k->type = KEY_TYPE_DELETED;
+
+       return true;
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+void bch2_key_resize(struct bkey *k,
+                   unsigned new_size)
+{
+       k->p.offset -= k->size;
+       k->p.offset += new_size;
+       k->size = new_size;
+}
+
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
+                         struct bkey_packed *dst, struct bkey *src)
+{
+       struct bkey_format *f = &b->format;
+       struct bkey_i *dst_unpacked;
+       bool ret;
+
+       if ((dst_unpacked = packed_to_bkey(dst))) {
+               dst_unpacked->k = *src;
+               ret = true;
+       } else {
+               ret = bch2_bkey_pack_key(dst, src, f);
+       }
+
+       if (ret && iter)
+               bch2_verify_key_order(b, iter, dst);
+
+       return ret;
+}
+
+static void extent_save(struct btree *b, struct btree_node_iter *iter,
+                       struct bkey_packed *dst, struct bkey *src)
+{
+       BUG_ON(!__extent_save(b, iter, dst, src));
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(h, l, r)                                       \
+({                                                                     \
+       struct bkey _ul = bkey_unpack_key(b,                            \
+                               __btree_node_offset_to_key(b, (l).k));  \
+       struct bkey _ur = bkey_unpack_key(b,                            \
+                               __btree_node_offset_to_key(b, (r).k));  \
+                                                                       \
+       bkey_cmp(bkey_start_pos(&_ul),                                  \
+                bkey_start_pos(&_ur)) ?: (r).k - (l).k;                \
+})
+
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
+                                   struct btree *b, size_t i)
+{
+       heap_sift_down(iter, i, extent_sort_cmp);
+}
+
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
+                                   struct btree *b,
+                                   struct btree_node_iter_set *i)
+{
+       sort_key_next(iter, b, i);
+       heap_sift_down(iter, i - iter->data, extent_sort_cmp);
+}
+
+static void extent_sort_append(struct bch_fs *c,
+                              struct btree *b,
+                              struct btree_nr_keys *nr,
+                              struct bkey_packed *start,
+                              struct bkey_packed **prev,
+                              struct bkey_packed *k)
+{
+       struct bkey_format *f = &b->format;
+       BKEY_PADDED(k) tmp;
+
+       if (bkey_whiteout(k))
+               return;
+
+       bch2_bkey_unpack(b, &tmp.k, k);
+
+       if (*prev &&
+           bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
+               return;
+
+       if (*prev) {
+               bch2_bkey_pack(*prev, (void *) *prev, f);
+
+               btree_keys_account_key_add(nr, 0, *prev);
+               *prev = bkey_next(*prev);
+       } else {
+               *prev = start;
+       }
+
+       bkey_copy(*prev, &tmp.k);
+}
+
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+                                       struct bset *dst,
+                                       struct btree *b,
+                                       struct btree_node_iter_large *iter)
+{
+       struct bkey_format *f = &b->format;
+       struct btree_node_iter_set *_l = iter->data, *_r;
+       struct bkey_packed *prev = NULL, *out, *lk, *rk;
+       struct bkey l_unpacked, r_unpacked;
+       struct bkey_s l, r;
+       struct btree_nr_keys nr;
+
+       memset(&nr, 0, sizeof(nr));
+
+       heap_resort(iter, extent_sort_cmp);
+
+       while (!bch2_btree_node_iter_large_end(iter)) {
+               lk = __btree_node_offset_to_key(b, _l->k);
+
+               if (iter->used == 1) {
+                       extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+                       extent_sort_next(iter, b, _l);
+                       continue;
+               }
+
+               _r = iter->data + 1;
+               if (iter->used > 2 &&
+                   extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
+                       _r++;
+
+               rk = __btree_node_offset_to_key(b, _r->k);
+
+               l = __bkey_disassemble(b, lk, &l_unpacked);
+               r = __bkey_disassemble(b, rk, &r_unpacked);
+
+               /* If current key and next key don't overlap, just append */
+               if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+                       extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+                       extent_sort_next(iter, b, _l);
+                       continue;
+               }
+
+               /* Skip 0 size keys */
+               if (!r.k->size) {
+                       extent_sort_next(iter, b, _r);
+                       continue;
+               }
+
+               /*
+                * overlap: keep the newer key and trim the older key so they
+                * don't overlap. comparing pointers tells us which one is
+                * newer, since the bsets are appended one after the other.
+                */
+
+               /* can't happen because of comparison func */
+               BUG_ON(_l->k < _r->k &&
+                      !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+               if (_l->k > _r->k) {
+                       /* l wins, trim r */
+                       if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+                               sort_key_next(iter, b, _r);
+                       } else {
+                               __bch2_cut_front(l.k->p, r);
+                               extent_save(b, NULL, rk, r.k);
+                       }
+
+                       extent_sort_sift(iter, b, _r - iter->data);
+               } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+                       BKEY_PADDED(k) tmp;
+
+                       /*
+                        * r wins, but it overlaps in the middle of l - split l:
+                        */
+                       bkey_reassemble(&tmp.k, l.s_c);
+                       bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+                       __bch2_cut_front(r.k->p, l);
+                       extent_save(b, NULL, lk, l.k);
+
+                       extent_sort_sift(iter, b, 0);
+
+                       extent_sort_append(c, b, &nr, dst->start, &prev,
+                                          bkey_to_packed(&tmp.k));
+               } else {
+                       bch2_cut_back(bkey_start_pos(r.k), l.k);
+                       extent_save(b, NULL, lk, l.k);
+               }
+       }
+
+       if (prev) {
+               bch2_bkey_pack(prev, (void *) prev, f);
+               btree_keys_account_key_add(&nr, 0, prev);
+               out = bkey_next(prev);
+       } else {
+               out = dst->start;
+       }
+
+       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+       return nr;
+}
+
+struct extent_insert_state {
+       struct btree_insert             *trans;
+       struct btree_insert_entry       *insert;
+       struct bpos                     committed;
+       struct bch_fs_usage             stats;
+
+       /* for deleting: */
+       struct bkey_i                   whiteout;
+       bool                            do_journal;
+       bool                            deleting;
+};
+
+static void bch2_add_sectors(struct extent_insert_state *s,
+                            struct bkey_s_c k, u64 offset, s64 sectors)
+{
+       struct bch_fs *c = s->trans->c;
+       struct btree *b = s->insert->iter->l[0].b;
+
+       EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
+
+       if (!sectors)
+               return;
+
+       bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
+                     &s->stats, s->trans->journal_res.seq, 0);
+}
+
+static void bch2_subtract_sectors(struct extent_insert_state *s,
+                                struct bkey_s_c k, u64 offset, s64 sectors)
+{
+       bch2_add_sectors(s, k, offset, -sectors);
+}
+
+/* These wrappers subtract exactly the sectors that we're removing from @k */
+static void bch2_cut_subtract_back(struct extent_insert_state *s,
+                                 struct bpos where, struct bkey_s k)
+{
+       bch2_subtract_sectors(s, k.s_c, where.offset,
+                            k.k->p.offset - where.offset);
+       bch2_cut_back(where, k.k);
+}
+
+static void bch2_cut_subtract_front(struct extent_insert_state *s,
+                                  struct bpos where, struct bkey_s k)
+{
+       bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
+                            where.offset - bkey_start_offset(k.k));
+       __bch2_cut_front(where, k);
+}
+
+static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
+{
+       if (k.k->size)
+               bch2_subtract_sectors(s, k.s_c,
+                                    bkey_start_offset(k.k), k.k->size);
+       k.k->size = 0;
+       k.k->type = KEY_TYPE_DELETED;
+}
+
+static bool bch2_extent_merge_inline(struct bch_fs *,
+                                    struct btree_iter *,
+                                    struct bkey_packed *,
+                                    struct bkey_packed *,
+                                    bool);
+
+#define MAX_LOCK_HOLD_TIME     (5 * NSEC_PER_MSEC)
+
+static enum btree_insert_ret
+extent_insert_should_stop(struct extent_insert_state *s)
+{
+       struct btree *b = s->insert->iter->l[0].b;
+
+       /*
+        * Check if we have sufficient space in both the btree node and the
+        * journal reservation:
+        *
+        * Each insert checks for room in the journal entry, but we check for
+        * room in the btree node up-front. In the worst case, bkey_cmpxchg()
+        * will insert two keys, and one iteration of this room will insert one
+        * key, so we need room for three keys.
+        */
+       if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
+               return BTREE_INSERT_BTREE_NODE_FULL;
+       else if (!journal_res_insert_fits(s->trans, s->insert))
+               return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
+       else
+               return BTREE_INSERT_OK;
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+                              struct bkey_i *insert)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bset_tree *t = bset_tree_last(l->b);
+       struct bkey_packed *where =
+               bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
+       struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where,
+                                                        KEY_TYPE_DISCARD);
+       struct bkey_packed *next_live_key = where;
+       unsigned clobber_u64s;
+
+       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+       if (prev)
+               where = bkey_next(prev);
+
+       while (next_live_key != btree_bkey_last(l->b, t) &&
+              bkey_deleted(next_live_key))
+               next_live_key = bkey_next(next_live_key);
+
+       /*
+        * Everything between where and next_live_key is now deleted keys, and
+        * is overwritten:
+        */
+       clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+
+       if (prev &&
+           bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
+               goto drop_deleted_keys;
+
+       if (next_live_key != btree_bkey_last(l->b, t) &&
+           bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
+                                   next_live_key, false))
+               goto drop_deleted_keys;
+
+       bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
+                               clobber_u64s, where->u64s);
+       return;
+drop_deleted_keys:
+       bch2_bset_delete(l->b, where, clobber_u64s);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+                                where, clobber_u64s, 0);
+}
+
+static void extent_insert_committed(struct extent_insert_state *s)
+{
+       struct bch_fs *c = s->trans->c;
+       struct btree_iter *iter = s->insert->iter;
+       struct bkey_i *insert = !s->deleting
+               ? s->insert->k
+               : &s->whiteout;
+       BKEY_PADDED(k) split;
+
+       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
+       EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
+
+       if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+               return;
+
+       if (s->deleting && !s->do_journal) {
+               bch2_cut_front(s->committed, insert);
+               goto done;
+       }
+
+       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+       bkey_copy(&split.k, insert);
+
+       if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+           bkey_cmp(s->committed, insert->k.p) &&
+           bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
+               /* XXX: possibly need to increase our reservation? */
+               bch2_cut_subtract_back(s, s->committed,
+                                     bkey_i_to_s(&split.k));
+               bch2_cut_front(s->committed, insert);
+               bch2_add_sectors(s, bkey_i_to_s_c(insert),
+                               bkey_start_offset(&insert->k),
+                               insert->k.size);
+       } else {
+               bch2_cut_back(s->committed, &split.k.k);
+               bch2_cut_front(s->committed, insert);
+       }
+
+       if (debug_check_bkeys(c))
+               bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
+
+       bch2_btree_journal_key(s->trans, iter, &split.k);
+
+       if (!s->deleting)
+               extent_bset_insert(c, iter, &split.k);
+done:
+       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+
+       insert->k.needs_whiteout        = false;
+       s->do_journal                   = false;
+       s->trans->did_work              = true;
+}
+
+static enum btree_insert_ret
+__extent_insert_advance_pos(struct extent_insert_state *s,
+                           struct bpos next_pos,
+                           struct bkey_s_c k)
+{
+       struct extent_insert_hook *hook = s->trans->hook;
+       enum btree_insert_ret ret;
+
+       if (hook)
+               ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
+       else
+               ret = BTREE_INSERT_OK;
+
+       if (ret == BTREE_INSERT_OK)
+               s->committed = next_pos;
+
+       return ret;
+}
+
+/*
+ * Update iter->pos, marking how much of @insert we've processed, and call hook
+ * fn:
+ */
+static enum btree_insert_ret
+extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+{
+       struct btree *b = s->insert->iter->l[0].b;
+       struct bpos next_pos = bpos_min(s->insert->k->k.p,
+                                       k.k ? k.k->p : b->key.k.p);
+       enum btree_insert_ret ret;
+
+       if (race_fault())
+               return BTREE_INSERT_NEED_TRAVERSE;
+
+       /* hole? */
+       if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
+               ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+                                                   bkey_s_c_null);
+               if (ret != BTREE_INSERT_OK)
+                       return ret;
+       }
+
+       /* avoid redundant calls to hook fn: */
+       if (!bkey_cmp(s->committed, next_pos))
+               return BTREE_INSERT_OK;
+
+       return __extent_insert_advance_pos(s, next_pos, k);
+}
+
+static enum btree_insert_ret
+extent_insert_check_split_compressed(struct extent_insert_state *s,
+                                    struct bkey_s_c k,
+                                    enum bch_extent_overlap overlap)
+{
+       struct bch_fs *c = s->trans->c;
+       unsigned sectors;
+
+       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+           (sectors = bch2_extent_is_compressed(k))) {
+               int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
+
+               if (s->trans->flags & BTREE_INSERT_NOFAIL)
+                       flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+               switch (bch2_disk_reservation_add(c,
+                               s->trans->disk_res,
+                               sectors * bch2_extent_nr_dirty_ptrs(k),
+                               flags)) {
+               case 0:
+                       break;
+               case -ENOSPC:
+                       return BTREE_INSERT_ENOSPC;
+               case -EINTR:
+                       return BTREE_INSERT_NEED_GC_LOCK;
+               default:
+                       BUG();
+               }
+       }
+
+       return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
+             struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+             enum bch_extent_overlap overlap)
+{
+       struct bch_fs *c = s->trans->c;
+       struct btree_iter *iter = s->insert->iter;
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
+       enum btree_insert_ret ret;
+
+       switch (overlap) {
+       case BCH_EXTENT_OVERLAP_FRONT:
+               /* insert overlaps with start of k: */
+               bch2_cut_subtract_front(s, insert->k.p, k);
+               BUG_ON(bkey_deleted(k.k));
+               extent_save(b, node_iter, _k, k.k);
+               break;
+
+       case BCH_EXTENT_OVERLAP_BACK:
+               /* insert overlaps with end of k: */
+               bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
+               BUG_ON(bkey_deleted(k.k));
+               extent_save(b, node_iter, _k, k.k);
+
+               /*
+                * As the auxiliary tree is indexed by the end of the
+                * key and we've just changed the end, update the
+                * auxiliary tree.
+                */
+               bch2_bset_fix_invalidated_key(b, t, _k);
+               bch2_btree_node_iter_fix(iter, b, node_iter, t,
+                                       _k, _k->u64s, _k->u64s);
+               break;
+
+       case BCH_EXTENT_OVERLAP_ALL: {
+               struct bpos orig_pos = k.k->p;
+
+               /* The insert key completely covers k, invalidate k */
+               if (!bkey_whiteout(k.k))
+                       btree_keys_account_key_drop(&b->nr,
+                                               t - b->set, _k);
+
+               bch2_drop_subtract(s, k);
+               k.k->p = bkey_start_pos(&insert->k);
+               if (!__extent_save(b, node_iter, _k, k.k)) {
+                       /*
+                        * Couldn't repack: we aren't necessarily able
+                        * to repack if the new key is outside the range
+                        * of the old extent, so we have to split
+                        * @insert:
+                        */
+                       k.k->p = orig_pos;
+                       extent_save(b, node_iter, _k, k.k);
+
+                       ret = extent_insert_advance_pos(s, k.s_c);
+                       if (ret != BTREE_INSERT_OK)
+                               return ret;
+
+                       extent_insert_committed(s);
+                       /*
+                        * We split and inserted upto at k.k->p - that
+                        * has to coincide with iter->pos, so that we
+                        * don't have anything more we have to insert
+                        * until we recheck our journal reservation:
+                        */
+                       EBUG_ON(bkey_cmp(s->committed, k.k->p));
+               } else {
+                       bch2_bset_fix_invalidated_key(b, t, _k);
+                       bch2_btree_node_iter_fix(iter, b, node_iter, t,
+                                               _k, _k->u64s, _k->u64s);
+               }
+
+               break;
+       }
+       case BCH_EXTENT_OVERLAP_MIDDLE: {
+               BKEY_PADDED(k) split;
+               /*
+                * The insert key falls 'in the middle' of k
+                * The insert key splits k in 3:
+                * - start only in k, preserve
+                * - middle common section, invalidate in k
+                * - end only in k, preserve
+                *
+                * We update the old key to preserve the start,
+                * insert will be the new common section,
+                * we manually insert the end that we are preserving.
+                *
+                * modify k _before_ doing the insert (which will move
+                * what k points to)
+                */
+               bkey_reassemble(&split.k, k.s_c);
+               split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+
+               bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
+               BUG_ON(bkey_deleted(&split.k.k));
+
+               bch2_cut_subtract_front(s, insert->k.p, k);
+               BUG_ON(bkey_deleted(k.k));
+               extent_save(b, node_iter, _k, k.k);
+
+               bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
+                               bkey_start_offset(&split.k.k),
+                               split.k.k.size);
+               extent_bset_insert(c, iter, &split.k);
+               break;
+       }
+       }
+
+       return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+__bch2_delete_fixup_extent(struct extent_insert_state *s)
+{
+       struct bch_fs *c = s->trans->c;
+       struct btree_iter *iter = s->insert->iter;
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+       struct bkey_i *insert = s->insert->k;
+       enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+       s->whiteout = *insert;
+       s->whiteout.k.type = KEY_TYPE_DISCARD;
+
+       while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+               enum bch_extent_overlap overlap;
+
+               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+                       break;
+
+               if (bkey_whiteout(k.k)) {
+                       s->committed = bpos_min(insert->k.p, k.k->p);
+                       goto next;
+               }
+
+               overlap = bch2_extent_overlap(&insert->k, k.k);
+
+               ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+               if (ret)
+                       break;
+
+               ret = extent_insert_advance_pos(s, k.s_c);
+               if (ret)
+                       break;
+
+               s->do_journal = true;
+
+               if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+                       btree_keys_account_key_drop(&b->nr,
+                                               t - b->set, _k);
+                       bch2_subtract_sectors(s, k.s_c,
+                                            bkey_start_offset(k.k), k.k->size);
+                       _k->type = KEY_TYPE_DISCARD;
+                       reserve_whiteout(b, t, _k);
+               } else if (k.k->needs_whiteout ||
+                          bset_written(b, bset(b, t))) {
+                       struct bkey_i discard = *insert;
+
+                       discard.k.type = KEY_TYPE_DISCARD;
+
+                       switch (overlap) {
+                       case BCH_EXTENT_OVERLAP_FRONT:
+                               bch2_cut_front(bkey_start_pos(k.k), &discard);
+                               break;
+                       case BCH_EXTENT_OVERLAP_BACK:
+                               bch2_cut_back(k.k->p, &discard.k);
+                               break;
+                       default:
+                               break;
+                       }
+
+                       discard.k.needs_whiteout = true;
+
+                       ret = extent_squash(s, insert, t, _k, k, overlap);
+                       BUG_ON(ret != BTREE_INSERT_OK);
+
+                       extent_bset_insert(c, iter, &discard);
+               } else {
+                       ret = extent_squash(s, insert, t, _k, k, overlap);
+                       BUG_ON(ret != BTREE_INSERT_OK);
+               }
+next:
+               bch2_cut_front(s->committed, insert);
+               bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+       }
+
+       return ret;
+}
+
+static enum btree_insert_ret
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
+{
+       struct btree_iter *iter = s->insert->iter;
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+       struct bkey_i *insert = s->insert->k;
+       enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+       while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+               enum bch_extent_overlap overlap;
+
+               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+                       break;
+
+               overlap = bch2_extent_overlap(&insert->k, k.k);
+
+               ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+               if (ret)
+                       break;
+
+               if (!k.k->size)
+                       goto squash;
+
+               /*
+                * Only call advance pos & call hook for nonzero size extents:
+                */
+               ret = extent_insert_advance_pos(s, k.s_c);
+               if (ret)
+                       break;
+
+               if (k.k->size &&
+                   (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+                       insert->k.needs_whiteout = true;
+
+               if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+                   bkey_whiteout(k.k) &&
+                   k.k->needs_whiteout) {
+                       unreserve_whiteout(b, t, _k);
+                       _k->needs_whiteout = false;
+               }
+squash:
+               ret = extent_squash(s, insert, t, _k, k, overlap);
+               if (ret != BTREE_INSERT_OK)
+                       break;
+       }
+
+       return ret;
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * âˆ€ k, j
+ *   k.size != 0 âˆ§ j.size != 0 â†’
+ *     Â¬ (k > bkey_start_pos(j) âˆ§ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *trans,
+                        struct btree_insert_entry *insert)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter = insert->iter;
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+       struct extent_insert_state s = {
+               .trans          = trans,
+               .insert         = insert,
+               .committed      = insert->iter->pos,
+               .deleting       = bkey_whiteout(&insert->k->k),
+       };
+
+       EBUG_ON(iter->level);
+       EBUG_ON(!insert->k->k.size);
+
+       /*
+        * As we process overlapping extents, we advance @iter->pos both to
+        * signal to our caller (btree_insert_key()) how much of @insert->k has
+        * been inserted, and also to keep @iter->pos consistent with
+        * @insert->k and the node iterator that we're advancing:
+        */
+       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+
+       if (!s.deleting &&
+           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+               bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
+                               bkey_start_offset(&insert->k->k),
+                               insert->k->k.size);
+
+       ret = !s.deleting
+               ? __bch2_insert_fixup_extent(&s)
+               : __bch2_delete_fixup_extent(&s);
+
+       if (ret == BTREE_INSERT_OK &&
+           bkey_cmp(s.committed, insert->k->k.p) < 0)
+               ret = extent_insert_advance_pos(&s, bkey_s_c_null);
+
+       extent_insert_committed(&s);
+
+       if (s.deleting)
+               bch2_cut_front(iter->pos, insert->k);
+
+       /*
+        * Subtract any remaining sectors from @insert, if we bailed out early
+        * and didn't fully insert @insert:
+        */
+       if (!s.deleting &&
+           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+           insert->k->k.size)
+               bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
+                                    bkey_start_offset(&insert->k->k),
+                                    insert->k->k.size);
+
+       bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
+                          gc_pos_btree_node(b));
+
+       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+       EBUG_ON(bkey_cmp(iter->pos, s.committed));
+       EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
+               !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
+
+       if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
+               ret = BTREE_INSERT_NEED_TRAVERSE;
+
+       WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
+                 "ret %u insert->k.size %u", ret, insert->k->k.size);
+
+       return ret;
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+               return "value too big";
+
+       if (!k.k->size)
+               return "zero key size";
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const union bch_extent_entry *entry;
+               struct bch_extent_crc_unpacked crc;
+               const struct bch_extent_ptr *ptr;
+               unsigned size_ondisk = e.k->size;
+               const char *reason;
+               unsigned nonce = UINT_MAX;
+
+               extent_for_each_entry(e, entry) {
+                       if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+                               return "invalid extent entry type";
+
+                       if (extent_entry_is_crc(entry)) {
+                               crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+                               if (crc.offset + e.k->size >
+                                   crc.uncompressed_size)
+                                       return "checksum offset + key size > uncompressed size";
+
+                               size_ondisk = crc.compressed_size;
+
+                               if (!bch2_checksum_type_valid(c, crc.csum_type))
+                                       return "invalid checksum type";
+
+                               if (crc.compression_type >= BCH_COMPRESSION_NR)
+                                       return "invalid compression type";
+
+                               if (bch2_csum_type_is_encryption(crc.csum_type)) {
+                                       if (nonce == UINT_MAX)
+                                               nonce = crc.offset + crc.nonce;
+                                       else if (nonce != crc.offset + crc.nonce)
+                                               return "incorrect nonce";
+                               }
+                       } else {
+                               ptr = entry_to_ptr(entry);
+
+                               reason = extent_ptr_invalid(c, e, &entry->ptr,
+                                                           size_ondisk, false);
+                               if (reason)
+                                       return reason;
+                       }
+               }
+
+               return NULL;
+       }
+
+       case BCH_RESERVATION: {
+               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+               if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+                       return "incorrect value size";
+
+               if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+                       return "invalid nr_replicas";
+
+               return NULL;
+       }
+
+       default:
+               return "invalid value type";
+       }
+}
+
+static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
+                                         struct bkey_s_c_extent e)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_dev *ca;
+       struct bucket_mark mark;
+       unsigned seq, stale;
+       char buf[160];
+       bool bad;
+       unsigned replicas = 0;
+
+       /*
+        * XXX: we should be doing most/all of these checks at startup time,
+        * where we check bch2_bkey_invalid() in btree_node_read_done()
+        *
+        * But note that we can't check for stale pointers or incorrect gc marks
+        * until after journal replay is done (it might be an extent that's
+        * going to get overwritten during replay)
+        */
+
+       extent_for_each_ptr(e, ptr) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+               replicas++;
+
+               /*
+                * If journal replay hasn't finished, we might be seeing keys
+                * that will be overwritten by the time journal replay is done:
+                */
+               if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+                       continue;
+
+               stale = 0;
+
+               do {
+                       seq = read_seqcount_begin(&c->gc_pos_lock);
+                       mark = ptr_bucket_mark(ca, ptr);
+
+                       /* between mark and bucket gen */
+                       smp_rmb();
+
+                       stale = ptr_stale(ca, ptr);
+
+                       bch2_fs_bug_on(stale && !ptr->cached, c,
+                                        "stale dirty pointer");
+
+                       bch2_fs_bug_on(stale > 96, c,
+                                        "key too stale: %i",
+                                        stale);
+
+                       if (stale)
+                               break;
+
+                       bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+                               (mark.data_type != BCH_DATA_USER ||
+                                !(ptr->cached
+                                  ? mark.cached_sectors
+                                  : mark.dirty_sectors));
+               } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+               if (bad)
+                       goto bad_ptr;
+       }
+
+       if (replicas > BCH_REPLICAS_MAX) {
+               bch2_bkey_val_to_text(c, btree_node_type(b), buf,
+                                    sizeof(buf), e.s_c);
+               bch2_fs_bug(c,
+                       "extent key bad (too many replicas: %u): %s",
+                       replicas, buf);
+               return;
+       }
+
+       if (!bkey_extent_is_cached(e.k) &&
+           !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
+               bch2_bkey_val_to_text(c, btree_node_type(b),
+                                    buf, sizeof(buf), e.s_c);
+               bch2_fs_bug(c,
+                       "extent key bad (replicas not marked in superblock):\n%s",
+                       buf);
+               return;
+       }
+
+       return;
+
+bad_ptr:
+       bch2_bkey_val_to_text(c, btree_node_type(b), buf,
+                            sizeof(buf), e.s_c);
+       bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
+                  "gen %i type %u", buf,
+                  PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
+       return;
+}
+
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
+               break;
+       case BCH_RESERVATION:
+               break;
+       default:
+               BUG();
+       }
+}
+
+void bch2_extent_to_text(struct bch_fs *c, char *buf,
+                        size_t size, struct bkey_s_c k)
+{
+       char *out = buf, *end = buf + size;
+       const char *invalid;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+       if (bkey_extent_is_data(k.k))
+               out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+       invalid = bch2_extent_invalid(c, k);
+       if (invalid)
+               p(" invalid: %s", invalid);
+#undef p
+}
+
+static void bch2_extent_crc_init(union bch_extent_crc *crc,
+                                struct bch_extent_crc_unpacked new)
+{
+#define common_fields(_crc)                                            \
+               .csum_type              = _crc.csum_type,               \
+               .compression_type       = _crc.compression_type,        \
+               ._compressed_size       = _crc.compressed_size - 1,     \
+               ._uncompressed_size     = _crc.uncompressed_size - 1,   \
+               .offset                 = _crc.offset
+
+       if (bch_crc_bytes[new.csum_type]        <= 4 &&
+           new.uncompressed_size               <= CRC32_SIZE_MAX &&
+           new.nonce                           <= CRC32_NONCE_MAX) {
+               crc->crc32 = (struct bch_extent_crc32) {
+                       .type = 1 << BCH_EXTENT_ENTRY_crc32,
+                       common_fields(new),
+                       .csum                   = *((__le32 *) &new.csum.lo),
+               };
+               return;
+       }
+
+       if (bch_crc_bytes[new.csum_type]        <= 10 &&
+           new.uncompressed_size               <= CRC64_SIZE_MAX &&
+           new.nonce                           <= CRC64_NONCE_MAX) {
+               crc->crc64 = (struct bch_extent_crc64) {
+                       .type = 1 << BCH_EXTENT_ENTRY_crc64,
+                       common_fields(new),
+                       .nonce                  = new.nonce,
+                       .csum_lo                = new.csum.lo,
+                       .csum_hi                = *((__le16 *) &new.csum.hi),
+               };
+               return;
+       }
+
+       if (bch_crc_bytes[new.csum_type]        <= 16 &&
+           new.uncompressed_size               <= CRC128_SIZE_MAX &&
+           new.nonce                           <= CRC128_NONCE_MAX) {
+               crc->crc128 = (struct bch_extent_crc128) {
+                       .type = 1 << BCH_EXTENT_ENTRY_crc128,
+                       common_fields(new),
+                       .nonce                  = new.nonce,
+                       .csum                   = new.csum,
+               };
+               return;
+       }
+#undef common_fields
+       BUG();
+}
+
+void bch2_extent_crc_append(struct bkey_i_extent *e,
+                           struct bch_extent_crc_unpacked new)
+{
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+
+       BUG_ON(new.compressed_size > new.uncompressed_size);
+       BUG_ON(new.live_size != e->k.size);
+       BUG_ON(!new.compressed_size || !new.uncompressed_size);
+
+       /*
+        * Look up the last crc entry, so we can check if we need to add
+        * another:
+        */
+       extent_for_each_crc(extent_i_to_s(e), crc, i)
+               ;
+
+       if (!bch2_crc_unpacked_cmp(crc, new))
+               return;
+
+       bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
+       __extent_entry_push(e);
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+       struct bkey_s_extent e;
+
+       switch (k.k->type) {
+       case KEY_TYPE_ERROR:
+               return false;
+
+       case KEY_TYPE_DELETED:
+               return true;
+       case KEY_TYPE_DISCARD:
+               return bversion_zero(k.k->version);
+       case KEY_TYPE_COOKIE:
+               return false;
+
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               e = bkey_s_to_extent(k);
+
+               bch2_extent_drop_stale(c, e);
+
+               if (!bkey_val_u64s(e.k)) {
+                       if (bkey_extent_is_cached(e.k)) {
+                               k.k->type = KEY_TYPE_DISCARD;
+                               if (bversion_zero(k.k->version))
+                                       return true;
+                       } else {
+                               k.k->type = KEY_TYPE_ERROR;
+                       }
+               }
+
+               return false;
+       case BCH_RESERVATION:
+               return false;
+       default:
+               BUG();
+       }
+}
+
+void bch2_extent_mark_replicas_cached(struct bch_fs *c,
+                                     struct bkey_s_extent e,
+                                     unsigned target,
+                                     unsigned nr_desired_replicas)
+{
+       struct bch_extent_ptr *ptr;
+       int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
+
+       if (target && extra > 0)
+               extent_for_each_ptr(e, ptr) {
+                       int n = bch2_extent_ptr_durability(c, ptr);
+
+                       if (n && n <= extra &&
+                           !bch2_dev_in_target(c, ptr->dev, target)) {
+                               ptr->cached = true;
+                               extra -= n;
+                       }
+               }
+
+       if (extra > 0)
+               extent_for_each_ptr(e, ptr) {
+                       int n = bch2_extent_ptr_durability(c, ptr);
+
+                       if (n && n <= extra) {
+                               ptr->cached = true;
+                               extra -= n;
+                       }
+               }
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+                        struct bch_devs_mask *avoid,
+                        struct extent_pick_ptr *pick)
+{
+       int ret;
+
+       switch (k.k->type) {
+       case KEY_TYPE_ERROR:
+               return -EIO;
+
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
+                                             avoid, pick);
+
+               if (!ret && !bkey_extent_is_cached(k.k))
+                       ret = -EIO;
+
+               return ret;
+
+       default:
+               return 0;
+       }
+}
+
+enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+                                   struct bkey_i *l, struct bkey_i *r)
+{
+       struct bkey_s_extent el, er;
+       union bch_extent_entry *en_l, *en_r;
+
+       if (key_merging_disabled(c))
+               return BCH_MERGE_NOMERGE;
+
+       /*
+        * Generic header checks
+        * Assumes left and right are in order
+        * Left and right must be exactly aligned
+        */
+
+       if (l->k.u64s           != r->k.u64s ||
+           l->k.type           != r->k.type ||
+           bversion_cmp(l->k.version, r->k.version) ||
+           bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+               return BCH_MERGE_NOMERGE;
+
+       switch (l->k.type) {
+       case KEY_TYPE_DISCARD:
+       case KEY_TYPE_ERROR:
+               /* These types are mergeable, and no val to check */
+               break;
+
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               el = bkey_i_to_s_extent(l);
+               er = bkey_i_to_s_extent(r);
+
+               extent_for_each_entry(el, en_l) {
+                       struct bch_extent_ptr *lp, *rp;
+                       struct bch_dev *ca;
+
+                       en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
+
+                       if ((extent_entry_type(en_l) !=
+                            extent_entry_type(en_r)) ||
+                           extent_entry_is_crc(en_l))
+                               return BCH_MERGE_NOMERGE;
+
+                       lp = &en_l->ptr;
+                       rp = &en_r->ptr;
+
+                       if (lp->offset + el.k->size     != rp->offset ||
+                           lp->dev                     != rp->dev ||
+                           lp->gen                     != rp->gen)
+                               return BCH_MERGE_NOMERGE;
+
+                       /* We don't allow extents to straddle buckets: */
+                       ca = bch_dev_bkey_exists(c, lp->dev);
+
+                       if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+                               return BCH_MERGE_NOMERGE;
+               }
+
+               break;
+       case BCH_RESERVATION: {
+               struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+               struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+               if (li->v.generation != ri->v.generation ||
+                   li->v.nr_replicas != ri->v.nr_replicas)
+                       return BCH_MERGE_NOMERGE;
+               break;
+       }
+       default:
+               return BCH_MERGE_NOMERGE;
+       }
+
+       l->k.needs_whiteout |= r->k.needs_whiteout;
+
+       /* Keys with no pointers aren't restricted to one bucket and could
+        * overflow KEY_SIZE
+        */
+       if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+               bch2_key_resize(&l->k, KEY_SIZE_MAX);
+               bch2_cut_front(l->k.p, r);
+               return BCH_MERGE_PARTIAL;
+       }
+
+       bch2_key_resize(&l->k, l->k.size + r->k.size);
+
+       return BCH_MERGE_MERGE;
+}
+
+static void extent_i_save(struct btree *b, struct bkey_packed *dst,
+                         struct bkey_i *src)
+{
+       struct bkey_format *f = &b->format;
+       struct bkey_i *dst_unpacked;
+
+       BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
+
+       /*
+        * We don't want the bch2_verify_key_order() call in extent_save(),
+        * because we may be out of order with deleted keys that are about to be
+        * removed by extent_bset_insert()
+        */
+
+       if ((dst_unpacked = packed_to_bkey(dst)))
+               bkey_copy(dst_unpacked, src);
+       else
+               BUG_ON(!bch2_bkey_pack(dst, src, f));
+}
+
+static bool extent_merge_one_overlapping(struct btree_iter *iter,
+                                        struct bpos new_pos,
+                                        struct bset_tree *t,
+                                        struct bkey_packed *k, struct bkey uk,
+                                        bool check, bool could_pack)
+{
+       struct btree_iter_level *l = &iter->l[0];
+
+       BUG_ON(!bkey_deleted(k));
+
+       if (check) {
+               return !bkey_packed(k) || could_pack;
+       } else {
+               uk.p = new_pos;
+               extent_save(l->b, &l->iter, k, &uk);
+               bch2_bset_fix_invalidated_key(l->b, t, k);
+               bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+                                        k, k->u64s, k->u64s);
+               return true;
+       }
+}
+
+static bool extent_merge_do_overlapping(struct btree_iter *iter,
+                                       struct bkey *m, bool back_merge)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
+       struct bset_tree *t;
+       struct bkey_packed *k;
+       struct bkey uk;
+       struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
+       bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
+       bool check = true;
+
+       /*
+        * @m is the new merged extent:
+        *
+        * The merge took place in the last bset; we know there can't be any 0
+        * size extents overlapping with m there because if so they would have
+        * been between the two extents we merged.
+        *
+        * But in the other bsets, we have to check for and fix such extents:
+        */
+do_fixup:
+       for_each_bset(b, t) {
+               if (t == bset_tree_last(b))
+                       break;
+
+               /*
+                * if we don't find this bset in the iterator we already got to
+                * the end of that bset, so start searching from the end.
+                */
+               k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+               if (k == btree_bkey_last(b, t))
+                       k = bch2_bkey_prev_all(b, t, k);
+               if (!k)
+                       continue;
+
+               if (back_merge) {
+                       /*
+                        * Back merge: 0 size extents will be before the key
+                        * that was just inserted (and thus the iterator
+                        * position) - walk backwards to find them
+                        */
+                       for (;
+                            k &&
+                            (uk = bkey_unpack_key(b, k),
+                             bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
+                            k = bch2_bkey_prev_all(b, t, k)) {
+                               if (bkey_cmp(uk.p, m->p) >= 0)
+                                       continue;
+
+                               if (!extent_merge_one_overlapping(iter, new_pos,
+                                               t, k, uk, check, could_pack))
+                                       return false;
+                       }
+               } else {
+                       /* Front merge - walk forwards */
+                       for (;
+                            k != btree_bkey_last(b, t) &&
+                            (uk = bkey_unpack_key(b, k),
+                             bkey_cmp(uk.p, m->p) < 0);
+                            k = bkey_next(k)) {
+                               if (bkey_cmp(uk.p,
+                                            bkey_start_pos(m)) <= 0)
+                                       continue;
+
+                               if (!extent_merge_one_overlapping(iter, new_pos,
+                                               t, k, uk, check, could_pack))
+                                       return false;
+                       }
+               }
+       }
+
+       if (check) {
+               check = false;
+               goto do_fixup;
+       }
+
+       return true;
+}
+
+/*
+ * When merging an extent that we're inserting into a btree node, the new merged
+ * extent could overlap with an existing 0 size extent - if we don't fix that,
+ * it'll break the btree node iterator so this code finds those 0 size extents
+ * and shifts them out of the way.
+ *
+ * Also unpacks and repacks.
+ */
+static bool bch2_extent_merge_inline(struct bch_fs *c,
+                                    struct btree_iter *iter,
+                                    struct bkey_packed *l,
+                                    struct bkey_packed *r,
+                                    bool back_merge)
+{
+       struct btree *b = iter->l[0].b;
+       struct btree_node_iter *node_iter = &iter->l[0].iter;
+       const struct bkey_format *f = &b->format;
+       struct bset_tree *t = bset_tree_last(b);
+       struct bkey_packed *m;
+       BKEY_PADDED(k) li;
+       BKEY_PADDED(k) ri;
+       struct bkey_i *mi;
+       struct bkey tmp;
+
+       /*
+        * We need to save copies of both l and r, because we might get a
+        * partial merge (which modifies both) and then fails to repack
+        */
+       bch2_bkey_unpack(b, &li.k, l);
+       bch2_bkey_unpack(b, &ri.k, r);
+
+       m = back_merge ? l : r;
+       mi = back_merge ? &li.k : &ri.k;
+
+       /* l & r should be in last bset: */
+       EBUG_ON(bch2_bkey_to_bset(b, m) != t);
+
+       switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
+       case BCH_MERGE_NOMERGE:
+               return false;
+       case BCH_MERGE_PARTIAL:
+               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
+                       return false;
+
+               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+                       return false;
+
+               extent_i_save(b, m, mi);
+               bch2_bset_fix_invalidated_key(b, t, m);
+
+               /*
+                * Update iterator to reflect what we just inserted - otherwise,
+                * the iter_fix() call is going to put us _before_ the key we
+                * just partially merged with:
+                */
+               if (back_merge)
+                       bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
+
+               bch2_btree_node_iter_fix(iter, b, node_iter,
+                                        t, m, m->u64s, m->u64s);
+
+               if (!back_merge)
+                       bkey_copy(packed_to_bkey(l), &li.k);
+               else
+                       bkey_copy(packed_to_bkey(r), &ri.k);
+               return false;
+       case BCH_MERGE_MERGE:
+               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
+                       return false;
+
+               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+                       return false;
+
+               extent_i_save(b, m, &li.k);
+               bch2_bset_fix_invalidated_key(b, t, m);
+
+               bch2_btree_node_iter_fix(iter, b, node_iter,
+                                        t, m, m->u64s, m->u64s);
+               return true;
+       default:
+               BUG();
+       }
+}
+
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+       struct btree_iter iter;
+       struct bpos end = pos;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       end.offset += size;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+                            BTREE_ITER_SLOTS, k) {
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+                       break;
+
+               if (!bch2_extent_is_fully_allocated(k)) {
+                       ret = -ENOSPC;
+                       break;
+               }
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
new file mode 100644 (file)
index 0000000..15aed3c
--- /dev/null
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct journal_res;
+struct btree_node_iter;
+struct btree_node_iter_large;
+struct btree_insert;
+struct btree_insert_entry;
+struct extent_insert_hook;
+struct bch_devs_mask;
+union bch_extent_crc;
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+                              struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_btree_ops (struct bkey_ops) {                        \
+       .key_invalid    = bch2_btree_ptr_invalid,               \
+       .key_debugcheck = bch2_btree_ptr_debugcheck,            \
+       .val_to_text    = bch2_btree_ptr_to_text,               \
+       .swab           = bch2_ptr_swab,                        \
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
+                                   struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_extent_ops (struct bkey_ops) {               \
+       .key_invalid    = bch2_extent_invalid,                  \
+       .key_debugcheck = bch2_extent_debugcheck,               \
+       .val_to_text    = bch2_extent_to_text,                  \
+       .swab           = bch2_ptr_swab,                        \
+       .key_normalize  = bch2_ptr_normalize,                   \
+       .key_merge      = bch2_extent_merge,                    \
+       .is_extents     = true,                                 \
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
+                                                 struct btree *,
+                                                 struct btree_node_iter_large *);
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+                                                    struct bset *,
+                                                    struct btree *,
+                                                    struct btree_node_iter_large *);
+
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+                       struct bch_devs_mask *avoid,
+                       struct extent_pick_ptr *);
+
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+                        struct bch_devs_mask *,
+                        struct extent_pick_ptr *);
+
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *,
+                       struct btree_insert_entry *);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+                                     unsigned, unsigned);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+
+unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
+unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *,
+                                   const struct bch_extent_ptr *);
+unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+                            struct bch_extent_ptr, u64);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+       switch (k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+       switch (k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+       case BCH_RESERVATION:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+       return bkey_extent_is_allocation(k.k) &&
+               !bch2_extent_is_compressed(k);
+}
+
+static inline bool bkey_extent_is_cached(const struct bkey *k)
+{
+       return k->type == BCH_EXTENT_CACHED;
+}
+
+static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
+{
+       EBUG_ON(k->type != BCH_EXTENT &&
+               k->type != BCH_EXTENT_CACHED);
+
+       k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
+}
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+       return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+       int ret = __ffs(e->type);
+
+       EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+       return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+       switch (extent_entry_type(entry)) {
+       case BCH_EXTENT_ENTRY_crc32:
+               return sizeof(struct bch_extent_crc32);
+       case BCH_EXTENT_ENTRY_crc64:
+               return sizeof(struct bch_extent_crc64);
+       case BCH_EXTENT_ENTRY_crc128:
+               return sizeof(struct bch_extent_crc128);
+       case BCH_EXTENT_ENTRY_ptr:
+               return sizeof(struct bch_extent_ptr);
+       default:
+               BUG();
+       }
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+       return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+       return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+       return !extent_entry_is_ptr(e);
+}
+
+union bch_extent_crc {
+       u8                              type;
+       struct bch_extent_crc32         crc32;
+       struct bch_extent_crc64         crc64;
+       struct bch_extent_crc128        crc128;
+};
+
+/* downcast, preserves const */
+#define to_entry(_entry)                                               \
+({                                                                     \
+       BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&        \
+                    !type_is(_entry, struct bch_extent_ptr *));        \
+                                                                       \
+       __builtin_choose_expr(                                          \
+               (type_is_exact(_entry, const union bch_extent_crc *) || \
+                type_is_exact(_entry, const struct bch_extent_ptr *)), \
+               (const union bch_extent_entry *) (_entry),              \
+               (union bch_extent_entry *) (_entry));                   \
+})
+
+#define __entry_to_crc(_entry)                                         \
+       __builtin_choose_expr(                                          \
+               type_is_exact(_entry, const union bch_extent_entry *),  \
+               (const union bch_extent_crc *) (_entry),                \
+               (union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)                                           \
+({                                                                     \
+       EBUG_ON((_entry) && !extent_entry_is_crc(_entry));              \
+                                                                       \
+       __entry_to_crc(_entry);                                         \
+})
+
+#define entry_to_ptr(_entry)                                           \
+({                                                                     \
+       EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));              \
+                                                                       \
+       __builtin_choose_expr(                                          \
+               type_is_exact(_entry, const union bch_extent_entry *),  \
+               (const struct bch_extent_ptr *) (_entry),               \
+               (struct bch_extent_ptr *) (_entry));                    \
+})
+
+/* checksum entries: */
+
+enum bch_extent_crc_type {
+       BCH_EXTENT_CRC_NONE,
+       BCH_EXTENT_CRC32,
+       BCH_EXTENT_CRC64,
+       BCH_EXTENT_CRC128,
+};
+
+static inline enum bch_extent_crc_type
+__extent_crc_type(const union bch_extent_crc *crc)
+{
+       if (!crc)
+               return BCH_EXTENT_CRC_NONE;
+
+       switch (extent_entry_type(to_entry(crc))) {
+       case BCH_EXTENT_ENTRY_crc32:
+               return BCH_EXTENT_CRC32;
+       case BCH_EXTENT_ENTRY_crc64:
+               return BCH_EXTENT_CRC64;
+       case BCH_EXTENT_ENTRY_crc128:
+               return BCH_EXTENT_CRC128;
+       default:
+               BUG();
+       }
+}
+
+#define extent_crc_type(_crc)                                          \
+({                                                                     \
+       BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) &&       \
+                    !type_is(_crc, struct bch_extent_crc64 *) &&       \
+                    !type_is(_crc, struct bch_extent_crc128 *) &&      \
+                    !type_is(_crc, union bch_extent_crc *));           \
+                                                                       \
+         type_is(_crc, struct bch_extent_crc32 *)  ? BCH_EXTENT_CRC32  \
+       : type_is(_crc, struct bch_extent_crc64 *)  ? BCH_EXTENT_CRC64  \
+       : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \
+       : __extent_crc_type((union bch_extent_crc *) _crc);             \
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)                                            \
+               .csum_type              = _crc.csum_type,               \
+               .compression_type       = _crc.compression_type,        \
+               .compressed_size        = _crc._compressed_size + 1,    \
+               .uncompressed_size      = _crc._uncompressed_size + 1,  \
+               .offset                 = _crc.offset,                  \
+               .live_size              = k->size
+
+       switch (extent_crc_type(crc)) {
+       case BCH_EXTENT_CRC_NONE:
+               return (struct bch_extent_crc_unpacked) {
+                       .compressed_size        = k->size,
+                       .uncompressed_size      = k->size,
+                       .live_size              = k->size,
+               };
+       case BCH_EXTENT_CRC32: {
+               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+                       common_fields(crc->crc32),
+               };
+
+               *((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+               memcpy(&ret.csum.lo, &crc->crc32.csum,
+                      sizeof(crc->crc32.csum));
+
+               return ret;
+       }
+       case BCH_EXTENT_CRC64: {
+               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+                       common_fields(crc->crc64),
+                       .nonce                  = crc->crc64.nonce,
+                       .csum.lo                = (__force __le64) crc->crc64.csum_lo,
+               };
+
+               *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+               return ret;
+       }
+       case BCH_EXTENT_CRC128: {
+               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+                       common_fields(crc->crc128),
+                       .nonce                  = crc->crc128.nonce,
+                       .csum                   = crc->crc128.csum,
+               };
+
+               return ret;
+       }
+       default:
+               BUG();
+       }
+#undef common_fields
+}
+
+/* Extent entry iteration: */
+
+#define extent_entry_next(_entry)                                      \
+       ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e)                                          \
+       vstruct_idx((_e).v, bkey_val_u64s((_e).k))
+
+/* Iterate over all entries: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)                 \
+       for ((_entry) = _start;                                         \
+            (_entry) < extent_entry_last(_e);                          \
+            (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry)                              \
+       extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterate over crcs only: */
+
+#define __extent_crc_next(_e, _p)                                      \
+({                                                                     \
+       typeof(&(_e).v->start[0]) _entry = _p;                          \
+                                                                       \
+       while ((_entry) < extent_entry_last(_e) &&                      \
+              !extent_entry_is_crc(_entry))                            \
+               (_entry) = extent_entry_next(_entry);                   \
+                                                                       \
+       entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);   \
+})
+
+#define __extent_for_each_crc(_e, _crc)                                        \
+       for ((_crc) = __extent_crc_next(_e, (_e).v->start);             \
+            (_crc);                                                    \
+            (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+#define extent_crc_next(_e, _crc, _iter)                               \
+({                                                                     \
+       extent_for_each_entry_from(_e, _iter, _iter)                    \
+               if (extent_entry_is_crc(_iter)) {                       \
+                       (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+                       break;                                          \
+               }                                                       \
+                                                                       \
+       (_iter) < extent_entry_last(_e);                                \
+})
+
+#define extent_for_each_crc(_e, _crc, _iter)                           \
+       for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),             \
+            (_iter) = (_e).v->start;                                   \
+            extent_crc_next(_e, _crc, _iter);                          \
+            (_iter) = extent_entry_next(_iter))
+
+/* Iterate over pointers, with crcs: */
+
+#define extent_ptr_crc_next(_e, _ptr, _crc)                            \
+({                                                                     \
+       __label__ out;                                                  \
+       typeof(&(_e).v->start[0]) _entry;                               \
+                                                                       \
+       extent_for_each_entry_from(_e, _entry, to_entry(_ptr))          \
+               if (extent_entry_is_crc(_entry)) {                      \
+                       (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
+               } else {                                                \
+                       _ptr = entry_to_ptr(_entry);                    \
+                       goto out;                                       \
+               }                                                       \
+                                                                       \
+       _ptr = NULL;                                                    \
+out:                                                                   \
+       _ptr;                                                           \
+})
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)                                \
+       for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),             \
+            (_ptr) = &(_e).v->start->ptr;                              \
+            ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));            \
+            (_ptr)++)
+
+/* Iterate over pointers only, and from a given position: */
+
+#define extent_ptr_next(_e, _ptr)                                      \
+({                                                                     \
+       struct bch_extent_crc_unpacked _crc;                            \
+                                                                       \
+       extent_ptr_crc_next(_e, _ptr, _crc);                            \
+})
+
+#define extent_for_each_ptr(_e, _ptr)                                  \
+       for ((_ptr) = &(_e).v->start->ptr;                              \
+            ((_ptr) = extent_ptr_next(_e, _ptr));                      \
+            (_ptr)++)
+
+#define extent_ptr_prev(_e, _ptr)                                      \
+({                                                                     \
+       typeof(&(_e).v->start->ptr) _p;                                 \
+       typeof(&(_e).v->start->ptr) _prev = NULL;                       \
+                                                                       \
+       extent_for_each_ptr(_e, _p) {                                   \
+               if (_p == (_ptr))                                       \
+                       break;                                          \
+               _prev = _p;                                             \
+       }                                                               \
+                                                                       \
+       _prev;                                                          \
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr)                                \
+       for ((_ptr) = extent_ptr_prev(_e, NULL);                        \
+            (_ptr);                                                    \
+            (_ptr) = extent_ptr_prev(_e, _ptr))
+
+void bch2_extent_crc_append(struct bkey_i_extent *,
+                           struct bch_extent_crc_unpacked);
+
+static inline void __extent_entry_push(struct bkey_i_extent *e)
+{
+       union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+
+       EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+               BKEY_EXTENT_VAL_U64s_MAX);
+
+       e->k.u64s += extent_entry_u64s(entry);
+}
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+                                    struct bch_extent_ptr ptr)
+{
+       ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+       extent_entry_last(extent_i_to_s(e))->ptr = ptr;
+       __extent_entry_push(e);
+}
+
+static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               ret.devs[ret.nr++] = ptr->dev;
+
+       return ret;
+}
+
+static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached)
+                       ret.devs[ret.nr++] = ptr->dev;
+
+       return ret;
+}
+
+static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (ptr->cached)
+                       ret.devs[ret.nr++] = ptr->dev;
+
+       return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               return bch2_extent_devs(bkey_s_c_to_extent(k));
+       default:
+               return (struct bch_devs_list) { .nr = 0 };
+       }
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+       default:
+               return (struct bch_devs_list) { .nr = 0 };
+       }
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+       default:
+               return (struct bch_devs_list) { .nr = 0 };
+       }
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+                                struct bch_extent_crc_unpacked);
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
+
+void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+
+bool bch2_cut_front(struct bpos, struct bkey_i *);
+bool bch2_cut_back(struct bpos, struct bkey *);
+void bch2_key_resize(struct bkey *, unsigned);
+
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
+#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
new file mode 100644 (file)
index 0000000..27b2bde
--- /dev/null
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+       u8                      csum_type;
+       u8                      compression_type;
+
+       u16                     compressed_size;
+       u16                     uncompressed_size;
+
+       u16                     offset;
+       u16                     live_size;
+
+       u16                     nonce;
+
+       struct bch_csum         csum;
+};
+
+struct extent_pick_ptr {
+       struct bch_extent_ptr           ptr;
+       struct bch_extent_crc_unpacked  crc;
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
new file mode 100644 (file)
index 0000000..7cb4942
--- /dev/null
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ *
+ * Size parameter is treated as if we were using 0 based indexing, however:
+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
+ * are actually size - 1 elements
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+       EBUG_ON(child > 1);
+
+       return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+       return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+       return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+       return rounddown_pow_of_two(size - 1);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+       return rounddown_pow_of_two(size) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+       EBUG_ON(i >= size);
+
+       if (eytzinger1_right_child(i) < size) {
+               i = eytzinger1_right_child(i);
+
+               i <<= __fls(size) - __fls(i);
+               i >>= i >= size;
+       } else {
+               i >>= ffz(i) + 1;
+       }
+
+       return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+       EBUG_ON(i >= size);
+
+       if (eytzinger1_left_child(i) < size) {
+               i = eytzinger1_left_child(i) + 1;
+
+               i <<= __fls(size) - __fls(i);
+               i -= 1;
+               i >>= i >= size;
+       } else {
+               i >>= __ffs(i) + 1;
+       }
+
+       return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+       return (size - rounddown_pow_of_two(size - 1)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+                                             unsigned extra)
+{
+       unsigned b = __fls(i);
+       unsigned shift = __fls(size - 1) - b;
+       int s;
+
+       EBUG_ON(!i || i >= size);
+
+       i  ^= 1U << b;
+       i <<= 1;
+       i  |= 1;
+       i <<= shift;
+
+       /*
+        * sign bit trick:
+        *
+        * if (i > extra)
+        *      i -= (i - extra) >> 1;
+        */
+       s = extra - i;
+       i += (s >> 1) & (s >> 31);
+
+       return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+                                              unsigned extra)
+{
+       unsigned shift;
+       int s;
+
+       EBUG_ON(!i || i >= size);
+
+       /*
+        * sign bit trick:
+        *
+        * if (i > extra)
+        *      i += i - extra;
+        */
+       s = extra - i;
+       i -= s & (s >> 31);
+
+       shift = __ffs(i);
+
+       i >>= shift + 1;
+       i  |= 1U << (__fls(size - 1) - shift);
+
+       return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+       return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+       return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size)                 \
+       for ((_i) = eytzinger1_first((_size));          \
+            (_i) != 0;                                 \
+            (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+       EBUG_ON(child > 1);
+
+       return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+       return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+       return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+       return eytzinger1_first(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+       return eytzinger1_last(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+       return eytzinger1_next(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+       return eytzinger1_prev(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+       return eytzinger1_extra(size + 1);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+                                              unsigned extra)
+{
+       return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+                                              unsigned extra)
+{
+       return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+       return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+       return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size)                 \
+       for ((_i) = eytzinger0_first((_size));          \
+            (_i) != -1;                                \
+            (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+                                        eytzinger_cmp_fn cmp, const void *search)
+{
+       unsigned i, n = 0;
+
+       if (!nr)
+               return -1;
+
+       do {
+               i = n;
+               n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+       } while (n < nr);
+
+       if (n & 1) {
+               /* @i was greater than @search, return previous node: */
+
+               if (i == eytzinger0_first(nr))
+                       return -1;
+
+               return eytzinger0_prev(i, nr);
+       } else {
+               return i;
+       }
+}
+
+static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
+                                    eytzinger_cmp_fn cmp, const void *search)
+{
+       size_t i = 0;
+       int res;
+
+       while (i < nr &&
+              (res = cmp(search, base + i * size, size)))
+               i = eytzinger0_child(i, res > 0);
+
+       return i;
+}
+
+void eytzinger0_sort(void *, size_t, size_t,
+                   int (*cmp_func)(const void *, const void *, size_t),
+                   void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
new file mode 100644 (file)
index 0000000..bd1534e
--- /dev/null
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type)                                                     \
+struct {                                                               \
+       size_t front, back, size, mask;                                 \
+       type *data;                                                     \
+}
+
+#define DECLARE_FIFO(type, name)       FIFO(type) name
+
+#define fifo_buf_size(fifo)                                            \
+       (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+
+#define init_fifo(fifo, _size, _gfp)                                   \
+({                                                                     \
+       (fifo)->front   = (fifo)->back = 0;                             \
+       (fifo)->size    = (_size);                                      \
+       (fifo)->mask    = (fifo)->size                                  \
+               ? roundup_pow_of_two((fifo)->size) - 1                  \
+               : 0;                                                    \
+       (fifo)->data    = kvpmalloc(fifo_buf_size(fifo), (_gfp));       \
+})
+
+#define free_fifo(fifo)                                                        \
+do {                                                                   \
+       kvpfree((fifo)->data, fifo_buf_size(fifo));                     \
+       (fifo)->data = NULL;                                            \
+} while (0)
+
+#define fifo_swap(l, r)                                                        \
+do {                                                                   \
+       swap((l)->front, (r)->front);                                   \
+       swap((l)->back, (r)->back);                                     \
+       swap((l)->size, (r)->size);                                     \
+       swap((l)->mask, (r)->mask);                                     \
+       swap((l)->data, (r)->data);                                     \
+} while (0)
+
+#define fifo_move(dest, src)                                           \
+do {                                                                   \
+       typeof(*((dest)->data)) _t;                                     \
+       while (!fifo_full(dest) &&                                      \
+              fifo_pop(src, _t))                                       \
+               fifo_push(dest, _t);                                    \
+} while (0)
+
+#define fifo_used(fifo)                (((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)                ((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)       ((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)                (fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)  ((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)   ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p)                                    \
+       ((((p) >= &fifo_peek_front(fifo)                                \
+          ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +           \
+          (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p)        (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)        (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+
+#define fifo_push_back_ref(f)                                          \
+       (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f)                                         \
+       (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new)                                      \
+({                                                                     \
+       typeof((fifo)->data) _r = fifo_push_back_ref(fifo);             \
+       if (_r)                                                         \
+               *_r = (new);                                            \
+       _r != NULL;                                                     \
+})
+
+#define fifo_push_front(fifo, new)                                     \
+({                                                                     \
+       typeof((fifo)->data) _r = fifo_push_front_ref(fifo);            \
+       if (_r)                                                         \
+               *_r = (new);                                            \
+       _r != NULL;                                                     \
+})
+
+#define fifo_pop_front(fifo, i)                                                \
+({                                                                     \
+       bool _r = !fifo_empty((fifo));                                  \
+       if (_r)                                                         \
+               (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];     \
+       _r;                                                             \
+})
+
+#define fifo_pop_back(fifo, i)                                         \
+({                                                                     \
+       bool _r = !fifo_empty((fifo));                                  \
+       if (_r)                                                         \
+               (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]       \
+       _r;                                                             \
+})
+
+#define fifo_push_ref(fifo)    fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i)     fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)      fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)                fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)                      \
+       for (((void) (&(_iter) == &(_fifo)->front)),                    \
+            _iter = (_fifo)->front;                                    \
+            ((_iter != (_fifo)->back) &&                               \
+             (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
+            _iter++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)                    \
+       for (((void) (&(_iter) == &(_fifo)->front)),                    \
+            _iter = (_fifo)->front;                                    \
+            ((_iter != (_fifo)->back) &&                               \
+             (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));  \
+            _iter++)
+
+#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
new file mode 100644 (file)
index 0000000..56d2117
--- /dev/null
@@ -0,0 +1,2862 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "io.h"
+#include "keylist.h"
+#include "quota.h"
+#include "trace.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include <trace/events/writeback.h>
+
+struct quota_res {
+       u64                             sectors;
+};
+
+struct i_sectors_hook {
+       struct extent_insert_hook       hook;
+       struct bch_inode_info           *inode;
+       struct quota_res                quota_res;
+       s64                             sectors;
+       u64                             new_i_size;
+       unsigned                        flags;
+       unsigned                        appending:1;
+};
+
+struct bchfs_write_op {
+       struct bch_inode_info           *inode;
+       s64                             sectors_added;
+       bool                            is_dio;
+       bool                            unalloc;
+       u64                             new_i_size;
+
+       /* must be last: */
+       struct bch_write_op             op;
+};
+
+struct bch_writepage_io {
+       struct closure                  cl;
+       u64                             new_sectors;
+
+       /* must be last: */
+       struct bchfs_write_op           op;
+};
+
+struct dio_write {
+       struct closure                  cl;
+       struct kiocb                    *req;
+       struct task_struct              *task;
+       unsigned                        loop:1,
+                                       sync:1,
+                                       free_iov:1;
+       struct quota_res                quota_res;
+
+       struct iov_iter                 iter;
+       struct iovec                    inline_vecs[2];
+
+       /* must be last: */
+       struct bchfs_write_op           iop;
+};
+
+struct dio_read {
+       struct closure                  cl;
+       struct kiocb                    *req;
+       long                            ret;
+       struct bch_read_bio             rbio;
+};
+
+/* pagecache_block must be held */
+static int write_invalidate_inode_pages_range(struct address_space *mapping,
+                                             loff_t start, loff_t end)
+{
+       int ret;
+
+       /*
+        * XXX: the way this is currently implemented, we can spin if a process
+        * is continually redirtying a specific page
+        */
+       do {
+               if (!mapping->nrpages)
+                       return 0;
+
+               ret = filemap_write_and_wait_range(mapping, start, end);
+               if (ret)
+                       break;
+
+               if (!mapping->nrpages)
+                       return 0;
+
+               ret = invalidate_inode_pages2_range(mapping,
+                               start >> PAGE_SHIFT,
+                               end >> PAGE_SHIFT);
+       } while (ret == -EBUSY);
+
+       return ret;
+}
+
+/* quotas */
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static void bch2_quota_reservation_put(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      struct quota_res *res)
+{
+       if (!res->sectors)
+               return;
+
+       mutex_lock(&inode->ei_quota_lock);
+       BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+       bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+                       -((s64) res->sectors), BCH_QUOTA_PREALLOC);
+       inode->ei_quota_reserved -= res->sectors;
+       mutex_unlock(&inode->ei_quota_lock);
+
+       res->sectors = 0;
+}
+
+static int bch2_quota_reservation_add(struct bch_fs *c,
+                                     struct bch_inode_info *inode,
+                                     struct quota_res *res,
+                                     unsigned sectors,
+                                     bool check_enospc)
+{
+       int ret;
+
+       mutex_lock(&inode->ei_quota_lock);
+       ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+                             check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
+       if (likely(!ret)) {
+               inode->ei_quota_reserved += sectors;
+               res->sectors += sectors;
+       }
+       mutex_unlock(&inode->ei_quota_lock);
+
+       return ret;
+}
+
+#else
+
+static void bch2_quota_reservation_put(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      struct quota_res *res)
+{
+}
+
+static int bch2_quota_reservation_add(struct bch_fs *c,
+                                     struct bch_inode_info *inode,
+                                     struct quota_res *res,
+                                     unsigned sectors,
+                                     bool check_enospc)
+{
+       return 0;
+}
+
+#endif
+
+/* i_size updates: */
+
+static int inode_set_size(struct bch_inode_info *inode,
+                         struct bch_inode_unpacked *bi,
+                         void *p)
+{
+       loff_t *new_i_size = p;
+
+       lockdep_assert_held(&inode->ei_update_lock);
+
+       bi->bi_size = *new_i_size;
+       return 0;
+}
+
+static int __must_check bch2_write_inode_size(struct bch_fs *c,
+                                             struct bch_inode_info *inode,
+                                             loff_t new_size)
+{
+       return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
+}
+
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+                          struct quota_res *quota_res, int sectors)
+{
+       mutex_lock(&inode->ei_quota_lock);
+#ifdef CONFIG_BCACHEFS_QUOTA
+       if (quota_res && sectors > 0) {
+               BUG_ON(sectors > quota_res->sectors);
+               BUG_ON(sectors > inode->ei_quota_reserved);
+
+               quota_res->sectors -= sectors;
+               inode->ei_quota_reserved -= sectors;
+       } else {
+               bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
+       }
+#endif
+       inode->v.i_blocks += sectors;
+       mutex_unlock(&inode->ei_quota_lock);
+}
+
+/* i_sectors accounting: */
+
+static enum btree_insert_ret
+i_sectors_hook_fn(struct extent_insert_hook *hook,
+                 struct bpos committed_pos,
+                 struct bpos next_pos,
+                 struct bkey_s_c k,
+                 const struct bkey_i *insert)
+{
+       struct i_sectors_hook *h = container_of(hook,
+                               struct i_sectors_hook, hook);
+       s64 sectors = next_pos.offset - committed_pos.offset;
+       int sign = bkey_extent_is_allocation(&insert->k) -
+               (k.k && bkey_extent_is_allocation(k.k));
+
+       EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
+
+       h->sectors += sectors * sign;
+
+       return BTREE_INSERT_OK;
+}
+
+static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
+                                    struct bch_inode_unpacked *bi,
+                                    void *p)
+{
+       struct i_sectors_hook *h = p;
+
+       if (h->new_i_size != U64_MAX &&
+           (!h->appending ||
+            h->new_i_size > bi->bi_size))
+               bi->bi_size = h->new_i_size;
+       bi->bi_sectors  += h->sectors;
+       bi->bi_flags    &= ~h->flags;
+       return 0;
+}
+
+static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
+{
+       int ret;
+
+       mutex_lock(&h->inode->ei_update_lock);
+       i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
+
+       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
+
+       if (!ret && h->new_i_size != U64_MAX)
+               i_size_write(&h->inode->v, h->new_i_size);
+       mutex_unlock(&h->inode->ei_update_lock);
+
+       bch2_quota_reservation_put(c, h->inode, &h->quota_res);
+
+       h->sectors = 0;
+
+       return ret;
+}
+
+static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
+                                   struct bch_inode_unpacked *bi, void *p)
+{
+       struct i_sectors_hook *h = p;
+
+       if (h->flags & BCH_INODE_I_SIZE_DIRTY)
+               bi->bi_size = h->new_i_size;
+
+       bi->bi_flags |= h->flags;
+       return 0;
+}
+
+static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+{
+       int ret;
+
+       mutex_lock(&h->inode->ei_update_lock);
+       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
+       mutex_unlock(&h->inode->ei_update_lock);
+
+       return ret;
+}
+
+static inline struct i_sectors_hook
+i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
+{
+       return (struct i_sectors_hook) {
+               .hook.fn        = i_sectors_hook_fn,
+               .inode          = inode,
+               .sectors        = 0,
+               .new_i_size     = U64_MAX,
+               .flags          = flags|BCH_INODE_I_SECTORS_DIRTY,
+       };
+}
+
+/* normal i_size/i_sectors update machinery: */
+
+struct bchfs_extent_trans_hook {
+       struct bchfs_write_op           *op;
+       struct extent_insert_hook       hook;
+
+       struct bch_inode_unpacked       inode_u;
+       struct bkey_inode_buf           inode_p;
+
+       bool                            need_inode_update;
+};
+
+static enum btree_insert_ret
+bchfs_extent_update_hook(struct extent_insert_hook *hook,
+                        struct bpos committed_pos,
+                        struct bpos next_pos,
+                        struct bkey_s_c k,
+                        const struct bkey_i *insert)
+{
+       struct bchfs_extent_trans_hook *h = container_of(hook,
+                               struct bchfs_extent_trans_hook, hook);
+       struct bch_inode_info *inode = h->op->inode;
+       int sign = bkey_extent_is_allocation(&insert->k) -
+               (k.k && bkey_extent_is_allocation(k.k));
+       s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
+       u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
+       bool do_pack = false;
+
+       if (h->op->unalloc &&
+           !bch2_extent_is_fully_allocated(k))
+               return BTREE_INSERT_ENOSPC;
+
+       BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
+
+       /* XXX: inode->i_size locking */
+       if (offset > inode->ei_inode.bi_size) {
+               if (!h->need_inode_update) {
+                       h->need_inode_update = true;
+                       return BTREE_INSERT_NEED_TRAVERSE;
+               }
+
+               /* truncate in progress? */
+               if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
+                       goto no_i_size_update;
+
+               h->inode_u.bi_size = offset;
+               do_pack = true;
+
+               inode->ei_inode.bi_size = offset;
+
+               spin_lock(&inode->v.i_lock);
+               if (offset > inode->v.i_size) {
+                       if (h->op->is_dio)
+                               i_size_write(&inode->v, offset);
+                       else
+                               BUG();
+               }
+               spin_unlock(&inode->v.i_lock);
+       }
+no_i_size_update:
+       if (sectors) {
+               if (!h->need_inode_update) {
+                       h->need_inode_update = true;
+                       return BTREE_INSERT_NEED_TRAVERSE;
+               }
+
+               h->inode_u.bi_sectors += sectors;
+               do_pack = true;
+
+               h->op->sectors_added += sectors;
+       }
+
+       if (do_pack)
+               bch2_inode_pack(&h->inode_p, &h->inode_u);
+
+       return BTREE_INSERT_OK;
+}
+
+static int bchfs_write_index_update(struct bch_write_op *wop)
+{
+       struct bchfs_write_op *op = container_of(wop,
+                               struct bchfs_write_op, op);
+       struct keylist *keys = &op->op.insert_keys;
+       struct btree_iter extent_iter, inode_iter;
+       struct bchfs_extent_trans_hook hook;
+       struct bkey_i *k = bch2_keylist_front(keys);
+       s64 orig_sectors_added = op->sectors_added;
+       int ret;
+
+       BUG_ON(k->k.p.inode != op->inode->v.i_ino);
+
+       bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_INTENT);
+       bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
+                            POS(extent_iter.pos.inode, 0),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       hook.op                 = op;
+       hook.hook.fn            = bchfs_extent_update_hook;
+       hook.need_inode_update  = false;
+
+       do {
+               /* XXX: inode->i_size locking */
+               k = bch2_keylist_front(keys);
+               if (min(k->k.p.offset << 9, op->new_i_size) >
+                   op->inode->ei_inode.bi_size)
+                       hook.need_inode_update = true;
+
+               /* optimization for fewer transaction restarts: */
+               ret = bch2_btree_iter_traverse(&extent_iter);
+               if (ret)
+                       goto err;
+
+               if (hook.need_inode_update) {
+                       struct bkey_s_c inode;
+
+                       if (!btree_iter_linked(&inode_iter))
+                               bch2_btree_iter_link(&extent_iter, &inode_iter);
+
+                       inode = bch2_btree_iter_peek_slot(&inode_iter);
+                       if ((ret = btree_iter_err(inode)))
+                               goto err;
+
+                       if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
+                                     "inode %llu not found when updating",
+                                     extent_iter.pos.inode)) {
+                               ret = -ENOENT;
+                               break;
+                       }
+
+                       if (WARN_ONCE(bkey_bytes(inode.k) >
+                                     sizeof(hook.inode_p),
+                                     "inode %llu too big (%zu bytes, buf %zu)",
+                                     extent_iter.pos.inode,
+                                     bkey_bytes(inode.k),
+                                     sizeof(hook.inode_p))) {
+                               ret = -ENOENT;
+                               break;
+                       }
+
+                       bkey_reassemble(&hook.inode_p.inode.k_i, inode);
+                       ret = bch2_inode_unpack(bkey_s_c_to_inode(inode),
+                                              &hook.inode_u);
+                       if (WARN_ONCE(ret,
+                                     "error %i unpacking inode %llu",
+                                     ret, extent_iter.pos.inode)) {
+                               ret = -ENOENT;
+                               break;
+                       }
+
+                       ret = bch2_btree_insert_at(wop->c, &wop->res,
+                                       &hook.hook, op_journal_seq(wop),
+                                       BTREE_INSERT_NOFAIL|
+                                       BTREE_INSERT_ATOMIC|
+                                       BTREE_INSERT_USE_RESERVE,
+                                       BTREE_INSERT_ENTRY(&extent_iter, k),
+                                       BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+                                                       &hook.inode_p.inode.k_i, 2));
+               } else {
+                       ret = bch2_btree_insert_at(wop->c, &wop->res,
+                                       &hook.hook, op_journal_seq(wop),
+                                       BTREE_INSERT_NOFAIL|
+                                       BTREE_INSERT_ATOMIC|
+                                       BTREE_INSERT_USE_RESERVE,
+                                       BTREE_INSERT_ENTRY(&extent_iter, k));
+               }
+
+               BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+
+               if (WARN_ONCE(!ret != !k->k.size,
+                             "ret %i k->size %u", ret, k->k.size))
+                       ret = k->k.size ? -EINTR : 0;
+err:
+               if (ret == -EINTR)
+                       continue;
+               if (ret)
+                       break;
+
+               BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
+               bch2_keylist_pop_front(keys);
+       } while (!bch2_keylist_empty(keys));
+
+       bch2_btree_iter_unlock(&extent_iter);
+       bch2_btree_iter_unlock(&inode_iter);
+
+       if (op->is_dio) {
+               struct dio_write *dio = container_of(op, struct dio_write, iop);
+
+               i_sectors_acct(wop->c, op->inode, &dio->quota_res,
+                              op->sectors_added - orig_sectors_added);
+       }
+
+       return ret;
+}
+
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+                                       struct bch_fs *c,
+                                       struct bch_inode_info *inode,
+                                       struct bch_io_opts opts,
+                                       bool is_dio)
+{
+       op->inode               = inode;
+       op->sectors_added       = 0;
+       op->is_dio              = is_dio;
+       op->unalloc             = false;
+       op->new_i_size          = U64_MAX;
+
+       bch2_write_op_init(&op->op, c, opts);
+       op->op.target           = opts.foreground_target;
+       op->op.index_update_fn  = bchfs_write_index_update;
+       op_journal_seq_set(&op->op, &inode->ei_journal_seq);
+}
+
+static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
+{
+       struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+       bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
+       return opts;
+}
+
+/* page state: */
+
+/* stored in page->private: */
+
+/*
+ * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
+ * almost protected it with the page lock, except that bch2_writepage_io_done has
+ * to update the sector counts (and from interrupt/bottom half context).
+ */
+struct bch_page_state {
+union { struct {
+       /* existing data: */
+       unsigned                sectors:PAGE_SECTOR_SHIFT + 1;
+       unsigned                nr_replicas:4;
+       unsigned                compressed:1;
+
+       /* Owns PAGE_SECTORS sized reservation: */
+       unsigned                reserved:1;
+       unsigned                reservation_replicas:4;
+
+       /* Owns PAGE_SECTORS sized quota reservation: */
+       unsigned                quota_reserved:1;
+
+       /*
+        * Number of sectors on disk - for i_blocks
+        * Uncompressed size, not compressed size:
+        */
+       unsigned                dirty_sectors:PAGE_SECTOR_SHIFT + 1;
+};
+       /* for cmpxchg: */
+       unsigned long           v;
+};
+};
+
+#define page_state_cmpxchg(_ptr, _new, _expr)                          \
+({                                                                     \
+       unsigned long _v = READ_ONCE((_ptr)->v);                        \
+       struct bch_page_state _old;                                     \
+                                                                       \
+       do {                                                            \
+               _old.v = _new.v = _v;                                   \
+               _expr;                                                  \
+                                                                       \
+               EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
+       } while (_old.v != _new.v &&                                    \
+                (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v); \
+                                                                       \
+       _old;                                                           \
+})
+
+static inline struct bch_page_state *page_state(struct page *page)
+{
+       struct bch_page_state *s = (void *) &page->private;
+
+       BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
+
+       if (!PagePrivate(page))
+               SetPagePrivate(page);
+
+       return s;
+}
+
+static inline unsigned page_res_sectors(struct bch_page_state s)
+{
+
+       return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
+}
+
+static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+                                       struct bch_page_state s)
+{
+       struct disk_reservation res = { .sectors = page_res_sectors(s) };
+       struct quota_res quota_res = { .sectors = s.quota_reserved ? PAGE_SECTORS : 0 };
+
+       bch2_quota_reservation_put(c, inode, &quota_res);
+       bch2_disk_reservation_put(c, &res);
+}
+
+static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+                                     struct page *page)
+{
+       struct bch_page_state s;
+
+       s = page_state_cmpxchg(page_state(page), s, {
+               s.reserved              = 0;
+               s.quota_reserved        = 0;
+       });
+
+       __bch2_put_page_reservation(c, inode, s);
+}
+
+static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+                                    struct page *page, bool check_enospc)
+{
+       struct bch_page_state *s = page_state(page), new, old;
+
+       /* XXX: this should not be open coded */
+       unsigned nr_replicas = inode->ei_inode.bi_data_replicas
+               ? inode->ei_inode.bi_data_replicas - 1
+               : c->opts.data_replicas;
+
+       struct disk_reservation disk_res = bch2_disk_reservation_init(c,
+                                               nr_replicas);
+       struct quota_res quota_res = { 0 };
+       int ret = 0;
+
+       /*
+        * XXX: this could likely be quite a bit simpler, page reservations
+        * _should_ only be manipulated with page locked:
+        */
+
+       old = page_state_cmpxchg(s, new, {
+               if (new.reserved
+                   ? (new.reservation_replicas < disk_res.nr_replicas)
+                   : (new.sectors < PAGE_SECTORS ||
+                      new.nr_replicas < disk_res.nr_replicas ||
+                      new.compressed)) {
+                       int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
+                                      page_res_sectors(new) -
+                                      disk_res.sectors);
+
+                       if (sectors > 0) {
+                               ret = bch2_disk_reservation_add(c, &disk_res, sectors,
+                                               !check_enospc
+                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
+                               if (unlikely(ret))
+                                       goto err;
+                       }
+
+                       new.reserved = 1;
+                       new.reservation_replicas = disk_res.nr_replicas;
+               }
+
+               if (!new.quota_reserved &&
+                   new.sectors + new.dirty_sectors < PAGE_SECTORS) {
+                       ret = bch2_quota_reservation_add(c, inode, &quota_res,
+                                               PAGE_SECTORS - quota_res.sectors,
+                                               check_enospc);
+                       if (unlikely(ret))
+                               goto err;
+
+                       new.quota_reserved = 1;
+               }
+       });
+
+       quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
+       disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
+err:
+       bch2_quota_reservation_put(c, inode, &quota_res);
+       bch2_disk_reservation_put(c, &disk_res);
+       return ret;
+}
+
+static void bch2_clear_page_bits(struct page *page)
+{
+       struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_page_state s;
+
+       if (!PagePrivate(page))
+               return;
+
+       s.v = xchg(&page_state(page)->v, 0);
+       ClearPagePrivate(page);
+
+       if (s.dirty_sectors)
+               i_sectors_acct(c, inode, NULL, -s.dirty_sectors);
+
+       __bch2_put_page_reservation(c, inode, s);
+}
+
+bool bch2_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct quota_res quota_res = { 0 };
+       struct bch_page_state old, new;
+
+       old = page_state_cmpxchg(page_state(&folio->page), new,
+               new.dirty_sectors = PAGE_SECTORS - new.sectors;
+               new.quota_reserved = 0;
+       );
+
+       quota_res.sectors += old.quota_reserved * PAGE_SECTORS;
+
+       if (old.dirty_sectors != new.dirty_sectors)
+               i_sectors_acct(c, inode, &quota_res,
+                              new.dirty_sectors - old.dirty_sectors);
+       bch2_quota_reservation_put(c, inode, &quota_res);
+
+       return filemap_dirty_folio(mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+       struct file *file = vmf->vma->vm_file;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       int ret;
+
+       bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+       ret = filemap_fault(vmf);
+       bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+       return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+       struct page *page = vmf->page;
+       struct file *file = vmf->vma->vm_file;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct address_space *mapping = file->f_mapping;
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret = VM_FAULT_LOCKED;
+
+       sb_start_pagefault(inode->v.i_sb);
+       file_update_time(file);
+
+       /*
+        * Not strictly necessary, but helps avoid dio writes livelocking in
+        * write_invalidate_inode_pages_range() - can drop this if/when we get
+        * a write_invalidate_inode_pages_range() that works without dropping
+        * page lock before invalidating page
+        */
+       bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+       lock_page(page);
+       if (page->mapping != mapping ||
+           page_offset(page) > i_size_read(&inode->v)) {
+               unlock_page(page);
+               ret = VM_FAULT_NOPAGE;
+               goto out;
+       }
+
+       if (bch2_get_page_reservation(c, inode, page, true)) {
+               unlock_page(page);
+               ret = VM_FAULT_SIGBUS;
+               goto out;
+       }
+
+       if (!PageDirty(page))
+               set_page_dirty(page);
+       wait_for_stable_page(page);
+out:
+       bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+       sb_end_pagefault(inode->v.i_sb);
+       return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+       EBUG_ON(!PageLocked(&folio->page));
+       EBUG_ON(folio_test_writeback(folio));
+
+       if (offset || length < folio_size(folio))
+               return;
+
+       bch2_clear_page_bits(&folio->page);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+       /* XXX: this can't take locks that are held while we allocate memory */
+       EBUG_ON(!PageLocked(&folio->page));
+       EBUG_ON(folio_test_writeback(folio));
+
+       if (folio_test_dirty(folio))
+               return false;
+
+       bch2_clear_page_bits(&folio->page);
+       return true;
+}
+
+/* readpages/writepages: */
+
+static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
+{
+       sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
+
+       return bio->bi_vcnt < bio->bi_max_vecs &&
+               bio_end_sector(bio) == offset;
+}
+
+static int bio_add_page_contig(struct bio *bio, struct page *page)
+{
+       sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
+
+       EBUG_ON(!bio->bi_max_vecs);
+
+       if (!bio->bi_vcnt)
+               bio->bi_iter.bi_sector = offset;
+       else if (!bio_can_add_page_contig(bio, page))
+               return -1;
+
+       __bio_add_page(bio, page, PAGE_SIZE, 0);
+       return 0;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+       struct bvec_iter_all iter;
+       struct bio_vec *bv;
+
+       bio_for_each_segment_all(bv, bio, iter) {
+               struct page *page = bv->bv_page;
+
+               if (!bio->bi_status) {
+                       SetPageUptodate(page);
+               } else {
+                       ClearPageUptodate(page);
+                       SetPageError(page);
+               }
+               unlock_page(page);
+       }
+
+       bio_put(bio);
+}
+
+static inline void page_state_init_for_read(struct page *page)
+{
+       struct bch_page_state *s = page_state(page);
+
+       BUG_ON(s->reserved);
+       s->sectors      = 0;
+       s->compressed   = 0;
+}
+
+struct readpages_iter {
+       struct address_space    *mapping;
+       struct page             **pages;
+       unsigned                nr_pages;
+       unsigned                idx;
+       pgoff_t                 offset;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+                              struct readahead_control *ractl)
+{
+       unsigned i, nr_pages = readahead_count(ractl);
+
+       memset(iter, 0, sizeof(*iter));
+
+       iter->mapping   = ractl->mapping;
+       iter->offset    = readahead_index(ractl);
+       iter->nr_pages  = nr_pages;
+
+       iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+       if (!iter->pages)
+               return -ENOMEM;
+
+       __readahead_batch(ractl, iter->pages, nr_pages);
+       for (i = 0; i < nr_pages; i++) {
+               put_page(iter->pages[i]);
+       }
+
+       return 0;
+}
+
+static inline struct page *readpage_iter_next(struct readpages_iter *iter)
+{
+       if (iter->idx >= iter->nr_pages)
+               return NULL;
+
+       EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
+
+       page_state_init_for_read(iter->pages[iter->idx]);
+       return iter->pages[iter->idx];
+}
+
+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
+{
+       struct bvec_iter iter;
+       struct bio_vec bv;
+       bool compressed = bch2_extent_is_compressed(k);
+       unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
+
+       bio_for_each_segment(bv, bio, iter) {
+               struct bch_page_state *s = page_state(bv.bv_page);
+
+               /* sectors in @k from the start of this page: */
+               unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
+
+               unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
+
+               s->nr_replicas = !s->sectors
+                       ? nr_ptrs
+                       : min_t(unsigned, s->nr_replicas, nr_ptrs);
+
+               BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
+               s->sectors += page_sectors;
+
+               s->compressed |= compressed;
+       }
+}
+
+static void readpage_bio_extend(struct readpages_iter *iter,
+                               struct bio *bio, u64 offset,
+                               bool get_more)
+{
+       while (bio_end_sector(bio) < offset &&
+              bio->bi_vcnt < bio->bi_max_vecs) {
+               pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+               struct page *page = readpage_iter_next(iter);
+               int ret;
+
+               if (page) {
+                       if (iter->offset + iter->idx != page_offset)
+                               break;
+
+                       iter->idx++;
+               } else {
+                       if (!get_more)
+                               break;
+
+                       page = xa_load(&iter->mapping->i_pages, page_offset);
+                       if (page && !xa_is_value(page))
+                               break;
+
+                       page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
+                       if (!page)
+                               break;
+
+                       page_state_init_for_read(page);
+
+                       ret = add_to_page_cache_lru(page, iter->mapping,
+                                                   page_offset, GFP_NOFS);
+                       if (ret) {
+                               ClearPagePrivate(page);
+                               put_page(page);
+                               break;
+                       }
+
+                       put_page(page);
+               }
+
+               __bio_add_page(bio, page, PAGE_SIZE, 0);
+       }
+}
+
+static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
+                      struct bch_read_bio *rbio, u64 inum,
+                      struct readpages_iter *readpages_iter)
+{
+       struct bio *bio = &rbio->bio;
+       int flags = BCH_READ_RETRY_IF_STALE|
+               BCH_READ_MAY_PROMOTE;
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+
+       while (1) {
+               BKEY_PADDED(k) tmp;
+               struct bkey_s_c k;
+               unsigned bytes;
+
+               bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
+
+               k = bch2_btree_iter_peek_slot(iter);
+               BUG_ON(!k.k);
+
+               if (IS_ERR(k.k)) {
+                       int ret = bch2_btree_iter_unlock(iter);
+                       BUG_ON(!ret);
+                       bcache_io_error(c, bio, "btree IO error %i", ret);
+                       bio_endio(bio);
+                       return;
+               }
+
+               bkey_reassemble(&tmp.k, k);
+               bch2_btree_iter_unlock(iter);
+               k = bkey_i_to_s_c(&tmp.k);
+
+               if (readpages_iter) {
+                       bool want_full_extent = false;
+
+                       if (bkey_extent_is_data(k.k)) {
+                               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+                               struct bch_extent_crc_unpacked crc;
+                               const union bch_extent_entry *i;
+
+                               extent_for_each_crc(e, crc, i)
+                                       want_full_extent |= ((crc.csum_type != 0) |
+                                                            (crc.compression_type != 0));
+                       }
+
+                       readpage_bio_extend(readpages_iter,
+                                           bio, k.k->p.offset,
+                                           want_full_extent);
+               }
+
+               bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+                        bio->bi_iter.bi_sector) << 9;
+               swap(bio->bi_iter.bi_size, bytes);
+
+               if (bytes == bio->bi_iter.bi_size)
+                       flags |= BCH_READ_LAST_FRAGMENT;
+
+               if (bkey_extent_is_allocation(k.k))
+                       bch2_add_page_sectors(bio, k);
+
+               bch2_read_extent(c, rbio, k, flags);
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       return;
+
+               swap(bio->bi_iter.bi_size, bytes);
+               bio_advance(bio, bytes);
+       }
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts = io_opts(c, inode);
+       struct btree_iter iter;
+       struct page *page;
+       struct readpages_iter readpages_iter;
+       int ret;
+
+       ret = readpages_iter_init(&readpages_iter, ractl);
+       BUG_ON(ret);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+                            BTREE_ITER_SLOTS);
+
+       bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+       while ((page = readpage_iter_next(&readpages_iter))) {
+               pgoff_t index = readpages_iter.offset + readpages_iter.idx;
+               unsigned n = min_t(unsigned,
+                                  readpages_iter.nr_pages -
+                                  readpages_iter.idx,
+                                  BIO_MAX_VECS);
+               struct bch_read_bio *rbio =
+                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+                                                  GFP_NOFS, &c->bio_read),
+                                 opts);
+
+               readpages_iter.idx++;
+
+               rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+               rbio->bio.bi_end_io = bch2_readpages_end_io;
+               __bio_add_page(&rbio->bio, page, PAGE_SIZE, 0);
+
+               bchfs_read(c, &iter, rbio, inode->v.i_ino, &readpages_iter);
+       }
+
+       bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+       kfree(readpages_iter.pages);
+}
+
+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
+                            u64 inum, struct page *page)
+{
+       struct btree_iter iter;
+
+       page_state_init_for_read(page);
+
+       rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+       bio_add_page_contig(&rbio->bio, page);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+                            BTREE_ITER_SLOTS);
+       bchfs_read(c, &iter, rbio, inum, NULL);
+}
+
+static void bch2_read_single_page_end_io(struct bio *bio)
+{
+       complete(bio->bi_private);
+}
+
+static int bch2_read_single_page(struct page *page,
+                                struct address_space *mapping)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_read_bio *rbio;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
+                        io_opts(c, inode));
+       rbio->bio.bi_private = &done;
+       rbio->bio.bi_end_io = bch2_read_single_page_end_io;
+
+       __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+       wait_for_completion(&done);
+
+       ret = blk_status_to_errno(rbio->bio.bi_status);
+       bio_put(&rbio->bio);
+
+       if (ret < 0)
+               return ret;
+
+       SetPageUptodate(page);
+       return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+       struct page *page = &folio->page;
+       int ret;
+
+       ret = bch2_read_single_page(page, page->mapping);
+       folio_unlock(folio);
+       return ret;
+}
+
+/* writepages: */
+
+struct bch_writepage_state {
+       struct bch_writepage_io *io;
+       struct bch_io_opts      opts;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+                                                                 struct bch_inode_info *inode)
+{
+       return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+}
+
+static void bch2_writepage_io_free(struct closure *cl)
+{
+       struct bch_writepage_io *io = container_of(cl,
+                                       struct bch_writepage_io, cl);
+
+       bio_put(&io->op.op.wbio.bio);
+}
+
+static void bch2_writepage_io_done(struct closure *cl)
+{
+       struct bch_writepage_io *io = container_of(cl,
+                                       struct bch_writepage_io, cl);
+       struct bch_fs *c = io->op.op.c;
+       struct bio *bio = &io->op.op.wbio.bio;
+       struct bvec_iter_all iter;
+       struct bio_vec *bvec;
+
+       if (io->op.op.error) {
+               bio_for_each_segment_all(bvec, bio, iter)
+                       SetPageError(bvec->bv_page);
+               set_bit(AS_EIO, &io->op.inode->v.i_mapping->flags);
+       }
+
+       /*
+        * racing with fallocate can cause us to add fewer sectors than
+        * expected - but we shouldn't add more sectors than expected:
+        */
+       BUG_ON(io->op.sectors_added > (s64) io->new_sectors);
+
+       /*
+        * (error (due to going RO) halfway through a page can screw that up
+        * slightly)
+        * XXX wtf?
+          BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS);
+        */
+
+       /*
+        * PageWriteback is effectively our ref on the inode - fixup i_blocks
+        * before calling end_page_writeback:
+        */
+       if (io->op.sectors_added != io->new_sectors)
+               i_sectors_acct(c, io->op.inode, NULL,
+                              io->op.sectors_added - (s64) io->new_sectors);
+
+       bio_for_each_segment_all(bvec, bio, iter)
+               end_page_writeback(bvec->bv_page);
+
+       closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+       struct bch_writepage_io *io = w->io;
+
+       w->io = NULL;
+       closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
+       continue_at(&io->cl, bch2_writepage_io_done, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+                                   struct bch_writepage_state *w,
+                                   struct bch_inode_info *inode,
+                                   struct page *page,
+                                   unsigned nr_replicas)
+{
+       struct bch_write_op *op;
+       u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
+
+       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+                                             REQ_OP_WRITE,
+                                             GFP_NOFS,
+                                             &c->writepage_bioset),
+                            struct bch_writepage_io, op.op.wbio.bio);
+
+       closure_init(&w->io->cl, NULL);
+       w->io->new_sectors      = 0;
+       bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
+       op                      = &w->io->op.op;
+       op->nr_replicas         = nr_replicas;
+       op->res.nr_replicas     = nr_replicas;
+       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
+       op->pos                 = POS(inode->v.i_ino, offset);
+       op->wbio.bio.bi_iter.bi_sector = offset;
+}
+
+static int __bch2_writepage(struct folio *folio,
+                           struct writeback_control *wbc,
+                           void *data)
+{
+       struct page *page = &folio->page;
+       struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_writepage_state *w = data;
+       struct bch_page_state new, old;
+       unsigned offset;
+       loff_t i_size = i_size_read(&inode->v);
+       pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+       EBUG_ON(!PageUptodate(page));
+
+       /* Is the page fully inside i_size? */
+       if (page->index < end_index)
+               goto do_io;
+
+       /* Is the page fully outside i_size? (truncate in progress) */
+       offset = i_size & (PAGE_SIZE - 1);
+       if (page->index > end_index || !offset) {
+               unlock_page(page);
+               return 0;
+       }
+
+       /*
+        * The page straddles i_size.  It must be zeroed out on each and every
+        * writepage invocation because it may be mmapped.  "A file is mapped
+        * in multiples of the page size.  For a file that is not a multiple of
+        * the  page size, the remaining memory is zeroed when mapped, and
+        * writes to that region are not written out to the file."
+        */
+       zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+       /* Before unlocking the page, transfer reservation to w->io: */
+       old = page_state_cmpxchg(page_state(page), new, {
+               EBUG_ON(!new.reserved &&
+                       (new.sectors != PAGE_SECTORS ||
+                       new.compressed));
+
+               if (new.reserved)
+                       new.nr_replicas = new.reservation_replicas;
+               new.reserved = 0;
+
+               new.compressed |= w->opts.compression != 0;
+
+               new.sectors += new.dirty_sectors;
+               new.dirty_sectors = 0;
+       });
+
+       BUG_ON(PageWriteback(page));
+       set_page_writeback(page);
+       unlock_page(page);
+
+       if (w->io &&
+           (w->io->op.op.res.nr_replicas != new.nr_replicas ||
+            !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
+               bch2_writepage_do_io(w);
+
+       if (!w->io)
+               bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
+
+       w->io->new_sectors += new.sectors - old.sectors;
+
+       BUG_ON(inode != w->io->op.inode);
+       BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+
+       if (old.reserved)
+               w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
+
+       w->io->op.new_i_size = i_size;
+
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
+       return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+       struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+       struct bch_writepage_state w =
+               bch_writepage_state_init(c, to_bch_ei(mapping->host));
+       struct blk_plug plug;
+       int ret;
+
+       blk_start_plug(&plug);
+       ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+       if (w.io)
+               bch2_writepage_do_io(&w);
+       blk_finish_plug(&plug);
+       return ret;
+}
+
+int bch2_writepage(struct page *page, struct writeback_control *wbc)
+{
+       struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
+       struct bch_writepage_state w =
+               bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
+       int ret;
+
+       ret = __bch2_writepage(page_folio(page), wbc, &w);
+       if (w.io)
+               bch2_writepage_do_io(&w);
+
+       return ret;
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+                    loff_t pos, unsigned len,
+                    struct page **pagep, void **fsdata)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       pgoff_t index = pos >> PAGE_SHIFT;
+       unsigned offset = pos & (PAGE_SIZE - 1);
+       struct page *page;
+       int ret = -ENOMEM;
+
+       BUG_ON(inode_unhashed(&inode->v));
+
+       bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+       page = grab_cache_page_write_begin(mapping, index);
+       if (!page)
+               goto err_unlock;
+
+       if (PageUptodate(page))
+               goto out;
+
+       /* If we're writing entire page, don't need to read it in first: */
+       if (len == PAGE_SIZE)
+               goto out;
+
+       if (!offset && pos + len >= inode->v.i_size) {
+               zero_user_segment(page, len, PAGE_SIZE);
+               flush_dcache_page(page);
+               goto out;
+       }
+
+       if (index > inode->v.i_size >> PAGE_SHIFT) {
+               zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
+               flush_dcache_page(page);
+               goto out;
+       }
+readpage:
+       ret = bch2_read_single_page(page, mapping);
+       if (ret)
+               goto err;
+out:
+       ret = bch2_get_page_reservation(c, inode, page, true);
+       if (ret) {
+               if (!PageUptodate(page)) {
+                       /*
+                        * If the page hasn't been read in, we won't know if we
+                        * actually need a reservation - we don't actually need
+                        * to read here, we just need to check if the page is
+                        * fully backed by uncompressed data:
+                        */
+                       goto readpage;
+               }
+
+               goto err;
+       }
+
+       *pagep = page;
+       return 0;
+err:
+       unlock_page(page);
+       put_page(page);
+       *pagep = NULL;
+err_unlock:
+       bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+       return ret;
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+                  loff_t pos, unsigned len, unsigned copied,
+                  struct page *page, void *fsdata)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+
+       if (unlikely(copied < len && !PageUptodate(page))) {
+               /*
+                * The page needs to be read in, but that would destroy
+                * our partial write - simplest thing is to just force
+                * userspace to redo the write:
+                */
+               zero_user(page, 0, PAGE_SIZE);
+               flush_dcache_page(page);
+               copied = 0;
+       }
+
+       spin_lock(&inode->v.i_lock);
+       if (pos + copied > inode->v.i_size)
+               i_size_write(&inode->v, pos + copied);
+       spin_unlock(&inode->v.i_lock);
+
+       if (copied) {
+               if (!PageUptodate(page))
+                       SetPageUptodate(page);
+               if (!PageDirty(page))
+                       set_page_dirty(page);
+
+               inode->ei_last_dirtied = (unsigned long) current;
+       } else {
+               bch2_put_page_reservation(c, inode, page);
+       }
+
+       unlock_page(page);
+       put_page(page);
+       bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+       return copied;
+}
+
+#define WRITE_BATCH_PAGES      32
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+                                struct address_space *mapping,
+                                struct iov_iter *iter,
+                                loff_t pos, unsigned len)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct page *pages[WRITE_BATCH_PAGES];
+       unsigned long index = pos >> PAGE_SHIFT;
+       unsigned offset = pos & (PAGE_SIZE - 1);
+       unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+       unsigned i, copied = 0, nr_pages_copied = 0;
+       int ret = 0;
+
+       BUG_ON(!len);
+       BUG_ON(nr_pages > ARRAY_SIZE(pages));
+
+       for (i = 0; i < nr_pages; i++) {
+               pages[i] = grab_cache_page_write_begin(mapping, index + i);
+               if (!pages[i]) {
+                       nr_pages = i;
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
+       if (offset && !PageUptodate(pages[0])) {
+               ret = bch2_read_single_page(pages[0], mapping);
+               if (ret)
+                       goto out;
+       }
+
+       if ((pos + len) & (PAGE_SIZE - 1) &&
+           !PageUptodate(pages[nr_pages - 1])) {
+               if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
+                       zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+               } else {
+                       ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+                       if (ret)
+                               goto out;
+               }
+       }
+
+       for (i = 0; i < nr_pages; i++) {
+               ret = bch2_get_page_reservation(c, inode, pages[i], true);
+
+               if (ret && !PageUptodate(pages[i])) {
+                       ret = bch2_read_single_page(pages[i], mapping);
+                       if (ret)
+                               goto out;
+
+                       ret = bch2_get_page_reservation(c, inode, pages[i], true);
+               }
+
+               if (ret)
+                       goto out;
+       }
+
+       if (mapping_writably_mapped(mapping))
+               for (i = 0; i < nr_pages; i++)
+                       flush_dcache_page(pages[i]);
+
+       while (copied < len) {
+               struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+               unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
+               unsigned pg_bytes = min_t(unsigned, len - copied,
+                                         PAGE_SIZE - pg_offset);
+               unsigned pg_copied = copy_page_from_iter_atomic(page,
+                                               pg_offset, pg_bytes, iter);
+
+               flush_dcache_page(page);
+               copied += pg_copied;
+
+               if (pg_copied != pg_bytes)
+                       break;
+       }
+
+       if (!copied)
+               goto out;
+
+       nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+       inode->ei_last_dirtied = (unsigned long) current;
+
+       spin_lock(&inode->v.i_lock);
+       if (pos + copied > inode->v.i_size)
+               i_size_write(&inode->v, pos + copied);
+       spin_unlock(&inode->v.i_lock);
+
+       if (copied < len &&
+           ((offset + copied) & (PAGE_SIZE - 1))) {
+               struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+
+               if (!PageUptodate(page)) {
+                       zero_user(page, 0, PAGE_SIZE);
+                       copied -= (offset + copied) & (PAGE_SIZE - 1);
+               }
+       }
+out:
+       for (i = 0; i < nr_pages_copied; i++) {
+               if (!PageUptodate(pages[i]))
+                       SetPageUptodate(pages[i]);
+               if (!PageDirty(pages[i]))
+                       set_page_dirty(pages[i]);
+               unlock_page(pages[i]);
+               put_page(pages[i]);
+       }
+
+       for (i = nr_pages_copied; i < nr_pages; i++) {
+               if (!PageDirty(pages[i]))
+                       bch2_put_page_reservation(c, inode, pages[i]);
+               unlock_page(pages[i]);
+               put_page(pages[i]);
+       }
+
+       return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct address_space *mapping = file->f_mapping;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       loff_t pos = iocb->ki_pos;
+       ssize_t written = 0;
+       int ret = 0;
+
+       bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+       do {
+               unsigned offset = pos & (PAGE_SIZE - 1);
+               unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
+                             PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+again:
+               /*
+                * Bring in the user page that we will copy from _first_.
+                * Otherwise there's a nasty deadlock on copying from the
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                *
+                * Not only is this an optimisation, but it is also required
+                * to check that the address is actually valid, when atomic
+                * usercopies are used, below.
+                */
+               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+                       bytes = min_t(unsigned long, iov_iter_count(iter),
+                                     PAGE_SIZE - offset);
+
+                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+                               ret = -EFAULT;
+                               break;
+                       }
+               }
+
+               if (unlikely(fatal_signal_pending(current))) {
+                       ret = -EINTR;
+                       break;
+               }
+
+               ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+               if (unlikely(ret < 0))
+                       break;
+
+               cond_resched();
+
+               if (unlikely(ret == 0)) {
+                       /*
+                        * If we were unable to copy any data at all, we must
+                        * fall back to a single segment length write.
+                        *
+                        * If we didn't fallback here, we could livelock
+                        * because not all segments in the iov can be copied at
+                        * once without a pagefault.
+                        */
+                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                     iov_iter_single_seg_count(iter));
+                       goto again;
+               }
+               pos += ret;
+               written += ret;
+
+               balance_dirty_pages_ratelimited(mapping);
+       } while (iov_iter_count(iter));
+
+       bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+       return written ? written : ret;
+}
+
+/* O_DIRECT reads */
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+       struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+       dio->req->ki_complete(dio->req, dio->ret);
+       bio_check_pages_dirty(&dio->rbio.bio);  /* transfers ownership */
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+       struct dio_read *dio = bio->bi_private;
+
+       if (bio->bi_status)
+               dio->ret = blk_status_to_errno(bio->bi_status);
+
+       closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+       bch2_direct_IO_read_endio(bio);
+       bio_check_pages_dirty(bio);     /* transfers ownership */
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+       struct file *file = req->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts = io_opts(c, inode);
+       struct dio_read *dio;
+       struct bio *bio;
+       loff_t offset = req->ki_pos;
+       bool sync = is_sync_kiocb(req);
+       size_t shorten;
+       ssize_t ret;
+
+       if ((offset|iter->count) & (block_bytes(c) - 1))
+               return -EINVAL;
+
+       ret = min_t(loff_t, iter->count,
+                   max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+       if (!ret)
+               return ret;
+
+       shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+       iter->count -= shorten;
+
+       bio = bio_alloc_bioset(NULL,
+                              iov_iter_npages(iter, BIO_MAX_VECS),
+                              REQ_OP_READ,
+                              GFP_KERNEL,
+                              &c->dio_read_bioset);
+
+       bio->bi_end_io = bch2_direct_IO_read_endio;
+
+       dio = container_of(bio, struct dio_read, rbio.bio);
+       closure_init(&dio->cl, NULL);
+
+       /*
+        * this is a _really_ horrible hack just to avoid an atomic sub at the
+        * end:
+        */
+       if (!sync) {
+               set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+               atomic_set(&dio->cl.remaining,
+                          CLOSURE_REMAINING_INITIALIZER -
+                          CLOSURE_RUNNING +
+                          CLOSURE_DESTRUCTOR);
+       } else {
+               atomic_set(&dio->cl.remaining,
+                          CLOSURE_REMAINING_INITIALIZER + 1);
+       }
+
+       dio->req        = req;
+       dio->ret        = ret;
+
+       goto start;
+       while (iter->count) {
+               bio = bio_alloc_bioset(NULL,
+                                      iov_iter_npages(iter, BIO_MAX_VECS),
+                                      REQ_OP_READ,
+                                      GFP_KERNEL,
+                                      &c->bio_read);
+               bio->bi_end_io          = bch2_direct_IO_read_split_endio;
+start:
+               bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
+               bio->bi_iter.bi_sector  = offset >> 9;
+               bio->bi_private         = dio;
+
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (ret < 0) {
+                       /* XXX: fault inject this path */
+                       bio->bi_status = BLK_STS_RESOURCE;
+                       bio_endio(bio);
+                       break;
+               }
+
+               offset += bio->bi_iter.bi_size;
+               bio_set_pages_dirty(bio);
+
+               if (iter->count)
+                       closure_get(&dio->cl);
+
+               bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+       }
+
+       iter->count += shorten;
+
+       if (sync) {
+               closure_sync(&dio->cl);
+               closure_debug_destroy(&dio->cl);
+               ret = dio->ret;
+               bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+               return ret;
+       } else {
+               return -EIOCBQUEUED;
+       }
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct address_space *mapping = file->f_mapping;
+       size_t count = iov_iter_count(iter);
+       ssize_t ret;
+
+       if (!count)
+               return 0; /* skip atime */
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               struct blk_plug plug;
+
+               ret = filemap_write_and_wait_range(mapping,
+                                       iocb->ki_pos,
+                                       iocb->ki_pos + count - 1);
+               if (ret < 0)
+                       return ret;
+
+               file_accessed(file);
+
+               blk_start_plug(&plug);
+               ret = bch2_direct_IO_read(iocb, iter);
+               blk_finish_plug(&plug);
+
+               if (ret >= 0)
+                       iocb->ki_pos += ret;
+       } else {
+               bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+               ret = generic_file_read_iter(iocb, iter);
+               bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+       }
+
+       return ret;
+}
+
+/* O_DIRECT writes */
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+       struct iovec *iov = dio->inline_vecs;
+
+       /*
+        * iov_iter has a single embedded iovec - nothing to do:
+        */
+       if (iter_is_ubuf(&dio->iter))
+               return 0;
+
+       /*
+        * We don't currently handle non-iovec iov_iters here - return an error,
+        * and we'll fall back to doing the IO synchronously:
+        */
+       if (!iter_is_iovec(&dio->iter))
+               return -1;
+
+       if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+               iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+                                   GFP_KERNEL);
+               if (unlikely(!iov))
+                       return -ENOMEM;
+
+               dio->free_iov = true;
+       }
+
+       memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+       dio->iter.__iov = iov;
+       return 0;
+}
+
+static void bch2_dio_write_loop_async(struct closure *);
+
+static long bch2_dio_write_loop(struct dio_write *dio)
+{
+       struct kiocb *req = dio->req;
+       struct address_space *mapping = req->ki_filp->f_mapping;
+       struct bch_inode_info *inode = dio->iop.inode;
+       struct bio *bio = &dio->iop.op.wbio.bio;
+       struct bvec_iter_all iter;
+       struct bio_vec *bv;
+       bool sync;
+       long ret;
+
+       if (dio->loop)
+               goto loop;
+
+       inode_dio_begin(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+       /* Write and invalidate pagecache range that we're writing to: */
+       ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
+                               req->ki_pos + iov_iter_count(&dio->iter) - 1);
+       if (unlikely(ret))
+               goto err;
+
+       while (1) {
+               if (current != dio->task)
+                       kthread_use_mm(dio->task->mm);
+               BUG_ON(current->faults_disabled_mapping);
+               current->faults_disabled_mapping = mapping;
+
+               ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+               current->faults_disabled_mapping = NULL;
+               if (current != dio->task)
+                       kthread_unuse_mm(dio->task->mm);
+
+               if (unlikely(ret < 0))
+                       goto err;
+
+               /* gup might have faulted pages back in: */
+               ret = write_invalidate_inode_pages_range(mapping,
+                               req->ki_pos + (dio->iop.op.written << 9),
+                               req->ki_pos + iov_iter_count(&dio->iter) - 1);
+               if (unlikely(ret))
+                       goto err;
+
+               dio->iop.op.pos = POS(inode->v.i_ino,
+                               (req->ki_pos >> 9) + dio->iop.op.written);
+
+               task_io_account_write(bio->bi_iter.bi_size);
+
+               closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
+
+               if (!dio->sync && !dio->loop && dio->iter.count) {
+                       if (bch2_dio_write_copy_iov(dio)) {
+                               dio->iop.op.error = -ENOMEM;
+                               goto err_wait_io;
+                       }
+               }
+err_wait_io:
+               dio->loop = true;
+
+               if (!dio->sync) {
+                       continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
+                       return -EIOCBQUEUED;
+               }
+
+               closure_sync(&dio->cl);
+loop:
+               bio_for_each_segment_all(bv, bio, iter)
+                       put_page(bv->bv_page);
+               if (!dio->iter.count || dio->iop.op.error)
+                       break;
+               bio_reset(bio, NULL, REQ_OP_WRITE);
+       }
+
+       ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+err:
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
+       bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res);
+
+       if (dio->free_iov)
+               kfree(dio->iter.__iov);
+
+       closure_debug_destroy(&dio->cl);
+
+       sync = dio->sync;
+       bio_put(bio);
+
+       /* inode->i_dio_count is our ref on inode and thus bch_fs */
+       inode_dio_end(&inode->v);
+
+       if (!sync) {
+               req->ki_complete(req, ret);
+               ret = -EIOCBQUEUED;
+       }
+       return ret;
+}
+
+static void bch2_dio_write_loop_async(struct closure *cl)
+{
+       struct dio_write *dio = container_of(cl, struct dio_write, cl);
+
+       bch2_dio_write_loop(dio);
+}
+
+static noinline
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+       struct file *file = req->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct dio_write *dio;
+       struct bio *bio;
+       loff_t offset = req->ki_pos;
+       ssize_t ret;
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+
+       if (unlikely(!iter->count))
+               return 0;
+
+       if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+               return -EINVAL;
+
+       bio = bio_alloc_bioset(NULL,
+                              iov_iter_npages(iter, BIO_MAX_VECS),
+                              REQ_OP_WRITE,
+                              GFP_KERNEL,
+                              &c->dio_write_bioset);
+       dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
+       closure_init(&dio->cl, NULL);
+       dio->req                = req;
+       dio->task               = current;
+       dio->loop               = false;
+       dio->sync               = is_sync_kiocb(req) ||
+               offset + iter->count > inode->v.i_size;
+       dio->free_iov           = false;
+       dio->quota_res.sectors  = 0;
+       dio->iter               = *iter;
+       bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
+       dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task);
+       dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+       if ((req->ki_flags & IOCB_DSYNC) &&
+           !c->opts.journal_flush_disabled)
+               dio->iop.op.flags |= BCH_WRITE_FLUSH;
+
+       ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+                                        iter->count >> 9, true);
+       if (unlikely(ret))
+               goto err;
+
+       ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
+                                       dio->iop.op.opts.data_replicas, 0);
+       if (unlikely(ret)) {
+               if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
+                                                     offset >> 9),
+                                              iter->count >> 9))
+                       goto err;
+
+               dio->iop.unalloc = true;
+       }
+
+       dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
+
+       return bch2_dio_write_loop(dio);
+err:
+       bch2_disk_reservation_put(c, &dio->iop.op.res);
+       bch2_quota_reservation_put(c, inode, &dio->quota_res);
+       closure_debug_destroy(&dio->cl);
+       bio_put(bio);
+       return ret;
+}
+
+static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       ssize_t ret;
+
+       if (iocb->ki_flags & IOCB_DIRECT)
+               return bch2_direct_write(iocb, from);
+
+       ret = file_remove_privs(file);
+       if (ret)
+               return ret;
+
+       ret = file_update_time(file);
+       if (ret)
+               return ret;
+
+       ret = iocb->ki_flags & IOCB_DIRECT
+               ? bch2_direct_write(iocb, from)
+               : bch2_buffered_write(iocb, from);
+
+       if (likely(ret > 0))
+               iocb->ki_pos += ret;
+
+       return ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp);
+       bool direct = iocb->ki_flags & IOCB_DIRECT;
+       ssize_t ret;
+
+       inode_lock(&inode->v);
+       ret = generic_write_checks(iocb, from);
+       if (ret > 0)
+               ret = __bch2_write_iter(iocb, from);
+       inode_unlock(&inode->v);
+
+       if (ret > 0 && !direct)
+               ret = generic_write_sync(iocb, ret);
+
+       return ret;
+}
+
+/* fsync: */
+
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret;
+
+       ret = file_write_and_wait_range(file, start, end);
+       if (ret)
+               return ret;
+
+       if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
+               goto out;
+
+       ret = sync_inode_metadata(&inode->v, 1);
+       if (ret)
+               return ret;
+out:
+       if (c->opts.journal_flush_disabled)
+               return 0;
+
+       return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+}
+
+/* truncate: */
+
+static inline int range_has_data(struct bch_fs *c,
+                                 struct bpos start,
+                                 struct bpos end)
+{
+
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          start, 0, k) {
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+                       break;
+
+               if (bkey_extent_is_data(k.k)) {
+                       ret = 1;
+                       break;
+               }
+       }
+
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+static int __bch2_truncate_page(struct bch_inode_info *inode,
+                               pgoff_t index, loff_t start, loff_t end)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct address_space *mapping = inode->v.i_mapping;
+       unsigned start_offset = start & (PAGE_SIZE - 1);
+       unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+       struct page *page;
+       int ret = 0;
+
+       /* Page boundary? Nothing to do */
+       if (!((index == start >> PAGE_SHIFT && start_offset) ||
+             (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
+               return 0;
+
+       /* Above i_size? */
+       if (index << PAGE_SHIFT >= inode->v.i_size)
+               return 0;
+
+       page = find_lock_page(mapping, index);
+       if (!page) {
+               /*
+                * XXX: we're doing two index lookups when we end up reading the
+                * page
+                */
+               ret = range_has_data(c,
+                               POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
+                               POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+               if (ret <= 0)
+                       return ret;
+
+               page = find_or_create_page(mapping, index, GFP_KERNEL);
+               if (unlikely(!page)) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
+       if (!PageUptodate(page)) {
+               ret = bch2_read_single_page(page, mapping);
+               if (ret)
+                       goto unlock;
+       }
+
+       /*
+        * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+        *
+        * XXX: because we aren't currently tracking whether the page has actual
+        * data in it (vs. just 0s, or only partially written) this wrong. ick.
+        */
+       ret = bch2_get_page_reservation(c, inode, page, false);
+       BUG_ON(ret);
+
+       if (index == start >> PAGE_SHIFT &&
+           index == end >> PAGE_SHIFT)
+               zero_user_segment(page, start_offset, end_offset);
+       else if (index == start >> PAGE_SHIFT)
+               zero_user_segment(page, start_offset, PAGE_SIZE);
+       else if (index == end >> PAGE_SHIFT)
+               zero_user_segment(page, 0, end_offset);
+
+       if (!PageDirty(page))
+               set_page_dirty(page);
+unlock:
+       unlock_page(page);
+       put_page(page);
+out:
+       return ret;
+}
+
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
+{
+       return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+                                   from, from + PAGE_SIZE);
+}
+
+static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct address_space *mapping = inode->v.i_mapping;
+       int ret;
+
+       ret = filemap_write_and_wait_range(mapping,
+                       inode->ei_inode.bi_size, S64_MAX);
+       if (ret)
+               return ret;
+
+       truncate_setsize(&inode->v, iattr->ia_size);
+       /* ATTR_MODE will never be set here, ns argument isn't needed: */
+       setattr_copy(NULL, &inode->v, iattr);
+
+       mutex_lock(&inode->ei_update_lock);
+       inode_set_ctime_current(&inode->v);
+       inode->v.i_mtime = inode_get_ctime(&inode->v);
+       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+       mutex_unlock(&inode->ei_update_lock);
+
+       return ret;
+}
+
+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct address_space *mapping = inode->v.i_mapping;
+       struct i_sectors_hook i_sectors_hook =
+               i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
+       bool shrink;
+       int ret = 0;
+
+       inode_dio_wait(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+       BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
+
+       shrink = iattr->ia_size <= inode->v.i_size;
+
+       if (!shrink) {
+               ret = bch2_extend(inode, iattr);
+               goto err_put_pagecache;
+       }
+
+       ret = bch2_truncate_page(inode, iattr->ia_size);
+       if (unlikely(ret))
+               goto err_put_pagecache;
+
+       if (iattr->ia_size > inode->ei_inode.bi_size)
+               ret = filemap_write_and_wait_range(mapping,
+                               inode->ei_inode.bi_size,
+                               iattr->ia_size - 1);
+       else if (iattr->ia_size & (PAGE_SIZE - 1))
+               ret = filemap_write_and_wait_range(mapping,
+                               round_down(iattr->ia_size, PAGE_SIZE),
+                               iattr->ia_size - 1);
+       if (ret)
+               goto err_put_pagecache;
+
+       i_sectors_hook.new_i_size = iattr->ia_size;
+
+       ret = i_sectors_dirty_start(c, &i_sectors_hook);
+       if (unlikely(ret))
+               goto err_put_pagecache;
+
+       truncate_setsize(&inode->v, iattr->ia_size);
+
+       ret = bch2_inode_truncate(c, inode->v.i_ino,
+                                 round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+                                 &i_sectors_hook.hook,
+                                 &inode->ei_journal_seq);
+       if (unlikely(ret))
+               goto err_put_sectors_dirty;
+
+       /* ATTR_MODE will never be set here, ns argument isn't needed: */
+       setattr_copy(NULL, &inode->v, iattr);
+       inode_set_ctime_current(&inode->v);
+       inode->v.i_mtime = inode_get_ctime(&inode->v);
+out:
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err_put_pagecache:
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       return ret;
+err_put_sectors_dirty:
+       /*
+        * On error - in particular, bch2_truncate_page() error - don't clear
+        * I_SIZE_DIRTY, as we've left data above i_size!:
+        */
+       i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
+       goto out;
+}
+
+/* fallocate: */
+
+static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       u64 ino = inode->v.i_ino;
+       u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
+       u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+       int ret = 0;
+
+       inode_lock(&inode->v);
+       inode_dio_wait(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+       ret = __bch2_truncate_page(inode,
+                                  offset >> PAGE_SHIFT,
+                                  offset, offset + len);
+       if (unlikely(ret))
+               goto err;
+
+       if (offset >> PAGE_SHIFT !=
+           (offset + len) >> PAGE_SHIFT) {
+               ret = __bch2_truncate_page(inode,
+                                          (offset + len) >> PAGE_SHIFT,
+                                          offset, offset + len);
+               if (unlikely(ret))
+                       goto err;
+       }
+
+       truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+
+       if (discard_start < discard_end) {
+               /*
+                * We need to pass in a disk reservation here because we might
+                * be splitting a compressed extent into two. This isn't a
+                * problem with truncate because truncate will never split an
+                * extent, only truncate it...
+                */
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(c, 0);
+               struct i_sectors_hook i_sectors_hook =
+                       i_sectors_hook_init(inode, 0);
+               int ret;
+
+               ret = i_sectors_dirty_start(c, &i_sectors_hook);
+               if (unlikely(ret))
+                       goto err;
+
+               ret = bch2_btree_delete_range(c,
+                               BTREE_ID_EXTENTS,
+                               POS(ino, discard_start),
+                               POS(ino, discard_end),
+                               ZERO_VERSION,
+                               &disk_res,
+                               &i_sectors_hook.hook,
+                               &inode->ei_journal_seq);
+
+               ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+       }
+err:
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       inode_unlock(&inode->v);
+
+       return ret;
+}
+
+static long bch2_fcollapse(struct bch_inode_info *inode,
+                          loff_t offset, loff_t len)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct address_space *mapping = inode->v.i_mapping;
+       struct btree_iter src;
+       struct btree_iter dst;
+       BKEY_PADDED(k) copy;
+       struct bkey_s_c k;
+       struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
+       loff_t new_size;
+       int ret;
+
+       if ((offset | len) & (block_bytes(c) - 1))
+               return -EINVAL;
+
+       bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
+                            POS(inode->v.i_ino, offset >> 9),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       /* position will be set from dst iter's position: */
+       bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN,
+                            BTREE_ITER_SLOTS);
+       bch2_btree_iter_link(&src, &dst);
+
+       /*
+        * We need i_mutex to keep the page cache consistent with the extents
+        * btree, and the btree consistent with i_size - we don't need outside
+        * locking for the extents btree itself, because we're using linked
+        * iterators
+        */
+       inode_lock(&inode->v);
+       inode_dio_wait(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+       ret = -EINVAL;
+       if (offset + len >= inode->v.i_size)
+               goto err;
+
+       if (inode->v.i_size < len)
+               goto err;
+
+       new_size = inode->v.i_size - len;
+
+       ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+       if (ret)
+               goto err;
+
+       ret = i_sectors_dirty_start(c, &i_sectors_hook);
+       if (ret)
+               goto err;
+
+       while (bkey_cmp(dst.pos,
+                       POS(inode->v.i_ino,
+                           round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+               struct disk_reservation disk_res;
+
+               bch2_btree_iter_set_pos(&src,
+                       POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+
+               k = bch2_btree_iter_peek_slot(&src);
+               if ((ret = btree_iter_err(k)))
+                       goto btree_iter_err;
+
+               bkey_reassemble(&copy.k, k);
+
+               bch2_cut_front(src.pos, &copy.k);
+               copy.k.k.p.offset -= len >> 9;
+
+               BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+
+               ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
+                               bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
+                               BCH_DISK_RESERVATION_NOFAIL);
+               BUG_ON(ret);
+
+               ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+                                          &inode->ei_journal_seq,
+                                          BTREE_INSERT_ATOMIC|
+                                          BTREE_INSERT_NOFAIL,
+                                          BTREE_INSERT_ENTRY(&dst, &copy.k));
+               bch2_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret) {
+                       bch2_btree_iter_unlock(&src);
+                       bch2_btree_iter_unlock(&dst);
+                       goto err_put_sectors_dirty;
+               }
+               /*
+                * XXX: if we error here we've left data with multiple
+                * pointers... which isn't a _super_ serious problem...
+                */
+
+               bch2_btree_iter_cond_resched(&src);
+       }
+
+       bch2_btree_iter_unlock(&src);
+       bch2_btree_iter_unlock(&dst);
+
+       ret = bch2_inode_truncate(c, inode->v.i_ino,
+                                round_up(new_size, block_bytes(c)) >> 9,
+                                &i_sectors_hook.hook,
+                                &inode->ei_journal_seq);
+       if (ret)
+               goto err_put_sectors_dirty;
+
+       i_sectors_hook.new_i_size = new_size;
+err_put_sectors_dirty:
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       inode_unlock(&inode->v);
+       return ret;
+}
+
+static long bch2_fallocate(struct bch_inode_info *inode, int mode,
+                          loff_t offset, loff_t len)
+{
+       struct address_space *mapping = inode->v.i_mapping;
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
+       struct btree_iter iter;
+       struct bpos end_pos;
+       loff_t block_start, block_end;
+       loff_t end = offset + len;
+       unsigned sectors;
+       unsigned replicas = io_opts(c, inode).data_replicas;
+       int ret;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       inode_lock(&inode->v);
+       inode_dio_wait(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+               ret = inode_newsize_ok(&inode->v, end);
+               if (ret)
+                       goto err;
+       }
+
+       if (mode & FALLOC_FL_ZERO_RANGE) {
+               ret = __bch2_truncate_page(inode,
+                                          offset >> PAGE_SHIFT,
+                                          offset, end);
+
+               if (!ret &&
+                   offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+                       ret = __bch2_truncate_page(inode,
+                                                  end >> PAGE_SHIFT,
+                                                  offset, end);
+
+               if (unlikely(ret))
+                       goto err;
+
+               truncate_pagecache_range(&inode->v, offset, end - 1);
+
+               block_start     = round_up(offset, PAGE_SIZE);
+               block_end       = round_down(end, PAGE_SIZE);
+       } else {
+               block_start     = round_down(offset, PAGE_SIZE);
+               block_end       = round_up(end, PAGE_SIZE);
+       }
+
+       bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
+       end_pos = POS(inode->v.i_ino, block_end >> 9);
+
+       ret = i_sectors_dirty_start(c, &i_sectors_hook);
+       if (unlikely(ret))
+               goto err;
+
+       while (bkey_cmp(iter.pos, end_pos) < 0) {
+               struct disk_reservation disk_res = { 0 };
+               struct bkey_i_reservation reservation;
+               struct bkey_s_c k;
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               if ((ret = btree_iter_err(k)))
+                       goto btree_iter_err;
+
+               /* already reserved */
+               if (k.k->type == BCH_RESERVATION &&
+                   bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
+                       bch2_btree_iter_next_slot(&iter);
+                       continue;
+               }
+
+               if (bkey_extent_is_data(k.k)) {
+                       if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+                               bch2_btree_iter_next_slot(&iter);
+                               continue;
+                       }
+               }
+
+               bkey_reservation_init(&reservation.k_i);
+               reservation.k.type      = BCH_RESERVATION;
+               reservation.k.p         = k.k->p;
+               reservation.k.size      = k.k->size;
+
+               bch2_cut_front(iter.pos, &reservation.k_i);
+               bch2_cut_back(end_pos, &reservation.k);
+
+               sectors = reservation.k.size;
+               reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
+
+               if (!bkey_extent_is_allocation(k.k)) {
+                       ret = bch2_quota_reservation_add(c, inode,
+                                       &i_sectors_hook.quota_res,
+                                       sectors, true);
+                       if (unlikely(ret))
+                               goto btree_iter_err;
+               }
+
+               if (reservation.v.nr_replicas < replicas ||
+                   bch2_extent_is_compressed(k)) {
+                       ret = bch2_disk_reservation_get(c, &disk_res, sectors,
+                                                       replicas, 0);
+                       if (unlikely(ret))
+                               goto btree_iter_err;
+
+                       reservation.v.nr_replicas = disk_res.nr_replicas;
+               }
+
+               ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+                                         &inode->ei_journal_seq,
+                                         BTREE_INSERT_ATOMIC|
+                                         BTREE_INSERT_NOFAIL,
+                                         BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
+               bch2_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret) {
+                       bch2_btree_iter_unlock(&iter);
+                       goto err_put_sectors_dirty;
+               }
+
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+           end > inode->v.i_size) {
+               i_size_write(&inode->v, end);
+
+               mutex_lock(&inode->ei_update_lock);
+               ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+               mutex_unlock(&inode->ei_update_lock);
+       }
+
+       /* blech */
+       if ((mode & FALLOC_FL_KEEP_SIZE) &&
+           (mode & FALLOC_FL_ZERO_RANGE) &&
+           inode->ei_inode.bi_size != inode->v.i_size) {
+               /* sync appends.. */
+               ret = filemap_write_and_wait_range(mapping,
+                                       inode->ei_inode.bi_size, S64_MAX);
+               if (ret)
+                       goto err;
+
+               if (inode->ei_inode.bi_size != inode->v.i_size) {
+                       mutex_lock(&inode->ei_update_lock);
+                       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+                       mutex_unlock(&inode->ei_update_lock);
+               }
+       }
+
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       inode_unlock(&inode->v);
+
+       return 0;
+err_put_sectors_dirty:
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       inode_unlock(&inode->v);
+       return ret;
+}
+
+long bch2_fallocate_dispatch(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+
+       if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+               return bch2_fallocate(inode, mode, offset, len);
+
+       if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+               return bch2_fpunch(inode, offset, len);
+
+       if (mode == FALLOC_FL_COLLAPSE_RANGE)
+               return bch2_fcollapse(inode, offset, len);
+
+       return -EOPNOTSUPP;
+}
+
+/* fseek: */
+
+static bool folio_is_data(struct folio *folio)
+{
+       /* XXX: should only have to check PageDirty */
+       return folio_test_private(folio) &&
+               (page_state(&folio->page)->sectors ||
+                page_state(&folio->page)->dirty_sectors);
+}
+
+static loff_t bch2_next_pagecache_data(struct inode *vinode,
+                                      loff_t start_offset,
+                                      loff_t end_offset)
+{
+       struct folio_batch fbatch;
+       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
+       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
+       pgoff_t index           = start_index;
+       unsigned i;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(vinode->i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+
+                       folio_lock(folio);
+                       if (folio_is_data(folio)) {
+                               end_offset =
+                                       min(end_offset,
+                                           max(start_offset,
+                                               ((loff_t) index) << PAGE_SHIFT));
+                               folio_unlock(folio);
+                               folio_batch_release(&fbatch);
+                               return end_offset;
+                       }
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+
+       return end_offset;
+}
+
+static loff_t bch2_seek_data(struct file *file, u64 offset)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 isize, next_data = MAX_LFS_FILESIZE;
+       int ret;
+
+       isize = i_size_read(&inode->v);
+       if (offset >= isize)
+               return -ENXIO;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(inode->v.i_ino, offset >> 9), 0, k) {
+               if (k.k->p.inode != inode->v.i_ino) {
+                       break;
+               } else if (bkey_extent_is_data(k.k)) {
+                       next_data = max(offset, bkey_start_offset(k.k) << 9);
+                       break;
+               } else if (k.k->p.offset >> 9 > isize)
+                       break;
+       }
+
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
+
+       if (next_data > offset)
+               next_data = bch2_next_pagecache_data(&inode->v,
+                                                    offset, next_data);
+
+       if (next_data > isize)
+               return -ENXIO;
+
+       return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+{
+       struct page *page;
+       bool ret;
+
+       page = find_lock_page(mapping, index);
+       if (!page)
+               return false;
+
+       ret = folio_is_data(page_folio(page));
+       unlock_page(page);
+
+       return ret;
+}
+
+static loff_t bch2_next_pagecache_hole(struct inode *vinode,
+                                      loff_t start_offset,
+                                      loff_t end_offset)
+{
+       struct address_space *mapping = vinode->i_mapping;
+       pgoff_t index;
+
+       for (index = start_offset >> PAGE_SHIFT;
+            index < end_offset >> PAGE_SHIFT;
+            index++)
+               if (!page_slot_is_data(mapping, index))
+                       end_offset = max(start_offset,
+                                        ((loff_t) index) << PAGE_SHIFT);
+
+       return end_offset;
+}
+
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 isize, next_hole = MAX_LFS_FILESIZE;
+       int ret;
+
+       isize = i_size_read(&inode->v);
+       if (offset >= isize)
+               return -ENXIO;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(inode->v.i_ino, offset >> 9),
+                          BTREE_ITER_SLOTS, k) {
+               if (k.k->p.inode != inode->v.i_ino) {
+                       next_hole = bch2_next_pagecache_hole(&inode->v,
+                                       offset, MAX_LFS_FILESIZE);
+                       break;
+               } else if (!bkey_extent_is_data(k.k)) {
+                       next_hole = bch2_next_pagecache_hole(&inode->v,
+                                       max(offset, bkey_start_offset(k.k) << 9),
+                                       k.k->p.offset << 9);
+
+                       if (next_hole < k.k->p.offset << 9)
+                               break;
+               } else {
+                       offset = max(offset, bkey_start_offset(k.k) << 9);
+               }
+       }
+
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
+
+       if (next_hole > isize)
+               next_hole = isize;
+
+       return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
+{
+       switch (whence) {
+       case SEEK_SET:
+       case SEEK_CUR:
+       case SEEK_END:
+               return generic_file_llseek(file, offset, whence);
+       case SEEK_DATA:
+               return bch2_seek_data(file, offset);
+       case SEEK_HOLE:
+               return bch2_seek_hole(file, offset);
+       }
+
+       return -EINVAL;
+}
+
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+       bioset_exit(&c->dio_write_bioset);
+       bioset_exit(&c->dio_read_bioset);
+       bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       if (bioset_init(&c->writepage_bioset,
+                       4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->dio_read_bioset,
+                       4, offsetof(struct dio_read, rbio.bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->dio_write_bioset,
+                       4, offsetof(struct dio_write, iop.op.wbio.bio),
+                       BIOSET_NEED_BVECS))
+               ret = -ENOMEM;
+
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
new file mode 100644 (file)
index 0000000..2e4bfee
--- /dev/null
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "io_types.h"
+
+#include <linux/uio.h>
+
+bool bch2_dirty_folio(struct address_space *, struct folio *);
+
+int bch2_writepage(struct page *, struct writeback_control *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+                    unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+                  unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bch2_truncate(struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
new file mode 100644 (file)
index 0000000..895ccc7
--- /dev/null
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "fs.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/mount.h>
+
+#define FS_IOC_GOINGDOWN            _IOR('X', 125, __u32)
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const unsigned bch_flags_to_vfs[] = {
+       [__BCH_INODE_SYNC]      = S_SYNC,
+       [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
+       [__BCH_INODE_APPEND]    = S_APPEND,
+       [__BCH_INODE_NOATIME]   = S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const unsigned bch_flags_to_uflags[] = {
+       [__BCH_INODE_SYNC]      = FS_SYNC_FL,
+       [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
+       [__BCH_INODE_APPEND]    = FS_APPEND_FL,
+       [__BCH_INODE_NODUMP]    = FS_NODUMP_FL,
+       [__BCH_INODE_NOATIME]   = FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const unsigned bch_flags_to_xflags[] = {
+       [__BCH_INODE_SYNC]      = FS_XFLAG_SYNC,
+       [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
+       [__BCH_INODE_APPEND]    = FS_XFLAG_APPEND,
+       [__BCH_INODE_NODUMP]    = FS_XFLAG_NODUMP,
+       [__BCH_INODE_NOATIME]   = FS_XFLAG_NOATIME,
+       //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)                                     \
+do {                                                                   \
+       unsigned _i;                                                    \
+                                                                       \
+       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
+               if ((_in) & (1 << _i))                                  \
+                       (_out) |= _map[_i];                             \
+               else                                                    \
+                       (_out) &= ~_map[_i];                            \
+} while (0)
+
+#define map_flags(_map, _in)                                           \
+({                                                                     \
+       unsigned _out = 0;                                              \
+                                                                       \
+       set_flags(_map, _in, _out);                                     \
+       _out;                                                           \
+})
+
+#define map_flags_rev(_map, _in)                                       \
+({                                                                     \
+       unsigned _i, _out = 0;                                          \
+                                                                       \
+       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
+               if ((_in) & _map[_i]) {                                 \
+                       (_out) |= 1 << _i;                              \
+                       (_in) &= ~_map[_i];                             \
+               }                                                       \
+       (_out);                                                         \
+})
+
+#define map_defined(_map)                                              \
+({                                                                     \
+       unsigned _in = ~0;                                              \
+                                                                       \
+       map_flags_rev(_map, _in);                                       \
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+       set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+struct flags_set {
+       unsigned                mask;
+       unsigned                flags;
+
+       unsigned                projid;
+};
+
+static int bch2_inode_flags_set(struct bch_inode_info *inode,
+                               struct bch_inode_unpacked *bi,
+                               void *p)
+{
+       /*
+        * We're relying on btree locking here for exclusion with other ioctl
+        * calls - use the flags in the btree (@bi), not inode->i_flags:
+        */
+       struct flags_set *s = p;
+       unsigned newflags = s->flags;
+       unsigned oldflags = bi->bi_flags & s->mask;
+
+       if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+           !capable(CAP_LINUX_IMMUTABLE))
+               return -EPERM;
+
+       if (!S_ISREG(inode->v.i_mode) &&
+           !S_ISDIR(inode->v.i_mode) &&
+           (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+               return -EINVAL;
+
+       bi->bi_flags &= ~s->mask;
+       bi->bi_flags |= newflags;
+       inode_set_ctime_current(&inode->v);
+       return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+       unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+       return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+                            struct file *file,
+                            struct bch_inode_info *inode,
+                            void __user *arg)
+{
+       struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+       unsigned uflags;
+       int ret;
+
+       if (get_user(uflags, (int __user *) arg))
+               return -EFAULT;
+
+       s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+       if (uflags)
+               return -EOPNOTSUPP;
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       inode_lock(&inode->v);
+       if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+               ret = -EACCES;
+               goto setflags_out;
+       }
+
+       mutex_lock(&inode->ei_update_lock);
+       ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
+
+       if (!ret)
+               bch2_inode_flags_to_vfs(inode);
+       mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+       inode_unlock(&inode->v);
+       mnt_drop_write_file(file);
+       return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+                              struct fsxattr __user *arg)
+{
+       struct fsxattr fa = { 0 };
+
+       fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+       fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+       return copy_to_user(arg, &fa, sizeof(fa));
+}
+
+static int bch2_set_projid(struct bch_fs *c,
+                          struct bch_inode_info *inode,
+                          u32 projid)
+{
+       struct bch_qid qid = inode->ei_qid;
+       int ret;
+
+       if (projid == inode->ei_qid.q[QTYP_PRJ])
+               return 0;
+
+       qid.q[QTYP_PRJ] = projid;
+
+       return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+                                  inode->v.i_blocks +
+                                  inode->ei_quota_reserved);
+       if (ret)
+               return ret;
+
+       inode->ei_qid.q[QTYP_PRJ] = projid;
+       return 0;
+}
+
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct flags_set *s = p;
+
+       bi->bi_project = s->projid;
+
+       return bch2_inode_flags_set(inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+                              struct file *file,
+                              struct bch_inode_info *inode,
+                              struct fsxattr __user *arg)
+{
+       struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+       struct fsxattr fa;
+       int ret;
+
+       if (copy_from_user(&fa, arg, sizeof(fa)))
+               return -EFAULT;
+
+       s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+       if (fa.fsx_xflags)
+               return -EOPNOTSUPP;
+
+       s.projid = fa.fsx_projid;
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       inode_lock(&inode->v);
+       if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+               ret = -EACCES;
+               goto err;
+       }
+
+       mutex_lock(&inode->ei_update_lock);
+       ret = bch2_set_projid(c, inode, fa.fsx_projid);
+       if (ret)
+               goto err_unlock;
+
+       ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
+       if (!ret)
+               bch2_inode_flags_to_vfs(inode);
+err_unlock:
+       mutex_unlock(&inode->ei_update_lock);
+err:
+       inode_unlock(&inode->v);
+       mnt_drop_write_file(file);
+       return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct super_block *sb = inode->v.i_sb;
+       struct bch_fs *c = sb->s_fs_info;
+
+       switch (cmd) {
+       case FS_IOC_GETFLAGS:
+               return bch2_ioc_getflags(inode, (int __user *) arg);
+
+       case FS_IOC_SETFLAGS:
+               return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+
+       case FS_IOC_FSGETXATTR:
+               return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+       case FS_IOC_FSSETXATTR:
+               return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
+
+       case FS_IOC_GETVERSION:
+               return -ENOTTY;
+       case FS_IOC_SETVERSION:
+               return -ENOTTY;
+
+       case FS_IOC_GOINGDOWN:
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               down_write(&sb->s_umount);
+               sb->s_flags |= SB_RDONLY;
+               bch2_fs_emergency_read_only(c);
+               up_write(&sb->s_umount);
+               return 0;
+
+       default:
+               return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+       }
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+       /* These are just misnamed, they actually get/put from/to user an int */
+       switch (cmd) {
+       case FS_IOC_GETFLAGS:
+               cmd = FS_IOC_GETFLAGS;
+               break;
+       case FS_IOC32_SETFLAGS:
+               cmd = FS_IOC_SETFLAGS;
+               break;
+       default:
+               return -ENOIOCTLCMD;
+       }
+       return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
new file mode 100644 (file)
index 0000000..2d117ef
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
new file mode 100644 (file)
index 0000000..3f3d916
--- /dev/null
@@ -0,0 +1,1773 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-ioctl.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "quota.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch2_inode_cache;
+
+static void bch2_vfs_inode_init(struct bch_fs *,
+                               struct bch_inode_info *,
+                               struct bch_inode_unpacked *);
+
+static void journal_seq_copy(struct bch_inode_info *dst,
+                            u64 journal_seq)
+{
+       u64 old, v = READ_ONCE(dst->ei_journal_seq);
+
+       do {
+               old = v;
+
+               if (old >= journal_seq)
+                       break;
+       } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+}
+
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+       BUG_ON(atomic_long_read(&lock->v) == 0);
+
+       if (atomic_long_sub_return_release(i, &lock->v) == 0)
+               wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+       long v = atomic_long_read(&lock->v), old;
+
+       do {
+               old = v;
+
+               if (i > 0 ? v < 0 : v > 0)
+                       return false;
+       } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+                                       old, old + i)) != old);
+       return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+       wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *lock)
+{
+       __pagecache_lock_put(lock, 1);
+}
+
+void bch2_pagecache_add_get(struct pagecache_lock *lock)
+{
+       __pagecache_lock_get(lock, 1);
+}
+
+void bch2_pagecache_block_put(struct pagecache_lock *lock)
+{
+       __pagecache_lock_put(lock, -1);
+}
+
+void bch2_pagecache_block_get(struct pagecache_lock *lock)
+{
+       __pagecache_lock_get(lock, -1);
+}
+
+/*
+ * I_SIZE_DIRTY requires special handling:
+ *
+ * To the recovery code, the flag means that there is stale data past i_size
+ * that needs to be deleted; it's used for implementing atomic appends and
+ * truncates.
+ *
+ * On append, we set I_SIZE_DIRTY before doing the write, then after the write
+ * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
+ * that exposes the data we just wrote.
+ *
+ * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
+ * i_size to the new smaller size, then we delete the data that we just made
+ * invisible, and then we clear I_SIZE_DIRTY.
+ *
+ * Because there can be multiple appends in flight at a time, we need a refcount
+ * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
+ * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
+ *
+ * Because write_inode() can be called at any time, i_size_dirty_count means
+ * something different to the runtime code - it means to write_inode() "don't
+ * update i_size yet".
+ *
+ * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
+ * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
+ * be set explicitly.
+ */
+
+void bch2_inode_update_after_write(struct bch_fs *c,
+                                  struct bch_inode_info *inode,
+                                  struct bch_inode_unpacked *bi,
+                                  unsigned fields)
+{
+       set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
+                 ? 0
+                 : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+       i_uid_write(&inode->v, bi->bi_uid);
+       i_gid_write(&inode->v, bi->bi_gid);
+       inode->v.i_mode = bi->bi_mode;
+
+       if (fields & ATTR_ATIME)
+               inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+       if (fields & ATTR_MTIME)
+               inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+       if (fields & ATTR_CTIME)
+               inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
+
+       inode->ei_inode         = *bi;
+       inode->ei_qid           = bch_qid(bi);
+}
+
+int __must_check bch2_write_inode_trans(struct btree_trans *trans,
+                               struct bch_inode_info *inode,
+                               struct bch_inode_unpacked *inode_u,
+                               inode_set_fn set,
+                               void *p)
+{
+       struct btree_iter *iter;
+       struct bkey_inode_buf *inode_p;
+       struct bkey_s_c k;
+       u64 inum = inode->v.i_ino;
+       int ret;
+
+       lockdep_assert_held(&inode->ei_update_lock);
+
+       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
+       k = bch2_btree_iter_peek_slot(iter);
+       if ((ret = btree_iter_err(k)))
+               return ret;
+
+       if (WARN_ONCE(k.k->type != BCH_INODE_FS,
+                     "inode %llu not found when updating", inum))
+               return -ENOENT;
+
+       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
+       if (WARN_ONCE(ret,
+                     "error %i unpacking inode %llu", ret, inum))
+               return -ENOENT;
+
+       BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
+
+       BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
+              !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+              inode_u->bi_size > i_size_read(&inode->v));
+
+       if (set) {
+               ret = set(inode, inode_u, p);
+               if (ret)
+                       return ret;
+       }
+
+       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+       if (IS_ERR(inode_p))
+               return PTR_ERR(inode_p);
+
+       bch2_inode_pack(inode_p, inode_u);
+       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+       return 0;
+}
+
+int __must_check __bch2_write_inode(struct bch_fs *c,
+                                   struct bch_inode_info *inode,
+                                   inode_set_fn set,
+                                   void *p, unsigned fields)
+{
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
+
+       /*
+        * the btree node lock protects inode->ei_inode, not ei_update_lock;
+        * this is important for inode updates via bchfs_write_index_update
+        */
+       if (!ret)
+               bch2_inode_update_after_write(c, inode, &inode_u, fields);
+
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
+}
+
+static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+{
+       struct bch_inode_unpacked inode_u;
+       struct bch_inode_info *inode;
+       int ret;
+
+       inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+       if (unlikely(!inode))
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->v.i_state & I_NEW))
+               return &inode->v;
+
+       ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+       if (ret) {
+               iget_failed(&inode->v);
+               return ERR_PTR(ret);
+       }
+
+       bch2_vfs_inode_init(c, inode, &inode_u);
+
+       inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+
+       unlock_new_inode(&inode->v);
+
+       return &inode->v;
+}
+
+static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
+                                 const struct inode *dir, umode_t mode)
+{
+       kuid_t uid = current_fsuid();
+       kgid_t gid;
+
+       if (dir && dir->i_mode & S_ISGID) {
+               gid = dir->i_gid;
+               if (S_ISDIR(mode))
+                       mode |= S_ISGID;
+       } else
+               gid = current_fsgid();
+
+       inode_u->bi_uid         = from_kuid(i_user_ns(dir), uid);
+       inode_u->bi_gid         = from_kgid(i_user_ns(dir), gid);
+       inode_u->bi_mode        = mode;
+}
+
+static int inode_update_for_create_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_inode_unpacked *new_inode = p;
+       struct timespec64 now = current_time(&inode->v);
+
+       bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+       if (S_ISDIR(new_inode->bi_mode))
+               bi->bi_nlink++;
+
+       return 0;
+}
+
+static int inum_test(struct inode *inode, void *p)
+{
+       unsigned long *ino = p;
+
+       return *ino == inode->i_ino;
+}
+
+static struct bch_inode_info *
+__bch2_create(struct mnt_idmap *idmap,
+             struct bch_inode_info *dir, struct dentry *dentry,
+             umode_t mode, dev_t rdev, bool tmpfile)
+{
+       struct bch_fs *c = dir->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct bch_inode_unpacked dir_u;
+       struct bch_inode_info *inode, *old;
+       struct bch_inode_unpacked inode_u;
+       struct bch_hash_info hash_info;
+       struct posix_acl *default_acl = NULL, *acl = NULL;
+       int ret;
+
+       bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
+       bch2_inode_init_owner(&inode_u, &dir->v, mode);
+
+       inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
+
+       hash_info = bch2_hash_info_init(c, &inode_u);
+
+       if (tmpfile)
+               inode_u.bi_flags |= BCH_INODE_UNLINKED;
+
+       ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+       if (ret)
+               return ERR_PTR(ret);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+       if (ret)
+               goto err;
+#endif
+
+       /*
+        * preallocate vfs inode before btree transaction, so that nothing can
+        * fail after the transaction succeeds:
+        */
+       inode = to_bch_ei(new_inode(c->vfs_sb));
+       if (unlikely(!inode)) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       if (!tmpfile)
+               mutex_lock(&dir->ei_update_lock);
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_inode_create(&trans, &inode_u,
+                                   BLOCKDEV_INODE_MAX, 0,
+                                   &c->unused_inode_hint) ?:
+               (default_acl
+                ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+                                     default_acl, ACL_TYPE_DEFAULT)
+                : 0) ?:
+               (acl
+                ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+                                     acl, ACL_TYPE_ACCESS)
+                : 0) ?:
+               (!tmpfile
+                ? __bch2_dirent_create(&trans, dir->v.i_ino,
+                                       &dir->ei_str_hash,
+                                       mode_to_type(mode),
+                                       &dentry->d_name,
+                                       inode_u.bi_inum,
+                                       BCH_HASH_SET_MUST_CREATE)
+               : 0) ?:
+               (!tmpfile
+                ? bch2_write_inode_trans(&trans, dir, &dir_u,
+                                         inode_update_for_create_fn,
+                                         &inode_u)
+                : 0) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err_trans;
+
+       atomic_long_inc(&c->nr_inodes);
+
+       if (!tmpfile) {
+               bch2_inode_update_after_write(c, dir, &dir_u,
+                                             ATTR_MTIME|ATTR_CTIME);
+               journal_seq_copy(dir, inode->ei_journal_seq);
+               mutex_unlock(&dir->ei_update_lock);
+       }
+
+       bch2_vfs_inode_init(c, inode, &inode_u);
+
+       set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+       set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+       /*
+        * we must insert the new inode into the inode cache before calling
+        * bch2_trans_exit() and dropping locks, else we could race with another
+        * thread pulling the inode in and modifying it:
+        */
+
+       inode->v.i_state |= I_CREATING;
+       old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+                                     inum_test, NULL, &inode->v.i_ino));
+       BUG_ON(!old);
+
+       if (unlikely(old != inode)) {
+               /*
+                * We raced, another process pulled the new inode into cache
+                * before us:
+                */
+               old->ei_journal_seq = inode->ei_journal_seq;
+               make_bad_inode(&inode->v);
+               iput(&inode->v);
+
+               inode = old;
+       } else {
+               /*
+                * we really don't want insert_inode_locked2() to be setting
+                * I_NEW...
+                */
+               unlock_new_inode(&inode->v);
+       }
+
+       bch2_trans_exit(&trans);
+out:
+       posix_acl_release(default_acl);
+       posix_acl_release(acl);
+       return inode;
+err_trans:
+       bch2_trans_exit(&trans);
+       make_bad_inode(&inode->v);
+       iput(&inode->v);
+err:
+       bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
+       inode = ERR_PTR(ret);
+       goto out;
+}
+
+/* methods */
+
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+                                 unsigned int flags)
+{
+       struct bch_fs *c = vdir->i_sb->s_fs_info;
+       struct bch_inode_info *dir = to_bch_ei(vdir);
+       struct inode *vinode = NULL;
+       u64 inum;
+
+       inum = bch2_dirent_lookup(c, dir->v.i_ino,
+                                 &dir->ei_str_hash,
+                                 &dentry->d_name);
+
+       if (inum)
+               vinode = bch2_vfs_inode_get(c, inum);
+
+       return d_splice_alias(vinode, dentry);
+}
+
+static int bch2_create(struct mnt_idmap *idmap,
+                      struct inode *vdir, struct dentry *dentry,
+                      umode_t mode, bool excl)
+{
+       struct bch_inode_info *inode =
+               __bch2_create(idmap, to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
+
+static int inode_update_for_link_fn(struct bch_inode_info *inode,
+                                   struct bch_inode_unpacked *bi,
+                                   void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct timespec64 now = current_time(&inode->v);
+
+       bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+       if (bi->bi_flags & BCH_INODE_UNLINKED)
+               bi->bi_flags &= ~BCH_INODE_UNLINKED;
+       else
+               bi->bi_nlink++;
+
+       return 0;
+}
+
+static int __bch2_link(struct bch_fs *c,
+                      struct bch_inode_info *inode,
+                      struct bch_inode_info *dir,
+                      struct dentry *dentry)
+{
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
+                                    &dir->ei_str_hash,
+                                    mode_to_type(inode->v.i_mode),
+                                    &dentry->d_name,
+                                    inode->v.i_ino,
+                                    BCH_HASH_SET_MUST_CREATE) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_link_fn,
+                                      NULL) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+
+       if (ret == -EINTR)
+               goto retry;
+
+       if (likely(!ret))
+               bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+                    struct dentry *dentry)
+{
+       struct bch_fs *c = vdir->i_sb->s_fs_info;
+       struct bch_inode_info *dir = to_bch_ei(vdir);
+       struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+       int ret;
+
+       ret = __bch2_link(c, inode, dir, dentry);
+       if (unlikely(ret))
+               return ret;
+
+       ihold(&inode->v);
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
+
+static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
+                                         struct bch_inode_unpacked *bi,
+                                         void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_inode_info *unlink_inode = p;
+       struct timespec64 now = current_time(&inode->v);
+
+       bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+       bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
+
+       return 0;
+}
+
+static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct timespec64 now = current_time(&inode->v);
+
+       bi->bi_ctime = timespec_to_bch2_time(c, now);
+       if (bi->bi_nlink)
+               bi->bi_nlink--;
+       else
+               bi->bi_flags |= BCH_INODE_UNLINKED;
+
+       return 0;
+}
+
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+       struct bch_fs *c = vdir->i_sb->s_fs_info;
+       struct bch_inode_info *dir = to_bch_ei(vdir);
+       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+       struct bch_inode_unpacked dir_u, inode_u;
+       struct btree_trans trans;
+       int ret;
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
+                                    &dir->ei_str_hash,
+                                    &dentry->d_name) ?:
+               bch2_write_inode_trans(&trans, dir, &dir_u,
+                                      inode_update_dir_for_unlink_fn,
+                                      inode) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_unlink_fn,
+                                      NULL) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &dir->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
+       if (ret)
+               goto err;
+
+       if (dir->ei_journal_seq > inode->ei_journal_seq)
+               inode->ei_journal_seq = dir->ei_journal_seq;
+
+       bch2_inode_update_after_write(c, dir, &dir_u,
+                                     ATTR_MTIME|ATTR_CTIME);
+       bch2_inode_update_after_write(c, inode, &inode_u,
+                                     ATTR_MTIME);
+err:
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+static int bch2_symlink(struct mnt_idmap *idmap,
+                       struct inode *vdir, struct dentry *dentry,
+                       const char *symname)
+{
+       struct bch_fs *c = vdir->i_sb->s_fs_info;
+       struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+       int ret;
+
+       inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+       if (unlikely(IS_ERR(inode)))
+               return PTR_ERR(inode);
+
+       inode_lock(&inode->v);
+       ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+       inode_unlock(&inode->v);
+
+       if (unlikely(ret))
+               goto err;
+
+       ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+       if (unlikely(ret))
+               goto err;
+
+       journal_seq_copy(dir, inode->ei_journal_seq);
+
+       ret = __bch2_link(c, inode, dir, dentry);
+       if (unlikely(ret))
+               goto err;
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
+err:
+       iput(&inode->v);
+       return ret;
+}
+
+static int bch2_mkdir(struct mnt_idmap *idmap,
+                     struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+       struct bch_inode_info *inode =
+               __bch2_create(idmap, to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
+
+static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
+{
+       struct bch_fs *c = vdir->i_sb->s_fs_info;
+
+       if (bch2_empty_dir(c, dentry->d_inode->i_ino))
+               return -ENOTEMPTY;
+
+       return bch2_unlink(vdir, dentry);
+}
+
+static int bch2_mknod(struct mnt_idmap *idmap,
+                     struct inode *vdir, struct dentry *dentry,
+                     umode_t mode, dev_t rdev)
+{
+       struct bch_inode_info *inode =
+               __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
+
+struct rename_info {
+       u64                     now;
+       struct bch_inode_info   *src_dir;
+       struct bch_inode_info   *dst_dir;
+       struct bch_inode_info   *src_inode;
+       struct bch_inode_info   *dst_inode;
+       enum bch_rename_mode    mode;
+};
+
+static int inode_update_for_rename_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct rename_info *info = p;
+
+       if (inode == info->src_dir) {
+               bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
+               bi->bi_nlink += info->dst_inode &&
+                       S_ISDIR(info->dst_inode->v.i_mode) &&
+                       info->mode == BCH_RENAME_EXCHANGE;
+       }
+
+       if (inode == info->dst_dir) {
+               bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
+               bi->bi_nlink -= info->dst_inode &&
+                       S_ISDIR(info->dst_inode->v.i_mode);
+       }
+
+       if (inode == info->dst_inode &&
+           info->mode == BCH_RENAME_OVERWRITE) {
+               BUG_ON(bi->bi_nlink &&
+                      S_ISDIR(info->dst_inode->v.i_mode));
+
+               if (bi->bi_nlink)
+                       bi->bi_nlink--;
+               else
+                       bi->bi_flags |= BCH_INODE_UNLINKED;
+       }
+
+       if (inode == info->src_dir ||
+           inode == info->dst_dir)
+               bi->bi_mtime = info->now;
+       bi->bi_ctime = info->now;
+
+       return 0;
+}
+
+static int bch2_rename2(struct mnt_idmap *idmap,
+                       struct inode *src_vdir, struct dentry *src_dentry,
+                       struct inode *dst_vdir, struct dentry *dst_dentry,
+                       unsigned flags)
+{
+       struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+       struct rename_info i = {
+               .now            = timespec_to_bch2_time(c,
+                                               current_time(src_vdir)),
+               .src_dir        = to_bch_ei(src_vdir),
+               .dst_dir        = to_bch_ei(dst_vdir),
+               .src_inode      = to_bch_ei(src_dentry->d_inode),
+               .dst_inode      = to_bch_ei(dst_dentry->d_inode),
+               .mode           = flags & RENAME_EXCHANGE
+                               ? BCH_RENAME_EXCHANGE
+                       : dst_dentry->d_inode
+                               ? BCH_RENAME_OVERWRITE : BCH_RENAME,
+       };
+       struct btree_trans trans;
+       struct bch_inode_unpacked dst_dir_u, src_dir_u;
+       struct bch_inode_unpacked src_inode_u, dst_inode_u;
+       u64 journal_seq = 0;
+       int ret;
+
+       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+               return -EINVAL;
+
+       if (i.mode == BCH_RENAME_OVERWRITE) {
+               if (S_ISDIR(i.src_inode->v.i_mode) !=
+                   S_ISDIR(i.dst_inode->v.i_mode))
+                       return -ENOTDIR;
+
+               if (S_ISDIR(i.src_inode->v.i_mode) &&
+                   bch2_empty_dir(c, i.dst_inode->v.i_ino))
+                       return -ENOTEMPTY;
+
+               ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+                                                  0, LLONG_MAX);
+               if (ret)
+                       return ret;
+       }
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+       i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
+
+       ret   = bch2_dirent_rename(&trans,
+                                  i.src_dir, &src_dentry->d_name,
+                                  i.dst_dir, &dst_dentry->d_name,
+                                  i.mode) ?:
+               bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
+                                      inode_update_for_rename_fn, &i) ?:
+               (i.src_dir != i.dst_dir
+                ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
+                                      inode_update_for_rename_fn, &i)
+                : 0 ) ?:
+               bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
+                                      inode_update_for_rename_fn, &i) ?:
+               (i.dst_inode
+                ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
+                                      inode_update_for_rename_fn, &i)
+                : 0 ) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err;
+
+       bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+                                     ATTR_MTIME|ATTR_CTIME);
+       journal_seq_copy(i.src_dir, journal_seq);
+
+       if (i.src_dir != i.dst_dir) {
+               bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+                                             ATTR_MTIME|ATTR_CTIME);
+               journal_seq_copy(i.dst_dir, journal_seq);
+       }
+
+       bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+                                     ATTR_CTIME);
+       if (i.dst_inode)
+               bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+                                             ATTR_CTIME);
+err:
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+struct inode_write_setattr {
+       struct iattr            *attr;
+       struct mnt_idmap        *idmap;
+};
+
+static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
+                                      struct bch_inode_unpacked *bi,
+                                      void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct inode_write_setattr *s = p;
+       unsigned int ia_valid = s->attr->ia_valid;
+
+       if (ia_valid & ATTR_UID)
+               bi->bi_uid = from_kuid(i_user_ns(&inode->v), s->attr->ia_uid);
+       if (ia_valid & ATTR_GID)
+               bi->bi_gid = from_kgid(i_user_ns(&inode->v), s->attr->ia_gid);
+
+       if (ia_valid & ATTR_ATIME)
+               bi->bi_atime = timespec_to_bch2_time(c, s->attr->ia_atime);
+       if (ia_valid & ATTR_MTIME)
+               bi->bi_mtime = timespec_to_bch2_time(c, s->attr->ia_mtime);
+       if (ia_valid & ATTR_CTIME)
+               bi->bi_ctime = timespec_to_bch2_time(c, s->attr->ia_ctime);
+
+       if (ia_valid & ATTR_MODE) {
+               umode_t mode = s->attr->ia_mode;
+               kgid_t gid = ia_valid & ATTR_GID
+                       ? s->attr->ia_gid
+                       : inode->v.i_gid;
+
+               if (!in_group_p(gid) &&
+                   !capable_wrt_inode_uidgid(s->idmap, &inode->v, CAP_FSETID))
+                       mode &= ~S_ISGID;
+               bi->bi_mode = mode;
+       }
+
+       return 0;
+}
+
+static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+                               struct bch_inode_info *inode,
+                               struct iattr *iattr)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_qid qid = inode->ei_qid;
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       struct posix_acl *acl = NULL;
+       struct inode_write_setattr s = { iattr, idmap };
+       unsigned qtypes = 0;
+       int ret;
+
+       mutex_lock(&inode->ei_update_lock);
+
+       if (c->opts.usrquota &&
+           (iattr->ia_valid & ATTR_UID) &&
+           !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
+               qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid),
+               qtypes |= 1 << QTYP_USR;
+       }
+
+       if (c->opts.grpquota &&
+           (iattr->ia_valid & ATTR_GID) &&
+           !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
+               qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), iattr->ia_gid);
+               qtypes |= 1 << QTYP_GRP;
+       }
+
+       if (qtypes) {
+               ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
+                                         inode->v.i_blocks +
+                                         inode->ei_quota_reserved);
+               if (ret)
+                       goto err;
+       }
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+       kfree(acl);
+       acl = NULL;
+
+       ret = bch2_write_inode_trans(&trans, inode, &inode_u,
+                               inode_update_for_setattr_fn, &s) ?:
+               (iattr->ia_valid & ATTR_MODE
+                ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
+                : 0) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err_trans;
+
+       bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
+
+       if (acl)
+               set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+       bch2_trans_exit(&trans);
+err:
+       mutex_unlock(&inode->ei_update_lock);
+
+       return ret;
+}
+
+static int bch2_getattr(struct mnt_idmap *idmap,
+                       const struct path *path, struct kstat *stat,
+                       u32 request_mask, unsigned query_flags)
+{
+       struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       stat->dev       = inode->v.i_sb->s_dev;
+       stat->ino       = inode->v.i_ino;
+       stat->mode      = inode->v.i_mode;
+       stat->nlink     = inode->v.i_nlink;
+       stat->uid       = inode->v.i_uid;
+       stat->gid       = inode->v.i_gid;
+       stat->rdev      = inode->v.i_rdev;
+       stat->size      = i_size_read(&inode->v);
+       stat->atime     = inode->v.i_atime;
+       stat->mtime     = inode->v.i_mtime;
+       stat->ctime     = inode_get_ctime(&inode->v);
+       stat->blksize   = block_bytes(c);
+       stat->blocks    = inode->v.i_blocks;
+
+       if (request_mask & STATX_BTIME) {
+               stat->result_mask |= STATX_BTIME;
+               stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+       }
+
+       if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+               stat->attributes |= STATX_ATTR_IMMUTABLE;
+       if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+               stat->attributes |= STATX_ATTR_APPEND;
+       if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+               stat->attributes |= STATX_ATTR_NODUMP;
+
+       return 0;
+}
+
+static int bch2_setattr(struct mnt_idmap *idmap,
+                       struct dentry *dentry, struct iattr *iattr)
+{
+       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+       int ret;
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+
+       ret = setattr_prepare(idmap, dentry, iattr);
+       if (ret)
+               return ret;
+
+       return iattr->ia_valid & ATTR_SIZE
+               ? bch2_truncate(inode, iattr)
+               : bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+static int bch2_tmpfile(struct mnt_idmap *idmap,
+                       struct inode *vdir, struct file *file, umode_t mode)
+{
+       struct bch_inode_info *inode =
+               __bch2_create(idmap, to_bch_ei(vdir),
+                             file->f_path.dentry, mode, 0, true);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_mark_tmpfile(file, &inode->v);
+       d_instantiate(file->f_path.dentry, &inode->v);
+       return finish_open_simple(file, 0);
+}
+
+static int bch2_fill_extent(struct fiemap_extent_info *info,
+                           const struct bkey_i *k, unsigned flags)
+{
+       if (bkey_extent_is_data(&k->k)) {
+               struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+               const struct bch_extent_ptr *ptr;
+               struct bch_extent_crc_unpacked crc;
+               int ret;
+
+               extent_for_each_ptr_crc(e, ptr, crc) {
+                       int flags2 = 0;
+                       u64 offset = ptr->offset;
+
+                       if (crc.compression_type)
+                               flags2 |= FIEMAP_EXTENT_ENCODED;
+                       else
+                               offset += crc.offset;
+
+                       if ((offset & (PAGE_SECTORS - 1)) ||
+                           (e.k->size & (PAGE_SECTORS - 1)))
+                               flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+                       ret = fiemap_fill_next_extent(info,
+                                                     bkey_start_offset(e.k) << 9,
+                                                     offset << 9,
+                                                     e.k->size << 9, flags|flags2);
+                       if (ret)
+                               return ret;
+               }
+
+               return 0;
+       } else if (k->k.type == BCH_RESERVATION) {
+               return fiemap_fill_next_extent(info,
+                                              bkey_start_offset(&k->k) << 9,
+                                              0, k->k.size << 9,
+                                              flags|
+                                              FIEMAP_EXTENT_DELALLOC|
+                                              FIEMAP_EXTENT_UNWRITTEN);
+       } else {
+               BUG();
+       }
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+                      u64 start, u64 len)
+{
+       struct bch_fs *c = vinode->i_sb->s_fs_info;
+       struct bch_inode_info *ei = to_bch_ei(vinode);
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       BKEY_PADDED(k) tmp;
+       bool have_extent = false;
+       int ret = 0;
+
+       ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+       if (ret)
+               return ret;
+
+       if (start + len < start)
+               return -EINVAL;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(ei->v.i_ino, start >> 9), 0, k)
+               if (bkey_extent_is_data(k.k) ||
+                   k.k->type == BCH_RESERVATION) {
+                       if (bkey_cmp(bkey_start_pos(k.k),
+                                    POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
+                               break;
+
+                       if (have_extent) {
+                               ret = bch2_fill_extent(info, &tmp.k, 0);
+                               if (ret)
+                                       goto out;
+                       }
+
+                       bkey_reassemble(&tmp.k, k);
+                       have_extent = true;
+               }
+
+       if (have_extent)
+               ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+out:
+       bch2_btree_iter_unlock(&iter);
+       return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+       .fault          = bch2_page_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = bch2_page_mkwrite,
+};
+
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       file_accessed(file);
+
+       vma->vm_ops = &bch_vm_ops;
+       return 0;
+}
+
+/* Directories: */
+
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+       return generic_file_llseek_size(file, offset, whence,
+                                       S64_MAX, S64_MAX);
+}
+
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+       struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
+
+       return bch2_readdir(c, file, ctx);
+}
+
+static const struct file_operations bch_file_operations = {
+       .llseek         = bch2_llseek,
+       .read_iter      = bch2_read_iter,
+       .write_iter     = bch2_write_iter,
+       .mmap           = bch2_mmap,
+       .open           = generic_file_open,
+       .fsync          = bch2_fsync,
+       .splice_read    = filemap_splice_read,
+       .splice_write   = iter_file_splice_write,
+       .fallocate      = bch2_fallocate_dispatch,
+       .unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+       .getattr        = bch2_getattr,
+       .setattr        = bch2_setattr,
+       .fiemap         = bch2_fiemap,
+       .listxattr      = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       .get_acl        = bch2_get_acl,
+       .set_acl        = bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+       .lookup         = bch2_lookup,
+       .create         = bch2_create,
+       .link           = bch2_link,
+       .unlink         = bch2_unlink,
+       .symlink        = bch2_symlink,
+       .mkdir          = bch2_mkdir,
+       .rmdir          = bch2_rmdir,
+       .mknod          = bch2_mknod,
+       .rename         = bch2_rename2,
+       .getattr        = bch2_getattr,
+       .setattr        = bch2_setattr,
+       .tmpfile        = bch2_tmpfile,
+       .listxattr      = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       .get_acl        = bch2_get_acl,
+       .set_acl        = bch2_set_acl,
+#endif
+};
+
+static const struct file_operations bch_dir_file_operations = {
+       .llseek         = bch2_dir_llseek,
+       .read           = generic_read_dir,
+       .iterate_shared = bch2_vfs_readdir,
+       .fsync          = bch2_fsync,
+       .unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+       .get_link       = page_get_link,
+       .getattr        = bch2_getattr,
+       .setattr        = bch2_setattr,
+       .listxattr      = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       .get_acl        = bch2_get_acl,
+       .set_acl        = bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+       .getattr        = bch2_getattr,
+       .setattr        = bch2_setattr,
+       .listxattr      = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       .get_acl        = bch2_get_acl,
+       .set_acl        = bch2_set_acl,
+#endif
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+       .writepage      = bch2_writepage,
+       .read_folio     = bch2_read_folio,
+       .writepages     = bch2_writepages,
+       .readahead      = bch2_readahead,
+       .dirty_folio    = bch2_dirty_folio,
+       .write_begin    = bch2_write_begin,
+       .write_end      = bch2_write_end,
+       .invalidate_folio = bch2_invalidate_folio,
+       .release_folio  = bch2_release_folio,
+       .direct_IO      = noop_direct_IO,
+#ifdef CONFIG_MIGRATION
+       .migrate_folio  = filemap_migrate_folio,
+#endif
+       .error_remove_page = generic_error_remove_page,
+};
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+               u64 ino, u32 generation)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       struct inode *vinode;
+
+       if (ino < BCACHEFS_ROOT_INO)
+               return ERR_PTR(-ESTALE);
+
+       vinode = bch2_vfs_inode_get(c, ino);
+       if (IS_ERR(vinode))
+               return ERR_CAST(vinode);
+       if (generation && vinode->i_generation != generation) {
+               /* we didn't find the right inode.. */
+               iput(vinode);
+               return ERR_PTR(-ESTALE);
+       }
+       return vinode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+               int fh_len, int fh_type)
+{
+       return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                   bch2_nfs_get_inode);
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+               int fh_len, int fh_type)
+{
+       return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                   bch2_nfs_get_inode);
+}
+
+static const struct export_operations bch_export_ops = {
+       .fh_to_dentry   = bch2_fh_to_dentry,
+       .fh_to_parent   = bch2_fh_to_parent,
+       //.get_parent   = bch2_get_parent,
+};
+
+static void bch2_vfs_inode_init(struct bch_fs *c,
+                               struct bch_inode_info *inode,
+                               struct bch_inode_unpacked *bi)
+{
+       bch2_inode_update_after_write(c, inode, bi, ~0);
+
+       inode->v.i_blocks       = bi->bi_sectors;
+       inode->v.i_ino          = bi->bi_inum;
+       inode->v.i_rdev         = bi->bi_dev;
+       inode->v.i_generation   = bi->bi_generation;
+       inode->v.i_size         = bi->bi_size;
+
+       inode->ei_journal_seq   = 0;
+       inode->ei_quota_reserved = 0;
+       inode->ei_str_hash      = bch2_hash_info_init(c, bi);
+
+       bch2_inode_flags_to_vfs(inode);
+
+       inode->v.i_mapping->a_ops = &bch_address_space_operations;
+
+       switch (inode->v.i_mode & S_IFMT) {
+       case S_IFREG:
+               inode->v.i_op   = &bch_file_inode_operations;
+               inode->v.i_fop  = &bch_file_operations;
+               break;
+       case S_IFDIR:
+               inode->v.i_op   = &bch_dir_inode_operations;
+               inode->v.i_fop  = &bch_dir_file_operations;
+               break;
+       case S_IFLNK:
+               inode_nohighmem(&inode->v);
+               inode->v.i_op   = &bch_symlink_inode_operations;
+               break;
+       default:
+               init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+               inode->v.i_op   = &bch_special_inode_operations;
+               break;
+       }
+}
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+       struct bch_inode_info *inode;
+
+       inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+       if (!inode)
+               return NULL;
+
+       inode_init_once(&inode->v);
+       mutex_init(&inode->ei_update_lock);
+       pagecache_lock_init(&inode->ei_pagecache_lock);
+       mutex_init(&inode->ei_quota_lock);
+       inode->ei_journal_seq = 0;
+
+       return &inode->v;
+}
+
+static void bch2_i_callback(struct rcu_head *head)
+{
+       struct inode *vinode = container_of(head, struct inode, i_rcu);
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+
+       kmem_cache_free(bch2_inode_cache, inode);
+}
+
+static void bch2_destroy_inode(struct inode *vinode)
+{
+       call_rcu(&vinode->i_rcu, bch2_i_callback);
+}
+
+static int inode_update_times_fn(struct bch_inode_info *inode,
+                                struct bch_inode_unpacked *bi,
+                                void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
+       bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
+       bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
+
+       return 0;
+}
+
+static int bch2_vfs_write_inode(struct inode *vinode,
+                               struct writeback_control *wbc)
+{
+       struct bch_fs *c = vinode->i_sb->s_fs_info;
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       int ret;
+
+       mutex_lock(&inode->ei_update_lock);
+       ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+       mutex_unlock(&inode->ei_update_lock);
+
+       if (c->opts.journal_flush_disabled)
+               return ret;
+
+       if (!ret && wbc->sync_mode == WB_SYNC_ALL)
+               ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+
+       return ret;
+}
+
+static void bch2_evict_inode(struct inode *vinode)
+{
+       struct bch_fs *c = vinode->i_sb->s_fs_info;
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+
+       truncate_inode_pages_final(&inode->v.i_data);
+
+       clear_inode(&inode->v);
+
+       BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
+       if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+               bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+                               BCH_QUOTA_WARN);
+               bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+                               BCH_QUOTA_WARN);
+               bch2_inode_rm(c, inode->v.i_ino);
+
+               WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
+                         "nr_inodes < 0");
+       }
+}
+
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       struct super_block *sb = dentry->d_sb;
+       struct bch_fs *c = sb->s_fs_info;
+       u64 fsid;
+
+       buf->f_type     = BCACHEFS_STATFS_MAGIC;
+       buf->f_bsize    = sb->s_blocksize;
+       buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
+       buf->f_bfree    = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
+                          PAGE_SECTOR_SHIFT;
+       buf->f_bavail   = buf->f_bfree;
+       buf->f_files    = atomic_long_read(&c->nr_inodes);
+       buf->f_ffree    = U64_MAX;
+
+       fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+              le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+       buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+       buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+       buf->f_namelen  = BCH_NAME_MAX;
+
+       return 0;
+}
+
+static int bch2_sync_fs(struct super_block *sb, int wait)
+{
+       struct bch_fs *c = sb->s_fs_info;
+
+       if (!wait) {
+               bch2_journal_flush_async(&c->journal, NULL);
+               return 0;
+       }
+
+       return bch2_journal_flush(&c->journal);
+}
+
+static struct bch_fs *bch2_path_to_fs(const char *path)
+{
+       struct bch_fs *c;
+       dev_t dev;
+       int ret;
+
+       ret = lookup_bdev(path, &dev);
+       if (ret)
+               return ERR_PTR(ret);
+
+       c = bch2_dev_to_fs(dev);
+       return c ?: ERR_PTR(-ENOENT);
+}
+
+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
+                                              unsigned nr_devs, struct bch_opts opts)
+{
+       struct bch_fs *c, *c1, *c2;
+       size_t i;
+
+       if (!nr_devs)
+               return ERR_PTR(-EINVAL);
+
+       c = bch2_fs_open(devs, nr_devs, opts);
+
+       if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
+               /*
+                * Already open?
+                * Look up each block device, make sure they all belong to a
+                * filesystem and they all belong to the _same_ filesystem
+                */
+
+               c1 = bch2_path_to_fs(devs[0]);
+               if (!c1)
+                       return c;
+
+               for (i = 1; i < nr_devs; i++) {
+                       c2 = bch2_path_to_fs(devs[i]);
+                       if (!IS_ERR(c2))
+                               closure_put(&c2->cl);
+
+                       if (c1 != c2) {
+                               closure_put(&c1->cl);
+                               return c;
+                       }
+               }
+
+               c = c1;
+       }
+
+       if (IS_ERR(c))
+               return c;
+
+       mutex_lock(&c->state_lock);
+
+       if (!bch2_fs_running(c)) {
+               mutex_unlock(&c->state_lock);
+               closure_put(&c->cl);
+               pr_err("err mounting %s: incomplete filesystem", dev_name);
+               return ERR_PTR(-EINVAL);
+       }
+
+       mutex_unlock(&c->state_lock);
+
+       set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+       return c;
+}
+
+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
+                                            struct bch_opts opts)
+{
+       char *dev_name = NULL, **devs = NULL, *s;
+       struct bch_fs *c = ERR_PTR(-ENOMEM);
+       size_t i, nr_devs = 0;
+
+       dev_name = kstrdup(_dev_name, GFP_KERNEL);
+       if (!dev_name)
+               goto err;
+
+       for (s = dev_name; s; s = strchr(s + 1, ':'))
+               nr_devs++;
+
+       devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+       if (!devs)
+               goto err;
+
+       for (i = 0, s = dev_name;
+            s;
+            (s = strchr(s, ':')) && (*s++ = '\0'))
+               devs[i++] = s;
+
+       c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
+err:
+       kfree(devs);
+       kfree(dev_name);
+       return c;
+}
+
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       struct bch_opts opts = bch2_opts_empty();
+       int ret;
+
+       opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
+       ret = bch2_parse_mount_opts(&opts, data);
+       if (ret)
+               return ret;
+
+       if (opts.read_only != c->opts.read_only) {
+               const char *err = NULL;
+
+               mutex_lock(&c->state_lock);
+
+               if (opts.read_only) {
+                       bch2_fs_read_only(c);
+
+                       sb->s_flags |= SB_RDONLY;
+               } else {
+                       err = bch2_fs_read_write(c);
+                       if (err) {
+                               bch_err(c, "error going rw: %s", err);
+                               return -EINVAL;
+                       }
+
+                       sb->s_flags &= ~SB_RDONLY;
+               }
+
+               c->opts.read_only = opts.read_only;
+
+               mutex_unlock(&c->state_lock);
+       }
+
+       if (opts.errors >= 0)
+               c->opts.errors = opts.errors;
+
+       return ret;
+}
+
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+{
+       struct bch_fs *c = root->d_sb->s_fs_info;
+       enum bch_opt_id i;
+       char buf[512];
+
+       for (i = 0; i < bch2_opts_nr; i++) {
+               const struct bch_option *opt = &bch2_opt_table[i];
+               u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+               if (opt->mode < OPT_MOUNT)
+                       continue;
+
+               if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+                       continue;
+
+               bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
+                                OPT_SHOW_MOUNT_STYLE);
+               seq_putc(seq, ',');
+               seq_puts(seq, buf);
+       }
+
+       return 0;
+
+}
+
+static const struct super_operations bch_super_operations = {
+       .alloc_inode    = bch2_alloc_inode,
+       .destroy_inode  = bch2_destroy_inode,
+       .write_inode    = bch2_vfs_write_inode,
+       .evict_inode    = bch2_evict_inode,
+       .sync_fs        = bch2_sync_fs,
+       .statfs         = bch2_statfs,
+       .show_options   = bch2_show_options,
+       .remount_fs     = bch2_remount,
+#if 0
+       .put_super      = bch2_put_super,
+       .freeze_fs      = bch2_freeze,
+       .unfreeze_fs    = bch2_unfreeze,
+#endif
+};
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+       return s->s_fs_info == data;
+}
+
+static int bch2_set_super(struct super_block *s, void *data)
+{
+       s->s_fs_info = data;
+       return 0;
+}
+
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
+                                int flags, const char *dev_name, void *data)
+{
+       struct bch_fs *c;
+       struct bch_dev *ca;
+       struct super_block *sb;
+       struct inode *vinode;
+       struct bch_opts opts = bch2_opts_empty();
+       unsigned i;
+       int ret;
+
+       opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+
+       ret = bch2_parse_mount_opts(&opts, data);
+       if (ret)
+               return ERR_PTR(ret);
+
+       c = bch2_open_as_blockdevs(dev_name, opts);
+       if (IS_ERR(c))
+               return ERR_CAST(c);
+
+       sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
+       if (IS_ERR(sb)) {
+               closure_put(&c->cl);
+               return ERR_CAST(sb);
+       }
+
+       BUG_ON(sb->s_fs_info != c);
+
+       if (sb->s_root) {
+               closure_put(&c->cl);
+
+               if ((flags ^ sb->s_flags) & SB_RDONLY) {
+                       ret = -EBUSY;
+                       goto err_put_super;
+               }
+               goto out;
+       }
+
+       /* XXX: blocksize */
+       sb->s_blocksize         = PAGE_SIZE;
+       sb->s_blocksize_bits    = PAGE_SHIFT;
+       sb->s_maxbytes          = MAX_LFS_FILESIZE;
+       sb->s_op                = &bch_super_operations;
+       sb->s_export_op         = &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+       sb->s_qcop              = &bch2_quotactl_operations;
+       sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
+       sb->s_xattr             = bch2_xattr_handlers;
+       sb->s_magic             = BCACHEFS_STATFS_MAGIC;
+       sb->s_time_gran         = c->sb.time_precision;
+       c->vfs_sb               = sb;
+       strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+
+       ret = super_setup_bdi(sb);
+       if (ret)
+               goto err_put_super;
+
+       sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
+
+       for_each_online_member(ca, c, i) {
+               struct block_device *bdev = ca->disk_sb.bdev;
+
+               /* XXX: create an anonymous device for multi device filesystems */
+               sb->s_bdev      = bdev;
+               sb->s_dev       = bdev->bd_dev;
+               percpu_ref_put(&ca->io_ref);
+               break;
+       }
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       if (c->opts.acl)
+               sb->s_flags     |= SB_POSIXACL;
+#endif
+
+       vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+       if (IS_ERR(vinode)) {
+               ret = PTR_ERR(vinode);
+               goto err_put_super;
+       }
+
+       sb->s_root = d_make_root(vinode);
+       if (!sb->s_root) {
+               ret = -ENOMEM;
+               goto err_put_super;
+       }
+
+       sb->s_flags |= SB_ACTIVE;
+out:
+       return dget(sb->s_root);
+
+err_put_super:
+       deactivate_locked_super(sb);
+       return ERR_PTR(ret);
+}
+
+static void bch2_kill_sb(struct super_block *sb)
+{
+       struct bch_fs *c = sb->s_fs_info;
+
+       generic_shutdown_super(sb);
+
+       if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
+               bch2_fs_stop(c);
+       else
+               closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "bcachefs",
+       .mount          = bch2_mount,
+       .kill_sb        = bch2_kill_sb,
+       .fs_flags       = FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcachefs");
+
+void bch2_vfs_exit(void)
+{
+       unregister_filesystem(&bcache_fs_type);
+       if (bch2_inode_cache)
+               kmem_cache_destroy(bch2_inode_cache);
+}
+
+int __init bch2_vfs_init(void)
+{
+       int ret = -ENOMEM;
+
+       bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+       if (!bch2_inode_cache)
+               goto err;
+
+       ret = register_filesystem(&bcache_fs_type);
+       if (ret)
+               goto err;
+
+       return 0;
+err:
+       bch2_vfs_exit();
+       return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
new file mode 100644 (file)
index 0000000..e8dd566
--- /dev/null
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+       atomic_long_t           v;
+       wait_queue_head_t       wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+       atomic_long_set(&lock->v, 0);
+       init_waitqueue_head(&lock->wait);
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *);
+void bch2_pagecache_add_get(struct pagecache_lock *);
+void bch2_pagecache_block_put(struct pagecache_lock *);
+void bch2_pagecache_block_get(struct pagecache_lock *);
+
+struct bch_inode_info {
+       struct inode            v;
+
+       struct mutex            ei_update_lock;
+       u64                     ei_journal_seq;
+       u64                     ei_quota_reserved;
+       unsigned long           ei_last_dirtied;
+       struct pagecache_lock   ei_pagecache_lock;
+
+       struct mutex            ei_quota_lock;
+       struct bch_qid          ei_qid;
+
+       struct bch_hash_info    ei_str_hash;
+
+       /* copy of inode in btree: */
+       struct bch_inode_unpacked ei_inode;
+};
+
+#define to_bch_ei(_inode)                                      \
+       container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+       return to_bch_ei(file_inode(file));
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+       return (mode >> 12) & 15;
+}
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+       return S_ISDIR(mode) ? 2 : 1;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct bch_inode_info *,
+                           struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct bch_fs *,
+                                  struct bch_inode_info *,
+                                  struct bch_inode_unpacked *,
+                                  unsigned);
+int __must_check bch2_write_inode_trans(struct btree_trans *,
+                               struct bch_inode_info *,
+                               struct bch_inode_unpacked *,
+                               inode_set_fn, void *);
+int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+                                   inode_set_fn, void *, unsigned);
+int __must_check bch2_write_inode(struct bch_fs *,
+                                 struct bch_inode_info *);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
new file mode 100644 (file)
index 0000000..eb01284
--- /dev/null
@@ -0,0 +1,1306 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs.h"
+#include "fsck.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/dcache.h> /* struct qstr */
+#include <linux/generic-radix-tree.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
+                        struct bkey_s_c_dirent dirent)
+{
+       struct qstr name;
+       struct bch_inode_unpacked dir_inode;
+       struct bch_hash_info dir_hash_info;
+       u64 dir_inum = dirent.k->p.inode;
+       int ret;
+       char *buf;
+
+       name.len = bch2_dirent_name_bytes(dirent);
+       buf = kmalloc(name.len + 1, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       memcpy(buf, dirent.v->d_name, name.len);
+       buf[name.len] = '\0';
+       name.name = buf;
+
+       /* Unlock iter so we don't deadlock, after copying name: */
+       bch2_btree_iter_unlock(iter);
+
+       ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
+       if (ret) {
+               bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
+               goto err;
+       }
+
+       dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+       ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
+       if (ret)
+               bch_err(c, "remove_dirent: err %i deleting dirent", ret);
+err:
+       kfree(buf);
+       return ret;
+}
+
+static int reattach_inode(struct bch_fs *c,
+                         struct bch_inode_unpacked *lostfound_inode,
+                         u64 inum)
+{
+       struct bch_hash_info lostfound_hash_info =
+               bch2_hash_info_init(c, lostfound_inode);
+       struct bkey_inode_buf packed;
+       char name_buf[20];
+       struct qstr name;
+       int ret;
+
+       snprintf(name_buf, sizeof(name_buf), "%llu", inum);
+       name = (struct qstr) QSTR(name_buf);
+
+       lostfound_inode->bi_nlink++;
+
+       bch2_inode_pack(&packed, lostfound_inode);
+
+       ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+                              NULL, NULL, NULL,
+                              BTREE_INSERT_NOFAIL);
+       if (ret) {
+               bch_err(c, "error %i reattaching inode %llu while updating lost+found",
+                       ret, inum);
+               return ret;
+       }
+
+       ret = bch2_dirent_create(c, lostfound_inode->bi_inum,
+                                &lostfound_hash_info,
+                                DT_DIR, &name, inum, NULL,
+                                BTREE_INSERT_NOFAIL);
+       if (ret) {
+               bch_err(c, "error %i reattaching inode %llu while creating new dirent",
+                       ret, inum);
+               return ret;
+       }
+       return ret;
+}
+
+struct inode_walker {
+       bool                    first_this_inode;
+       bool                    have_inode;
+       u64                     cur_inum;
+       struct bch_inode_unpacked inode;
+};
+
+static struct inode_walker inode_walker_init(void)
+{
+       return (struct inode_walker) {
+               .cur_inum       = -1,
+               .have_inode     = false,
+       };
+}
+
+static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
+{
+       w->first_this_inode     = inum != w->cur_inum;
+       w->cur_inum             = inum;
+
+       if (w->first_this_inode) {
+               int ret = bch2_inode_find_by_inum(c, inum, &w->inode);
+
+               if (ret && ret != -ENOENT)
+                       return ret;
+
+               w->have_inode = !ret;
+       }
+
+       return 0;
+}
+
+struct hash_check {
+       struct bch_hash_info    info;
+       struct btree_iter       chain;
+       struct btree_iter       iter;
+       u64                     next;
+};
+
+static void hash_check_init(const struct bch_hash_desc desc,
+                           struct hash_check *h, struct bch_fs *c)
+{
+       bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0);
+       bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0);
+}
+
+static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
+                                const struct bch_inode_unpacked *bi)
+{
+       h->info = bch2_hash_info_init(c, bi);
+       h->next = -1;
+}
+
+static int hash_redo_key(const struct bch_hash_desc desc,
+                        struct hash_check *h, struct bch_fs *c,
+                        struct btree_iter *k_iter, struct bkey_s_c k,
+                        u64 hashed)
+{
+       struct bkey_i *tmp;
+       int ret = 0;
+
+       tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+       if (!tmp)
+               return -ENOMEM;
+
+       bkey_reassemble(tmp, k);
+
+       ret = bch2_btree_delete_at(k_iter, 0);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_unlock(k_iter);
+
+       bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL, tmp,
+                     BTREE_INSERT_NOFAIL|
+                     BCH_HASH_SET_MUST_CREATE);
+err:
+       kfree(tmp);
+       return ret;
+}
+
+/* fsck hasn't been converted to new transactions yet: */
+static int fsck_hash_delete_at(const struct bch_hash_desc desc,
+                              struct bch_hash_info *info,
+                              struct btree_iter *orig_iter)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       int ret;
+
+       bch2_btree_iter_unlock(orig_iter);
+
+       bch2_trans_init(&trans, orig_iter->c);
+retry:
+       bch2_trans_begin(&trans);
+
+       iter = bch2_trans_copy_iter(&trans, orig_iter);
+       if (IS_ERR(iter)) {
+               ret = PTR_ERR(iter);
+               goto err;
+       }
+
+       ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
+               bch2_trans_commit(&trans, NULL, NULL, NULL,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOFAIL);
+err:
+       if (ret == -EINTR)
+               goto retry;
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int hash_check_key(const struct bch_hash_desc desc,
+                         struct hash_check *h, struct bch_fs *c,
+                         struct btree_iter *k_iter, struct bkey_s_c k)
+{
+       char buf[200];
+       u64 hashed;
+       int ret = 0;
+
+       if (k.k->type != desc.whiteout_type &&
+           k.k->type != desc.key_type)
+               return 0;
+
+       if (k.k->p.offset != h->next) {
+               if (!btree_iter_linked(&h->chain)) {
+                       bch2_btree_iter_link(k_iter, &h->chain);
+                       bch2_btree_iter_link(k_iter, &h->iter);
+               }
+               bch2_btree_iter_copy(&h->chain, k_iter);
+       }
+       h->next = k.k->p.offset + 1;
+
+       if (k.k->type != desc.key_type)
+               return 0;
+
+       hashed = desc.hash_bkey(&h->info, k);
+
+       if (fsck_err_on(hashed < h->chain.pos.offset ||
+                       hashed > k.k->p.offset, c,
+                       "hash table key at wrong offset: %llu, "
+                       "hashed to %llu chain starts at %llu\n%s",
+                       k.k->p.offset, hashed, h->chain.pos.offset,
+                       (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+                                              buf, sizeof(buf), k), buf))) {
+               ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
+               if (ret) {
+                       bch_err(c, "hash_redo_key err %i", ret);
+                       return ret;
+               }
+               return 1;
+       }
+
+       if (!bkey_cmp(h->chain.pos, k_iter->pos))
+               return 0;
+
+       bch2_btree_iter_copy(&h->iter, &h->chain);
+       while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
+               struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
+
+               if (fsck_err_on(k2.k->type == desc.key_type &&
+                               !desc.cmp_bkey(k, k2), c,
+                               "duplicate hash table keys:\n%s",
+                               (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+                                                      buf, sizeof(buf), k), buf))) {
+                       ret = fsck_hash_delete_at(desc, &h->info, &h->iter);
+                       if (ret)
+                               return ret;
+                       return 1;
+               }
+               bch2_btree_iter_next(&h->iter);
+       }
+fsck_err:
+       return ret;
+}
+
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+noinline_for_stack
+static int check_extents(struct bch_fs *c)
+{
+       struct inode_walker w = inode_walker_init();
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 i_sectors;
+       int ret = 0;
+
+       bch_verbose(c, "checking extents");
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+               ret = walk_inode(c, &w, k.k->p.inode);
+               if (ret)
+                       break;
+
+               if (fsck_err_on(!w.have_inode, c,
+                       "extent type %u for missing inode %llu",
+                       k.k->type, k.k->p.inode) ||
+                   fsck_err_on(w.have_inode &&
+                       !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
+                       "extent type %u for non regular file, inode %llu mode %o",
+                       k.k->type, k.k->p.inode, w.inode.bi_mode)) {
+                       bch2_btree_iter_unlock(&iter);
+
+                       ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (fsck_err_on(w.first_this_inode &&
+                       w.have_inode &&
+                       !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
+                       w.inode.bi_sectors !=
+                       (i_sectors = bch2_count_inode_sectors(c, w.cur_inum)),
+                       c, "i_sectors wrong: got %llu, should be %llu",
+                       w.inode.bi_sectors, i_sectors)) {
+                       struct bkey_inode_buf p;
+
+                       w.inode.bi_sectors = i_sectors;
+
+                       bch2_btree_iter_unlock(&iter);
+
+                       bch2_inode_pack(&p, &w.inode);
+
+                       ret = bch2_btree_insert(c, BTREE_ID_INODES,
+                                               &p.inode.k_i,
+                                               NULL,
+                                               NULL,
+                                               NULL,
+                                               BTREE_INSERT_NOFAIL);
+                       if (ret) {
+                               bch_err(c, "error in fs gc: error %i "
+                                       "updating inode", ret);
+                               goto err;
+                       }
+
+                       /* revalidate iterator: */
+                       k = bch2_btree_iter_peek(&iter);
+               }
+
+               if (fsck_err_on(w.have_inode &&
+                       !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                       k.k->type != BCH_RESERVATION &&
+                       k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
+                       "extent type %u offset %llu past end of inode %llu, i_size %llu",
+                       k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+                       bch2_btree_iter_unlock(&iter);
+
+                       ret = bch2_inode_truncate(c, k.k->p.inode,
+                                       round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
+                                       NULL, NULL);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+       }
+err:
+fsck_err:
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+noinline_for_stack
+static int check_dirents(struct bch_fs *c)
+{
+       struct inode_walker w = inode_walker_init();
+       struct hash_check h;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       unsigned name_len;
+       char buf[200];
+       int ret = 0;
+
+       bch_verbose(c, "checking dirents");
+
+       hash_check_init(bch2_dirent_hash_desc, &h, c);
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+                          POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+               struct bkey_s_c_dirent d;
+               struct bch_inode_unpacked target;
+               bool have_target;
+               u64 d_inum;
+
+               ret = walk_inode(c, &w, k.k->p.inode);
+               if (ret)
+                       break;
+
+               if (fsck_err_on(!w.have_inode, c,
+                               "dirent in nonexisting directory:\n%s",
+                               (bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+                                                      buf, sizeof(buf), k), buf)) ||
+                   fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
+                               "dirent in non directory inode type %u:\n%s",
+                               mode_to_type(w.inode.bi_mode),
+                               (bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+                                                      buf, sizeof(buf), k), buf))) {
+                       ret = bch2_btree_delete_at(&iter, 0);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (w.first_this_inode && w.have_inode)
+                       hash_check_set_inode(&h, c, &w.inode);
+
+               ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+               if (ret > 0) {
+                       ret = 0;
+                       continue;
+               }
+
+               if (ret)
+                       goto fsck_err;
+
+               if (k.k->type != BCH_DIRENT)
+                       continue;
+
+               d = bkey_s_c_to_dirent(k);
+               d_inum = le64_to_cpu(d.v->d_inum);
+
+               name_len = bch2_dirent_name_bytes(d);
+
+               if (fsck_err_on(!name_len, c, "empty dirent") ||
+                   fsck_err_on(name_len == 1 &&
+                               !memcmp(d.v->d_name, ".", 1), c,
+                               ". dirent") ||
+                   fsck_err_on(name_len == 2 &&
+                               !memcmp(d.v->d_name, "..", 2), c,
+                               ".. dirent")) {
+                       ret = remove_dirent(c, &iter, d);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (fsck_err_on(d_inum == d.k->p.inode, c,
+                               "dirent points to own directory:\n%s",
+                               (bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+                                                      buf, sizeof(buf), k), buf))) {
+                       ret = remove_dirent(c, &iter, d);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               ret = bch2_inode_find_by_inum(c, d_inum, &target);
+               if (ret && ret != -ENOENT)
+                       break;
+
+               have_target = !ret;
+               ret = 0;
+
+               if (fsck_err_on(!have_target, c,
+                               "dirent points to missing inode:\n%s",
+                               (bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+                                                      buf, sizeof(buf), k), buf))) {
+                       ret = remove_dirent(c, &iter, d);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (fsck_err_on(have_target &&
+                               d.v->d_type !=
+                               mode_to_type(target.bi_mode), c,
+                               "incorrect d_type: should be %u:\n%s",
+                               mode_to_type(target.bi_mode),
+                               (bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+                                                      buf, sizeof(buf), k), buf))) {
+                       struct bkey_i_dirent *n;
+
+                       n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+                       if (!n) {
+                               ret = -ENOMEM;
+                               goto err;
+                       }
+
+                       bkey_reassemble(&n->k_i, d.s_c);
+                       n->v.d_type = mode_to_type(target.bi_mode);
+
+                       ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                                       BTREE_INSERT_NOFAIL,
+                                       BTREE_INSERT_ENTRY(&iter, &n->k_i));
+                       kfree(n);
+                       if (ret)
+                               goto err;
+
+               }
+       }
+err:
+fsck_err:
+       bch2_btree_iter_unlock(&h.chain);
+       bch2_btree_iter_unlock(&h.iter);
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+noinline_for_stack
+static int check_xattrs(struct bch_fs *c)
+{
+       struct inode_walker w = inode_walker_init();
+       struct hash_check h;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch_verbose(c, "checking xattrs");
+
+       hash_check_init(bch2_xattr_hash_desc, &h, c);
+
+       for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
+                          POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+               ret = walk_inode(c, &w, k.k->p.inode);
+               if (ret)
+                       break;
+
+               if (fsck_err_on(!w.have_inode, c,
+                               "xattr for missing inode %llu",
+                               k.k->p.inode)) {
+                       ret = bch2_btree_delete_at(&iter, 0);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (w.first_this_inode && w.have_inode)
+                       hash_check_set_inode(&h, c, &w.inode);
+
+               ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+               if (ret)
+                       goto fsck_err;
+       }
+err:
+fsck_err:
+       bch2_btree_iter_unlock(&h.chain);
+       bch2_btree_iter_unlock(&h.iter);
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Get root directory, create if it doesn't exist: */
+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+{
+       struct bkey_inode_buf packed;
+       int ret;
+
+       bch_verbose(c, "checking root directory");
+
+       ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+       if (ret && ret != -ENOENT)
+               return ret;
+
+       if (fsck_err_on(ret, c, "root directory missing"))
+               goto create_root;
+
+       if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
+                       "root inode not a directory"))
+               goto create_root;
+
+       return 0;
+fsck_err:
+       return ret;
+create_root:
+       bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+                       0, NULL);
+       root_inode->bi_inum = BCACHEFS_ROOT_INO;
+
+       bch2_inode_pack(&packed, root_inode);
+
+       return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+                                NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int check_lostfound(struct bch_fs *c,
+                          struct bch_inode_unpacked *root_inode,
+                          struct bch_inode_unpacked *lostfound_inode)
+{
+       struct qstr lostfound = QSTR("lost+found");
+       struct bch_hash_info root_hash_info =
+               bch2_hash_info_init(c, root_inode);
+       struct bkey_inode_buf packed;
+       u64 inum;
+       int ret;
+
+       bch_verbose(c, "checking lost+found");
+
+       inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
+                                &lostfound);
+       if (!inum) {
+               bch_notice(c, "creating lost+found");
+               goto create_lostfound;
+       }
+
+       ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+       if (ret && ret != -ENOENT)
+               return ret;
+
+       if (fsck_err_on(ret, c, "lost+found missing"))
+               goto create_lostfound;
+
+       if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
+                       "lost+found inode not a directory"))
+               goto create_lostfound;
+
+       return 0;
+fsck_err:
+       return ret;
+create_lostfound:
+       root_inode->bi_nlink++;
+
+       bch2_inode_pack(&packed, root_inode);
+
+       ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+                               NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+       if (ret)
+               return ret;
+
+       bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+                       0, root_inode);
+
+       ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
+                              &c->unused_inode_hint);
+       if (ret)
+               return ret;
+
+       ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+                                &lostfound, lostfound_inode->bi_inum, NULL,
+                                BTREE_INSERT_NOFAIL);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+struct inode_bitmap {
+       unsigned long   *bits;
+       size_t          size;
+};
+
+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+{
+       return nr < b->size ? test_bit(nr, b->bits) : false;
+}
+
+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+{
+       if (nr >= b->size) {
+               size_t new_size = max_t(size_t, max_t(size_t,
+                                       PAGE_SIZE * 8,
+                                       b->size * 2),
+                                       nr + 1);
+               void *n;
+
+               new_size = roundup_pow_of_two(new_size);
+               n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
+               if (!n) {
+                       return -ENOMEM;
+               }
+
+               b->bits = n;
+               b->size = new_size;
+       }
+
+       __set_bit(nr, b->bits);
+       return 0;
+}
+
+struct pathbuf {
+       size_t          nr;
+       size_t          size;
+
+       struct pathbuf_entry {
+               u64     inum;
+               u64     offset;
+       }               *entries;
+};
+
+static int path_down(struct pathbuf *p, u64 inum)
+{
+       if (p->nr == p->size) {
+               size_t new_size = max_t(size_t, 256UL, p->size * 2);
+               void *n = krealloc(p->entries,
+                                  new_size * sizeof(p->entries[0]),
+                                  GFP_KERNEL);
+               if (!n)
+                       return -ENOMEM;
+
+               p->entries = n;
+               p->size = new_size;
+       };
+
+       p->entries[p->nr++] = (struct pathbuf_entry) {
+               .inum = inum,
+               .offset = 0,
+       };
+       return 0;
+}
+
+noinline_for_stack
+static int check_directory_structure(struct bch_fs *c,
+                                    struct bch_inode_unpacked *lostfound_inode)
+{
+       struct inode_bitmap dirs_done = { NULL, 0 };
+       struct pathbuf path = { 0, 0, NULL };
+       struct pathbuf_entry *e;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent dirent;
+       bool had_unreachable;
+       u64 d_inum;
+       int ret = 0;
+
+       bch_verbose(c, "checking directory structure");
+
+       /* DFS: */
+restart_dfs:
+       had_unreachable = false;
+
+       ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
+       if (ret) {
+               bch_err(c, "memory allocation failure in inode_bitmap_set()");
+               goto err;
+       }
+
+       ret = path_down(&path, BCACHEFS_ROOT_INO);
+       if (ret) {
+               return ret;
+       }
+
+       while (path.nr) {
+next:
+               e = &path.entries[path.nr - 1];
+
+               if (e->offset == U64_MAX)
+                       goto up;
+
+               for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+                                  POS(e->inum, e->offset + 1), 0, k) {
+                       if (k.k->p.inode != e->inum)
+                               break;
+
+                       e->offset = k.k->p.offset;
+
+                       if (k.k->type != BCH_DIRENT)
+                               continue;
+
+                       dirent = bkey_s_c_to_dirent(k);
+
+                       if (dirent.v->d_type != DT_DIR)
+                               continue;
+
+                       d_inum = le64_to_cpu(dirent.v->d_inum);
+
+                       if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
+                                       "directory %llu has multiple hardlinks",
+                                       d_inum)) {
+                               ret = remove_dirent(c, &iter, dirent);
+                               if (ret)
+                                       goto err;
+                               continue;
+                       }
+
+                       ret = inode_bitmap_set(&dirs_done, d_inum);
+                       if (ret) {
+                               bch_err(c, "memory allocation failure in inode_bitmap_set()");
+                               goto err;
+                       }
+
+                       ret = path_down(&path, d_inum);
+                       if (ret) {
+                               goto err;
+                       }
+
+                       bch2_btree_iter_unlock(&iter);
+                       goto next;
+               }
+               ret = bch2_btree_iter_unlock(&iter);
+               if (ret) {
+                       bch_err(c, "btree error %i in fsck", ret);
+                       goto err;
+               }
+up:
+               path.nr--;
+       }
+
+       for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+               if (k.k->type != BCH_INODE_FS)
+                       continue;
+
+               if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
+                       continue;
+
+               if (!bch2_empty_dir(c, k.k->p.inode))
+                       continue;
+
+               if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
+                               "unreachable directory found (inum %llu)",
+                               k.k->p.inode)) {
+                       bch2_btree_iter_unlock(&iter);
+
+                       ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
+                       if (ret) {
+                               goto err;
+                       }
+
+                       had_unreachable = true;
+               }
+       }
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               goto err;
+
+       if (had_unreachable) {
+               bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
+               kfree(dirs_done.bits);
+               kfree(path.entries);
+               memset(&dirs_done, 0, sizeof(dirs_done));
+               memset(&path, 0, sizeof(path));
+               goto restart_dfs;
+       }
+
+out:
+       kfree(dirs_done.bits);
+       kfree(path.entries);
+       return ret;
+err:
+fsck_err:
+       ret = bch2_btree_iter_unlock(&iter) ?: ret;
+       goto out;
+}
+
+struct nlink {
+       u32     count;
+       u32     dir_count;
+};
+
+typedef GENRADIX(struct nlink) nlink_table;
+
+static void inc_link(struct bch_fs *c, nlink_table *links,
+                    u64 range_start, u64 *range_end,
+                    u64 inum, bool dir)
+{
+       struct nlink *link;
+
+       if (inum < range_start || inum >= *range_end)
+               return;
+
+       link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
+       if (!link) {
+               bch_verbose(c, "allocation failed during fs gc - will need another pass");
+               *range_end = inum;
+               return;
+       }
+
+       if (dir)
+               link->dir_count++;
+       else
+               link->count++;
+}
+
+noinline_for_stack
+static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
+                              u64 range_start, u64 *range_end)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       u64 d_inum;
+       int ret;
+
+       inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
+               switch (k.k->type) {
+               case BCH_DIRENT:
+                       d = bkey_s_c_to_dirent(k);
+                       d_inum = le64_to_cpu(d.v->d_inum);
+
+                       if (d.v->d_type == DT_DIR)
+                               inc_link(c, links, range_start, range_end,
+                                        d.k->p.inode, true);
+
+                       inc_link(c, links, range_start, range_end,
+                                d_inum, false);
+
+                       break;
+               }
+
+               bch2_btree_iter_cond_resched(&iter);
+       }
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+
+       return ret;
+}
+
+s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 sectors = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
+               if (k.k->p.inode != inum)
+                       break;
+
+               if (bkey_extent_is_allocation(k.k))
+                       sectors += k.k->size;
+       }
+
+       return bch2_btree_iter_unlock(&iter) ?: sectors;
+}
+
+static int check_inode_nlink(struct bch_fs *c,
+                            struct bch_inode_unpacked *lostfound_inode,
+                            struct bch_inode_unpacked *u,
+                            struct nlink *link,
+                            bool *do_update)
+{
+       u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
+               ? 0
+               : u->bi_nlink + nlink_bias(u->bi_mode);
+       u32 real_i_nlink =
+               link->count * nlink_bias(u->bi_mode) +
+               link->dir_count;
+       int ret = 0;
+
+       /*
+        * These should have been caught/fixed by earlier passes, we don't
+        * repair them here:
+        */
+       if (S_ISDIR(u->bi_mode) && link->count > 1) {
+               need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
+                             u->bi_inum, link->count);
+               return 0;
+       }
+
+       if (S_ISDIR(u->bi_mode) && !link->count) {
+               need_fsck_err(c, "unreachable directory found (inum %llu)",
+                             u->bi_inum);
+               return 0;
+       }
+
+       if (!S_ISDIR(u->bi_mode) && link->dir_count) {
+               need_fsck_err(c, "non directory with subdirectories",
+                             u->bi_inum);
+               return 0;
+       }
+
+       if (i_nlink < link->count) {
+               if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
+                            u->bi_inum, i_nlink, link->count,
+                            mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
+                       return 0;
+               goto set_i_nlink;
+       }
+
+       if (i_nlink != real_i_nlink &&
+           c->sb.clean) {
+               if (fsck_err(c, "filesystem marked clean, "
+                            "but inode %llu has wrong i_nlink "
+                            "(type %u i_nlink %u, should be %u)",
+                            u->bi_inum, mode_to_type(u->bi_mode),
+                            i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+                       return 0;
+               goto set_i_nlink;
+       }
+
+       if (real_i_nlink && i_nlink != real_i_nlink)
+               bch_verbose(c, "setting inode %llu nlink from %u to %u",
+                           u->bi_inum, i_nlink, real_i_nlink);
+set_i_nlink:
+       if (i_nlink != real_i_nlink) {
+               if (real_i_nlink) {
+                       u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
+                       u->bi_flags &= ~BCH_INODE_UNLINKED;
+               } else {
+                       u->bi_nlink = 0;
+                       u->bi_flags |= BCH_INODE_UNLINKED;
+               }
+
+               *do_update = true;
+       }
+fsck_err:
+       return ret;
+}
+
+static int check_inode(struct bch_fs *c,
+                      struct bch_inode_unpacked *lostfound_inode,
+                      struct btree_iter *iter,
+                      struct bkey_s_c_inode inode,
+                      struct nlink *link)
+{
+       struct bch_inode_unpacked u;
+       bool do_update = false;
+       int ret = 0;
+
+       ret = bch2_inode_unpack(inode, &u);
+       if (bch2_fs_inconsistent_on(ret, c,
+                        "error unpacking inode %llu in fsck",
+                        inode.k->p.inode))
+               return ret;
+
+       if (link) {
+               ret = check_inode_nlink(c, lostfound_inode, &u, link,
+                                       &do_update);
+               if (ret)
+                       return ret;
+       }
+
+       if (u.bi_flags & BCH_INODE_UNLINKED) {
+               bch_verbose(c, "deleting inode %llu", u.bi_inum);
+
+               ret = bch2_inode_rm(c, u.bi_inum);
+               if (ret)
+                       bch_err(c, "error in fs gc: error %i "
+                               "while deleting inode", ret);
+               return ret;
+       }
+
+       if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) {
+               fsck_err_on(c->sb.clean, c,
+                           "filesystem marked clean, "
+                           "but inode %llu has i_size dirty",
+                           u.bi_inum);
+
+               bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+               /*
+                * XXX: need to truncate partial blocks too here - or ideally
+                * just switch units to bytes and that issue goes away
+                */
+
+               ret = bch2_inode_truncate(c, u.bi_inum,
+                               round_up(u.bi_size, PAGE_SIZE) >> 9,
+                               NULL, NULL);
+               if (ret) {
+                       bch_err(c, "error in fs gc: error %i "
+                               "truncating inode", ret);
+                       return ret;
+               }
+
+               /*
+                * We truncated without our normal sector accounting hook, just
+                * make sure we recalculate it:
+                */
+               u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+               u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+               do_update = true;
+       }
+
+       if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) {
+               s64 sectors;
+
+               fsck_err_on(c->sb.clean, c,
+                           "filesystem marked clean, "
+                           "but inode %llu has i_sectors dirty",
+                           u.bi_inum);
+
+               bch_verbose(c, "recounting sectors for inode %llu",
+                           u.bi_inum);
+
+               sectors = bch2_count_inode_sectors(c, u.bi_inum);
+               if (sectors < 0) {
+                       bch_err(c, "error in fs gc: error %i "
+                               "recounting inode sectors",
+                               (int) sectors);
+                       return sectors;
+               }
+
+               u.bi_sectors = sectors;
+               u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+               do_update = true;
+       }
+
+       if (do_update) {
+               struct bkey_inode_buf p;
+
+               bch2_inode_pack(&p, &u);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                                         BTREE_INSERT_NOFAIL,
+                                         BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
+               if (ret && ret != -EINTR)
+                       bch_err(c, "error in fs gc: error %i "
+                               "updating inode", ret);
+       }
+fsck_err:
+       return ret;
+}
+
+noinline_for_stack
+static int bch2_gc_walk_inodes(struct bch_fs *c,
+                              struct bch_inode_unpacked *lostfound_inode,
+                              nlink_table *links,
+                              u64 range_start, u64 range_end)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct nlink *link, zero_links = { 0, 0 };
+       struct genradix_iter nlinks_iter;
+       int ret = 0, ret2 = 0;
+       u64 nlinks_pos;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
+       nlinks_iter = genradix_iter_init(links, 0);
+
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
+              !btree_iter_err(k)) {
+peek_nlinks:   link = genradix_iter_peek(&nlinks_iter, links);
+
+               if (!link && (!k.k || iter.pos.inode >= range_end))
+                       break;
+
+               nlinks_pos = range_start + nlinks_iter.pos;
+               if (iter.pos.inode > nlinks_pos) {
+                       /* Should have been caught by dirents pass: */
+                       need_fsck_err_on(link && link->count, c,
+                               "missing inode %llu (nlink %u)",
+                               nlinks_pos, link->count);
+                       genradix_iter_advance(&nlinks_iter, links);
+                       goto peek_nlinks;
+               }
+
+               if (iter.pos.inode < nlinks_pos || !link)
+                       link = &zero_links;
+
+               if (k.k && k.k->type == BCH_INODE_FS) {
+                       /*
+                        * Avoid potential deadlocks with iter for
+                        * truncate/rm/etc.:
+                        */
+                       bch2_btree_iter_unlock(&iter);
+
+                       ret = check_inode(c, lostfound_inode, &iter,
+                                         bkey_s_c_to_inode(k), link);
+                       BUG_ON(ret == -EINTR);
+                       if (ret)
+                               break;
+
+                       if (link->count)
+                               atomic_long_inc(&c->nr_inodes);
+               } else {
+                       /* Should have been caught by dirents pass: */
+                       need_fsck_err_on(link->count, c,
+                               "missing inode %llu (nlink %u)",
+                               nlinks_pos, link->count);
+               }
+
+               if (nlinks_pos == iter.pos.inode)
+                       genradix_iter_advance(&nlinks_iter, links);
+
+               bch2_btree_iter_next(&iter);
+               bch2_btree_iter_cond_resched(&iter);
+       }
+fsck_err:
+       ret2 = bch2_btree_iter_unlock(&iter);
+       if (ret2)
+               bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
+
+       return ret ?: ret2;
+}
+
+noinline_for_stack
+static int check_inode_nlinks(struct bch_fs *c,
+                             struct bch_inode_unpacked *lostfound_inode)
+{
+       nlink_table links;
+       u64 this_iter_range_start, next_iter_range_start = 0;
+       int ret = 0;
+
+       bch_verbose(c, "checking inode nlinks");
+
+       genradix_init(&links);
+
+       do {
+               this_iter_range_start = next_iter_range_start;
+               next_iter_range_start = U64_MAX;
+
+               ret = bch2_gc_walk_dirents(c, &links,
+                                         this_iter_range_start,
+                                         &next_iter_range_start);
+               if (ret)
+                       break;
+
+               ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
+                                        this_iter_range_start,
+                                        next_iter_range_start);
+               if (ret)
+                       break;
+
+               genradix_free(&links);
+       } while (next_iter_range_start != U64_MAX);
+
+       genradix_free(&links);
+
+       return ret;
+}
+
+noinline_for_stack
+static int check_inodes_fast(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_inode inode;
+       unsigned long nr_inodes = 0;
+       int ret = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+               if (k.k->type != BCH_INODE_FS)
+                       continue;
+
+               inode = bkey_s_c_to_inode(k);
+
+               if (!(inode.v->bi_flags & BCH_INODE_UNLINKED))
+                       nr_inodes++;
+
+               if (inode.v->bi_flags &
+                   (BCH_INODE_I_SIZE_DIRTY|
+                    BCH_INODE_I_SECTORS_DIRTY|
+                    BCH_INODE_UNLINKED)) {
+                       fsck_err_on(c->sb.clean, c,
+                               "filesystem marked clean but found inode %llu with flags %x",
+                               inode.k->p.inode, inode.v->bi_flags);
+                       ret = check_inode(c, NULL, &iter, inode, NULL);
+                       BUG_ON(ret == -EINTR);
+                       if (ret)
+                               break;
+               }
+       }
+       atomic_long_set(&c->nr_inodes, nr_inodes);
+fsck_err:
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
+ * Doesn't fix them yet, mainly because they haven't yet been observed:
+ */
+static int bch2_fsck_full(struct bch_fs *c)
+{
+       struct bch_inode_unpacked root_inode, lostfound_inode;
+       int ret;
+
+       bch_verbose(c, "starting fsck:");
+       ret =   check_extents(c) ?:
+               check_dirents(c) ?:
+               check_xattrs(c) ?:
+               check_root(c, &root_inode) ?:
+               check_lostfound(c, &root_inode, &lostfound_inode) ?:
+               check_directory_structure(c, &lostfound_inode) ?:
+               check_inode_nlinks(c, &lostfound_inode);
+
+       bch2_flush_fsck_errs(c);
+       bch_verbose(c, "fsck done");
+
+       return ret;
+}
+
+static int bch2_fsck_inode_nlink(struct bch_fs *c)
+{
+       struct bch_inode_unpacked root_inode, lostfound_inode;
+       int ret;
+
+       bch_verbose(c, "checking inode link counts:");
+       ret =   check_root(c, &root_inode) ?:
+               check_lostfound(c, &root_inode, &lostfound_inode) ?:
+               check_inode_nlinks(c, &lostfound_inode);
+
+       bch2_flush_fsck_errs(c);
+       bch_verbose(c, "done");
+
+       return ret;
+}
+
+static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+{
+       int ret;
+
+       bch_verbose(c, "walking inodes:");
+       ret = check_inodes_fast(c);
+
+       bch2_flush_fsck_errs(c);
+       bch_verbose(c, "done");
+
+       return ret;
+}
+
+int bch2_fsck(struct bch_fs *c)
+{
+       if (!c->opts.nofsck)
+               return bch2_fsck_full(c);
+
+       if (!c->sb.clean)
+               return bch2_fsck_inode_nlink(c);
+
+       return bch2_fsck_walk_inodes_only(c);
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
new file mode 100644 (file)
index 0000000..88da067
--- /dev/null
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+s64 bch2_count_inode_sectors(struct bch_fs *, u64);
+int bch2_fsck(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
new file mode 100644 (file)
index 0000000..2d63555
--- /dev/null
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "keylist.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+#define FIELD_BYTES()                                          \
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+static const u8 bits_table[8] = {
+       1  * 8 - 1,
+       2  * 8 - 2,
+       3  * 8 - 3,
+       4  * 8 - 4,
+       6  * 8 - 5,
+       8  * 8 - 6,
+       10 * 8 - 7,
+       13 * 8 - 8,
+};
+
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
+{
+       __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
+       unsigned shift, bytes, bits = likely(!hi)
+               ? fls64(lo)
+               : fls64(hi) + 64;
+
+       for (shift = 1; shift <= 8; shift++)
+               if (bits < bits_table[shift - 1])
+                       goto got_shift;
+
+       BUG();
+got_shift:
+       bytes = byte_table[shift - 1];
+
+       BUG_ON(out + bytes > end);
+
+       memcpy(out, (u8 *) in + 16 - bytes, bytes);
+       *out |= (1 << 8) >> shift;
+
+       return bytes;
+}
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+                             u64 out[2], unsigned *out_bits)
+{
+       __be64 be[2] = { 0, 0 };
+       unsigned bytes, shift;
+       u8 *p;
+
+       if (in >= end)
+               return -1;
+
+       if (!*in)
+               return -1;
+
+       /*
+        * position of highest set bit indicates number of bytes:
+        * shift = number of bits to remove in high byte:
+        */
+       shift   = 8 - __fls(*in); /* 1 <= shift <= 8 */
+       bytes   = byte_table[shift - 1];
+
+       if (in + bytes > end)
+               return -1;
+
+       p = (u8 *) be + 16 - bytes;
+       memcpy(p, in, bytes);
+       *p ^= (1 << 8) >> shift;
+
+       out[0] = be64_to_cpu(be[0]);
+       out[1] = be64_to_cpu(be[1]);
+       *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+       return bytes;
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+                    const struct bch_inode_unpacked *inode)
+{
+       u8 *out = packed->inode.v.fields;
+       u8 *end = (void *) &packed[1];
+       u8 *last_nonzero_field = out;
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+
+       bkey_inode_init(&packed->inode.k_i);
+       packed->inode.k.p.inode         = inode->bi_inum;
+       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
+       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
+       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)                                  \
+       out += inode_encode_field(out, end, 0, inode->_name);           \
+       nr_fields++;                                                    \
+                                                                       \
+       if (inode->_name) {                                             \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       }
+
+       BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+       out = last_nonzero_field;
+       nr_fields = last_nonzero_fieldnr;
+
+       set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
+       memset(out, 0,
+              (u8 *) &packed->inode.v +
+              bkey_val_bytes(&packed->inode.k) - out);
+
+       SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               struct bch_inode_unpacked unpacked;
+
+               int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+                                          &unpacked);
+               BUG_ON(ret);
+               BUG_ON(unpacked.bi_inum         != inode->bi_inum);
+               BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
+               BUG_ON(unpacked.bi_mode         != inode->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)  BUG_ON(unpacked._name != inode->_name);
+               BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+       }
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+                     struct bch_inode_unpacked *unpacked)
+{
+       const u8 *in = inode.v->fields;
+       const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+       u64 field[2];
+       unsigned fieldnr = 0, field_bits;
+       int ret;
+
+       unpacked->bi_inum       = inode.k->p.inode;
+       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
+       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)                                  \
+       if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
+               unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+               memset((void *) unpacked + offset, 0,                   \
+                      sizeof(*unpacked) - offset);                     \
+               return 0;                                               \
+       }                                                               \
+                                                                       \
+       ret = inode_decode_field(in, end, field, &field_bits);          \
+       if (ret < 0)                                                    \
+               return ret;                                             \
+                                                                       \
+       if (field_bits > sizeof(unpacked->_name) * 8)                   \
+               return -1;                                              \
+                                                                       \
+       unpacked->_name = field[1];                                     \
+       in += ret;
+
+       BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+       /* XXX: signal if there were more fields than expected? */
+
+       return 0;
+}
+
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (k.k->p.offset)
+               return "nonzero offset";
+
+       switch (k.k->type) {
+       case BCH_INODE_FS: {
+               struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+               struct bch_inode_unpacked unpacked;
+
+               if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+                       return "incorrect value size";
+
+               if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+                       return "fs inode in blockdev range";
+
+               if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+                       return "invalid str hash type";
+
+               if (bch2_inode_unpack(inode, &unpacked))
+                       return "invalid variable length fields";
+
+               if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+                       return "invalid data checksum type";
+
+               if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+                       return "invalid data checksum type";
+
+               if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+                   unpacked.bi_nlink != 0)
+                       return "flagged as unlinked but bi_nlink != 0";
+
+               return NULL;
+       }
+       case BCH_INODE_BLOCKDEV:
+               if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
+                       return "incorrect value size";
+
+               if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+                       return "blockdev inode in fs range";
+
+               return NULL;
+       case BCH_INODE_GENERATION:
+               if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+                       return "incorrect value size";
+
+               return NULL;
+       default:
+               return "invalid type";
+       }
+}
+
+void bch2_inode_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
+{
+       char *out = buf, *end = out + size;
+       struct bkey_s_c_inode inode;
+       struct bch_inode_unpacked unpacked;
+
+       switch (k.k->type) {
+       case BCH_INODE_FS:
+               inode = bkey_s_c_to_inode(k);
+               if (bch2_inode_unpack(inode, &unpacked)) {
+                       out += scnprintf(out, end - out, "(unpack error)");
+                       break;
+               }
+
+#define BCH_INODE_FIELD(_name, _bits)                                          \
+               out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+               BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+               break;
+       }
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+                    uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+                    struct bch_inode_unpacked *parent)
+{
+       s64 now = bch2_current_time(c);
+
+       memset(inode_u, 0, sizeof(*inode_u));
+
+       /* ick */
+       inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
+       get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
+
+       inode_u->bi_mode        = mode;
+       inode_u->bi_uid         = uid;
+       inode_u->bi_gid         = gid;
+       inode_u->bi_dev         = rdev;
+       inode_u->bi_atime       = now;
+       inode_u->bi_mtime       = now;
+       inode_u->bi_ctime       = now;
+       inode_u->bi_otime       = now;
+
+       if (parent) {
+#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
+               BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+       }
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_INODE_BLOCKDEV:
+       case BCH_INODE_FS:
+               BUG();
+       case BCH_INODE_GENERATION:
+               return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+       default:
+               return 0;
+       }
+}
+
+int __bch2_inode_create(struct btree_trans *trans,
+                       struct bch_inode_unpacked *inode_u,
+                       u64 min, u64 max, u64 *hint)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_inode_buf *inode_p;
+       struct btree_iter *iter;
+       u64 start;
+       int ret;
+
+       if (!max)
+               max = ULLONG_MAX;
+
+       if (c->opts.inodes_32bit)
+               max = min_t(u64, max, U32_MAX);
+
+       start = READ_ONCE(*hint);
+
+       if (start >= max || start < min)
+               start = min;
+
+       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+       if (IS_ERR(inode_p))
+               return PTR_ERR(inode_p);
+
+       iter = bch2_trans_get_iter(trans,
+                       BTREE_ID_INODES, POS(start, 0),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+again:
+       while (1) {
+               struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+               ret = btree_iter_err(k);
+               if (ret)
+                       return ret;
+
+               switch (k.k->type) {
+               case BCH_INODE_BLOCKDEV:
+               case BCH_INODE_FS:
+                       /* slot used */
+                       if (iter->pos.inode >= max)
+                               goto out;
+
+                       bch2_btree_iter_next_slot(iter);
+                       break;
+
+               default:
+                       *hint                   = k.k->p.inode;
+                       inode_u->bi_inum        = k.k->p.inode;
+                       inode_u->bi_generation  = bkey_generation(k);
+
+                       bch2_inode_pack(inode_p, inode_u);
+                       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+                       return 0;
+               }
+       }
+out:
+       if (start != min) {
+               /* Retry from start */
+               start = min;
+               bch2_btree_iter_set_pos(iter, POS(start, 0));
+               goto again;
+       }
+
+       return -ENOSPC;
+}
+
+int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+                     u64 min, u64 max, u64 *hint)
+{
+       return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+                       __bch2_inode_create(&trans, inode_u, min, max, hint));
+}
+
+int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
+                       struct extent_insert_hook *hook, u64 *journal_seq)
+{
+       return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+                                      POS(inode_nr, new_size),
+                                      POS(inode_nr + 1, 0),
+                                      ZERO_VERSION, NULL, hook,
+                                      journal_seq);
+}
+
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+{
+       struct btree_iter iter;
+       struct bkey_i_inode_generation delete;
+       int ret;
+
+       ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+                                    POS(inode_nr, 0),
+                                    POS(inode_nr + 1, 0),
+                                    ZERO_VERSION, NULL, NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If this was a directory, there shouldn't be any real dirents left -
+        * but there could be whiteouts (from hash collisions) that we should
+        * delete:
+        *
+        * XXX: the dirent could ideally would delete whiteouts when they're no
+        * longer needed
+        */
+       ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+                                    POS(inode_nr, 0),
+                                    POS(inode_nr + 1, 0),
+                                    ZERO_VERSION, NULL, NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       do {
+               struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+               u32 bi_generation = 0;
+
+               ret = btree_iter_err(k);
+               if (ret) {
+                       bch2_btree_iter_unlock(&iter);
+                       return ret;
+               }
+
+               bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
+                                       "inode %llu not found when deleting",
+                                       inode_nr);
+
+               switch (k.k->type) {
+               case BCH_INODE_FS: {
+                       struct bch_inode_unpacked inode_u;
+
+                       if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+                               bi_generation = inode_u.bi_generation + 1;
+                       break;
+               }
+               case BCH_INODE_GENERATION: {
+                       struct bkey_s_c_inode_generation g =
+                               bkey_s_c_to_inode_generation(k);
+                       bi_generation = le32_to_cpu(g.v->bi_generation);
+                       break;
+               }
+               }
+
+               if (!bi_generation) {
+                       bkey_init(&delete.k);
+                       delete.k.p.inode = inode_nr;
+               } else {
+                       bkey_inode_generation_init(&delete.k_i);
+                       delete.k.p.inode = inode_nr;
+                       delete.v.bi_generation = cpu_to_le32(bi_generation);
+               }
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                               BTREE_INSERT_ATOMIC|
+                               BTREE_INSERT_NOFAIL,
+                               BTREE_INSERT_ENTRY(&iter, &delete.k_i));
+       } while (ret == -EINTR);
+
+       bch2_btree_iter_unlock(&iter);
+       return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+                           struct bch_inode_unpacked *inode)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = -ENOENT;
+
+       for_each_btree_key(&iter, c, BTREE_ID_INODES,
+                          POS(inode_nr, 0),
+                          BTREE_ITER_SLOTS, k) {
+               switch (k.k->type) {
+               case BCH_INODE_FS:
+                       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+                       break;
+               default:
+                       /* hole, not found */
+                       break;
+               }
+
+               break;
+
+       }
+
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void)
+{
+       struct bch_inode_unpacked *u, test_inodes[] = {
+               {
+                       .bi_atime       = U64_MAX,
+                       .bi_ctime       = U64_MAX,
+                       .bi_mtime       = U64_MAX,
+                       .bi_otime       = U64_MAX,
+                       .bi_size        = U64_MAX,
+                       .bi_sectors     = U64_MAX,
+                       .bi_uid         = U32_MAX,
+                       .bi_gid         = U32_MAX,
+                       .bi_nlink       = U32_MAX,
+                       .bi_generation  = U32_MAX,
+                       .bi_dev         = U32_MAX,
+               },
+       };
+
+       for (u = test_inodes;
+            u < test_inodes + ARRAY_SIZE(test_inodes);
+            u++) {
+               struct bkey_inode_buf p;
+
+               bch2_inode_pack(&p, u);
+       }
+}
+#endif
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
new file mode 100644 (file)
index 0000000..bd6166c
--- /dev/null
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "opts.h"
+
+#include <linux/math64.h>
+
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_inode_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_inode_invalid,           \
+       .val_to_text    = bch2_inode_to_text,           \
+}
+
+struct bch_inode_unpacked {
+       u64                     bi_inum;
+       __le64                  bi_hash_seed;
+       u32                     bi_flags;
+       u16                     bi_mode;
+
+#define BCH_INODE_FIELD(_name, _bits)  u##_bits _name;
+       BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+};
+
+struct bkey_inode_buf {
+       struct bkey_i_inode     inode;
+
+#define BCH_INODE_FIELD(_name, _bits)          + 8 + _bits / 8
+       u8              _pad[0 + BCH_INODE_FIELDS()];
+#undef  BCH_INODE_FIELD
+} __attribute__((packed, aligned(8)));
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+                    uid_t, gid_t, umode_t, dev_t,
+                    struct bch_inode_unpacked *);
+
+int __bch2_inode_create(struct btree_trans *,
+                       struct bch_inode_unpacked *,
+                       u64, u64, u64 *);
+int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
+                     u64, u64, u64 *);
+
+int bch2_inode_truncate(struct bch_fs *, u64, u64,
+                      struct extent_insert_hook *, u64 *);
+int bch2_inode_rm(struct bch_fs *, u64);
+
+int bch2_inode_find_by_inum(struct bch_fs *, u64,
+                          struct bch_inode_unpacked *);
+
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+       struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (inode->bi_##_name)                                          \
+               opt_set(ret, _name, inode->bi_##_name - 1);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+                                       enum bch_opt_id id, u64 v)
+{
+       switch (id) {
+#define BCH_INODE_OPT(_name, ...)                                      \
+       case Opt_##_name:                                               \
+               inode->bi_##_name = v;                                  \
+               break;
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       default:
+               BUG();
+       }
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+                                     enum bch_opt_id id, u64 v)
+{
+       return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+                                       enum bch_opt_id id)
+{
+       return __bch2_inode_opt_set(inode, id, 0);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void);
+#else
+static inline void bch2_inode_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
new file mode 100644 (file)
index 0000000..d1935ef
--- /dev/null
@@ -0,0 +1,1875 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/random.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       const struct bch_devs_mask *devs;
+       unsigned d, nr = 0, total = 0;
+       u64 now = local_clock(), last;
+       s64 congested;
+       struct bch_dev *ca;
+
+       if (!target)
+               return false;
+
+       rcu_read_lock();
+       devs = bch2_target_to_mask(c, target);
+       for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+               ca = rcu_dereference(c->devs[d]);
+               if (!ca)
+                       continue;
+
+               congested = atomic_read(&ca->congested);
+               last = READ_ONCE(ca->congested_last);
+               if (time_after64(now, last))
+                       congested -= (now - last) >> 12;
+
+               total += max(congested, 0LL);
+               nr++;
+       }
+       rcu_read_unlock();
+
+       return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+                                      u64 now, int rw)
+{
+       u64 latency_capable =
+               ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+       /* ideally we'd be taking into account the device's variance here: */
+       u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+       s64 latency_over = io_latency - latency_threshold;
+
+       if (latency_threshold && latency_over > 0) {
+               /*
+                * bump up congested by approximately latency_over * 4 /
+                * latency_threshold - we don't need much accuracy here so don't
+                * bother with the divide:
+                */
+               if (atomic_read(&ca->congested) < CONGESTED_MAX)
+                       atomic_add(latency_over >>
+                                  max_t(int, ilog2(latency_threshold) - 2, 0),
+                                  &ca->congested);
+
+               ca->congested_last = now;
+       } else if (atomic_read(&ca->congested) > 0) {
+               atomic_dec(&ca->congested);
+       }
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+       atomic64_t *latency = &ca->cur_latency[rw];
+       u64 now = local_clock();
+       u64 io_latency = time_after64(now, submit_time)
+               ? now - submit_time
+               : 0;
+       u64 old, new, v = atomic64_read(latency);
+
+       do {
+               old = v;
+
+               /*
+                * If the io latency was reasonably close to the current
+                * latency, skip doing the update and atomic operation - most of
+                * the time:
+                */
+               if (abs((int) (old - io_latency)) < (old >> 1) &&
+                   now & ~(~0 << 5))
+                       break;
+
+               new = ewma_add(old, io_latency, 5);
+       } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+       bch2_congested_acct(ca, io_latency, now, rw);
+
+       __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       return false;
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+       struct bvec_iter_all iter;
+       struct bio_vec *bv;
+
+       bio_for_each_segment_all(bv, bio, iter)
+               if (bv->bv_page != ZERO_PAGE(0))
+                       mempool_free(bv->bv_page, &c->bio_bounce_pages);
+       bio->bi_vcnt = 0;
+}
+
+static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
+                                   bool *using_mempool)
+{
+       struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+
+       if (likely(!*using_mempool)) {
+               bv->bv_page = alloc_page(GFP_NOIO);
+               if (unlikely(!bv->bv_page)) {
+                       mutex_lock(&c->bio_bounce_pages_lock);
+                       *using_mempool = true;
+                       goto pool_alloc;
+
+               }
+       } else {
+pool_alloc:
+               bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+       }
+
+       bv->bv_len = PAGE_SIZE;
+       bv->bv_offset = 0;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+                              size_t bytes)
+{
+       bool using_mempool = false;
+
+       BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+
+       bio->bi_iter.bi_size = bytes;
+
+       while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
+               bch2_bio_alloc_page_pool(c, bio, &using_mempool);
+
+       if (using_mempool)
+               mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
+                                   size_t bytes)
+{
+       while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
+               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+               BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+
+               bv->bv_page = alloc_page(GFP_NOIO);
+               if (!bv->bv_page) {
+                       /*
+                        * We already allocated from mempool, we can't allocate from it again
+                        * without freeing the pages we already allocated or else we could
+                        * deadlock:
+                        */
+                       bch2_bio_free_pages_pool(c, bio);
+                       bch2_bio_alloc_pages_pool(c, bio, bytes);
+                       return;
+               }
+
+               bv->bv_len = PAGE_SIZE;
+               bv->bv_offset = 0;
+               bio->bi_vcnt++;
+       }
+
+       bio->bi_iter.bi_size = bytes;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+                              enum bch_data_type type,
+                              const struct bkey_i *k)
+{
+       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+       const struct bch_extent_ptr *ptr;
+       struct bch_write_bio *n;
+       struct bch_dev *ca;
+
+       BUG_ON(c->opts.nochanges);
+
+       extent_for_each_ptr(e, ptr) {
+               BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+                      !c->devs[ptr->dev]);
+
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ptr + 1 < &extent_entry_last(e)->ptr) {
+                       n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+                                               GFP_NOIO, &ca->replica_set));
+
+                       n->bio.bi_end_io        = wbio->bio.bi_end_io;
+                       n->bio.bi_private       = wbio->bio.bi_private;
+                       n->parent               = wbio;
+                       n->split                = true;
+                       n->bounce               = false;
+                       n->put_bio              = true;
+                       n->bio.bi_opf           = wbio->bio.bi_opf;
+                       bio_inc_remaining(&wbio->bio);
+               } else {
+                       n = wbio;
+                       n->split                = false;
+               }
+
+               n->c                    = c;
+               n->dev                  = ptr->dev;
+               n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
+               n->submit_time          = local_clock();
+               n->bio.bi_iter.bi_sector = ptr->offset;
+
+               if (!journal_flushes_device(ca))
+                       n->bio.bi_opf |= REQ_FUA;
+
+               if (likely(n->have_ioref)) {
+                       this_cpu_add(ca->io_done->sectors[WRITE][type],
+                                    bio_sectors(&n->bio));
+
+                       bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+                       if (type != BCH_DATA_BTREE && unlikely(c->opts.no_data_io)) {
+                               bio_endio(&n->bio);
+                               continue;
+                       }
+
+                       submit_bio(&n->bio);
+               } else {
+                       n->bio.bi_status        = BLK_STS_REMOVED;
+                       bio_endio(&n->bio);
+               }
+       }
+}
+
+static void __bch2_write(struct closure *);
+
+static void bch2_write_done(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bch_fs *c = op->c;
+
+       if (!op->error && (op->flags & BCH_WRITE_FLUSH))
+               op->error = bch2_journal_error(&c->journal);
+
+       if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+               bch2_disk_reservation_put(c, &op->res);
+       percpu_ref_put(&c->writes);
+       bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+
+       closure_return(cl);
+}
+
+int bch2_write_index_default(struct bch_write_op *op)
+{
+       struct keylist *keys = &op->insert_keys;
+       struct btree_iter iter;
+       int ret;
+
+       bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
+                                       NULL, op_journal_seq(op),
+                                       BTREE_INSERT_NOFAIL|
+                                       BTREE_INSERT_USE_RESERVE);
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct keylist *keys = &op->insert_keys;
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       struct bkey_i *src, *dst = keys->keys, *n, *k;
+       int ret;
+
+       for (src = keys->keys; src != keys->top; src = n) {
+               n = bkey_next(src);
+               bkey_copy(dst, src);
+
+               e = bkey_i_to_s_extent(dst);
+               extent_for_each_ptr_backwards(e, ptr)
+                       if (test_bit(ptr->dev, op->failed.d))
+                               bch2_extent_drop_ptr(e, ptr);
+
+               if (!bch2_extent_nr_ptrs(e.c)) {
+                       ret = -EIO;
+                       goto err;
+               }
+
+               if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
+                       ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
+                       if (ret)
+                               goto err;
+               }
+
+               dst = bkey_next(dst);
+       }
+
+       keys->top = dst;
+
+       /*
+        * probably not the ideal place to hook this in, but I don't
+        * particularly want to plumb io_opts all the way through the btree
+        * update stack right now
+        */
+       for_each_keylist_key(keys, k)
+               bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
+       if (!bch2_keylist_empty(keys)) {
+               u64 sectors_start = keylist_sectors(keys);
+               int ret = op->index_update_fn(op);
+
+               BUG_ON(keylist_sectors(keys) && !ret);
+
+               op->written += sectors_start - keylist_sectors(keys);
+
+               if (ret) {
+                       __bcache_io_error(c, "btree IO error %i", ret);
+                       op->error = ret;
+               }
+       }
+out:
+       bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
+       return;
+err:
+       keys->top = keys->keys;
+       op->error = ret;
+       goto out;
+}
+
+static void bch2_write_index(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bch_fs *c = op->c;
+
+       __bch2_write_index(op);
+
+       if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+               bch2_journal_flush_seq_async(&c->journal,
+                                            *op_journal_seq(op),
+                                            cl);
+               continue_at(cl, bch2_write_done, index_update_wq(op));
+       } else {
+               continue_at_nobarrier(cl, bch2_write_done, NULL);
+       }
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+       struct closure *cl              = bio->bi_private;
+       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
+
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+               set_bit(wbio->dev, op->failed.d);
+
+       if (wbio->have_ioref) {
+               bch2_latency_acct(ca, wbio->submit_time, WRITE);
+               percpu_ref_put(&ca->io_ref);
+       }
+
+       if (wbio->bounce)
+               bch2_bio_free_pages_pool(c, bio);
+
+       if (wbio->put_bio)
+               bio_put(bio);
+
+       if (parent)
+               bio_endio(&parent->bio);
+       else
+               closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+                              struct write_point *wp,
+                              struct bversion version,
+                              struct bch_extent_crc_unpacked crc)
+{
+       struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
+
+       op->pos.offset += crc.uncompressed_size;
+       e->k.p = op->pos;
+       e->k.size = crc.uncompressed_size;
+       e->k.version = version;
+       bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
+
+       bch2_extent_crc_append(e, crc);
+       bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
+
+       bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+                                       struct write_point *wp,
+                                       struct bio *src,
+                                       bool *page_alloc_failed)
+{
+       struct bch_write_bio *wbio;
+       struct bio *bio;
+       unsigned output_available =
+               min(wp->sectors_free << 9, src->bi_iter.bi_size);
+       unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+
+       bio = bio_alloc_bioset(NULL, pages, 0,
+                              GFP_NOIO, &c->bio_write);
+       wbio                    = wbio_init(bio);
+       wbio->bounce            = true;
+       wbio->put_bio           = true;
+       /* copy WRITE_SYNC flag */
+       wbio->bio.bi_opf        = src->bi_opf;
+
+       /*
+        * We can't use mempool for more than c->sb.encoded_extent_max
+        * worth of pages, but we'd like to allocate more if we can:
+        */
+       while (bio->bi_iter.bi_size < output_available) {
+               unsigned len = min_t(unsigned, PAGE_SIZE,
+                                    output_available - bio->bi_iter.bi_size);
+               struct page *p;
+
+               p = alloc_page(GFP_NOIO);
+               if (!p) {
+                       unsigned pool_max =
+                               min_t(unsigned, output_available,
+                                     c->sb.encoded_extent_max << 9);
+
+                       if (bio_sectors(bio) < pool_max)
+                               bch2_bio_alloc_pages_pool(c, bio, pool_max);
+                       break;
+               }
+
+               bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+                       .bv_page        = p,
+                       .bv_len         = len,
+                       .bv_offset      = 0,
+               };
+               bio->bi_iter.bi_size += len;
+       }
+
+       *page_alloc_failed = bio->bi_vcnt < pages;
+       return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+                                struct bch_write_op *op,
+                                unsigned new_csum_type)
+{
+       struct bio *bio = &op->wbio.bio;
+       struct bch_extent_crc_unpacked new_crc;
+       int ret;
+
+       /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+       if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+           bch2_csum_type_is_encryption(new_csum_type))
+               new_csum_type = op->crc.csum_type;
+
+       ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+                                 NULL, &new_crc,
+                                 op->crc.offset, op->crc.live_size,
+                                 new_csum_type);
+       if (ret)
+               return ret;
+
+       bio_advance(bio, op->crc.offset << 9);
+       bio->bi_iter.bi_size = op->crc.live_size << 9;
+       op->crc = new_crc;
+       return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct nonce nonce = extent_nonce(op->version, op->crc);
+       struct bch_csum csum;
+
+       if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+               return 0;
+
+       /*
+        * If we need to decrypt data in the write path, we'll no longer be able
+        * to verify the existing checksum (poly1305 mac, in this case) after
+        * it's decrypted - this is the last point we'll be able to reverify the
+        * checksum:
+        */
+       csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+       if (bch2_crc_cmp(op->crc.csum, csum))
+               return -EIO;
+
+       bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+       op->crc.csum_type = 0;
+       op->crc.csum = (struct bch_csum) { 0, 0 };
+       return 0;
+}
+
+static enum prep_encoded_ret {
+       PREP_ENCODED_OK,
+       PREP_ENCODED_ERR,
+       PREP_ENCODED_CHECKSUM_ERR,
+       PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+       struct bch_fs *c = op->c;
+       struct bio *bio = &op->wbio.bio;
+
+       if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+               return PREP_ENCODED_OK;
+
+       BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+       /* Can we just write the entire extent as is? */
+       if (op->crc.uncompressed_size == op->crc.live_size &&
+           op->crc.compressed_size <= wp->sectors_free &&
+           op->crc.compression_type == op->compression_type) {
+               if (!op->crc.compression_type &&
+                   op->csum_type != op->crc.csum_type &&
+                   bch2_write_rechecksum(c, op, op->csum_type))
+                       return PREP_ENCODED_CHECKSUM_ERR;
+
+               return PREP_ENCODED_DO_WRITE;
+       }
+
+       /*
+        * If the data is compressed and we couldn't write the entire extent as
+        * is, we have to decompress it:
+        */
+       if (op->crc.compression_type) {
+               struct bch_csum csum;
+
+               if (bch2_write_decrypt(op))
+                       return PREP_ENCODED_CHECKSUM_ERR;
+
+               /* Last point we can still verify checksum: */
+               csum = bch2_checksum_bio(c, op->crc.csum_type,
+                                        extent_nonce(op->version, op->crc),
+                                        bio);
+               if (bch2_crc_cmp(op->crc.csum, csum))
+                       return PREP_ENCODED_CHECKSUM_ERR;
+
+               if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+                       return PREP_ENCODED_ERR;
+       }
+
+       /*
+        * No longer have compressed data after this point - data might be
+        * encrypted:
+        */
+
+       /*
+        * If the data is checksummed and we're only writing a subset,
+        * rechecksum and adjust bio to point to currently live data:
+        */
+       if ((op->crc.live_size != op->crc.uncompressed_size ||
+            op->crc.csum_type != op->csum_type) &&
+           bch2_write_rechecksum(c, op, op->csum_type))
+               return PREP_ENCODED_CHECKSUM_ERR;
+
+       /*
+        * If we want to compress the data, it has to be decrypted:
+        */
+       if ((op->compression_type ||
+            bch2_csum_type_is_encryption(op->crc.csum_type) !=
+            bch2_csum_type_is_encryption(op->csum_type)) &&
+           bch2_write_decrypt(op))
+               return PREP_ENCODED_CHECKSUM_ERR;
+
+       return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+{
+       struct bch_fs *c = op->c;
+       struct bio *src = &op->wbio.bio, *dst = src;
+       struct bvec_iter saved_iter;
+       struct bkey_i *key_to_write;
+       unsigned key_to_write_offset = op->insert_keys.top_p -
+               op->insert_keys.keys_p;
+       unsigned total_output = 0;
+       bool bounce = false, page_alloc_failed = false;
+       int ret, more = 0;
+
+       BUG_ON(!bio_sectors(src));
+
+       switch (bch2_write_prep_encoded_data(op, wp)) {
+       case PREP_ENCODED_OK:
+               break;
+       case PREP_ENCODED_ERR:
+               ret = -EIO;
+               goto err;
+       case PREP_ENCODED_CHECKSUM_ERR:
+               goto csum_err;
+       case PREP_ENCODED_DO_WRITE:
+               init_append_extent(op, wp, op->version, op->crc);
+               goto do_write;
+       }
+
+       if (op->compression_type ||
+           (op->csum_type &&
+            !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+           (bch2_csum_type_is_encryption(op->csum_type) &&
+            !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+               dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+               bounce = true;
+       }
+
+       saved_iter = dst->bi_iter;
+
+       do {
+               struct bch_extent_crc_unpacked crc =
+                       (struct bch_extent_crc_unpacked) { 0 };
+               struct bversion version = op->version;
+               size_t dst_len, src_len;
+
+               if (page_alloc_failed &&
+                   bio_sectors(dst) < wp->sectors_free &&
+                   bio_sectors(dst) < c->sb.encoded_extent_max)
+                       break;
+
+               BUG_ON(op->compression_type &&
+                      (op->flags & BCH_WRITE_DATA_ENCODED) &&
+                      bch2_csum_type_is_encryption(op->crc.csum_type));
+               BUG_ON(op->compression_type && !bounce);
+
+               crc.compression_type = op->compression_type
+                       ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+                                            op->compression_type)
+                       : 0;
+               if (!crc.compression_type) {
+                       dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+                       dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+                       if (op->csum_type)
+                               dst_len = min_t(unsigned, dst_len,
+                                               c->sb.encoded_extent_max << 9);
+
+                       if (bounce) {
+                               swap(dst->bi_iter.bi_size, dst_len);
+                               bio_copy_data(dst, src);
+                               swap(dst->bi_iter.bi_size, dst_len);
+                       }
+
+                       src_len = dst_len;
+               }
+
+               BUG_ON(!src_len || !dst_len);
+
+               if (bch2_csum_type_is_encryption(op->csum_type)) {
+                       if (bversion_zero(version)) {
+                               version.lo = atomic64_inc_return(&c->key_version) + 1;
+                       } else {
+                               crc.nonce = op->nonce;
+                               op->nonce += src_len >> 9;
+                       }
+               }
+
+               if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+                   !crc.compression_type &&
+                   bch2_csum_type_is_encryption(op->crc.csum_type) ==
+                   bch2_csum_type_is_encryption(op->csum_type)) {
+                       /*
+                        * Note: when we're using rechecksum(), we need to be
+                        * checksumming @src because it has all the data our
+                        * existing checksum covers - if we bounced (because we
+                        * were trying to compress), @dst will only have the
+                        * part of the data the new checksum will cover.
+                        *
+                        * But normally we want to be checksumming post bounce,
+                        * because part of the reason for bouncing is so the
+                        * data can't be modified (by userspace) while it's in
+                        * flight.
+                        */
+                       if (bch2_rechecksum_bio(c, src, version, op->crc,
+                                       &crc, &op->crc,
+                                       src_len >> 9,
+                                       bio_sectors(src) - (src_len >> 9),
+                                       op->csum_type))
+                               goto csum_err;
+               } else {
+                       if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+                           bch2_rechecksum_bio(c, src, version, op->crc,
+                                       NULL, &op->crc,
+                                       src_len >> 9,
+                                       bio_sectors(src) - (src_len >> 9),
+                                       op->crc.csum_type))
+                               goto csum_err;
+
+                       crc.compressed_size     = dst_len >> 9;
+                       crc.uncompressed_size   = src_len >> 9;
+                       crc.live_size           = src_len >> 9;
+
+                       swap(dst->bi_iter.bi_size, dst_len);
+                       bch2_encrypt_bio(c, op->csum_type,
+                                        extent_nonce(version, crc), dst);
+                       crc.csum = bch2_checksum_bio(c, op->csum_type,
+                                        extent_nonce(version, crc), dst);
+                       crc.csum_type = op->csum_type;
+                       swap(dst->bi_iter.bi_size, dst_len);
+               }
+
+               init_append_extent(op, wp, version, crc);
+
+               if (dst != src)
+                       bio_advance(dst, dst_len);
+               bio_advance(src, src_len);
+               total_output += dst_len;
+       } while (dst->bi_iter.bi_size &&
+                src->bi_iter.bi_size &&
+                wp->sectors_free &&
+                !bch2_keylist_realloc(&op->insert_keys,
+                                     op->inline_keys,
+                                     ARRAY_SIZE(op->inline_keys),
+                                     BKEY_EXTENT_U64s_MAX));
+
+       more = src->bi_iter.bi_size != 0;
+
+       dst->bi_iter = saved_iter;
+
+       if (!bounce && more) {
+               dst = bio_split(src, total_output >> 9,
+                               GFP_NOIO, &c->bio_write);
+               wbio_init(dst)->put_bio = true;
+       }
+
+       dst->bi_iter.bi_size = total_output;
+
+       /* Free unneeded pages after compressing: */
+       if (bounce)
+               while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
+                       mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
+                                    &c->bio_bounce_pages);
+do_write:
+       /* might have done a realloc... */
+
+       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+       dst->bi_end_io  = bch2_write_endio;
+       dst->bi_private = &op->cl;
+       dst->bi_opf     = REQ_OP_WRITE;
+
+       closure_get(dst->bi_private);
+
+       bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
+                                 key_to_write);
+       return more;
+csum_err:
+       bch_err(c, "error verifying existing checksum while "
+               "rewriting existing data (memory corruption?)");
+       ret = -EIO;
+err:
+       if (bounce) {
+               bch2_bio_free_pages_pool(c, dst);
+               bio_put(dst);
+       }
+
+       return ret;
+}
+
+static void __bch2_write(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bch_fs *c = op->c;
+       struct write_point *wp;
+       int ret;
+again:
+       do {
+               /* +1 for possible cache device: */
+               if (op->open_buckets_nr + op->nr_replicas + 1 >
+                   ARRAY_SIZE(op->open_buckets))
+                       goto flush_io;
+
+               if (bch2_keylist_realloc(&op->insert_keys,
+                                       op->inline_keys,
+                                       ARRAY_SIZE(op->inline_keys),
+                                       BKEY_EXTENT_U64s_MAX))
+                       goto flush_io;
+
+               wp = bch2_alloc_sectors_start(c,
+                       op->target,
+                       op->write_point,
+                       &op->devs_have,
+                       op->nr_replicas,
+                       op->nr_replicas_required,
+                       op->alloc_reserve,
+                       op->flags,
+                       (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+               EBUG_ON(!wp);
+
+               if (unlikely(IS_ERR(wp))) {
+                       if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+                               ret = PTR_ERR(wp);
+                               goto err;
+                       }
+
+                       goto flush_io;
+               }
+
+               ret = bch2_write_extent(op, wp);
+
+               BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr >
+                      ARRAY_SIZE(op->open_buckets));
+               bch2_open_bucket_get(c, wp,
+                                    &op->open_buckets_nr,
+                                    op->open_buckets);
+               bch2_alloc_sectors_done(c, wp);
+
+               if (ret < 0)
+                       goto err;
+       } while (ret);
+
+       continue_at(cl, bch2_write_index, index_update_wq(op));
+       return;
+err:
+       op->error = ret;
+
+       continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
+                   ? bch2_write_index
+                   : bch2_write_done, index_update_wq(op));
+       return;
+flush_io:
+       closure_sync(cl);
+
+       if (!bch2_keylist_empty(&op->insert_keys)) {
+               __bch2_write_index(op);
+
+               if (op->error) {
+                       continue_at_nobarrier(cl, bch2_write_done, NULL);
+                       return;
+               }
+       }
+
+       goto again;
+}
+
+/**
+ * bch_write - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch2_write(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bch_fs *c = op->c;
+
+       BUG_ON(!op->nr_replicas);
+       BUG_ON(!op->write_point.v);
+       BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+       BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+
+       op->start_time = local_clock();
+
+       memset(&op->failed, 0, sizeof(op->failed));
+
+       bch2_keylist_init(&op->insert_keys, op->inline_keys);
+       wbio_init(&op->wbio.bio)->put_bio = false;
+
+       if (c->opts.nochanges ||
+           !percpu_ref_tryget(&c->writes)) {
+               __bcache_io_error(c, "read only");
+               op->error = -EROFS;
+               if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+                       bch2_disk_reservation_put(c, &op->res);
+               closure_return(cl);
+               return;
+       }
+
+       bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
+
+       continue_at_nobarrier(cl, __bch2_write, NULL);
+}
+
+/* Cache promotion on read */
+
+struct promote_op {
+       struct closure          cl;
+       u64                     start_time;
+
+       struct rhash_head       hash;
+       struct bpos             pos;
+
+       struct migrate_write    write;
+       struct bio_vec          bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+       .head_offset    = offsetof(struct promote_op, hash),
+       .key_offset     = offsetof(struct promote_op, pos),
+       .key_len        = sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+                                 struct bpos pos,
+                                 struct bch_io_opts opts,
+                                 unsigned flags)
+{
+       if (!opts.promote_target)
+               return false;
+
+       if (!(flags & BCH_READ_MAY_PROMOTE))
+               return false;
+
+       if (percpu_ref_is_dying(&c->writes))
+               return false;
+
+       if (!bkey_extent_is_data(k.k))
+               return false;
+
+       if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
+               return false;
+
+       if (bch2_target_congested(c, opts.promote_target))
+               return false;
+
+       if (rhashtable_lookup_fast(&c->promote_table, &pos,
+                                  bch_promote_params))
+               return false;
+
+       return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+       int ret;
+
+       ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                    bch_promote_params);
+       BUG_ON(ret);
+       percpu_ref_put(&c->writes);
+       kfree(op);
+}
+
+static void promote_done(struct closure *cl)
+{
+       struct promote_op *op =
+               container_of(cl, struct promote_op, cl);
+       struct bch_fs *c = op->write.op.c;
+
+       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+                              op->start_time);
+
+       bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+       promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
+       struct closure *cl = &op->cl;
+       struct bio *bio = &op->write.op.wbio.bio;
+
+       trace_promote(&rbio->bio);
+
+       /* we now own pages: */
+       BUG_ON(!rbio->bounce);
+       BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+              sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+       bch2_migrate_read_done(&op->write, rbio);
+
+       closure_init(cl, NULL);
+       closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+       closure_return_with_destructor(cl, promote_done);
+}
+
+noinline
+static struct promote_op *__promote_alloc(struct bch_fs *c,
+                                         struct bpos pos,
+                                         struct extent_pick_ptr *pick,
+                                         struct bch_io_opts opts,
+                                         unsigned rbio_sectors,
+                                         struct bch_read_bio **rbio)
+{
+       struct promote_op *op = NULL;
+       struct bio *bio;
+       unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
+       /* data might have to be decompressed in the write path: */
+       unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
+                                          PAGE_SECTORS);
+       int ret;
+
+       if (!percpu_ref_tryget(&c->writes))
+               return NULL;
+
+       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
+                    GFP_NOIO);
+       if (!op)
+               goto err;
+
+       op->start_time = local_clock();
+       op->pos = pos;
+
+       /*
+        * promotes require bouncing, but if the extent isn't
+        * checksummed/compressed it might be too big for the mempool:
+        */
+       if (rbio_sectors > c->sb.encoded_extent_max) {
+               *rbio = kzalloc(sizeof(struct bch_read_bio) +
+                               sizeof(struct bio_vec) * rbio_pages,
+                               GFP_NOIO);
+               if (!*rbio)
+                       goto err;
+
+               rbio_init(&(*rbio)->bio, opts);
+               bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, rbio_pages, 0);
+
+               if (bch2_bio_alloc_pages(&(*rbio)->bio, rbio_sectors << 9,
+                                        GFP_NOIO))
+                       goto err;
+
+               (*rbio)->bounce         = true;
+               (*rbio)->split          = true;
+               (*rbio)->kmalloc        = true;
+       }
+
+       if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+                                         bch_promote_params))
+               goto err;
+
+       bio = &op->write.op.wbio.bio;
+       bio_init(bio, NULL, bio->bi_inline_vecs, wbio_pages, 0);
+
+       ret = bch2_migrate_write_init(c, &op->write,
+                       writepoint_hashed((unsigned long) current),
+                       opts,
+                       DATA_PROMOTE,
+                       (struct data_opts) {
+                               .target = opts.promote_target
+                       },
+                       bkey_s_c_null);
+       BUG_ON(ret);
+
+       return op;
+err:
+       if (*rbio)
+               bio_free_pages(&(*rbio)->bio);
+       kfree(*rbio);
+       *rbio = NULL;
+       kfree(op);
+       percpu_ref_put(&c->writes);
+       return NULL;
+}
+
+static inline struct promote_op *promote_alloc(struct bch_fs *c,
+                                              struct bvec_iter iter,
+                                              struct bkey_s_c k,
+                                              struct extent_pick_ptr *pick,
+                                              struct bch_io_opts opts,
+                                              unsigned flags,
+                                              struct bch_read_bio **rbio,
+                                              bool *bounce,
+                                              bool *read_full)
+{
+       bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+       unsigned sectors = promote_full
+               ? pick->crc.compressed_size
+               : bvec_iter_sectors(iter);
+       struct bpos pos = promote_full
+               ? bkey_start_pos(k.k)
+               : POS(k.k->p.inode, iter.bi_sector);
+       struct promote_op *promote;
+
+       if (!should_promote(c, k, pos, opts, flags))
+               return NULL;
+
+       promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+       if (!promote)
+               return NULL;
+
+       *bounce         = true;
+       *read_full      = promote_full;
+       return promote;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID       1
+#define READ_RETRY             2
+#define READ_ERR               3
+
+enum rbio_context {
+       RBIO_CONTEXT_NULL,
+       RBIO_CONTEXT_HIGHPRI,
+       RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+       return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+                          enum rbio_context context,
+                          struct workqueue_struct *wq)
+{
+       if (context <= rbio->context) {
+               fn(&rbio->work);
+       } else {
+               rbio->work.func         = fn;
+               rbio->context           = context;
+               queue_work(wq, &rbio->work);
+       }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+       BUG_ON(rbio->bounce && !rbio->split);
+
+       if (rbio->promote)
+               promote_free(rbio->c, rbio->promote);
+       rbio->promote = NULL;
+
+       if (rbio->bounce)
+               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+       if (rbio->split) {
+               struct bch_read_bio *parent = rbio->parent;
+
+               if (rbio->kmalloc)
+                       kfree(rbio);
+               else
+                       bio_put(&rbio->bio);
+
+               rbio = parent;
+       }
+
+       return rbio;
+}
+
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+       bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+                              rbio->start_time);
+       bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+                                    struct bvec_iter bvec_iter, u64 inode,
+                                    struct bch_devs_mask *avoid, unsigned flags)
+{
+       struct btree_iter iter;
+       BKEY_PADDED(k) tmp;
+       struct bkey_s_c k;
+       int ret;
+
+       flags &= ~BCH_READ_LAST_FRAGMENT;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+                            rbio->pos, BTREE_ITER_SLOTS);
+retry:
+       rbio->bio.bi_status = 0;
+
+       k = bch2_btree_iter_peek_slot(&iter);
+       if (btree_iter_err(k)) {
+               bch2_btree_iter_unlock(&iter);
+               goto err;
+       }
+
+       bkey_reassemble(&tmp.k, k);
+       k = bkey_i_to_s_c(&tmp.k);
+       bch2_btree_iter_unlock(&iter);
+
+       if (!bkey_extent_is_data(k.k) ||
+           !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+                                    rbio->pick.ptr,
+                                    rbio->pos.offset -
+                                    rbio->pick.crc.offset)) {
+               /* extent we wanted to read no longer exists: */
+               rbio->hole = true;
+               goto out;
+       }
+
+       ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+       if (ret == READ_RETRY)
+               goto retry;
+       if (ret)
+               goto err;
+       goto out;
+err:
+       rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+       bch2_rbio_done(rbio);
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+                           struct bvec_iter bvec_iter, u64 inode,
+                           struct bch_devs_mask *avoid, unsigned flags)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       flags &= ~BCH_READ_LAST_FRAGMENT;
+       flags |= BCH_READ_MUST_CLONE;
+retry:
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(inode, bvec_iter.bi_sector),
+                          BTREE_ITER_SLOTS, k) {
+               BKEY_PADDED(k) tmp;
+               unsigned bytes;
+
+               bkey_reassemble(&tmp.k, k);
+               k = bkey_i_to_s_c(&tmp.k);
+               bch2_btree_iter_unlock(&iter);
+
+               bytes = min_t(unsigned, bvec_iter.bi_size,
+                             (k.k->p.offset - bvec_iter.bi_sector) << 9);
+               swap(bvec_iter.bi_size, bytes);
+
+               ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+               switch (ret) {
+               case READ_RETRY:
+                       goto retry;
+               case READ_ERR:
+                       goto err;
+               };
+
+               if (bytes == bvec_iter.bi_size)
+                       goto out;
+
+               swap(bvec_iter.bi_size, bytes);
+               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+       }
+
+       /*
+        * If we get here, it better have been because there was an error
+        * reading a btree node
+        */
+       ret = bch2_btree_iter_unlock(&iter);
+       BUG_ON(!ret);
+       __bcache_io_error(c, "btree IO error %i", ret);
+err:
+       rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+       bch2_rbio_done(rbio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c        = rbio->c;
+       struct bvec_iter iter   = rbio->bvec_iter;
+       unsigned flags          = rbio->flags;
+       u64 inode               = rbio->pos.inode;
+       struct bch_devs_mask avoid;
+
+       trace_read_retry(&rbio->bio);
+
+       memset(&avoid, 0, sizeof(avoid));
+
+       if (rbio->retry == READ_RETRY_AVOID)
+               __set_bit(rbio->pick.ptr.dev, avoid.d);
+
+       rbio->bio.bi_status = 0;
+
+       rbio = bch2_rbio_free(rbio);
+
+       flags |= BCH_READ_IN_RETRY;
+       flags &= ~BCH_READ_MAY_PROMOTE;
+
+       if (flags & BCH_READ_NODECODE)
+               bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
+       else
+               bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+                           blk_status_t error)
+{
+       rbio->retry = retry;
+
+       if (rbio->flags & BCH_READ_IN_RETRY)
+               return;
+
+       if (retry == READ_ERR) {
+               rbio = bch2_rbio_free(rbio);
+
+               rbio->bio.bi_status = error;
+               bch2_rbio_done(rbio);
+       } else {
+               bch2_rbio_punt(rbio, bch2_rbio_retry,
+                              RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+       }
+}
+
+static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_extent *e;
+       BKEY_PADDED(k) new;
+       struct bch_extent_crc_unpacked new_crc;
+       unsigned offset;
+       int ret;
+
+       if (rbio->pick.crc.compression_type)
+               return;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
+                            BTREE_ITER_INTENT);
+retry:
+       k = bch2_btree_iter_peek(&iter);
+       if (IS_ERR_OR_NULL(k.k))
+               goto out;
+
+       if (!bkey_extent_is_data(k.k))
+               goto out;
+
+       bkey_reassemble(&new.k, k);
+       e = bkey_i_to_extent(&new.k);
+
+       if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
+                                    rbio->pick.ptr,
+                                    rbio->pos.offset -
+                                    rbio->pick.crc.offset) ||
+           bversion_cmp(e->k.version, rbio->version))
+               goto out;
+
+       /* Extent was merged? */
+       if (bkey_start_offset(&e->k) < rbio->pos.offset ||
+           e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
+               goto out;
+
+       /* The extent might have been partially overwritten since we read it: */
+       offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
+
+       if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+                               rbio->pick.crc, NULL, &new_crc,
+                               offset, e->k.size,
+                               rbio->pick.crc.csum_type)) {
+               bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+               goto out;
+       }
+
+       if (!bch2_extent_narrow_crcs(e, new_crc))
+               goto out;
+
+       ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                                  BTREE_INSERT_ATOMIC|
+                                  BTREE_INSERT_NOFAIL|
+                                  BTREE_INSERT_NOWAIT,
+                                  BTREE_INSERT_ENTRY(&iter, &e->k_i));
+       if (ret == -EINTR)
+               goto retry;
+out:
+       bch2_btree_iter_unlock(&iter);
+}
+
+static bool should_narrow_crcs(struct bkey_s_c k,
+                              struct extent_pick_ptr *pick,
+                              unsigned flags)
+{
+       return !(flags & BCH_READ_IN_RETRY) &&
+               bkey_extent_is_data(k.k) &&
+               bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c        = rbio->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+       struct bio *src         = &rbio->bio;
+       struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
+       struct bvec_iter dst_iter = rbio->bvec_iter;
+       struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+       struct nonce nonce = extent_nonce(rbio->version, crc);
+       struct bch_csum csum;
+
+       /* Reset iterator for checksumming and copying bounced data: */
+       if (rbio->bounce) {
+               src->bi_iter.bi_size            = crc.compressed_size << 9;
+               src->bi_iter.bi_idx             = 0;
+               src->bi_iter.bi_bvec_done       = 0;
+       } else {
+               src->bi_iter                    = rbio->bvec_iter;
+       }
+
+       csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+               goto csum_err;
+
+       if (unlikely(rbio->narrow_crcs))
+               bch2_rbio_narrow_crcs(rbio);
+
+       if (rbio->flags & BCH_READ_NODECODE)
+               goto nodecode;
+
+       /* Adjust crc to point to subset of data we want: */
+       crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+       crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
+
+       if (crc.compression_type != BCH_COMPRESSION_NONE) {
+               bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+                       goto decompression_err;
+       } else {
+               /* don't need to decrypt the entire bio: */
+               nonce = nonce_add(nonce, crc.offset << 9);
+               bio_advance(src, crc.offset << 9);
+
+               BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+               src->bi_iter.bi_size = dst_iter.bi_size;
+
+               bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+
+               if (rbio->bounce) {
+                       struct bvec_iter src_iter = src->bi_iter;
+                       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+               }
+       }
+
+       if (rbio->promote) {
+               /*
+                * Re encrypt data we decrypted, so it's consistent with
+                * rbio->crc:
+                */
+               bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               promote_start(rbio->promote, rbio);
+               rbio->promote = NULL;
+       }
+nodecode:
+       if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+               rbio = bch2_rbio_free(rbio);
+               bch2_rbio_done(rbio);
+       }
+       return;
+csum_err:
+       /*
+        * Checksum error: if the bio wasn't bounced, we may have been
+        * reading into buffers owned by userspace (that userspace can
+        * scribble over) - retry the read, bouncing it this time:
+        */
+       if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+               rbio->flags |= BCH_READ_MUST_BOUNCE;
+               bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+               return;
+       }
+
+       bch2_dev_io_error(ca,
+               "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+               rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+               rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+               csum.hi, csum.lo, crc.csum_type);
+       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+       return;
+decompression_err:
+       __bcache_io_error(c, "decompression error, inode %llu offset %llu",
+                         rbio->pos.inode,
+                         (u64) rbio->bvec_iter.bi_sector);
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+       return;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+       struct bch_read_bio *rbio =
+               container_of(bio, struct bch_read_bio, bio);
+       struct bch_fs *c        = rbio->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+       struct workqueue_struct *wq = NULL;
+       enum rbio_context context = RBIO_CONTEXT_NULL;
+
+       if (rbio->have_ioref) {
+               bch2_latency_acct(ca, rbio->submit_time, READ);
+               percpu_ref_put(&ca->io_ref);
+       }
+
+       if (!rbio->split)
+               rbio->bio.bi_end_io = rbio->end_io;
+
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+               return;
+       }
+
+       if (rbio->pick.ptr.cached &&
+           (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+            ptr_stale(ca, &rbio->pick.ptr))) {
+               atomic_long_inc(&c->read_realloc_races);
+
+               if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+                       bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+               else
+                       bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+               return;
+       }
+
+       if (rbio->narrow_crcs ||
+           rbio->pick.crc.compression_type ||
+           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+               context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
+       else if (rbio->pick.crc.csum_type)
+               context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
+
+       bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+                      struct bvec_iter iter, struct bkey_s_c k,
+                      struct bch_devs_mask *avoid, unsigned flags)
+{
+       struct extent_pick_ptr pick;
+       struct bch_read_bio *rbio = NULL;
+       struct bch_dev *ca;
+       struct promote_op *promote = NULL;
+       bool bounce = false, read_full = false, narrow_crcs = false;
+       struct bpos pos = bkey_start_pos(k.k);
+       int pick_ret;
+
+       pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
+
+       /* hole or reservation - just zero fill: */
+       if (!pick_ret)
+               goto hole;
+
+       if (pick_ret < 0)
+               goto no_device;
+
+       if (pick_ret > 0)
+               ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+       if (flags & BCH_READ_NODECODE) {
+               /*
+                * can happen if we retry, and the extent we were going to read
+                * has been merged in the meantime:
+                */
+               if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+                       goto hole;
+
+               iter.bi_sector  = pos.offset;
+               iter.bi_size    = pick.crc.compressed_size << 9;
+               goto noclone;
+       }
+
+       if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+           bio_flagged(&orig->bio, BIO_CHAIN))
+               flags |= BCH_READ_MUST_CLONE;
+
+       narrow_crcs = should_narrow_crcs(k, &pick, flags);
+
+       if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+               flags |= BCH_READ_MUST_BOUNCE;
+
+       EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+               k.k->p.offset < bvec_iter_end_sector(iter));
+
+       if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+           (pick.crc.csum_type != BCH_CSUM_NONE &&
+            (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+             (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+              (flags & BCH_READ_USER_MAPPED)) ||
+             (flags & BCH_READ_MUST_BOUNCE)))) {
+               read_full = true;
+               bounce = true;
+       }
+
+       promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+                               &rbio, &bounce, &read_full);
+
+       if (!read_full) {
+               EBUG_ON(pick.crc.compression_type);
+               EBUG_ON(pick.crc.csum_type &&
+                       (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+                        bvec_iter_sectors(iter) != pick.crc.live_size ||
+                        pick.crc.offset ||
+                        iter.bi_sector != pos.offset));
+
+               pick.ptr.offset += pick.crc.offset +
+                       (iter.bi_sector - pos.offset);
+               pick.crc.compressed_size        = bvec_iter_sectors(iter);
+               pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
+               pick.crc.offset                 = 0;
+               pick.crc.live_size              = bvec_iter_sectors(iter);
+               pos.offset                      = iter.bi_sector;
+       }
+
+       if (rbio) {
+               /* promote already allocated bounce rbio */
+       } else if (bounce) {
+               unsigned sectors = pick.crc.compressed_size;
+
+               rbio = rbio_init(bio_alloc_bioset(NULL,
+                                                 DIV_ROUND_UP(sectors, PAGE_SECTORS),
+                                                 0,
+                                                 GFP_NOIO,
+                                                 &c->bio_read_split),
+                                orig->opts);
+
+               bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+               rbio->bounce    = true;
+               rbio->split     = true;
+       } else if (flags & BCH_READ_MUST_CLONE) {
+               /*
+                * Have to clone if there were any splits, due to error
+                * reporting issues (if a split errored, and retrying didn't
+                * work, when it reports the error to its parent (us) we don't
+                * know if the error was from our bio, and we should retry, or
+                * from the whole bio, in which case we don't want to retry and
+                * lose the error)
+                */
+               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+                                                &c->bio_read_split),
+                                orig->opts);
+               rbio->bio.bi_iter = iter;
+               rbio->split     = true;
+       } else {
+noclone:
+               rbio = orig;
+               rbio->bio.bi_iter = iter;
+               BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+       }
+
+       BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+       rbio->c                 = c;
+       rbio->submit_time       = local_clock();
+       if (rbio->split)
+               rbio->parent    = orig;
+       else
+               rbio->end_io    = orig->bio.bi_end_io;
+       rbio->bvec_iter         = iter;
+       rbio->flags             = flags;
+       rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+       rbio->narrow_crcs       = narrow_crcs;
+       rbio->hole              = 0;
+       rbio->retry             = 0;
+       rbio->context           = 0;
+       rbio->devs_have         = bch2_bkey_devs(k);
+       rbio->pick              = pick;
+       rbio->pos               = pos;
+       rbio->version           = k.k->version;
+       rbio->promote           = promote;
+       INIT_WORK(&rbio->work, NULL);
+
+       rbio->bio.bi_opf        = orig->bio.bi_opf;
+       rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+       rbio->bio.bi_end_io     = bch2_read_endio;
+
+       if (rbio->bounce)
+               trace_read_bounce(&rbio->bio);
+
+       bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+       if (!rbio->have_ioref)
+               goto no_device_postclone;
+
+       percpu_down_read(&c->usage_lock);
+       bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+       percpu_up_read(&c->usage_lock);
+
+       this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+                    bio_sectors(&rbio->bio));
+
+       bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
+               if (!(flags & BCH_READ_LAST_FRAGMENT)) {
+                       bio_inc_remaining(&orig->bio);
+                       trace_read_split(&orig->bio);
+               }
+
+               if (unlikely(c->opts.no_data_io)) {
+                       bio_endio(&rbio->bio);
+                       return 0;
+               }
+
+               submit_bio(&rbio->bio);
+               return 0;
+       } else {
+               int ret;
+
+               submit_bio_wait(&rbio->bio);
+
+               rbio->context = RBIO_CONTEXT_UNBOUND;
+               bch2_read_endio(&rbio->bio);
+
+               ret = rbio->retry;
+               rbio = bch2_rbio_free(rbio);
+
+               if (ret == READ_RETRY_AVOID) {
+                       __set_bit(pick.ptr.dev, avoid->d);
+                       ret = READ_RETRY;
+               }
+
+               return ret;
+       }
+
+no_device_postclone:
+       if (!rbio->split)
+               rbio->bio.bi_end_io = rbio->end_io;
+       bch2_rbio_free(rbio);
+no_device:
+       __bcache_io_error(c, "no device to read from");
+
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
+               orig->bio.bi_status = BLK_STS_IOERR;
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       bch2_rbio_done(orig);
+               return 0;
+       } else {
+               return READ_ERR;
+       }
+
+hole:
+       /*
+        * won't normally happen in the BCH_READ_NODECODE
+        * (bch2_move_extent()) path, but if we retry and the extent we wanted
+        * to read no longer exists we have to signal that:
+        */
+       if (flags & BCH_READ_NODECODE)
+               orig->hole = true;
+
+       zero_fill_bio_iter(&orig->bio, iter);
+
+       if (flags & BCH_READ_LAST_FRAGMENT)
+               bch2_rbio_done(orig);
+       return 0;
+}
+
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       unsigned flags = BCH_READ_RETRY_IF_STALE|
+               BCH_READ_MAY_PROMOTE|
+               BCH_READ_USER_MAPPED;
+       int ret;
+
+       BUG_ON(rbio->_state);
+       BUG_ON(flags & BCH_READ_NODECODE);
+       BUG_ON(flags & BCH_READ_IN_RETRY);
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(inode, rbio->bio.bi_iter.bi_sector),
+                          BTREE_ITER_SLOTS, k) {
+               BKEY_PADDED(k) tmp;
+               unsigned bytes;
+
+               /*
+                * Unlock the iterator while the btree node's lock is still in
+                * cache, before doing the IO:
+                */
+               bkey_reassemble(&tmp.k, k);
+               k = bkey_i_to_s_c(&tmp.k);
+               bch2_btree_iter_unlock(&iter);
+
+               bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
+                             (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+
+               if (rbio->bio.bi_iter.bi_size == bytes)
+                       flags |= BCH_READ_LAST_FRAGMENT;
+
+               bch2_read_extent(c, rbio, k, flags);
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       return;
+
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+               bio_advance(&rbio->bio, bytes);
+       }
+
+       /*
+        * If we get here, it better have been because there was an error
+        * reading a btree node
+        */
+       ret = bch2_btree_iter_unlock(&iter);
+       BUG_ON(!ret);
+       bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+       bch2_rbio_done(rbio);
+}
+
+void bch2_fs_io_exit(struct bch_fs *c)
+{
+       if (c->promote_table.tbl)
+               rhashtable_destroy(&c->promote_table);
+       mempool_exit(&c->bio_bounce_pages);
+       bioset_exit(&c->bio_write);
+       bioset_exit(&c->bio_read_split);
+       bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+                       BIOSET_NEED_BVECS) ||
+           mempool_init_page_pool(&c->bio_bounce_pages,
+                                  max_t(unsigned,
+                                        c->opts.btree_node_size,
+                                        c->sb.encoded_extent_max) /
+                                  PAGE_SECTORS, 0) ||
+           rhashtable_init(&c->promote_table, &bch_promote_params))
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
new file mode 100644 (file)
index 0000000..f814226
--- /dev/null
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_H
+#define _BCACHEFS_IO_H
+
+#include "alloc.h"
+#include "checksum.h"
+#include "io_types.h"
+
+#define to_wbio(_bio)                  \
+       container_of((_bio), struct bch_write_bio, bio)
+
+#define to_rbio(_bio)                  \
+       container_of((_bio), struct bch_read_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+                              enum bch_data_type, const struct bkey_i *);
+
+#define BLK_STS_REMOVED                ((__force blk_status_t)128)
+
+enum bch_write_flags {
+       BCH_WRITE_ALLOC_NOWAIT          = (1 << 0),
+       BCH_WRITE_CACHED                = (1 << 1),
+       BCH_WRITE_FLUSH                 = (1 << 2),
+       BCH_WRITE_DATA_ENCODED          = (1 << 3),
+       BCH_WRITE_PAGES_STABLE          = (1 << 4),
+       BCH_WRITE_PAGES_OWNED           = (1 << 5),
+       BCH_WRITE_ONLY_SPECIFIED_DEVS   = (1 << 6),
+       BCH_WRITE_NOPUT_RESERVATION     = (1 << 7),
+       BCH_WRITE_NOMARK_REPLICAS       = (1 << 8),
+
+       /* Internal: */
+       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 9),
+};
+
+static inline u64 *op_journal_seq(struct bch_write_op *op)
+{
+       return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
+               ? op->journal_seq_p : &op->journal_seq;
+}
+
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+       op->journal_seq_p = journal_seq;
+       op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+       return op->alloc_reserve == RESERVE_MOVINGGC
+               ? op->c->copygc_wq
+               : op->c->wq;
+}
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+                                     struct bch_io_opts opts)
+{
+       op->c                   = c;
+       op->io_wq               = index_update_wq(op);
+       op->flags               = 0;
+       op->written             = 0;
+       op->error               = 0;
+       op->csum_type           = bch2_data_checksum_type(c, opts.data_checksum);
+       op->compression_type    = bch2_compression_opt_to_type[opts.compression];
+       op->nr_replicas         = 0;
+       op->nr_replicas_required = c->opts.data_replicas_required;
+       op->alloc_reserve       = RESERVE_NONE;
+       op->open_buckets_nr     = 0;
+       op->devs_have.nr        = 0;
+       op->target              = 0;
+       op->opts                = opts;
+       op->pos                 = POS_MAX;
+       op->version             = ZERO_VERSION;
+       op->write_point         = (struct write_point_specifier) { 0 };
+       op->res                 = (struct disk_reservation) { 0 };
+       op->journal_seq         = 0;
+       op->index_update_fn     = bch2_write_index_default;
+}
+
+void bch2_write(struct closure *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+       struct bch_write_bio *wbio = to_wbio(bio);
+
+       memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+       return wbio;
+}
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_pick_ptr;
+
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+                      struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
+enum bch_read_flags {
+       BCH_READ_RETRY_IF_STALE         = 1 << 0,
+       BCH_READ_MAY_PROMOTE            = 1 << 1,
+       BCH_READ_USER_MAPPED            = 1 << 2,
+       BCH_READ_NODECODE               = 1 << 3,
+       BCH_READ_LAST_FRAGMENT          = 1 << 4,
+
+       /* internal: */
+       BCH_READ_MUST_BOUNCE            = 1 << 5,
+       BCH_READ_MUST_CLONE             = 1 << 6,
+       BCH_READ_IN_RETRY               = 1 << 7,
+};
+
+static inline void bch2_read_extent(struct bch_fs *c,
+                                   struct bch_read_bio *rbio,
+                                   struct bkey_s_c k,
+                                   unsigned flags)
+{
+       __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+                                            struct bch_io_opts opts)
+{
+       struct bch_read_bio *rbio = to_rbio(bio);
+
+       rbio->_state    = 0;
+       rbio->promote   = NULL;
+       rbio->opts      = opts;
+       return rbio;
+}
+
+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_H */
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
new file mode 100644 (file)
index 0000000..b313128
--- /dev/null
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_TYPES_H
+#define _BCACHEFS_IO_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_read_bio {
+       struct bch_fs           *c;
+       u64                     start_time;
+       u64                     submit_time;
+
+       /*
+        * Reads will often have to be split, and if the extent being read from
+        * was checksummed or compressed we'll also have to allocate bounce
+        * buffers and copy the data back into the original bio.
+        *
+        * If we didn't have to split, we have to save and restore the original
+        * bi_end_io - @split below indicates which:
+        */
+       union {
+       struct bch_read_bio     *parent;
+       bio_end_io_t            *end_io;
+       };
+
+       /*
+        * Saved copy of bio->bi_iter, from submission time - allows us to
+        * resubmit on IO error, and also to copy data back to the original bio
+        * when we're bouncing:
+        */
+       struct bvec_iter        bvec_iter;
+
+       u16                     flags;
+       union {
+       struct {
+       u16                     bounce:1,
+                               split:1,
+                               kmalloc:1,
+                               have_ioref:1,
+                               narrow_crcs:1,
+                               hole:1,
+                               retry:2,
+                               context:2;
+       };
+       u16                     _state;
+       };
+
+       struct bch_devs_list    devs_have;
+
+       struct extent_pick_ptr  pick;
+       /* start pos of data we read (may not be pos of data we want) */
+       struct bpos             pos;
+       struct bversion         version;
+
+       struct promote_op       *promote;
+
+       struct bch_io_opts      opts;
+
+       struct work_struct      work;
+
+       struct bio              bio;
+};
+
+struct bch_write_bio {
+       struct_group(wbio,
+       struct bch_fs           *c;
+       struct bch_write_bio    *parent;
+
+       u64                     submit_time;
+
+       struct bch_devs_list    failed;
+       u8                      order;
+       u8                      dev;
+
+       unsigned                split:1,
+                               bounce:1,
+                               put_bio:1,
+                               have_ioref:1,
+                               used_mempool:1;
+       );
+
+       struct bio              bio;
+};
+
+struct bch_write_op {
+       struct closure          cl;
+       struct bch_fs           *c;
+       struct workqueue_struct *io_wq;
+       u64                     start_time;
+
+       unsigned                written; /* sectors */
+       u16                     flags;
+       s16                     error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+       unsigned                csum_type:4;
+       unsigned                compression_type:4;
+       unsigned                nr_replicas:4;
+       unsigned                nr_replicas_required:4;
+       unsigned                alloc_reserve:4;
+
+       u8                      open_buckets_nr;
+       struct bch_devs_list    devs_have;
+       u16                     target;
+       u16                     nonce;
+
+       struct bch_io_opts      opts;
+
+       struct bpos             pos;
+       struct bversion         version;
+
+       /* For BCH_WRITE_DATA_ENCODED: */
+       struct bch_extent_crc_unpacked crc;
+
+       struct write_point_specifier write_point;
+
+       struct disk_reservation res;
+
+       u8                      open_buckets[16];
+
+       /*
+        * If caller wants to flush but hasn't passed us a journal_seq ptr, we
+        * still need to stash the journal_seq somewhere:
+        */
+       union {
+               u64                     *journal_seq_p;
+               u64                     journal_seq;
+       };
+
+       int                     (*index_update_fn)(struct bch_write_op *);
+
+       struct bch_devs_mask    failed;
+
+       struct keylist          insert_keys;
+       u64                     inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+       /* Must be last: */
+       struct bch_write_bio    wbio;
+};
+
+#endif /* _BCACHEFS_IO_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
new file mode 100644 (file)
index 0000000..697f601
--- /dev/null
@@ -0,0 +1,1140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+#include "trace.h"
+
+static bool journal_entry_is_open(struct journal *j)
+{
+       return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+{
+       struct journal_buf *w = journal_prev_buf(j);
+
+       atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
+
+       if (!need_write_just_set &&
+           test_bit(JOURNAL_NEED_WRITE, &j->flags))
+               bch2_time_stats_update(j->delay_time,
+                                      j->need_write_time);
+#if 0
+       closure_call(&j->io, bch2_journal_write, NULL, NULL);
+#else
+       /* Shut sparse up: */
+       closure_init(&j->io, NULL);
+       set_closure_fn(&j->io, bch2_journal_write, NULL);
+       bch2_journal_write(&j->io);
+#endif
+}
+
+static void journal_pin_new_entry(struct journal *j, int count)
+{
+       struct journal_entry_pin_list *p;
+
+       /*
+        * The fifo_push() needs to happen at the same time as j->seq is
+        * incremented for journal_last_seq() to be calculated correctly
+        */
+       atomic64_inc(&j->seq);
+       p = fifo_push_ref(&j->pin);
+
+       INIT_LIST_HEAD(&p->list);
+       INIT_LIST_HEAD(&p->flushed);
+       atomic_set(&p->count, count);
+       p->devs.nr = 0;
+}
+
+static void bch2_journal_buf_init(struct journal *j)
+{
+       struct journal_buf *buf = journal_cur_buf(j);
+
+       memset(buf->has_inode, 0, sizeof(buf->has_inode));
+
+       memset(buf->data, 0, sizeof(*buf->data));
+       buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
+       buf->data->u64s = 0;
+}
+
+static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
+{
+       return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+}
+
+static inline bool journal_entry_empty(struct jset *j)
+{
+       struct jset_entry *i;
+
+       if (j->seq != j->last_seq)
+               return false;
+
+       vstruct_for_each(j, i)
+               if (i->type || i->u64s)
+                       return false;
+       return true;
+}
+
+static enum {
+       JOURNAL_ENTRY_ERROR,
+       JOURNAL_ENTRY_INUSE,
+       JOURNAL_ENTRY_CLOSED,
+       JOURNAL_UNLOCKED,
+} journal_buf_switch(struct journal *j, bool need_write_just_set)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_buf *buf;
+       union journal_res_state old, new;
+       u64 v = atomic64_read(&j->reservations.counter);
+
+       lockdep_assert_held(&j->lock);
+
+       do {
+               old.v = new.v = v;
+               if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
+                       return JOURNAL_ENTRY_CLOSED;
+
+               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+                       return JOURNAL_ENTRY_ERROR;
+
+               if (new.prev_buf_unwritten)
+                       return JOURNAL_ENTRY_INUSE;
+
+               /*
+                * avoid race between setting buf->data->u64s and
+                * journal_res_put starting write:
+                */
+               journal_state_inc(&new);
+
+               new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
+               new.idx++;
+               new.prev_buf_unwritten = 1;
+
+               BUG_ON(journal_state_count(new, new.idx));
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
+
+       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+       buf = &j->buf[old.idx];
+       buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
+
+       j->prev_buf_sectors =
+               vstruct_blocks_plus(buf->data, c->block_bits,
+                                   journal_entry_u64s_reserve(buf)) *
+               c->opts.block_size;
+       BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+
+       bch2_journal_reclaim_fast(j);
+       /* XXX: why set this here, and not in bch2_journal_write()? */
+       buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
+
+       if (journal_entry_empty(buf->data))
+               clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
+       else
+               set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+
+       journal_pin_new_entry(j, 1);
+
+       bch2_journal_buf_init(j);
+
+       cancel_delayed_work(&j->write_work);
+       spin_unlock(&j->lock);
+
+       if (c->bucket_journal_seq > 1 << 14) {
+               c->bucket_journal_seq = 0;
+               bch2_bucket_seq_cleanup(c);
+       }
+
+       c->bucket_journal_seq++;
+
+       /* ugh - might be called from __journal_res_get() under wait_event() */
+       __set_current_state(TASK_RUNNING);
+       bch2_journal_buf_put(j, old.idx, need_write_just_set);
+
+       return JOURNAL_UNLOCKED;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+       union journal_res_state old, new;
+       u64 v = atomic64_read(&j->reservations.counter);
+
+       do {
+               old.v = new.v = v;
+               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+                       return;
+
+               new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
+
+       journal_wake(j);
+       closure_wake_up(&journal_cur_buf(j)->wait);
+       closure_wake_up(&journal_prev_buf(j)->wait);
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ *
+ * returns:
+ * 1:          success
+ * 0:          journal currently full (must wait)
+ * -EROFS:     insufficient rw devices
+ * -EIO:       journal error
+ */
+static int journal_entry_open(struct journal *j)
+{
+       struct journal_buf *buf = journal_cur_buf(j);
+       union journal_res_state old, new;
+       ssize_t u64s;
+       int sectors;
+       u64 v;
+
+       lockdep_assert_held(&j->lock);
+       BUG_ON(journal_entry_is_open(j));
+
+       if (!fifo_free(&j->pin))
+               return 0;
+
+       sectors = bch2_journal_entry_sectors(j);
+       if (sectors <= 0)
+               return sectors;
+
+       buf->disk_sectors       = sectors;
+
+       sectors = min_t(unsigned, sectors, buf->size >> 9);
+       j->cur_buf_sectors      = sectors;
+
+       u64s = (sectors << 9) / sizeof(u64);
+
+       /* Subtract the journal header */
+       u64s -= sizeof(struct jset) / sizeof(u64);
+       /*
+        * Btree roots, prio pointers don't get added until right before we do
+        * the write:
+        */
+       u64s -= journal_entry_u64s_reserve(buf);
+       u64s  = max_t(ssize_t, 0L, u64s);
+
+       BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+
+       if (u64s <= le32_to_cpu(buf->data->u64s))
+               return 0;
+
+       /*
+        * Must be set before marking the journal entry as open:
+        */
+       j->cur_entry_u64s = u64s;
+
+       v = atomic64_read(&j->reservations.counter);
+       do {
+               old.v = new.v = v;
+
+               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+                       return -EIO;
+
+               /* Handle any already added entries */
+               new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
+
+       if (j->res_get_blocked_start)
+               bch2_time_stats_update(j->blocked_time,
+                                      j->res_get_blocked_start);
+       j->res_get_blocked_start = 0;
+
+       mod_delayed_work(system_freezable_wq,
+                        &j->write_work,
+                        msecs_to_jiffies(j->write_delay_ms));
+       journal_wake(j);
+       return 1;
+}
+
+/*
+ * returns true if there's nothing to flush and no journal write still in flight
+ */
+static bool journal_flush_write(struct journal *j)
+{
+       bool ret;
+
+       spin_lock(&j->lock);
+       ret = !j->reservations.prev_buf_unwritten;
+
+       if (!journal_entry_is_open(j)) {
+               spin_unlock(&j->lock);
+               return ret;
+       }
+
+       set_bit(JOURNAL_NEED_WRITE, &j->flags);
+       if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
+               ret = false;
+       else
+               spin_unlock(&j->lock);
+       return ret;
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+       struct journal *j = container_of(work, struct journal, write_work.work);
+
+       journal_flush_write(j);
+}
+
+/*
+ * Given an inode number, if that inode number has data in the journal that
+ * hasn't yet been flushed, return the journal sequence number that needs to be
+ * flushed:
+ */
+u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
+{
+       size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+       u64 seq = 0;
+
+       if (!test_bit(h, j->buf[0].has_inode) &&
+           !test_bit(h, j->buf[1].has_inode))
+               return 0;
+
+       spin_lock(&j->lock);
+       if (test_bit(h, journal_cur_buf(j)->has_inode))
+               seq = journal_cur_seq(j);
+       else if (test_bit(h, journal_prev_buf(j)->has_inode))
+               seq = journal_cur_seq(j) - 1;
+       spin_unlock(&j->lock);
+
+       return seq;
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+                             unsigned u64s_min, unsigned u64s_max)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_buf *buf;
+       int ret;
+retry:
+       ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+       if (ret)
+               return ret;
+
+       spin_lock(&j->lock);
+       /*
+        * Recheck after taking the lock, so we don't race with another thread
+        * that just did journal_entry_open() and call journal_entry_close()
+        * unnecessarily
+        */
+       ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+       if (ret) {
+               spin_unlock(&j->lock);
+               return 1;
+       }
+
+       /*
+        * If we couldn't get a reservation because the current buf filled up,
+        * and we had room for a bigger entry on disk, signal that we want to
+        * realloc the journal bufs:
+        */
+       buf = journal_cur_buf(j);
+       if (journal_entry_is_open(j) &&
+           buf->size >> 9 < buf->disk_sectors &&
+           buf->size < JOURNAL_ENTRY_SIZE_MAX)
+               j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+
+       /*
+        * Close the current journal entry if necessary, then try to start a new
+        * one:
+        */
+       switch (journal_buf_switch(j, false)) {
+       case JOURNAL_ENTRY_ERROR:
+               spin_unlock(&j->lock);
+               return -EROFS;
+       case JOURNAL_ENTRY_INUSE:
+               /* haven't finished writing out the previous one: */
+               spin_unlock(&j->lock);
+               trace_journal_entry_full(c);
+               goto blocked;
+       case JOURNAL_ENTRY_CLOSED:
+               break;
+       case JOURNAL_UNLOCKED:
+               goto retry;
+       }
+
+       /* We now have a new, closed journal buf - see if we can open it: */
+       ret = journal_entry_open(j);
+       spin_unlock(&j->lock);
+
+       if (ret < 0)
+               return ret;
+       if (ret)
+               goto retry;
+
+       /* Journal's full, we have to wait */
+
+       /*
+        * Direct reclaim - can't rely on reclaim from work item
+        * due to freezing..
+        */
+       bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+       trace_journal_full(c);
+blocked:
+       if (!j->res_get_blocked_start)
+               j->res_get_blocked_start = local_clock() ?: 1;
+       return 0;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+                                unsigned u64s_min, unsigned u64s_max)
+{
+       int ret;
+
+       wait_event(j->wait,
+                  (ret = __journal_res_get(j, res, u64s_min,
+                                           u64s_max)));
+       return ret < 0 ? ret : 0;
+}
+
+u64 bch2_journal_last_unwritten_seq(struct journal *j)
+{
+       u64 seq;
+
+       spin_lock(&j->lock);
+       seq = journal_cur_seq(j);
+       if (j->reservations.prev_buf_unwritten)
+               seq--;
+       spin_unlock(&j->lock);
+
+       return seq;
+}
+
+/**
+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
+ * open yet, or wait if we cannot
+ *
+ * used by the btree interior update machinery, when it needs to write a new
+ * btree root - every journal entry contains the roots of all the btrees, so it
+ * doesn't need to bother with getting a journal reservation
+ */
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+       int ret;
+
+       spin_lock(&j->lock);
+       BUG_ON(seq > journal_cur_seq(j));
+
+       if (seq < journal_cur_seq(j) ||
+           journal_entry_is_open(j)) {
+               spin_unlock(&j->lock);
+               return 1;
+       }
+
+       ret = journal_entry_open(j);
+       if (!ret)
+               closure_wait(&j->async_wait, parent);
+       spin_unlock(&j->lock);
+
+       if (!ret)
+               bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+       return ret;
+}
+
+/**
+ * bch2_journal_wait_on_seq - wait for a journal entry to be written
+ *
+ * does _not_ cause @seq to be written immediately - if there is no other
+ * activity to cause the relevant journal entry to be filled up or flushed it
+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
+ * configurable).
+ */
+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+{
+       spin_lock(&j->lock);
+
+       BUG_ON(seq > journal_cur_seq(j));
+
+       if (bch2_journal_error(j)) {
+               spin_unlock(&j->lock);
+               return;
+       }
+
+       if (seq == journal_cur_seq(j)) {
+               if (!closure_wait(&journal_cur_buf(j)->wait, parent))
+                       BUG();
+       } else if (seq + 1 == journal_cur_seq(j) &&
+                  j->reservations.prev_buf_unwritten) {
+               if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+                       BUG();
+
+               smp_mb();
+
+               /* check if raced with write completion (or failure) */
+               if (!j->reservations.prev_buf_unwritten ||
+                   bch2_journal_error(j))
+                       closure_wake_up(&journal_prev_buf(j)->wait);
+       }
+
+       spin_unlock(&j->lock);
+}
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ *
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+       struct journal_buf *buf;
+
+       spin_lock(&j->lock);
+
+       BUG_ON(seq > journal_cur_seq(j));
+
+       if (bch2_journal_error(j)) {
+               spin_unlock(&j->lock);
+               return;
+       }
+
+       if (seq == journal_cur_seq(j)) {
+               bool set_need_write = false;
+
+               buf = journal_cur_buf(j);
+
+               if (parent && !closure_wait(&buf->wait, parent))
+                       BUG();
+
+               if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+                       j->need_write_time = local_clock();
+                       set_need_write = true;
+               }
+
+               switch (journal_buf_switch(j, set_need_write)) {
+               case JOURNAL_ENTRY_ERROR:
+                       if (parent)
+                               closure_wake_up(&buf->wait);
+                       break;
+               case JOURNAL_ENTRY_CLOSED:
+                       /*
+                        * Journal entry hasn't been opened yet, but caller
+                        * claims it has something
+                        */
+                       BUG();
+               case JOURNAL_ENTRY_INUSE:
+                       break;
+               case JOURNAL_UNLOCKED:
+                       return;
+               }
+       } else if (parent &&
+                  seq + 1 == journal_cur_seq(j) &&
+                  j->reservations.prev_buf_unwritten) {
+               buf = journal_prev_buf(j);
+
+               if (!closure_wait(&buf->wait, parent))
+                       BUG();
+
+               smp_mb();
+
+               /* check if raced with write completion (or failure) */
+               if (!j->reservations.prev_buf_unwritten ||
+                   bch2_journal_error(j))
+                       closure_wake_up(&buf->wait);
+       }
+
+       spin_unlock(&j->lock);
+}
+
+static int journal_seq_flushed(struct journal *j, u64 seq)
+{
+       struct journal_buf *buf;
+       int ret = 1;
+
+       spin_lock(&j->lock);
+       BUG_ON(seq > journal_cur_seq(j));
+
+       if (seq == journal_cur_seq(j)) {
+               bool set_need_write = false;
+
+               ret = 0;
+
+               buf = journal_cur_buf(j);
+
+               if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+                       j->need_write_time = local_clock();
+                       set_need_write = true;
+               }
+
+               switch (journal_buf_switch(j, set_need_write)) {
+               case JOURNAL_ENTRY_ERROR:
+                       ret = -EIO;
+                       break;
+               case JOURNAL_ENTRY_CLOSED:
+                       /*
+                        * Journal entry hasn't been opened yet, but caller
+                        * claims it has something
+                        */
+                       BUG();
+               case JOURNAL_ENTRY_INUSE:
+                       break;
+               case JOURNAL_UNLOCKED:
+                       return 0;
+               }
+       } else if (seq + 1 == journal_cur_seq(j) &&
+                  j->reservations.prev_buf_unwritten) {
+               ret = bch2_journal_error(j);
+       }
+
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+       u64 start_time = local_clock();
+       int ret, ret2;
+
+       ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+
+       bch2_time_stats_update(j->flush_seq_time, start_time);
+
+       return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/**
+ * bch2_journal_meta_async - force a journal entry to be written
+ */
+void bch2_journal_meta_async(struct journal *j, struct closure *parent)
+{
+       struct journal_res res;
+       unsigned u64s = jset_u64s(0);
+
+       memset(&res, 0, sizeof(res));
+
+       bch2_journal_res_get(j, &res, u64s, u64s);
+       bch2_journal_res_put(j, &res);
+
+       bch2_journal_flush_seq_async(j, res.seq, parent);
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+       struct journal_res res;
+       unsigned u64s = jset_u64s(0);
+       int ret;
+
+       memset(&res, 0, sizeof(res));
+
+       ret = bch2_journal_res_get(j, &res, u64s, u64s);
+       if (ret)
+               return ret;
+
+       bch2_journal_res_put(j, &res);
+
+       return bch2_journal_flush_seq(j, res.seq);
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+       u64 seq, journal_seq;
+
+       spin_lock(&j->lock);
+       journal_seq = journal_cur_seq(j);
+
+       if (journal_entry_is_open(j)) {
+               seq = journal_seq;
+       } else if (journal_seq) {
+               seq = journal_seq - 1;
+       } else {
+               spin_unlock(&j->lock);
+               return;
+       }
+       spin_unlock(&j->lock);
+
+       bch2_journal_flush_seq_async(j, seq, parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+       u64 seq, journal_seq;
+
+       spin_lock(&j->lock);
+       journal_seq = journal_cur_seq(j);
+
+       if (journal_entry_is_open(j)) {
+               seq = journal_seq;
+       } else if (journal_seq) {
+               seq = journal_seq - 1;
+       } else {
+               spin_unlock(&j->lock);
+               return 0;
+       }
+       spin_unlock(&j->lock);
+
+       return bch2_journal_flush_seq(j, seq);
+}
+
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+                                        bool new_fs, struct closure *cl)
+{
+       struct bch_fs *c = ca->fs;
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal *journal_buckets;
+       u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+       int ret = 0;
+
+       /* don't handle reducing nr of buckets yet: */
+       if (nr <= ja->nr)
+               return 0;
+
+       ret = -ENOMEM;
+       new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+       new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+       if (!new_buckets || !new_bucket_seq)
+               goto err;
+
+       journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
+                               nr + sizeof(*journal_buckets) / sizeof(u64));
+       if (!journal_buckets)
+               goto err;
+
+       if (c)
+               spin_lock(&c->journal.lock);
+
+       memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
+       memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
+       swap(new_buckets,       ja->buckets);
+       swap(new_bucket_seq,    ja->bucket_seq);
+
+       if (c)
+               spin_unlock(&c->journal.lock);
+
+       while (ja->nr < nr) {
+               struct open_bucket *ob = NULL;
+               long bucket;
+
+               if (new_fs) {
+                       percpu_down_read(&c->usage_lock);
+                       bucket = bch2_bucket_alloc_new_fs(ca);
+                       percpu_up_read(&c->usage_lock);
+
+                       if (bucket < 0) {
+                               ret = -ENOSPC;
+                               goto err;
+                       }
+               } else {
+                       int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
+                       if (ob_idx < 0) {
+                               ret = cl ? -EAGAIN : -ENOSPC;
+                               goto err;
+                       }
+
+                       ob = c->open_buckets + ob_idx;
+                       bucket = sector_to_bucket(ca, ob->ptr.offset);
+               }
+
+               if (c) {
+                       percpu_down_read(&c->usage_lock);
+                       spin_lock(&c->journal.lock);
+               }
+
+               __array_insert_item(ja->buckets,                ja->nr, ja->last_idx);
+               __array_insert_item(ja->bucket_seq,             ja->nr, ja->last_idx);
+               __array_insert_item(journal_buckets->buckets,   ja->nr, ja->last_idx);
+
+               ja->buckets[ja->last_idx] = bucket;
+               ja->bucket_seq[ja->last_idx] = 0;
+               journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
+
+               if (ja->last_idx < ja->nr) {
+                       if (ja->cur_idx >= ja->last_idx)
+                               ja->cur_idx++;
+                       ja->last_idx++;
+               }
+               ja->nr++;
+
+               bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+                               ca->mi.bucket_size,
+                               gc_phase(GC_PHASE_SB),
+                               new_fs
+                               ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
+                               : 0);
+
+               if (c) {
+                       spin_unlock(&c->journal.lock);
+                       percpu_up_read(&c->usage_lock);
+               }
+
+               if (!new_fs)
+                       bch2_open_bucket_put(c, ob);
+       }
+
+       ret = 0;
+err:
+       kfree(new_bucket_seq);
+       kfree(new_buckets);
+
+       return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+                               unsigned nr)
+{
+       struct journal_device *ja = &ca->journal;
+       struct closure cl;
+       unsigned current_nr;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       do {
+               struct disk_reservation disk_res = { 0, 0 };
+
+               closure_sync(&cl);
+
+               mutex_lock(&c->sb_lock);
+               current_nr = ja->nr;
+
+               /*
+                * note: journal buckets aren't really counted as _sectors_ used yet, so
+                * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+                * when space used goes up without a reservation - but we do need the
+                * reservation to ensure we'll actually be able to allocate:
+                */
+
+               if (bch2_disk_reservation_get(c, &disk_res,
+                               bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+                       mutex_unlock(&c->sb_lock);
+                       return -ENOSPC;
+               }
+
+               ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+               bch2_disk_reservation_put(c, &disk_res);
+
+               if (ja->nr != current_nr)
+                       bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+       } while (ret == -EAGAIN);
+
+       return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+       unsigned nr;
+
+       if (dynamic_fault("bcachefs:add:journal_alloc"))
+               return -ENOMEM;
+
+       /*
+        * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+        * is smaller:
+        */
+       nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+                    BCH_JOURNAL_BUCKETS_MIN,
+                    min(1 << 10,
+                        (1 << 20) / ca->mi.bucket_size));
+
+       return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+}
+
+/* startup/shutdown: */
+
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
+{
+       union journal_res_state state;
+       struct journal_buf *w;
+       bool ret;
+
+       spin_lock(&j->lock);
+       state = READ_ONCE(j->reservations);
+       w = j->buf + !state.idx;
+
+       ret = state.prev_buf_unwritten &&
+               bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
+{
+       spin_lock(&j->lock);
+       bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
+       spin_unlock(&j->lock);
+
+       wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
+}
+
+void bch2_fs_journal_stop(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+       wait_event(j->wait, journal_flush_write(j));
+
+       /* do we need to write another journal entry? */
+       if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
+           c->btree_roots_dirty)
+               bch2_journal_meta(j);
+
+       BUG_ON(!bch2_journal_error(j) &&
+              test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+
+       cancel_delayed_work_sync(&j->write_work);
+       cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch2_fs_journal_start(struct journal *j)
+{
+       struct journal_seq_blacklist *bl;
+       u64 blacklist = 0;
+
+       list_for_each_entry(bl, &j->seq_blacklist, list)
+               blacklist = max(blacklist, bl->end);
+
+       spin_lock(&j->lock);
+
+       set_bit(JOURNAL_STARTED, &j->flags);
+
+       while (journal_cur_seq(j) < blacklist)
+               journal_pin_new_entry(j, 0);
+
+       /*
+        * journal_buf_switch() only inits the next journal entry when it
+        * closes an open journal entry - the very first journal entry gets
+        * initialized here:
+        */
+       journal_pin_new_entry(j, 1);
+       bch2_journal_buf_init(j);
+
+       spin_unlock(&j->lock);
+
+       /*
+        * Adding entries to the next journal entry before allocating space on
+        * disk for the next journal entry - this is ok, because these entries
+        * only have to go down with the next journal entry we write:
+        */
+       bch2_journal_seq_blacklist_write(j);
+
+       queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+/* init/exit: */
+
+void bch2_dev_journal_exit(struct bch_dev *ca)
+{
+       kfree(ca->journal.bio);
+       kfree(ca->journal.buckets);
+       kfree(ca->journal.bucket_seq);
+
+       ca->journal.bio         = NULL;
+       ca->journal.buckets     = NULL;
+       ca->journal.bucket_seq  = NULL;
+}
+
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
+{
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal *journal_buckets =
+               bch2_sb_get_journal(sb);
+       unsigned i, nr_bvecs;
+
+       ja->nr = bch2_nr_journal_buckets(journal_buckets);
+
+       ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+       if (!ja->bucket_seq)
+               return -ENOMEM;
+
+       nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+       ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+       if (!ca->journal.bio)
+               return -ENOMEM;
+
+       bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
+       ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+       if (!ja->buckets)
+               return -ENOMEM;
+
+       for (i = 0; i < ja->nr; i++)
+               ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+       return 0;
+}
+
+void bch2_fs_journal_exit(struct journal *j)
+{
+       kvpfree(j->buf[1].data, j->buf[1].size);
+       kvpfree(j->buf[0].data, j->buf[0].size);
+       free_fifo(&j->pin);
+}
+
+int bch2_fs_journal_init(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       static struct lock_class_key res_key;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       spin_lock_init(&j->lock);
+       spin_lock_init(&j->err_lock);
+       init_waitqueue_head(&j->wait);
+       INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+       INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+       mutex_init(&j->blacklist_lock);
+       INIT_LIST_HEAD(&j->seq_blacklist);
+       mutex_init(&j->reclaim_lock);
+
+       lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+       j->buf[0].size          = JOURNAL_ENTRY_SIZE_MIN;
+       j->buf[1].size          = JOURNAL_ENTRY_SIZE_MIN;
+       j->write_delay_ms       = 1000;
+       j->reclaim_delay_ms     = 100;
+
+       bkey_extent_init(&j->key);
+
+       atomic64_set(&j->reservations.counter,
+               ((union journal_res_state)
+                { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+           !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
+           !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       j->pin.front = j->pin.back = 1;
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
+}
+
+/* debug: */
+
+ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       union journal_res_state *s = &j->reservations;
+       struct bch_dev *ca;
+       unsigned iter;
+       ssize_t ret = 0;
+
+       rcu_read_lock();
+       spin_lock(&j->lock);
+
+       ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                        "active journal entries:\t%llu\n"
+                        "seq:\t\t\t%llu\n"
+                        "last_seq:\t\t%llu\n"
+                        "last_seq_ondisk:\t%llu\n"
+                        "reservation count:\t%u\n"
+                        "reservation offset:\t%u\n"
+                        "current entry u64s:\t%u\n"
+                        "io in flight:\t\t%i\n"
+                        "need write:\t\t%i\n"
+                        "dirty:\t\t\t%i\n"
+                        "replay done:\t\t%i\n",
+                        fifo_used(&j->pin),
+                        journal_cur_seq(j),
+                        journal_last_seq(j),
+                        j->last_seq_ondisk,
+                        journal_state_count(*s, s->idx),
+                        s->cur_entry_offset,
+                        j->cur_entry_u64s,
+                        s->prev_buf_unwritten,
+                        test_bit(JOURNAL_NEED_WRITE,   &j->flags),
+                        journal_entry_is_open(j),
+                        test_bit(JOURNAL_REPLAY_DONE,  &j->flags));
+
+       for_each_member_device_rcu(ca, c, iter,
+                                  &c->rw_devs[BCH_DATA_JOURNAL]) {
+               struct journal_device *ja = &ca->journal;
+
+               if (!ja->nr)
+                       continue;
+
+               ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                "dev %u:\n"
+                                "\tnr\t\t%u\n"
+                                "\tcur_idx\t\t%u (seq %llu)\n"
+                                "\tlast_idx\t%u (seq %llu)\n",
+                                iter, ja->nr,
+                                ja->cur_idx,   ja->bucket_seq[ja->cur_idx],
+                                ja->last_idx,  ja->bucket_seq[ja->last_idx]);
+       }
+
+       spin_unlock(&j->lock);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+{
+       struct journal_entry_pin_list *pin_list;
+       struct journal_entry_pin *pin;
+       ssize_t ret = 0;
+       u64 i;
+
+       spin_lock(&j->lock);
+       fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
+               ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                "%llu: count %u\n",
+                                i, atomic_read(&pin_list->count));
+
+               list_for_each_entry(pin, &pin_list->list, list)
+                       ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                        "\t%p %pf\n",
+                                        pin, pin->flush);
+
+               if (!list_empty(&pin_list->flushed))
+                       ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                        "flushed:\n");
+
+               list_for_each_entry(pin, &pin_list->flushed, list)
+                       ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                        "\t%p %pf\n",
+                                        pin, pin->flush);
+       }
+       spin_unlock(&j->lock);
+
+       return ret;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
new file mode 100644 (file)
index 0000000..f39b37e
--- /dev/null
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will will wait on the journal
+ * write to complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+       wake_up(&j->wait);
+       closure_wake_up(&j->async_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+       return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+       return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+       return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+       return j->pin.back - 1;
+}
+
+u64 bch2_inode_journal_seq(struct journal *, u64);
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+       return idx == 0 ? s.buf0_count : s.buf1_count;
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+       s->buf0_count += s->idx == 0;
+       s->buf1_count += s->idx == 1;
+}
+
+static inline void bch2_journal_set_has_inode(struct journal *j,
+                                             struct journal_res *res,
+                                             u64 inum)
+{
+       struct journal_buf *buf = &j->buf[res->idx];
+       unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
+
+       /* avoid atomic op if possible */
+       if (unlikely(!test_bit(bit, buf->has_inode)))
+               set_bit(bit, buf->has_inode);
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+       return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+       struct jset *jset = buf->data;
+       struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+       memset(entry, 0, sizeof(*entry));
+       entry->u64s = cpu_to_le16(u64s);
+
+       le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+       return entry;
+}
+
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+                                         unsigned type, enum btree_id id,
+                                         unsigned level,
+                                         const void *data, unsigned u64s)
+{
+       struct journal_buf *buf = &j->buf[res->idx];
+       struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
+       unsigned actual = jset_u64s(u64s);
+
+       EBUG_ON(!res->ref);
+       EBUG_ON(actual > res->u64s);
+
+       res->offset     += actual;
+       res->u64s       -= actual;
+
+       entry->u64s     = cpu_to_le16(u64s);
+       entry->btree_id = id;
+       entry->level    = level;
+       entry->type     = type;
+       entry->pad[0]   = 0;
+       entry->pad[1]   = 0;
+       entry->pad[2]   = 0;
+       memcpy_u64s(entry->_data, data, u64s);
+}
+
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
+                                       enum btree_id id, const struct bkey_i *k)
+{
+       bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
+                              id, 0, k, k->k.u64s);
+}
+
+void bch2_journal_buf_put_slowpath(struct journal *, bool);
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
+                                      bool need_write_just_set)
+{
+       union journal_res_state s;
+
+       s.v = atomic64_sub_return(((union journal_res_state) {
+                                   .buf0_count = idx == 0,
+                                   .buf1_count = idx == 1,
+                                   }).v, &j->reservations.counter);
+
+       EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
+
+       /*
+        * Do not initiate a journal write if the journal is in an error state
+        * (previous journal entry write may have failed)
+        */
+       if (s.idx != idx &&
+           !journal_state_count(s, idx) &&
+           s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
+               bch2_journal_buf_put_slowpath(j, need_write_just_set);
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+                                      struct journal_res *res)
+{
+       if (!res->ref)
+               return;
+
+       lock_release(&j->res_map, _RET_IP_);
+
+       while (res->u64s)
+               bch2_journal_add_entry(j, res,
+                                      BCH_JSET_ENTRY_btree_keys,
+                                      0, 0, NULL, 0);
+
+       bch2_journal_buf_put(j, res->idx, false);
+
+       res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+                                unsigned, unsigned);
+
+static inline int journal_res_get_fast(struct journal *j,
+                                      struct journal_res *res,
+                                      unsigned u64s_min,
+                                      unsigned u64s_max)
+{
+       union journal_res_state old, new;
+       u64 v = atomic64_read(&j->reservations.counter);
+
+       do {
+               old.v = new.v = v;
+
+               /*
+                * Check if there is still room in the current journal
+                * entry:
+                */
+               if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+                       return 0;
+
+               res->offset     = old.cur_entry_offset;
+               res->u64s       = min(u64s_max, j->cur_entry_u64s -
+                                     old.cur_entry_offset);
+
+               journal_state_inc(&new);
+               new.cur_entry_offset += res->u64s;
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
+
+       res->ref = true;
+       res->idx = new.idx;
+       res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+       return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+                                     unsigned u64s_min, unsigned u64s_max)
+{
+       int ret;
+
+       EBUG_ON(res->ref);
+       EBUG_ON(u64s_max < u64s_min);
+       EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+       if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+               goto out;
+
+       ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+       if (ret)
+               return ret;
+out:
+       lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+       EBUG_ON(!res->ref);
+       return 0;
+}
+
+u64 bch2_journal_last_unwritten_seq(struct journal *);
+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
+
+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+void bch2_journal_meta_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+       return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+               ? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline bool journal_flushes_device(struct bch_dev *ca)
+{
+       return true;
+}
+
+int bch2_journal_mark(struct bch_fs *, struct list_head *);
+void bch2_journal_entries_free(struct list_head *);
+int bch2_journal_replay(struct bch_fs *, struct list_head *);
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+       BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+       set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+ssize_t bch2_journal_print_debug(struct journal *, char *);
+ssize_t bch2_journal_print_pins(struct journal *, char *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+                               unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+void bch2_fs_journal_stop(struct journal *);
+void bch2_fs_journal_start(struct journal *);
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
new file mode 100644 (file)
index 0000000..320f4f2
--- /dev/null
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+#include "trace.h"
+
+struct journal_list {
+       struct closure          cl;
+       struct mutex            lock;
+       struct list_head        *head;
+       int                     ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK           0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+                            struct journal_list *jlist, struct jset *j)
+{
+       struct journal_replay *i, *pos;
+       struct list_head *where;
+       size_t bytes = vstruct_bytes(j);
+       __le64 last_seq;
+       int ret;
+
+       last_seq = !list_empty(jlist->head)
+               ? list_last_entry(jlist->head, struct journal_replay,
+                                 list)->j.last_seq
+               : 0;
+
+       /* Is this entry older than the range we need? */
+       if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+               goto out;
+       }
+
+       /* Drop entries we don't need anymore */
+       list_for_each_entry_safe(i, pos, jlist->head, list) {
+               if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+                       break;
+               list_del(&i->list);
+               kvpfree(i, offsetof(struct journal_replay, j) +
+                       vstruct_bytes(&i->j));
+       }
+
+       list_for_each_entry_reverse(i, jlist->head, list) {
+               /* Duplicate? */
+               if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+                       fsck_err_on(bytes != vstruct_bytes(&i->j) ||
+                                   memcmp(j, &i->j, bytes), c,
+                                   "found duplicate but non identical journal entries (seq %llu)",
+                                   le64_to_cpu(j->seq));
+                       goto found;
+               }
+
+               if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+                       where = &i->list;
+                       goto add;
+               }
+       }
+
+       where = jlist->head;
+add:
+       i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+       if (!i) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       list_add(&i->list, where);
+       i->devs.nr = 0;
+       unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+found:
+       if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
+               bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
+       else
+               fsck_err_on(1, c, "duplicate journal entries on same device");
+       ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+       return ret;
+}
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = ((__le32 *) &jset->seq)[0],
+               [2] = ((__le32 *) &jset->seq)[1],
+               [3] = BCH_NONCE_JOURNAL,
+       }};
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+       struct jset_entry *entry;
+
+       for (entry = start; entry != end; entry = vstruct_next(entry))
+               memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD   5
+#define JOURNAL_ENTRY_NONE     6
+#define JOURNAL_ENTRY_BAD      7
+
+#define journal_entry_err(c, msg, ...)                                 \
+({                                                                     \
+       switch (write) {                                                \
+       case READ:                                                      \
+               mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
+               break;                                                  \
+       case WRITE:                                                     \
+               bch_err(c, "corrupt metadata before write:\n"           \
+                       msg, ##__VA_ARGS__);                            \
+               if (bch2_fs_inconsistent(c)) {                          \
+                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       goto fsck_err;                                  \
+               }                                                       \
+               break;                                                  \
+       }                                                               \
+       true;                                                           \
+})
+
+#define journal_entry_err_on(cond, c, msg, ...)                                \
+       ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+                               struct jset_entry *entry,
+                               struct bkey_i *k, enum bkey_type key_type,
+                               const char *type, int write)
+{
+       void *next = vstruct_next(entry);
+       const char *invalid;
+       char buf[160];
+       int ret = 0;
+
+       if (journal_entry_err_on(!k->k.u64s, c,
+                       "invalid %s in journal: k->u64s 0", type)) {
+               entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+               journal_entry_null_range(vstruct_next(entry), next);
+               return 0;
+       }
+
+       if (journal_entry_err_on((void *) bkey_next(k) >
+                               (void *) vstruct_next(entry), c,
+                       "invalid %s in journal: extends past end of journal entry",
+                       type)) {
+               entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+               journal_entry_null_range(vstruct_next(entry), next);
+               return 0;
+       }
+
+       if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+                       "invalid %s in journal: bad format %u",
+                       type, k->k.format)) {
+               le16_add_cpu(&entry->u64s, -k->k.u64s);
+               memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+               journal_entry_null_range(vstruct_next(entry), next);
+               return 0;
+       }
+
+       if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
+               bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+
+       invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+       if (invalid) {
+               bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
+                                    bkey_i_to_s_c(k));
+               mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+                                type, invalid, buf);
+
+               le16_add_cpu(&entry->u64s, -k->k.u64s);
+               memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+               journal_entry_null_range(vstruct_next(entry), next);
+               return 0;
+       }
+fsck_err:
+       return ret;
+}
+
+static int journal_entry_validate_btree_keys(struct bch_fs *c,
+                                            struct jset *jset,
+                                            struct jset_entry *entry,
+                                            int write)
+{
+       struct bkey_i *k;
+
+       vstruct_for_each(entry, k) {
+               int ret = journal_validate_key(c, jset, entry, k,
+                               bkey_type(entry->level,
+                                         entry->btree_id),
+                               "key", write);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int journal_entry_validate_btree_root(struct bch_fs *c,
+                                            struct jset *jset,
+                                            struct jset_entry *entry,
+                                            int write)
+{
+       struct bkey_i *k = entry->start;
+       int ret = 0;
+
+       if (journal_entry_err_on(!entry->u64s ||
+                                le16_to_cpu(entry->u64s) != k->k.u64s, c,
+                                "invalid btree root journal entry: wrong number of keys")) {
+               void *next = vstruct_next(entry);
+               /*
+                * we don't want to null out this jset_entry,
+                * just the contents, so that later we can tell
+                * we were _supposed_ to have a btree root
+                */
+               entry->u64s = 0;
+               journal_entry_null_range(vstruct_next(entry), next);
+               return 0;
+       }
+
+       return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+                                   "btree root", write);
+fsck_err:
+       return ret;
+}
+
+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+                                           struct jset *jset,
+                                           struct jset_entry *entry,
+                                           int write)
+{
+       /* obsolete, don't care: */
+       return 0;
+}
+
+static int journal_entry_validate_blacklist(struct bch_fs *c,
+                                           struct jset *jset,
+                                           struct jset_entry *entry,
+                                           int write)
+{
+       int ret = 0;
+
+       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+               "invalid journal seq blacklist entry: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+       }
+fsck_err:
+       return ret;
+}
+
+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+                                              struct jset *jset,
+                                              struct jset_entry *entry,
+                                              int write)
+{
+       struct jset_entry_blacklist_v2 *bl_entry;
+       int ret = 0;
+
+       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+               "invalid journal seq blacklist entry: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+       }
+
+       bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+       if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+                                le64_to_cpu(bl_entry->end), c,
+               "invalid journal seq blacklist entry: start > end")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+       }
+
+fsck_err:
+       return ret;
+}
+
+struct jset_entry_ops {
+       int (*validate)(struct bch_fs *, struct jset *,
+                       struct jset_entry *, int);
+};
+
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr)                                               \
+       [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
+               .validate       = journal_entry_validate_##f,   \
+       },
+       BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
+                                 struct jset_entry *entry, int write)
+{
+       int ret = 0;
+
+       if (entry->type >= BCH_JSET_ENTRY_NR) {
+               journal_entry_err(c, "invalid journal entry type %u",
+                                 entry->type);
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return 0;
+       }
+
+       ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write);
+fsck_err:
+       return ret;
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+                                int write)
+{
+       struct jset_entry *entry;
+       int ret = 0;
+
+       vstruct_for_each(jset, entry) {
+               if (journal_entry_err_on(vstruct_next(entry) >
+                                        vstruct_last(jset), c,
+                               "journal entry extends past end of jset")) {
+                       jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+                       break;
+               }
+
+               ret = journal_entry_validate(c, jset, entry, write);
+               if (ret)
+                       break;
+       }
+fsck_err:
+       return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+                        struct jset *jset, u64 sector,
+                        unsigned bucket_sectors_left,
+                        unsigned sectors_read,
+                        int write)
+{
+       size_t bytes = vstruct_bytes(jset);
+       struct bch_csum csum;
+       int ret = 0;
+
+       if (le64_to_cpu(jset->magic) != jset_magic(c))
+               return JOURNAL_ENTRY_NONE;
+
+       if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
+               bch_err(c, "unknown journal entry version %u",
+                       le32_to_cpu(jset->version));
+               return BCH_FSCK_UNKNOWN_VERSION;
+       }
+
+       if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+                                "journal entry too big (%zu bytes), sector %lluu",
+                                bytes, sector)) {
+               /* XXX: note we might have missing journal entries */
+               return JOURNAL_ENTRY_BAD;
+       }
+
+       if (bytes > sectors_read << 9)
+               return JOURNAL_ENTRY_REREAD;
+
+       if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+                       "journal entry with unknown csum type %llu sector %lluu",
+                       JSET_CSUM_TYPE(jset), sector))
+               return JOURNAL_ENTRY_BAD;
+
+       csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+       if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
+                                "journal checksum bad, sector %llu", sector)) {
+               /* XXX: retry IO, when we start retrying checksum errors */
+               /* XXX: note we might have missing journal entries */
+               return JOURNAL_ENTRY_BAD;
+       }
+
+       bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+                    jset->encrypted_start,
+                    vstruct_end(jset) - (void *) jset->encrypted_start);
+
+       if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+                                "invalid journal entry: last_seq > seq"))
+               jset->last_seq = jset->seq;
+
+       return 0;
+fsck_err:
+       return ret;
+}
+
+struct journal_read_buf {
+       void            *data;
+       size_t          size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+                                   size_t new_size)
+{
+       void *n;
+
+       /* the bios are sized for this many pages, max: */
+       if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+               return -ENOMEM;
+
+       new_size = roundup_pow_of_two(new_size);
+       n = kvpmalloc(new_size, GFP_KERNEL);
+       if (!n)
+               return -ENOMEM;
+
+       kvpfree(b->data, b->size);
+       b->data = n;
+       b->size = new_size;
+       return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+                              struct journal_read_buf *buf,
+                              struct journal_list *jlist,
+                              unsigned bucket, u64 *seq, bool *entries_found)
+{
+       struct bch_fs *c = ca->fs;
+       struct journal_device *ja = &ca->journal;
+       struct bio *bio = ja->bio;
+       struct jset *j = NULL;
+       unsigned sectors, sectors_read = 0;
+       u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+           end = offset + ca->mi.bucket_size;
+       bool saw_bad = false;
+       int ret = 0;
+
+       pr_debug("reading %u", bucket);
+
+       while (offset < end) {
+               if (!sectors_read) {
+reread:                        sectors_read = min_t(unsigned,
+                               end - offset, buf->size >> 9);
+
+                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+                       bio->bi_iter.bi_sector  = offset;
+                       bio->bi_iter.bi_size    = sectors_read << 9;
+                       bch2_bio_map(bio, buf->data);
+
+                       ret = submit_bio_wait(bio);
+
+                       if (bch2_dev_io_err_on(ret, ca,
+                                              "journal read from sector %llu",
+                                              offset) ||
+                           bch2_meta_read_fault("journal"))
+                               return -EIO;
+
+                       j = buf->data;
+               }
+
+               ret = jset_validate(c, j, offset,
+                                   end - offset, sectors_read,
+                                   READ);
+               switch (ret) {
+               case BCH_FSCK_OK:
+                       break;
+               case JOURNAL_ENTRY_REREAD:
+                       if (vstruct_bytes(j) > buf->size) {
+                               ret = journal_read_buf_realloc(buf,
+                                                       vstruct_bytes(j));
+                               if (ret)
+                                       return ret;
+                       }
+                       goto reread;
+               case JOURNAL_ENTRY_NONE:
+                       if (!saw_bad)
+                               return 0;
+                       sectors = c->opts.block_size;
+                       goto next_block;
+               case JOURNAL_ENTRY_BAD:
+                       saw_bad = true;
+                       sectors = c->opts.block_size;
+                       goto next_block;
+               default:
+                       return ret;
+               }
+
+               /*
+                * This happens sometimes if we don't have discards on -
+                * when we've partially overwritten a bucket with new
+                * journal entries. We don't need the rest of the
+                * bucket:
+                */
+               if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+                       return 0;
+
+               ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+               mutex_lock(&jlist->lock);
+               ret = journal_entry_add(c, ca, jlist, j);
+               mutex_unlock(&jlist->lock);
+
+               switch (ret) {
+               case JOURNAL_ENTRY_ADD_OK:
+                       *entries_found = true;
+                       break;
+               case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+                       break;
+               default:
+                       return ret;
+               }
+
+               if (le64_to_cpu(j->seq) > *seq)
+                       *seq = le64_to_cpu(j->seq);
+
+               sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+               pr_debug("next");
+               offset          += sectors;
+               sectors_read    -= sectors;
+               j = ((void *) j) + (sectors << 9);
+       }
+
+       return 0;
+}
+
+static void bch2_journal_read_device(struct closure *cl)
+{
+#define read_bucket(b)                                                 \
+       ({                                                              \
+               bool entries_found = false;                             \
+               ret = journal_read_bucket(ca, &buf, jlist, b, &seq,     \
+                                         &entries_found);              \
+               if (ret)                                                \
+                       goto err;                                       \
+               __set_bit(b, bitmap);                                   \
+               entries_found;                                          \
+        })
+
+       struct journal_device *ja =
+               container_of(cl, struct journal_device, read);
+       struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+       struct journal_list *jlist =
+               container_of(cl->parent, struct journal_list, cl);
+       struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+       struct journal_read_buf buf = { NULL, 0 };
+       unsigned long *bitmap;
+       unsigned i, l, r;
+       u64 seq = 0;
+       int ret;
+
+       if (!ja->nr)
+               goto out;
+
+       bitmap = kcalloc(BITS_TO_LONGS(ja->nr), ja->nr, GFP_KERNEL);
+       if (!bitmap) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+       if (ret)
+               goto err;
+
+       pr_debug("%u journal buckets", ja->nr);
+
+       /*
+        * If the device supports discard but not secure discard, we can't do
+        * the fancy fibonacci hash/binary search because the live journal
+        * entries might not form a contiguous range:
+        */
+       for (i = 0; i < ja->nr; i++)
+               read_bucket(i);
+       goto search_done;
+
+       if (!blk_queue_nonrot(q))
+               goto linear_scan;
+
+       /*
+        * Read journal buckets ordered by golden ratio hash to quickly
+        * find a sequence of buckets with valid journal entries
+        */
+       for (i = 0; i < ja->nr; i++) {
+               l = (i * 2654435769U) % ja->nr;
+
+               if (test_bit(l, bitmap))
+                       break;
+
+               if (read_bucket(l))
+                       goto bsearch;
+       }
+
+       /*
+        * If that fails, check all the buckets we haven't checked
+        * already
+        */
+       pr_debug("falling back to linear search");
+linear_scan:
+       for (l = find_first_zero_bit(bitmap, ja->nr);
+            l < ja->nr;
+            l = find_next_zero_bit(bitmap, ja->nr, l + 1))
+               if (read_bucket(l))
+                       goto bsearch;
+
+       /* no journal entries on this device? */
+       if (l == ja->nr)
+               goto out;
+bsearch:
+       /* Binary search */
+       r = find_next_bit(bitmap, ja->nr, l + 1);
+       pr_debug("starting binary search, l %u r %u", l, r);
+
+       while (l + 1 < r) {
+               unsigned m = (l + r) >> 1;
+               u64 cur_seq = seq;
+
+               read_bucket(m);
+
+               if (cur_seq != seq)
+                       l = m;
+               else
+                       r = m;
+       }
+
+search_done:
+       /*
+        * Find the journal bucket with the highest sequence number:
+        *
+        * If there's duplicate journal entries in multiple buckets (which
+        * definitely isn't supposed to happen, but...) - make sure to start
+        * cur_idx at the last of those buckets, so we don't deadlock trying to
+        * allocate
+        */
+       seq = 0;
+
+       for (i = 0; i < ja->nr; i++)
+               if (ja->bucket_seq[i] >= seq &&
+                   ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
+                       /*
+                        * When journal_next_bucket() goes to allocate for
+                        * the first time, it'll use the bucket after
+                        * ja->cur_idx
+                        */
+                       ja->cur_idx = i;
+                       seq = ja->bucket_seq[i];
+               }
+
+       /*
+        * Set last_idx to indicate the entire journal is full and needs to be
+        * reclaimed - journal reclaim will immediately reclaim whatever isn't
+        * pinned when it first runs:
+        */
+       ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+
+       /*
+        * Read buckets in reverse order until we stop finding more journal
+        * entries:
+        */
+       for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
+            i != ja->cur_idx;
+            i = (i + ja->nr - 1) % ja->nr)
+               if (!test_bit(i, bitmap) &&
+                   !read_bucket(i))
+                       break;
+out:
+       kvpfree(buf.data, buf.size);
+       kfree(bitmap);
+       percpu_ref_put(&ca->io_ref);
+       closure_return(cl);
+       return;
+err:
+       mutex_lock(&jlist->lock);
+       jlist->ret = ret;
+       mutex_unlock(&jlist->lock);
+       goto out;
+#undef read_bucket
+}
+
+void bch2_journal_entries_free(struct list_head *list)
+{
+
+       while (!list_empty(list)) {
+               struct journal_replay *i =
+                       list_first_entry(list, struct journal_replay, list);
+               list_del(&i->list);
+               kvpfree(i, offsetof(struct journal_replay, j) +
+                       vstruct_bytes(&i->j));
+       }
+}
+
+int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
+{
+       struct journal *j = &c->journal;
+       struct journal_entry_pin_list *p;
+       u64 seq, nr = end_seq - last_seq + 1;
+
+       if (nr > j->pin.size) {
+               free_fifo(&j->pin);
+               init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+               if (!j->pin.data) {
+                       bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+                       return -ENOMEM;
+               }
+       }
+
+       atomic64_set(&j->seq, end_seq);
+       j->last_seq_ondisk = last_seq;
+
+       j->pin.front    = last_seq;
+       j->pin.back     = end_seq + 1;
+
+       fifo_for_each_entry_ptr(p, &j->pin, seq) {
+               INIT_LIST_HEAD(&p->list);
+               INIT_LIST_HEAD(&p->flushed);
+               atomic_set(&p->count, 0);
+               p->devs.nr = 0;
+       }
+
+       return 0;
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+{
+       struct journal *j = &c->journal;
+       struct journal_list jlist;
+       struct journal_replay *i;
+       struct journal_entry_pin_list *p;
+       struct bch_dev *ca;
+       u64 cur_seq, end_seq;
+       unsigned iter;
+       size_t keys = 0, entries = 0;
+       bool degraded = false;
+       int ret = 0;
+
+       closure_init_stack(&jlist.cl);
+       mutex_init(&jlist.lock);
+       jlist.head = list;
+       jlist.ret = 0;
+
+       for_each_member_device(ca, c, iter) {
+               if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+                       continue;
+
+               if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
+                    ca->mi.state == BCH_MEMBER_STATE_RO) &&
+                   percpu_ref_tryget(&ca->io_ref))
+                       closure_call(&ca->journal.read,
+                                    bch2_journal_read_device,
+                                    system_unbound_wq,
+                                    &jlist.cl);
+               else
+                       degraded = true;
+       }
+
+       closure_sync(&jlist.cl);
+
+       if (jlist.ret)
+               return jlist.ret;
+
+       if (list_empty(list)){
+               bch_err(c, "no journal entries found");
+               return BCH_FSCK_REPAIR_IMPOSSIBLE;
+       }
+
+       list_for_each_entry(i, list, list) {
+               ret = jset_validate_entries(c, &i->j, READ);
+               if (ret)
+                       goto fsck_err;
+
+               /*
+                * If we're mounting in degraded mode - if we didn't read all
+                * the devices - this is wrong:
+                */
+
+               if (!degraded &&
+                   (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+                    fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
+                                                      i->devs), c,
+                                "superblock not marked as containing replicas (type %u)",
+                                BCH_DATA_JOURNAL))) {
+                       ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       i = list_last_entry(list, struct journal_replay, list);
+
+       ret = bch2_journal_set_seq(c,
+                                  le64_to_cpu(i->j.last_seq),
+                                  le64_to_cpu(i->j.seq));
+       if (ret)
+               return ret;
+
+       mutex_lock(&j->blacklist_lock);
+
+       list_for_each_entry(i, list, list) {
+               p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
+
+               atomic_set(&p->count, 1);
+               p->devs = i->devs;
+
+               if (bch2_journal_seq_blacklist_read(j, i)) {
+                       mutex_unlock(&j->blacklist_lock);
+                       return -ENOMEM;
+               }
+       }
+
+       mutex_unlock(&j->blacklist_lock);
+
+       cur_seq = journal_last_seq(j);
+       end_seq = le64_to_cpu(list_last_entry(list,
+                               struct journal_replay, list)->j.seq);
+
+       list_for_each_entry(i, list, list) {
+               struct jset_entry *entry;
+               struct bkey_i *k, *_n;
+               bool blacklisted;
+
+               mutex_lock(&j->blacklist_lock);
+               while (cur_seq < le64_to_cpu(i->j.seq) &&
+                      bch2_journal_seq_blacklist_find(j, cur_seq))
+                       cur_seq++;
+
+               blacklisted = bch2_journal_seq_blacklist_find(j,
+                                                        le64_to_cpu(i->j.seq));
+               mutex_unlock(&j->blacklist_lock);
+
+               fsck_err_on(blacklisted, c,
+                           "found blacklisted journal entry %llu",
+                           le64_to_cpu(i->j.seq));
+
+               fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
+                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+                       cur_seq, le64_to_cpu(i->j.seq) - 1,
+                       journal_last_seq(j), end_seq);
+
+               cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+               for_each_jset_key(k, _n, entry, &i->j)
+                       keys++;
+               entries++;
+       }
+
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                keys, entries, journal_cur_seq(j));
+fsck_err:
+       return ret;
+}
+
+/* journal replay: */
+
+int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
+{
+       struct bkey_i *k, *n;
+       struct jset_entry *j;
+       struct journal_replay *r;
+       int ret;
+
+       list_for_each_entry(r, list, list)
+               for_each_jset_key(k, n, j, &r->j) {
+                       enum bkey_type type = bkey_type(j->level, j->btree_id);
+                       struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+
+                       if (btree_type_has_ptrs(type)) {
+                               ret = bch2_btree_mark_key_initial(c, type, k_s_c);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+
+       return 0;
+}
+
+int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+       struct journal *j = &c->journal;
+       struct journal_entry_pin_list *pin_list;
+       struct bkey_i *k, *_n;
+       struct jset_entry *entry;
+       struct journal_replay *i, *n;
+       int ret = 0;
+
+       list_for_each_entry_safe(i, n, list, list) {
+
+               j->replay_journal_seq = le64_to_cpu(i->j.seq);
+
+               for_each_jset_key(k, _n, entry, &i->j) {
+
+                       if (entry->btree_id == BTREE_ID_ALLOC) {
+                               /*
+                                * allocation code handles replay for
+                                * BTREE_ID_ALLOC keys:
+                                */
+                               ret = bch2_alloc_replay_key(c, k->k.p);
+                       } else {
+                               /*
+                                * We might cause compressed extents to be
+                                * split, so we need to pass in a
+                                * disk_reservation:
+                                */
+                               struct disk_reservation disk_res =
+                                       bch2_disk_reservation_init(c, 0);
+
+                               ret = bch2_btree_insert(c, entry->btree_id, k,
+                                                       &disk_res, NULL, NULL,
+                                                       BTREE_INSERT_NOFAIL|
+                                                       BTREE_INSERT_JOURNAL_REPLAY);
+                       }
+
+                       if (ret) {
+                               bch_err(c, "journal replay: error %d while replaying key",
+                                       ret);
+                               goto err;
+                       }
+
+                       cond_resched();
+               }
+
+               pin_list = journal_seq_pin(j, j->replay_journal_seq);
+
+               if (atomic_dec_and_test(&pin_list->count))
+                       journal_wake(j);
+       }
+
+       j->replay_journal_seq = 0;
+
+       bch2_journal_set_replay_done(j);
+       bch2_journal_flush_all_pins(j);
+       ret = bch2_journal_error(j);
+err:
+       bch2_journal_entries_free(list);
+       return ret;
+}
+
+/* journal write: */
+
+static void bch2_journal_add_btree_root(struct journal_buf *buf,
+                                      enum btree_id id, struct bkey_i *k,
+                                      unsigned level)
+{
+       struct jset_entry *entry;
+
+       entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s);
+       entry->type     = BCH_JSET_ENTRY_btree_root;
+       entry->btree_id = id;
+       entry->level    = level;
+       memcpy_u64s(entry->_data, k, k->k.u64s);
+}
+
+static unsigned journal_dev_buckets_available(struct journal *j,
+                                             struct bch_dev *ca)
+{
+       struct journal_device *ja = &ca->journal;
+       unsigned next = (ja->cur_idx + 1) % ja->nr;
+       unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+       /*
+        * Hack to avoid a deadlock during journal replay:
+        * journal replay might require setting a new btree
+        * root, which requires writing another journal entry -
+        * thus, if the journal is full (and this happens when
+        * replaying the first journal bucket's entries) we're
+        * screwed.
+        *
+        * So don't let the journal fill up unless we're in
+        * replay:
+        */
+       if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
+               available = max((int) available - 2, 0);
+
+       /*
+        * Don't use the last bucket unless writing the new last_seq
+        * will make another bucket available:
+        */
+       if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
+               available = max((int) available - 1, 0);
+
+       return available;
+}
+
+/* returns number of sectors available for next journal entry: */
+int bch2_journal_entry_sectors(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+       unsigned sectors_available = UINT_MAX;
+       unsigned i, nr_online = 0, nr_devs = 0;
+
+       lockdep_assert_held(&j->lock);
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i,
+                                  &c->rw_devs[BCH_DATA_JOURNAL]) {
+               struct journal_device *ja = &ca->journal;
+               unsigned buckets_required = 0;
+
+               if (!ja->nr)
+                       continue;
+
+               sectors_available = min_t(unsigned, sectors_available,
+                                         ca->mi.bucket_size);
+
+               /*
+                * Note that we don't allocate the space for a journal entry
+                * until we write it out - thus, if we haven't started the write
+                * for the previous entry we have to make sure we have space for
+                * it too:
+                */
+               if (bch2_extent_has_device(e.c, ca->dev_idx)) {
+                       if (j->prev_buf_sectors > ja->sectors_free)
+                               buckets_required++;
+
+                       if (j->prev_buf_sectors + sectors_available >
+                           ja->sectors_free)
+                               buckets_required++;
+               } else {
+                       if (j->prev_buf_sectors + sectors_available >
+                           ca->mi.bucket_size)
+                               buckets_required++;
+
+                       buckets_required++;
+               }
+
+               if (journal_dev_buckets_available(j, ca) >= buckets_required)
+                       nr_devs++;
+               nr_online++;
+       }
+       rcu_read_unlock();
+
+       if (nr_online < c->opts.metadata_replicas_required)
+               return -EROFS;
+
+       if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
+               return 0;
+
+       return sectors_available;
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+                              unsigned sectors)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       struct journal_device *ja;
+       struct bch_dev *ca;
+       struct dev_alloc_list devs_sorted;
+       unsigned i, replicas, replicas_want =
+               READ_ONCE(c->opts.metadata_replicas);
+
+       spin_lock(&j->lock);
+       e = bkey_i_to_s_extent(&j->key);
+
+       /*
+        * Drop any pointers to devices that have been removed, are no longer
+        * empty, or filled up their current journal bucket:
+        *
+        * Note that a device may have had a small amount of free space (perhaps
+        * one sector) that wasn't enough for the smallest possible journal
+        * entry - that's why we drop pointers to devices <= current free space,
+        * i.e. whichever device was limiting the current journal entry size.
+        */
+       extent_for_each_ptr_backwards(e, ptr) {
+                  ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ca->mi.state != BCH_MEMBER_STATE_RW ||
+                   ca->journal.sectors_free <= sectors)
+                       __bch2_extent_drop_ptr(e, ptr);
+               else
+                       ca->journal.sectors_free -= sectors;
+       }
+
+       replicas = bch2_extent_nr_ptrs(e.c);
+
+       rcu_read_lock();
+       devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+                                        &c->rw_devs[BCH_DATA_JOURNAL]);
+
+       for (i = 0; i < devs_sorted.nr; i++) {
+               ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+               if (!ca)
+                       continue;
+
+               if (!ca->mi.durability)
+                       continue;
+
+               ja = &ca->journal;
+               if (!ja->nr)
+                       continue;
+
+               if (replicas >= replicas_want)
+                       break;
+
+               /*
+                * Check that we can use this device, and aren't already using
+                * it:
+                */
+               if (bch2_extent_has_device(e.c, ca->dev_idx) ||
+                   !journal_dev_buckets_available(j, ca) ||
+                   sectors > ca->mi.bucket_size)
+                       continue;
+
+               j->wp.next_alloc[ca->dev_idx] += U32_MAX;
+               bch2_wp_rescale(c, ca, &j->wp);
+
+               ja->sectors_free = ca->mi.bucket_size - sectors;
+               ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+               extent_ptr_append(bkey_i_to_extent(&j->key),
+                       (struct bch_extent_ptr) {
+                                 .offset = bucket_to_sector(ca,
+                                       ja->buckets[ja->cur_idx]),
+                                 .dev = ca->dev_idx,
+               });
+
+               replicas += ca->mi.durability;
+       }
+       rcu_read_unlock();
+
+       j->prev_buf_sectors = 0;
+
+       bkey_copy(&w->key, &j->key);
+       spin_unlock(&j->lock);
+
+       if (replicas < c->opts.metadata_replicas_required)
+               return -EROFS;
+
+       BUG_ON(!replicas);
+
+       return 0;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+       struct jset_entry *i, *next, *prev = NULL;
+
+       /*
+        * Simple compaction, dropping empty jset_entries (from journal
+        * reservations that weren't fully used) and merging jset_entries that
+        * can be.
+        *
+        * If we wanted to be really fancy here, we could sort all the keys in
+        * the jset and drop keys that were overwritten - probably not worth it:
+        */
+       vstruct_for_each_safe(jset, i, next) {
+               unsigned u64s = le16_to_cpu(i->u64s);
+
+               /* Empty entry: */
+               if (!u64s)
+                       continue;
+
+               /* Can we merge with previous entry? */
+               if (prev &&
+                   i->btree_id == prev->btree_id &&
+                   i->level    == prev->level &&
+                   i->type     == prev->type &&
+                   i->type     == BCH_JSET_ENTRY_btree_keys &&
+                   le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+                       memmove_u64s_down(vstruct_next(prev),
+                                         i->_data,
+                                         u64s);
+                       le16_add_cpu(&prev->u64s, u64s);
+                       continue;
+               }
+
+               /* Couldn't merge, move i into new position (after prev): */
+               prev = prev ? vstruct_next(prev) : jset->start;
+               if (i != prev)
+                       memmove_u64s_down(prev, i, jset_u64s(u64s));
+       }
+
+       prev = prev ? vstruct_next(prev) : jset->start;
+       jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+       /* we aren't holding j->lock: */
+       unsigned new_size = READ_ONCE(j->buf_size_want);
+       void *new_buf;
+
+       if (buf->size >= new_size)
+               return;
+
+       new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+       if (!new_buf)
+               return;
+
+       memcpy(new_buf, buf->data, buf->size);
+       kvpfree(buf->data, buf->size);
+       buf->data       = new_buf;
+       buf->size       = new_size;
+}
+
+static void journal_write_done(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_buf *w = journal_prev_buf(j);
+       struct bch_devs_list devs =
+               bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+       u64 seq = le64_to_cpu(w->data->seq);
+
+       if (!devs.nr) {
+               bch_err(c, "unable to write journal to sufficient devices");
+               goto err;
+       }
+
+       if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+               goto err;
+out:
+       bch2_time_stats_update(j->write_time, j->write_start_time);
+
+       spin_lock(&j->lock);
+       j->last_seq_ondisk = seq;
+       if (seq >= j->pin.front)
+               journal_seq_pin(j, seq)->devs = devs;
+
+       /*
+        * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+        * more buckets:
+        *
+        * Must come before signaling write completion, for
+        * bch2_fs_journal_stop():
+        */
+       mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+
+       /* also must come before signalling write completion: */
+       closure_debug_destroy(cl);
+
+       BUG_ON(!j->reservations.prev_buf_unwritten);
+       atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+                    &j->reservations.counter);
+
+       closure_wake_up(&w->wait);
+       journal_wake(j);
+
+       if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
+               mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+       spin_unlock(&j->lock);
+       return;
+err:
+       bch2_fatal_error(c);
+       bch2_journal_halt(j);
+       goto out;
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+       struct bch_dev *ca = bio->bi_private;
+       struct journal *j = &ca->fs->journal;
+
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+           bch2_meta_write_fault("journal")) {
+               struct journal_buf *w = journal_prev_buf(j);
+               unsigned long flags;
+
+               spin_lock_irqsave(&j->err_lock, flags);
+               bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+               spin_unlock_irqrestore(&j->err_lock, flags);
+       }
+
+       closure_put(&j->io);
+       percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       struct journal_buf *w = journal_prev_buf(j);
+       struct jset *jset;
+       struct bio *bio;
+       struct bch_extent_ptr *ptr;
+       unsigned i, sectors, bytes;
+
+       journal_buf_realloc(j, w);
+       jset = w->data;
+
+       j->write_start_time = local_clock();
+       mutex_lock(&c->btree_root_lock);
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               struct btree_root *r = &c->btree_roots[i];
+
+               if (r->alive)
+                       bch2_journal_add_btree_root(w, i, &r->key, r->level);
+       }
+       c->btree_roots_dirty = false;
+       mutex_unlock(&c->btree_root_lock);
+
+       journal_write_compact(jset);
+
+       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
+       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
+       jset->magic             = cpu_to_le64(jset_magic(c));
+       jset->version           = cpu_to_le32(BCACHE_JSET_VERSION);
+
+       SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+       SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+       if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+           jset_validate_entries(c, jset, WRITE))
+               goto err;
+
+       bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+                   jset->encrypted_start,
+                   vstruct_end(jset) - (void *) jset->encrypted_start);
+
+       jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+                                 journal_nonce(jset), jset);
+
+       if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+           jset_validate_entries(c, jset, WRITE))
+               goto err;
+
+       sectors = vstruct_sectors(jset, c->block_bits);
+       BUG_ON(sectors > j->prev_buf_sectors);
+
+       bytes = vstruct_bytes(w->data);
+       memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+
+       if (journal_write_alloc(j, w, sectors)) {
+               bch2_journal_halt(j);
+               bch_err(c, "Unable to allocate journal write");
+               bch2_fatal_error(c);
+               continue_at(cl, journal_write_done, system_highpri_wq);
+               return;
+       }
+
+       /*
+        * XXX: we really should just disable the entire journal in nochanges
+        * mode
+        */
+       if (c->opts.nochanges)
+               goto no_io;
+
+       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+               if (!percpu_ref_tryget(&ca->io_ref)) {
+                       /* XXX: fix this */
+                       bch_err(c, "missing device for journal write\n");
+                       continue;
+               }
+
+               this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+                            sectors);
+
+               bio = ca->journal.bio;
+               bio_reset(bio, ca->disk_sb.bdev,
+                         REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+               bio->bi_iter.bi_sector  = ptr->offset;
+               bio->bi_iter.bi_size    = sectors << 9;
+               bio->bi_end_io          = journal_write_endio;
+               bio->bi_private         = ca;
+               bch2_bio_map(bio, jset);
+
+               trace_journal_write(bio);
+               closure_bio_submit(bio, cl);
+
+               ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+       }
+
+       for_each_rw_member(ca, c, i)
+               if (journal_flushes_device(ca) &&
+                   !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+                       percpu_ref_get(&ca->io_ref);
+
+                       bio = ca->journal.bio;
+                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+                       bio->bi_end_io          = journal_write_endio;
+                       bio->bi_private         = ca;
+                       closure_bio_submit(bio, cl);
+               }
+
+no_io:
+       extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
+               ptr->offset += sectors;
+
+       continue_at(cl, journal_write_done, system_highpri_wq);
+       return;
+err:
+       bch2_inconsistent_error(c);
+       continue_at(cl, journal_write_done, system_highpri_wq);
+}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
new file mode 100644 (file)
index 0000000..35f90c9
--- /dev/null
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+       struct list_head        list;
+       struct bch_devs_list    devs;
+       /* must be last: */
+       struct jset             j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+                                       struct jset_entry *entry, unsigned type)
+{
+       while (entry < vstruct_last(jset)) {
+               if (entry->type == type)
+                       return entry;
+
+               entry = vstruct_next(entry);
+       }
+
+       return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)                    \
+       for (entry = (jset)->start;                                     \
+            (entry = __jset_entry_type_next(jset, entry, type));       \
+            entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)                          \
+       for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
+               vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+int bch2_journal_entry_sectors(struct journal *);
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644 (file)
index 0000000..e5b8666
--- /dev/null
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+                                 struct journal_entry_pin_list *pin_list)
+{
+       return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+       u64 ret = 0;
+
+       spin_lock(&j->lock);
+       if (journal_pin_active(pin))
+               ret = journal_pin_seq(j, pin->pin_list);
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+                                    struct journal_entry_pin_list *pin_list,
+                                    struct journal_entry_pin *pin,
+                                    journal_pin_flush_fn flush_fn)
+{
+       BUG_ON(journal_pin_active(pin));
+       BUG_ON(!atomic_read(&pin_list->count));
+
+       atomic_inc(&pin_list->count);
+       pin->pin_list   = pin_list;
+       pin->flush      = flush_fn;
+
+       if (flush_fn)
+               list_add(&pin->list, &pin_list->list);
+       else
+               INIT_LIST_HEAD(&pin->list);
+
+       /*
+        * If the journal is currently full,  we might want to call flush_fn
+        * immediately:
+        */
+       journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+                         struct journal_entry_pin *pin,
+                         journal_pin_flush_fn flush_fn)
+{
+       spin_lock(&j->lock);
+       __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+       spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+                                     struct journal_entry_pin *pin)
+{
+       struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+       if (!journal_pin_active(pin))
+               return;
+
+       pin->pin_list = NULL;
+       list_del_init(&pin->list);
+
+       /*
+        * Unpinning a journal entry make make journal_next_bucket() succeed, if
+        * writing a new last_seq will now make another bucket available:
+        */
+       if (atomic_dec_and_test(&pin_list->count) &&
+           pin_list == &fifo_peek_front(&j->pin))
+               bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+                         struct journal_entry_pin *pin)
+{
+       spin_lock(&j->lock);
+       __journal_pin_drop(j, pin);
+       spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+                                 struct journal_entry_pin *src_pin,
+                                 struct journal_entry_pin *pin,
+                                 journal_pin_flush_fn flush_fn)
+{
+       spin_lock(&j->lock);
+
+       if (journal_pin_active(src_pin) &&
+           (!journal_pin_active(pin) ||
+            journal_pin_seq(j, src_pin->pin_list) <
+            journal_pin_seq(j, pin->pin_list))) {
+               __journal_pin_drop(j, pin);
+               __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+       }
+
+       spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+       struct journal_entry_pin_list temp;
+       bool popped = false;
+
+       lockdep_assert_held(&j->lock);
+
+       /*
+        * Unpin journal entries whose reference counts reached zero, meaning
+        * all btree nodes got written out
+        */
+       while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+               BUG_ON(!fifo_pop(&j->pin, temp));
+               popped = true;
+       }
+
+       if (popped)
+               journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+       struct journal_entry_pin_list *pin_list;
+       struct journal_entry_pin *ret;
+       u64 iter;
+
+       /* no need to iterate over empty fifo entries: */
+       bch2_journal_reclaim_fast(j);
+
+       fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+               if (iter > seq_to_flush)
+                       break;
+
+               ret = list_first_entry_or_null(&pin_list->list,
+                               struct journal_entry_pin, list);
+               if (ret) {
+                       /* must be list_del_init(), see bch2_journal_pin_drop() */
+                       list_move(&ret->list, &pin_list->flushed);
+                       *seq = iter;
+                       return ret;
+               }
+       }
+
+       return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+       struct journal_entry_pin *ret;
+
+       spin_lock(&j->lock);
+       ret = __journal_get_next_pin(j, seq_to_flush, seq);
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+       bool ret;
+
+       spin_lock(&j->lock);
+       ret = ja->nr &&
+               (ja->last_idx != ja->cur_idx &&
+                ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(to_delayed_work(work),
+                               struct bch_fs, journal.reclaim_work);
+       struct journal *j = &c->journal;
+       struct bch_dev *ca;
+       struct journal_entry_pin *pin;
+       u64 seq, seq_to_flush = 0;
+       unsigned iter, bucket_to_flush;
+       unsigned long next_flush;
+       bool reclaim_lock_held = false, need_flush;
+
+       /*
+        * Advance last_idx to point to the oldest journal entry containing
+        * btree node updates that have not yet been written out
+        */
+       for_each_rw_member(ca, c, iter) {
+               struct journal_device *ja = &ca->journal;
+
+               if (!ja->nr)
+                       continue;
+
+               while (should_discard_bucket(j, ja)) {
+                       if (!reclaim_lock_held) {
+                               /*
+                                * ugh:
+                                * might be called from __journal_res_get()
+                                * under wait_event() - have to go back to
+                                * TASK_RUNNING before doing something that
+                                * would block, but only if we're doing work:
+                                */
+                               __set_current_state(TASK_RUNNING);
+
+                               mutex_lock(&j->reclaim_lock);
+                               reclaim_lock_held = true;
+                               /* recheck under reclaim_lock: */
+                               continue;
+                       }
+
+                       if (ca->mi.discard &&
+                           bdev_max_discard_sectors(ca->disk_sb.bdev))
+                               blkdev_issue_discard(ca->disk_sb.bdev,
+                                       bucket_to_sector(ca,
+                                               ja->buckets[ja->last_idx]),
+                                       ca->mi.bucket_size, GFP_NOIO);
+
+                       spin_lock(&j->lock);
+                       ja->last_idx = (ja->last_idx + 1) % ja->nr;
+                       spin_unlock(&j->lock);
+
+                       journal_wake(j);
+               }
+
+               /*
+                * Write out enough btree nodes to free up 50% journal
+                * buckets
+                */
+               spin_lock(&j->lock);
+               bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+               seq_to_flush = max_t(u64, seq_to_flush,
+                                    ja->bucket_seq[bucket_to_flush]);
+               spin_unlock(&j->lock);
+       }
+
+       if (reclaim_lock_held)
+               mutex_unlock(&j->reclaim_lock);
+
+       /* Also flush if the pin fifo is more than half full */
+       spin_lock(&j->lock);
+       seq_to_flush = max_t(s64, seq_to_flush,
+                            (s64) journal_cur_seq(j) -
+                            (j->pin.size >> 1));
+       spin_unlock(&j->lock);
+
+       /*
+        * If it's been longer than j->reclaim_delay_ms since we last flushed,
+        * make sure to flush at least one journal pin:
+        */
+       next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+       need_flush = time_after(jiffies, next_flush);
+
+       while ((pin = journal_get_next_pin(j, need_flush
+                                          ? U64_MAX
+                                          : seq_to_flush, &seq))) {
+               __set_current_state(TASK_RUNNING);
+               pin->flush(j, pin, seq);
+               need_flush = false;
+
+               j->last_flushed = jiffies;
+       }
+
+       if (!test_bit(BCH_FS_RO, &c->flags))
+               queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+                                  msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+                             struct journal_entry_pin **pin,
+                             u64 *pin_seq)
+{
+       int ret;
+
+       *pin = NULL;
+
+       ret = bch2_journal_error(j);
+       if (ret)
+               return ret;
+
+       spin_lock(&j->lock);
+       /*
+        * If journal replay hasn't completed, the unreplayed journal entries
+        * hold refs on their corresponding sequence numbers
+        */
+       ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+               !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+               journal_last_seq(j) > seq_to_flush ||
+               (fifo_used(&j->pin) == 1 &&
+                atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+       struct journal_entry_pin *pin;
+       u64 pin_seq;
+
+       if (!test_bit(JOURNAL_STARTED, &j->flags))
+               return;
+
+       while (1) {
+               wait_event(j->wait, journal_flush_done(j, seq_to_flush,
+                                                      &pin, &pin_seq));
+               if (!pin)
+                       break;
+
+               pin->flush(j, pin, pin_seq);
+       }
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_entry_pin_list *p;
+       struct bch_devs_list devs;
+       u64 iter, seq = 0;
+       int ret = 0;
+
+       spin_lock(&j->lock);
+       fifo_for_each_entry_ptr(p, &j->pin, iter)
+               if (dev_idx >= 0
+                   ? bch2_dev_list_has_dev(p->devs, dev_idx)
+                   : p->devs.nr < c->opts.metadata_replicas)
+                       seq = iter;
+       spin_unlock(&j->lock);
+
+       bch2_journal_flush_pins(j, seq);
+
+       ret = bch2_journal_error(j);
+       if (ret)
+               return ret;
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+       seq = 0;
+
+       spin_lock(&j->lock);
+       while (!ret && seq < j->pin.back) {
+               seq = max(seq, journal_last_seq(j));
+               devs = journal_seq_pin(j, seq)->devs;
+               seq++;
+
+               spin_unlock(&j->lock);
+               ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+               spin_lock(&j->lock);
+       }
+       spin_unlock(&j->lock);
+
+       ret = bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644 (file)
index 0000000..a93ed43
--- /dev/null
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN    (32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+       return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+       BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+       return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+                         journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+                                 struct journal_entry_pin *,
+                                 struct journal_entry_pin *,
+                                 journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_flush_pins(struct journal *, u64);
+
+static inline void bch2_journal_flush_all_pins(struct journal *j)
+{
+       bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644 (file)
index 0000000..c26f36d
--- /dev/null
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+                                       struct journal_entry_pin *pin, u64 seq)
+{
+       struct bch_fs *c =
+               container_of(j, struct bch_fs, journal);
+       struct journal_seq_blacklist *bl =
+               container_of(pin, struct journal_seq_blacklist, pin);
+       struct blacklisted_node n;
+       struct closure cl;
+       unsigned i;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       for (i = 0;; i++) {
+               struct btree_iter iter;
+               struct btree *b;
+
+               mutex_lock(&j->blacklist_lock);
+               if (i >= bl->nr_entries) {
+                       mutex_unlock(&j->blacklist_lock);
+                       break;
+               }
+               n = bl->entries[i];
+               mutex_unlock(&j->blacklist_lock);
+
+               __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
+                                      0, 0, BTREE_ITER_NODES);
+
+               b = bch2_btree_iter_peek_node(&iter);
+
+               /* The node might have already been rewritten: */
+
+               if (b->data->keys.seq == n.seq) {
+                       ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+                       if (ret) {
+                               bch2_btree_iter_unlock(&iter);
+                               bch2_fs_fatal_error(c,
+                                       "error %i rewriting btree node with blacklisted journal seq",
+                                       ret);
+                               bch2_journal_halt(j);
+                               return;
+                       }
+               }
+
+               bch2_btree_iter_unlock(&iter);
+       }
+
+       for (i = 0;; i++) {
+               struct btree_update *as;
+               struct pending_btree_node_free *d;
+
+               mutex_lock(&j->blacklist_lock);
+               if (i >= bl->nr_entries) {
+                       mutex_unlock(&j->blacklist_lock);
+                       break;
+               }
+               n = bl->entries[i];
+               mutex_unlock(&j->blacklist_lock);
+redo_wait:
+               mutex_lock(&c->btree_interior_update_lock);
+
+               /*
+                * Is the node on the list of pending interior node updates -
+                * being freed? If so, wait for that to finish:
+                */
+               for_each_pending_btree_node_free(c, as, d)
+                       if (n.seq       == d->seq &&
+                           n.btree_id  == d->btree_id &&
+                           !d->level &&
+                           !bkey_cmp(n.pos, d->key.k.p)) {
+                               closure_wait(&as->wait, &cl);
+                               mutex_unlock(&c->btree_interior_update_lock);
+                               closure_sync(&cl);
+                               goto redo_wait;
+                       }
+
+               mutex_unlock(&c->btree_interior_update_lock);
+       }
+
+       mutex_lock(&j->blacklist_lock);
+
+       bch2_journal_pin_drop(j, &bl->pin);
+       list_del(&bl->list);
+       kfree(bl->entries);
+       kfree(bl);
+
+       mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+       struct journal_seq_blacklist *bl;
+
+       lockdep_assert_held(&j->blacklist_lock);
+
+       list_for_each_entry(bl, &j->seq_blacklist, list)
+               if (seq >= bl->start && seq <= bl->end)
+                       return bl;
+
+       return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+       struct journal_seq_blacklist *bl;
+
+       lockdep_assert_held(&j->blacklist_lock);
+
+       /*
+        * When we start the journal, bch2_journal_start() will skip over @seq:
+        */
+
+       bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+       if (!bl)
+               return NULL;
+
+       bl->start       = start;
+       bl->end         = end;
+
+       list_add_tail(&bl->list, &j->seq_blacklist);
+       return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+       struct journal *j = &c->journal;
+       struct journal_seq_blacklist *bl = NULL;
+       struct blacklisted_node *n;
+       u64 journal_seq;
+       int ret = 0;
+
+       if (!seq)
+               return 0;
+
+       spin_lock(&j->lock);
+       journal_seq = journal_cur_seq(j);
+       spin_unlock(&j->lock);
+
+       /* Interier updates aren't journalled: */
+       BUG_ON(b->level);
+       BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+       /*
+        * Decrease this back to j->seq + 2 when we next rev the on disk format:
+        * increasing it temporarily to work around bug in old kernels
+        */
+       fsck_err_on(seq > journal_seq + 4, c,
+                   "bset journal seq too far in the future: %llu > %llu",
+                   seq, journal_seq);
+
+       if (seq <= journal_seq &&
+           list_empty_careful(&j->seq_blacklist))
+               return 0;
+
+       mutex_lock(&j->blacklist_lock);
+
+       if (seq <= journal_seq) {
+               bl = bch2_journal_seq_blacklist_find(j, seq);
+               if (!bl)
+                       goto out;
+       } else {
+               bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+                           b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+               if (!j->new_blacklist) {
+                       j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+                                               journal_seq + 1,
+                                               journal_seq + 1);
+                       if (!j->new_blacklist) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+               }
+               bl = j->new_blacklist;
+               bl->end = max(bl->end, seq);
+       }
+
+       for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+               if (b->data->keys.seq   == n->seq &&
+                   b->btree_id         == n->btree_id &&
+                   !bkey_cmp(b->key.k.p, n->pos))
+                       goto found_entry;
+
+       if (!bl->nr_entries ||
+           is_power_of_2(bl->nr_entries)) {
+               n = krealloc(bl->entries,
+                            max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
+                            GFP_KERNEL);
+               if (!n) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               bl->entries = n;
+       }
+
+       bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+               .seq            = b->data->keys.seq,
+               .btree_id       = b->btree_id,
+               .pos            = b->key.k.p,
+       };
+found_entry:
+       ret = 1;
+out:
+fsck_err:
+       mutex_unlock(&j->blacklist_lock);
+       return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+                                            struct journal_replay *i,
+                                            u64 start, u64 end)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_seq_blacklist *bl;
+
+       bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+                   start, end);
+
+       bl = bch2_journal_seq_blacklisted_new(j, start, end);
+       if (!bl)
+               return -ENOMEM;
+
+       bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+                            journal_seq_blacklist_flush);
+       return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+                                   struct journal_replay *i)
+{
+       struct jset_entry *entry;
+       int ret = 0;
+
+       vstruct_for_each(&i->j, entry) {
+               switch (entry->type) {
+               case BCH_JSET_ENTRY_blacklist: {
+                       struct jset_entry_blacklist *bl_entry =
+                               container_of(entry, struct jset_entry_blacklist, entry);
+
+                       ret = __bch2_journal_seq_blacklist_read(j, i,
+                                       le64_to_cpu(bl_entry->seq),
+                                       le64_to_cpu(bl_entry->seq));
+                       break;
+               }
+               case BCH_JSET_ENTRY_blacklist_v2: {
+                       struct jset_entry_blacklist_v2 *bl_entry =
+                               container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+                       ret = __bch2_journal_seq_blacklist_read(j, i,
+                                       le64_to_cpu(bl_entry->start),
+                                       le64_to_cpu(bl_entry->end));
+                       break;
+               }
+               }
+
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+       struct journal_seq_blacklist *bl = j->new_blacklist;
+       struct jset_entry_blacklist_v2 *bl_entry;
+       struct jset_entry *entry;
+
+       if (!bl)
+               return;
+
+       entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+                       (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+       bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+       bl_entry->entry.type    = BCH_JSET_ENTRY_blacklist_v2;
+       bl_entry->start         = cpu_to_le64(bl->start);
+       bl_entry->end           = cpu_to_le64(bl->end);
+
+       bch2_journal_pin_add(j,
+                            journal_cur_seq(j),
+                            &bl->pin,
+                            journal_seq_blacklist_flush);
+
+       j->new_blacklist = NULL;
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644 (file)
index 0000000..b4a3b27
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_replay;
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+                                   struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
new file mode 100644 (file)
index 0000000..cf29122
--- /dev/null
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+struct journal_res;
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_buf {
+       struct jset             *data;
+
+       BKEY_PADDED(key);
+
+       struct closure_waitlist wait;
+
+       unsigned                size;
+       unsigned                disk_sectors;
+       /* bloom filter: */
+       unsigned long           has_inode[1024 / sizeof(unsigned long)];
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+struct journal_entry_pin_list {
+       struct list_head                list;
+       struct list_head                flushed;
+       atomic_t                        count;
+       struct bch_devs_list            devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef void (*journal_pin_flush_fn)(struct journal *j,
+                               struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+       struct list_head                list;
+       journal_pin_flush_fn            flush;
+       struct journal_entry_pin_list   *pin_list;
+};
+
+/* corresponds to a btree node with a blacklisted bset: */
+struct blacklisted_node {
+       __le64                  seq;
+       enum btree_id           btree_id;
+       struct bpos             pos;
+};
+
+struct journal_seq_blacklist {
+       struct list_head        list;
+       u64                     start;
+       u64                     end;
+
+       struct journal_entry_pin pin;
+
+       struct blacklisted_node *entries;
+       size_t                  nr_entries;
+};
+
+struct journal_res {
+       bool                    ref;
+       u8                      idx;
+       u16                     u64s;
+       u32                     offset;
+       u64                     seq;
+};
+
+union journal_res_state {
+       struct {
+               atomic64_t      counter;
+       };
+
+       struct {
+               u64             v;
+       };
+
+       struct {
+               u64             cur_entry_offset:20,
+                               idx:1,
+                               prev_buf_unwritten:1,
+                               buf0_count:21,
+                               buf1_count:21;
+       };
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX         (4U  << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX       ((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL       (JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL                (JOURNAL_ENTRY_OFFSET_MAX)
+
+/*
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
+ * either because something's waiting on the write to complete or because it's
+ * been dirty too long and the timer's expired.
+ */
+
+enum {
+       JOURNAL_REPLAY_DONE,
+       JOURNAL_STARTED,
+       JOURNAL_NEED_WRITE,
+       JOURNAL_NOT_EMPTY,
+};
+
+/* Embedded in struct bch_fs */
+struct journal {
+       /* Fastpath stuff up front: */
+
+       unsigned long           flags;
+
+       union journal_res_state reservations;
+       unsigned                cur_entry_u64s;
+       unsigned                prev_buf_sectors;
+       unsigned                cur_buf_sectors;
+       unsigned                buf_size_want;
+
+       /*
+        * Two journal entries -- one is currently open for new entries, the
+        * other is possibly being written out.
+        */
+       struct journal_buf      buf[2];
+
+       spinlock_t              lock;
+
+       /* Used when waiting because the journal was full */
+       wait_queue_head_t       wait;
+       struct closure_waitlist async_wait;
+
+       struct closure          io;
+       struct delayed_work     write_work;
+
+       /* Sequence number of most recent journal entry (last entry in @pin) */
+       atomic64_t              seq;
+
+       /* last_seq from the most recent journal entry written */
+       u64                     last_seq_ondisk;
+
+       /*
+        * FIFO of journal entries whose btree updates have not yet been
+        * written out.
+        *
+        * Each entry is a reference count. The position in the FIFO is the
+        * entry's sequence number relative to @seq.
+        *
+        * The journal entry itself holds a reference count, put when the
+        * journal entry is written out. Each btree node modified by the journal
+        * entry also holds a reference count, put when the btree node is
+        * written.
+        *
+        * When a reference count reaches zero, the journal entry is no longer
+        * needed. When all journal entries in the oldest journal bucket are no
+        * longer needed, the bucket can be discarded and reused.
+        */
+       struct {
+               u64 front, back, size, mask;
+               struct journal_entry_pin_list *data;
+       }                       pin;
+       u64                     replay_journal_seq;
+
+       struct mutex            blacklist_lock;
+       struct list_head        seq_blacklist;
+       struct journal_seq_blacklist *new_blacklist;
+
+       BKEY_PADDED(key);
+       struct write_point      wp;
+       spinlock_t              err_lock;
+
+       struct delayed_work     reclaim_work;
+       unsigned long           last_flushed;
+
+       /* protects advancing ja->last_idx: */
+       struct mutex            reclaim_lock;
+       unsigned                write_delay_ms;
+       unsigned                reclaim_delay_ms;
+
+       u64                     res_get_blocked_start;
+       u64                     need_write_time;
+       u64                     write_start_time;
+
+       struct bch2_time_stats  *write_time;
+       struct bch2_time_stats  *delay_time;
+       struct bch2_time_stats  *blocked_time;
+       struct bch2_time_stats  *flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lockdep_map      res_map;
+#endif
+};
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+       /*
+        * For each journal bucket, contains the max sequence number of the
+        * journal writes it contains - so we know when a bucket can be reused.
+        */
+       u64                     *bucket_seq;
+
+       unsigned                sectors_free;
+
+       /* Journal bucket we're currently writing to */
+       unsigned                cur_idx;
+
+       /* Last journal bucket that still contains an open journal entry */
+
+       /*
+        * j->lock and j->reclaim_lock must both be held to modify, j->lock
+        * sufficient to read:
+        */
+       unsigned                last_idx;
+       unsigned                nr;
+       u64                     *buckets;
+
+       /* Bio for journal reads/writes to this device */
+       struct bio              *bio;
+
+       /* for bch_journal_read_device */
+       struct closure          read;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
new file mode 100644 (file)
index 0000000..5da54ce
--- /dev/null
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+                       size_t nr_inline_u64s, size_t new_u64s)
+{
+       size_t oldsize = bch_keylist_u64s(l);
+       size_t newsize = oldsize + new_u64s;
+       u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+       u64 *new_keys;
+
+       newsize = roundup_pow_of_two(newsize);
+
+       if (newsize <= nr_inline_u64s ||
+           (old_buf && roundup_pow_of_two(oldsize) == newsize))
+               return 0;
+
+       new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+       if (!new_keys)
+               return -ENOMEM;
+
+       if (!old_buf)
+               memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+       l->keys_p = new_keys;
+       l->top_p = new_keys + oldsize;
+
+       return 0;
+}
+
+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
+{
+       struct bkey_i *where;
+
+       for_each_keylist_key(l, where)
+               if (bkey_cmp(insert->k.p, where->k.p) < 0)
+                       break;
+
+       memmove_u64s_up((u64 *) where + insert->k.u64s,
+                       where,
+                       ((u64 *) l->top) - ((u64 *) where));
+
+       l->top_p += insert->k.u64s;
+       bkey_copy(where, insert);
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+       l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+       memmove_u64s_down(l->keys,
+                         bkey_next(l->keys),
+                         bch_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+       struct bkey_i *k;
+
+       for_each_keylist_key(l, k)
+               BUG_ON(bkey_next(k) != l->top &&
+                      bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+}
+#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
new file mode 100644 (file)
index 0000000..a7ff86b
--- /dev/null
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+       l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+       if (l->keys_p != inline_keys)
+               kfree(l->keys_p);
+       bch2_keylist_init(l, inline_keys);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+       l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+       bkey_copy(l->top, k);
+       bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+       return l->top == l->keys;
+}
+
+static inline size_t bch_keylist_u64s(struct keylist *l)
+{
+       return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+       return bch_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+       return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)                     \
+       for (_k = (_keylist)->keys;                             \
+            _k != (_keylist)->top;                             \
+            _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+       struct bkey_i *k;
+       u64 ret = 0;
+
+       for_each_keylist_key(keys, k)
+               ret += k->k.size;
+
+       return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
new file mode 100644 (file)
index 0000000..4b3ff7d
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+       union {
+               struct bkey_i           *keys;
+               u64                     *keys_p;
+       };
+       union {
+               struct bkey_i           *top;
+               u64                     *top_p;
+       };
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
new file mode 100644 (file)
index 0000000..8f618dc
--- /dev/null
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+                        unsigned dev_idx, int flags, bool metadata)
+{
+       unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+       unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+       unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+       unsigned nr_good;
+
+       bch2_extent_drop_device(e, dev_idx);
+
+       nr_good = bch2_extent_durability(c, e.c);
+       if ((!nr_good && !(flags & lost)) ||
+           (nr_good < replicas && !(flags & degraded)))
+               return -EINVAL;
+
+       return 0;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+       struct bkey_s_c k;
+       struct bkey_s_extent e;
+       BKEY_PADDED(key) tmp;
+       struct btree_iter iter;
+       int ret = 0;
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+                            POS_MIN, BTREE_ITER_PREFETCH);
+
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
+              !(ret = btree_iter_err(k))) {
+               if (!bkey_extent_is_data(k.k) ||
+                   !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+                       ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+                       if (ret)
+                               break;
+                       bch2_btree_iter_next(&iter);
+                       continue;
+               }
+
+               bkey_reassemble(&tmp.key, k);
+               e = bkey_i_to_s_extent(&tmp.key);
+
+               ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+               if (ret)
+                       break;
+
+               /*
+                * If the new extent no longer has any pointers, bch2_extent_normalize()
+                * will do the appropriate thing with it (turning it into a
+                * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+                */
+               bch2_extent_normalize(c, e.s);
+
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+                                             bkey_i_to_s_c(&tmp.key));
+               if (ret)
+                       break;
+
+               iter.pos = bkey_start_pos(&tmp.key.k);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                                          BTREE_INSERT_ATOMIC|
+                                          BTREE_INSERT_NOFAIL,
+                                          BTREE_INSERT_ENTRY(&iter, &tmp.key));
+
+               /*
+                * don't want to leave ret == -EINTR, since if we raced and
+                * something else overwrote the key we could spuriously return
+                * -EINTR below:
+                */
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret)
+                       break;
+       }
+
+       bch2_btree_iter_unlock(&iter);
+
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+       struct btree_iter iter;
+       struct closure cl;
+       struct btree *b;
+       unsigned id;
+       int ret;
+
+       /* don't handle this yet: */
+       if (flags & BCH_FORCE_IF_METADATA_LOST)
+               return -EINVAL;
+
+       closure_init_stack(&cl);
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+                       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+                       struct bkey_i_extent *new_key;
+retry:
+                       if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+                                                   dev_idx)) {
+                               /*
+                                * we might have found a btree node key we
+                                * needed to update, and then tried to update it
+                                * but got -EINTR after upgrading the iter, but
+                                * then raced and the node is now gone:
+                                */
+                               bch2_btree_iter_downgrade(&iter);
+
+                               ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                                             bkey_i_to_s_c(&b->key));
+                               if (ret)
+                                       goto err;
+                       } else {
+                               bkey_copy(&tmp.k, &b->key);
+                               new_key = bkey_i_to_extent(&tmp.k);
+
+                               ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+                                                   dev_idx, flags, true);
+                               if (ret)
+                                       goto err;
+
+                               ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+                               if (ret == -EINTR) {
+                                       b = bch2_btree_iter_peek_node(&iter);
+                                       goto retry;
+                               }
+                               if (ret)
+                                       goto err;
+                       }
+               }
+               bch2_btree_iter_unlock(&iter);
+       }
+
+       ret = 0;
+out:
+       ret = bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+err:
+       bch2_btree_iter_unlock(&iter);
+       goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+       return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+               bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
new file mode 100644 (file)
index 0000000..027efaa
--- /dev/null
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
new file mode 100644 (file)
index 0000000..b6310a6
--- /dev/null
@@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "inode.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+#define SECTORS_IN_FLIGHT_PER_DEVICE   2048
+
+struct moving_io {
+       struct list_head        list;
+       struct closure          cl;
+       bool                    read_completed;
+
+       unsigned                read_sectors;
+       unsigned                write_sectors;
+
+       struct bch_read_bio     rbio;
+
+       struct migrate_write    write;
+       /* Must be last since it is variable size */
+       struct bio_vec          bi_inline_vecs[0];
+};
+
+struct moving_context {
+       /* Closure for waiting on all reads and writes to complete */
+       struct closure          cl;
+
+       struct bch_move_stats   *stats;
+
+       struct list_head        reads;
+
+       /* in flight sectors: */
+       atomic_t                read_sectors;
+       atomic_t                write_sectors;
+
+       wait_queue_head_t       wait;
+};
+
+static int bch2_migrate_index_update(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct migrate_write *m =
+               container_of(op, struct migrate_write, op);
+       struct keylist *keys = &op->insert_keys;
+       struct btree_iter iter;
+       int ret = 0;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       while (1) {
+               struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+               struct bkey_i_extent *insert, *new =
+                       bkey_i_to_extent(bch2_keylist_front(keys));
+               BKEY_PADDED(k) _new, _insert;
+               struct bch_extent_ptr *ptr;
+               struct bch_extent_crc_unpacked crc;
+               bool did_work = false;
+               int nr;
+
+               if (btree_iter_err(k)) {
+                       ret = bch2_btree_iter_unlock(&iter);
+                       break;
+               }
+
+               if (bversion_cmp(k.k->version, new->k.version) ||
+                   !bkey_extent_is_data(k.k) ||
+                   !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
+                                            m->ptr, m->offset))
+                       goto nomatch;
+
+               if (m->data_cmd == DATA_REWRITE &&
+                   !bch2_extent_has_device(bkey_s_c_to_extent(k),
+                                           m->data_opts.rewrite_dev))
+                       goto nomatch;
+
+               bkey_reassemble(&_insert.k, k);
+               insert = bkey_i_to_extent(&_insert.k);
+
+               bkey_copy(&_new.k, bch2_keylist_front(keys));
+               new = bkey_i_to_extent(&_new.k);
+
+               bch2_cut_front(iter.pos, &insert->k_i);
+               bch2_cut_back(new->k.p, &insert->k);
+               bch2_cut_back(insert->k.p, &new->k);
+
+               if (m->data_cmd == DATA_REWRITE) {
+                       ptr = (struct bch_extent_ptr *)
+                               bch2_extent_has_device(extent_i_to_s_c(insert),
+                                                      m->data_opts.rewrite_dev);
+                       bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
+               }
+
+               extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
+                       if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+                               /*
+                                * raced with another move op? extent already
+                                * has a pointer to the device we just wrote
+                                * data to
+                                */
+                               continue;
+                       }
+
+                       bch2_extent_crc_append(insert, crc);
+                       extent_ptr_append(insert, *ptr);
+                       did_work = true;
+               }
+
+               if (!did_work)
+                       goto nomatch;
+
+               bch2_extent_narrow_crcs(insert,
+                               (struct bch_extent_crc_unpacked) { 0 });
+               bch2_extent_normalize(c, extent_i_to_s(insert).s);
+               bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
+                                                op->opts.background_target,
+                                                op->opts.data_replicas);
+
+               /*
+                * It's possible we race, and for whatever reason the extent now
+                * has fewer replicas than when we last looked at it - meaning
+                * we need to get a disk reservation here:
+                */
+               nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+                       (bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
+               if (nr > 0) {
+                       /*
+                        * can't call bch2_disk_reservation_add() with btree
+                        * locks held, at least not without a song and dance
+                        */
+                       bch2_btree_iter_unlock(&iter);
+
+                       ret = bch2_disk_reservation_add(c, &op->res,
+                                       keylist_sectors(keys) * nr, 0);
+                       if (ret)
+                               goto out;
+
+                       m->nr_ptrs_reserved += nr;
+                       goto next;
+               }
+
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+                                             extent_i_to_s_c(insert).s_c);
+               if (ret)
+                       break;
+
+               ret = bch2_btree_insert_at(c, &op->res,
+                               NULL, op_journal_seq(op),
+                               BTREE_INSERT_ATOMIC|
+                               BTREE_INSERT_NOFAIL|
+                               BTREE_INSERT_USE_RESERVE|
+                               m->data_opts.btree_insert_flags,
+                               BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+               if (!ret)
+                       atomic_long_inc(&c->extent_migrate_done);
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret)
+                       break;
+next:
+               while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+                       bch2_keylist_pop_front(keys);
+                       if (bch2_keylist_empty(keys))
+                               goto out;
+               }
+
+               bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+               continue;
+nomatch:
+               if (m->ctxt)
+                       atomic64_add(k.k->p.offset - iter.pos.offset,
+                                    &m->ctxt->stats->sectors_raced);
+               atomic_long_inc(&c->extent_migrate_raced);
+               trace_move_race(&new->k);
+               bch2_btree_iter_next_slot(&iter);
+               goto next;
+       }
+out:
+       bch2_btree_iter_unlock(&iter);
+       return ret;
+}
+
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
+{
+       /* write bio must own pages: */
+       BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+       m->ptr          = rbio->pick.ptr;
+       m->offset       = rbio->pos.offset - rbio->pick.crc.offset;
+       m->op.devs_have = rbio->devs_have;
+       m->op.pos       = rbio->pos;
+       m->op.version   = rbio->version;
+       m->op.crc       = rbio->pick.crc;
+       m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+       if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+               m->op.nonce     = m->op.crc.nonce + m->op.crc.offset;
+               m->op.csum_type = m->op.crc.csum_type;
+       }
+
+       if (m->data_cmd == DATA_REWRITE)
+               bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+                           struct write_point_specifier wp,
+                           struct bch_io_opts io_opts,
+                           enum data_cmd data_cmd,
+                           struct data_opts data_opts,
+                           struct bkey_s_c k)
+{
+       int ret;
+
+       m->data_cmd     = data_cmd;
+       m->data_opts    = data_opts;
+       m->nr_ptrs_reserved = 0;
+
+       bch2_write_op_init(&m->op, c, io_opts);
+       m->op.compression_type =
+               bch2_compression_opt_to_type[io_opts.background_compression ?:
+                                            io_opts.compression];
+       m->op.target    = data_opts.target,
+       m->op.write_point = wp;
+
+       if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+               m->op.alloc_reserve = RESERVE_MOVINGGC;
+
+       m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+               BCH_WRITE_PAGES_STABLE|
+               BCH_WRITE_PAGES_OWNED|
+               BCH_WRITE_DATA_ENCODED|
+               BCH_WRITE_NOMARK_REPLICAS;
+
+       m->op.nr_replicas       = 1;
+       m->op.nr_replicas_required = 1;
+       m->op.index_update_fn   = bch2_migrate_index_update;
+
+       switch (data_cmd) {
+       case DATA_ADD_REPLICAS: {
+               int nr = (int) io_opts.data_replicas -
+                       bch2_extent_nr_dirty_ptrs(k);
+
+               if (nr > 0) {
+                       m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+
+                       ret = bch2_disk_reservation_get(c, &m->op.res,
+                                       k.k->size, m->op.nr_replicas, 0);
+                       if (ret)
+                               return ret;
+               }
+               break;
+       }
+       case DATA_REWRITE:
+               break;
+       case DATA_PROMOTE:
+               m->op.flags     |= BCH_WRITE_ALLOC_NOWAIT;
+               m->op.flags     |= BCH_WRITE_CACHED;
+               break;
+       default:
+               BUG();
+       }
+
+       return 0;
+}
+
+static void move_free(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, cl);
+       struct moving_context *ctxt = io->write.ctxt;
+       struct bvec_iter_all iter;
+       struct bio_vec *bv;
+
+       bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+
+       bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
+               if (bv->bv_page)
+                       __free_page(bv->bv_page);
+
+       wake_up(&ctxt->wait);
+
+       kfree(io);
+}
+
+static void move_write_done(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+       atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+       closure_return_with_destructor(cl, move_free);
+}
+
+static void move_write(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+       if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+               closure_return_with_destructor(cl, move_free);
+               return;
+       }
+
+       bch2_migrate_read_done(&io->write, &io->rbio);
+
+       atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+       closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+       continue_at(cl, move_write_done, NULL);
+}
+
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+{
+       struct moving_io *io =
+               list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+
+       return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+       struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+       struct moving_context *ctxt = io->write.ctxt;
+
+       atomic_sub(io->read_sectors, &ctxt->read_sectors);
+       io->read_completed = true;
+
+       if (next_pending_write(ctxt))
+               wake_up(&ctxt->wait);
+
+       closure_put(&ctxt->cl);
+}
+
+static void do_pending_writes(struct moving_context *ctxt)
+{
+       struct moving_io *io;
+
+       while ((io = next_pending_write(ctxt))) {
+               list_del(&io->list);
+               closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+       }
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)                     \
+do {                                                           \
+       do_pending_writes(_ctxt);                               \
+                                                               \
+       if (_cond)                                              \
+               break;                                          \
+       __wait_event((_ctxt)->wait,                             \
+                    next_pending_write(_ctxt) || (_cond));     \
+} while (1)
+
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+       unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+       move_ctxt_wait_event(ctxt,
+               !atomic_read(&ctxt->write_sectors) ||
+               atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+static int bch2_move_extent(struct bch_fs *c,
+                           struct moving_context *ctxt,
+                           struct write_point_specifier wp,
+                           struct bch_io_opts io_opts,
+                           struct bkey_s_c_extent e,
+                           enum data_cmd data_cmd,
+                           struct data_opts data_opts)
+{
+       struct moving_io *io;
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       unsigned sectors = e.k->size, pages;
+       int ret = -ENOMEM;
+
+       move_ctxt_wait_event(ctxt,
+               atomic_read(&ctxt->write_sectors) <
+               SECTORS_IN_FLIGHT_PER_DEVICE);
+
+       move_ctxt_wait_event(ctxt,
+               atomic_read(&ctxt->read_sectors) <
+               SECTORS_IN_FLIGHT_PER_DEVICE);
+
+       /* write path might have to decompress data: */
+       extent_for_each_ptr_crc(e, ptr, crc)
+               sectors = max_t(unsigned, sectors, crc.uncompressed_size);
+
+       pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+       io = kzalloc(sizeof(struct moving_io) +
+                    sizeof(struct bio_vec) * pages, GFP_KERNEL);
+       if (!io)
+               goto err;
+
+       io->write.ctxt          = ctxt;
+       io->read_sectors        = e.k->size;
+       io->write_sectors       = e.k->size;
+
+       bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+       bio_set_prio(&io->write.op.wbio.bio,
+                    IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+       if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+                                GFP_KERNEL))
+               goto err_free;
+
+       io->rbio.opts = io_opts;
+       bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+       io->rbio.bio.bi_vcnt = pages;
+       bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+       io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+       io->rbio.bio.bi_opf             = REQ_OP_READ;
+       io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(e.k);
+       io->rbio.bio.bi_end_io          = move_read_endio;
+
+       ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
+                                     data_cmd, data_opts, e.s_c);
+       if (ret)
+               goto err_free_pages;
+
+       atomic64_inc(&ctxt->stats->keys_moved);
+       atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+
+       trace_move_extent(e.k);
+
+       atomic_add(io->read_sectors, &ctxt->read_sectors);
+       list_add_tail(&io->list, &ctxt->reads);
+
+       /*
+        * dropped by move_read_endio() - guards against use after free of
+        * ctxt when doing wakeup
+        */
+       closure_get(&ctxt->cl);
+       bch2_read_extent(c, &io->rbio, e.s_c,
+                        BCH_READ_NODECODE|
+                        BCH_READ_LAST_FRAGMENT);
+       return 0;
+err_free_pages:
+       bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+       kfree(io);
+err:
+       trace_move_alloc_fail(e.k);
+       return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+                  struct bch_ratelimit *rate,
+                  struct write_point_specifier wp,
+                  struct bpos start,
+                  struct bpos end,
+                  move_pred_fn pred, void *arg,
+                  struct bch_move_stats *stats)
+{
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
+       struct moving_context ctxt = { .stats = stats };
+       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+       BKEY_PADDED(k) tmp;
+       struct bkey_s_c k;
+       struct bkey_s_c_extent e;
+       struct data_opts data_opts;
+       enum data_cmd data_cmd;
+       u64 cur_inum = U64_MAX;
+       int ret = 0, ret2;
+
+       closure_init_stack(&ctxt.cl);
+       INIT_LIST_HEAD(&ctxt.reads);
+       init_waitqueue_head(&ctxt.wait);
+
+       stats->data_type = BCH_DATA_USER;
+       bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
+                            BTREE_ITER_PREFETCH);
+
+       if (rate)
+               bch2_ratelimit_reset(rate);
+
+       while (!kthread || !(ret = kthread_should_stop())) {
+               if (rate &&
+                   bch2_ratelimit_delay(rate) &&
+                   (bch2_btree_iter_unlock(&stats->iter),
+                    (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
+                       break;
+peek:
+               k = bch2_btree_iter_peek(&stats->iter);
+               if (!k.k)
+                       break;
+               ret = btree_iter_err(k);
+               if (ret)
+                       break;
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+                       break;
+
+               if (!bkey_extent_is_data(k.k))
+                       goto next_nondata;
+
+               e = bkey_s_c_to_extent(k);
+
+               if (cur_inum != k.k->p.inode) {
+                       struct bch_inode_unpacked inode;
+
+                       /* don't hold btree locks while looking up inode: */
+                       bch2_btree_iter_unlock(&stats->iter);
+
+                       io_opts = bch2_opts_to_inode_opts(c->opts);
+                       if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+                               bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+                       cur_inum = k.k->p.inode;
+                       goto peek;
+               }
+
+               switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
+                                        &io_opts, &data_opts))) {
+               case DATA_SKIP:
+                       goto next;
+               case DATA_SCRUB:
+                       BUG();
+               case DATA_ADD_REPLICAS:
+               case DATA_REWRITE:
+               case DATA_PROMOTE:
+                       break;
+               default:
+                       BUG();
+               }
+
+               /* unlock before doing IO: */
+               bkey_reassemble(&tmp.k, k);
+               k = bkey_i_to_s_c(&tmp.k);
+               bch2_btree_iter_unlock(&stats->iter);
+
+               ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
+                                       bkey_s_c_to_extent(k),
+                                       data_cmd, data_opts);
+               if (ret2) {
+                       if (ret2 == -ENOMEM) {
+                               /* memory allocation failure, wait for some IO to finish */
+                               bch2_move_ctxt_wait_for_io(&ctxt);
+                               continue;
+                       }
+
+                       /* XXX signal failure */
+                       goto next;
+               }
+
+               if (rate)
+                       bch2_ratelimit_increment(rate, k.k->size);
+next:
+               atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+                            &stats->sectors_seen);
+next_nondata:
+               bch2_btree_iter_next(&stats->iter);
+               bch2_btree_iter_cond_resched(&stats->iter);
+       }
+
+       bch2_btree_iter_unlock(&stats->iter);
+
+       move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+       closure_sync(&ctxt.cl);
+
+       EBUG_ON(atomic_read(&ctxt.write_sectors));
+
+       trace_move_data(c,
+                       atomic64_read(&stats->sectors_moved),
+                       atomic64_read(&stats->keys_moved));
+
+       return ret;
+}
+
+static int bch2_gc_data_replicas(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+                          BTREE_ITER_PREFETCH, k) {
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+               if (ret)
+                       break;
+       }
+       ret = bch2_btree_iter_unlock(&iter) ?: ret;
+
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
+
+static int bch2_gc_btree_replicas(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct btree *b;
+       unsigned id;
+       int ret = 0;
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+                       ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                                     bkey_i_to_s_c(&b->key));
+
+                       bch2_btree_iter_cond_resched(&iter);
+               }
+
+               ret = bch2_btree_iter_unlock(&iter) ?: ret;
+       }
+
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
+
+static int bch2_move_btree(struct bch_fs *c,
+                          move_pred_fn pred,
+                          void *arg,
+                          struct bch_move_stats *stats)
+{
+       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+       struct btree *b;
+       unsigned id;
+       struct data_opts data_opts;
+       enum data_cmd cmd;
+       int ret = 0;
+
+       stats->data_type = BCH_DATA_BTREE;
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+                       switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
+                                           bkey_i_to_s_c_extent(&b->key),
+                                           &io_opts,
+                                           &data_opts))) {
+                       case DATA_SKIP:
+                               goto next;
+                       case DATA_SCRUB:
+                               BUG();
+                       case DATA_ADD_REPLICAS:
+                       case DATA_REWRITE:
+                               break;
+                       default:
+                               BUG();
+                       }
+
+                       ret = bch2_btree_node_rewrite(c, &stats->iter,
+                                       b->data->keys.seq, 0) ?: ret;
+next:
+                       bch2_btree_iter_cond_resched(&stats->iter);
+               }
+
+               ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
+       }
+
+       return ret;
+}
+
+#if 0
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+                               enum bkey_type type,
+                               struct bkey_s_c_extent e,
+                               struct bch_io_opts *io_opts,
+                               struct data_opts *data_opts)
+{
+       return DATA_SCRUB;
+}
+#endif
+
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+                                     enum bkey_type type,
+                                     struct bkey_s_c_extent e,
+                                     struct bch_io_opts *io_opts,
+                                     struct data_opts *data_opts)
+{
+       unsigned nr_good = bch2_extent_durability(c, e);
+       unsigned replicas = type == BKEY_TYPE_BTREE
+               ? c->opts.metadata_replicas
+               : io_opts->data_replicas;
+
+       if (!nr_good || nr_good >= replicas)
+               return DATA_SKIP;
+
+       data_opts->target               = 0;
+       data_opts->btree_insert_flags = 0;
+       return DATA_ADD_REPLICAS;
+}
+
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+                                 enum bkey_type type,
+                                 struct bkey_s_c_extent e,
+                                 struct bch_io_opts *io_opts,
+                                 struct data_opts *data_opts)
+{
+       struct bch_ioctl_data *op = arg;
+
+       if (!bch2_extent_has_device(e, op->migrate.dev))
+               return DATA_SKIP;
+
+       data_opts->target               = 0;
+       data_opts->btree_insert_flags   = 0;
+       data_opts->rewrite_dev          = op->migrate.dev;
+       return DATA_REWRITE;
+}
+
+int bch2_data_job(struct bch_fs *c,
+                 struct bch_move_stats *stats,
+                 struct bch_ioctl_data op)
+{
+       int ret = 0;
+
+       switch (op.op) {
+       case BCH_DATA_OP_REREPLICATE:
+               stats->data_type = BCH_DATA_JOURNAL;
+               ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+               ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+               ret = bch2_gc_btree_replicas(c) ?: ret;
+
+               ret = bch2_move_data(c, NULL,
+                                    writepoint_hashed((unsigned long) current),
+                                    op.start,
+                                    op.end,
+                                    rereplicate_pred, c, stats) ?: ret;
+               ret = bch2_gc_data_replicas(c) ?: ret;
+               break;
+       case BCH_DATA_OP_MIGRATE:
+               if (op.migrate.dev >= c->sb.nr_devices)
+                       return -EINVAL;
+
+               stats->data_type = BCH_DATA_JOURNAL;
+               ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+               ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+               ret = bch2_gc_btree_replicas(c) ?: ret;
+
+               ret = bch2_move_data(c, NULL,
+                                    writepoint_hashed((unsigned long) current),
+                                    op.start,
+                                    op.end,
+                                    migrate_pred, &op, stats) ?: ret;
+               ret = bch2_gc_data_replicas(c) ?: ret;
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
new file mode 100644 (file)
index 0000000..3f7e31c
--- /dev/null
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "btree_iter.h"
+#include "buckets.h"
+#include "io_types.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+struct moving_context;
+
+enum data_cmd {
+       DATA_SKIP,
+       DATA_SCRUB,
+       DATA_ADD_REPLICAS,
+       DATA_REWRITE,
+       DATA_PROMOTE,
+};
+
+struct data_opts {
+       u16             target;
+       unsigned        rewrite_dev;
+       int             btree_insert_flags;
+};
+
+struct migrate_write {
+       enum data_cmd           data_cmd;
+       struct data_opts        data_opts;
+
+       unsigned                nr_ptrs_reserved;
+
+       struct moving_context   *ctxt;
+
+       /* what we read: */
+       struct bch_extent_ptr   ptr;
+       u64                     offset;
+
+       struct bch_write_op     op;
+};
+
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+                           struct write_point_specifier,
+                           struct bch_io_opts,
+                           enum data_cmd, struct data_opts,
+                           struct bkey_s_c);
+
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+                               enum bkey_type, struct bkey_s_c_extent,
+                               struct bch_io_opts *, struct data_opts *);
+
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+                  struct write_point_specifier,
+                  struct bpos, struct bpos,
+                  move_pred_fn, void *,
+                  struct bch_move_stats *);
+
+int bch2_data_job(struct bch_fs *,
+                 struct bch_move_stats *,
+                 struct bch_ioctl_data);
+
+#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
new file mode 100644 (file)
index 0000000..8dbeb6e
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+       enum bch_data_type      data_type;
+       struct btree_iter       iter;
+
+       atomic64_t              keys_moved;
+       atomic64_t              sectors_moved;
+       atomic64_t              sectors_seen;
+       atomic64_t              sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
new file mode 100644 (file)
index 0000000..8b61b16
--- /dev/null
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "eytzinger.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+#include <linux/wait.h>
+
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)                                    \
+       ((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)                                    \
+       ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
+
+static inline int sectors_used_cmp(copygc_heap *heap,
+                                  struct copygc_heap_entry l,
+                                  struct copygc_heap_entry r)
+{
+       return (l.sectors > r.sectors) - (l.sectors < r.sectors);
+}
+
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
+{
+       const struct copygc_heap_entry *l = _l;
+       const struct copygc_heap_entry *r = _r;
+
+       return (l->offset > r->offset) - (l->offset < r->offset);
+}
+
+static bool __copygc_pred(struct bch_dev *ca,
+                         struct bkey_s_c_extent e)
+{
+       copygc_heap *h = &ca->copygc_heap;
+       const struct bch_extent_ptr *ptr =
+               bch2_extent_has_device(e, ca->dev_idx);
+
+       if (ptr) {
+               struct copygc_heap_entry search = { .offset = ptr->offset };
+
+               ssize_t i = eytzinger0_find_le(h->data, h->used,
+                                              sizeof(h->data[0]),
+                                              bucket_offset_cmp, &search);
+
+               return (i >= 0 &&
+                       ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+                       ptr->gen == h->data[i].gen);
+       }
+
+       return false;
+}
+
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+                                enum bkey_type type,
+                                struct bkey_s_c_extent e,
+                                struct bch_io_opts *io_opts,
+                                struct data_opts *data_opts)
+{
+       struct bch_dev *ca = arg;
+
+       if (!__copygc_pred(ca, e))
+               return DATA_SKIP;
+
+       data_opts->target               = dev_to_target(ca->dev_idx);
+       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
+       data_opts->rewrite_dev          = ca->dev_idx;
+       return DATA_REWRITE;
+}
+
+static bool have_copygc_reserve(struct bch_dev *ca)
+{
+       bool ret;
+
+       spin_lock(&ca->freelist_lock);
+       ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
+               ca->allocator_blocked;
+       spin_unlock(&ca->freelist_lock);
+
+       return ret;
+}
+
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+{
+       copygc_heap *h = &ca->copygc_heap;
+       struct copygc_heap_entry e, *i;
+       struct bucket_array *buckets;
+       struct bch_move_stats move_stats;
+       u64 sectors_to_move = 0, sectors_not_moved = 0;
+       u64 buckets_to_move, buckets_not_moved = 0;
+       size_t b;
+       int ret;
+
+       memset(&move_stats, 0, sizeof(move_stats));
+       closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+
+       /*
+        * Find buckets with lowest sector counts, skipping completely
+        * empty buckets, by building a maxheap sorted by sector count,
+        * and repeatedly replacing the maximum element until all
+        * buckets have been visited.
+        */
+       h->used = 0;
+
+       /*
+        * We need bucket marks to be up to date - gc can't be recalculating
+        * them:
+        */
+       down_read(&c->gc_lock);
+       down_read(&ca->bucket_lock);
+       buckets = bucket_array(ca);
+
+       for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+               struct copygc_heap_entry e;
+
+               if (m.owned_by_allocator ||
+                   m.data_type != BCH_DATA_USER ||
+                   !bucket_sectors_used(m) ||
+                   bucket_sectors_used(m) >= ca->mi.bucket_size)
+                       continue;
+
+               e = (struct copygc_heap_entry) {
+                       .gen            = m.gen,
+                       .sectors        = bucket_sectors_used(m),
+                       .offset         = bucket_to_sector(ca, b),
+               };
+               heap_add_or_replace(h, e, -sectors_used_cmp);
+       }
+       up_read(&ca->bucket_lock);
+       up_read(&c->gc_lock);
+
+       for (i = h->data; i < h->data + h->used; i++)
+               sectors_to_move += i->sectors;
+
+       while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+               BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
+               sectors_to_move -= e.sectors;
+       }
+
+       buckets_to_move = h->used;
+
+       if (!buckets_to_move)
+               return;
+
+       eytzinger0_sort(h->data, h->used,
+                       sizeof(h->data[0]),
+                       bucket_offset_cmp, NULL);
+
+       ret = bch2_move_data(c, &ca->copygc_pd.rate,
+                            writepoint_ptr(&ca->copygc_write_point),
+                            POS_MIN, POS_MAX,
+                            copygc_pred, ca,
+                            &move_stats);
+
+       down_read(&ca->bucket_lock);
+       buckets = bucket_array(ca);
+       for (i = h->data; i < h->data + h->used; i++) {
+               size_t b = sector_to_bucket(ca, i->offset);
+               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+
+               if (i->gen == m.gen && bucket_sectors_used(m)) {
+                       sectors_not_moved += bucket_sectors_used(m);
+                       buckets_not_moved++;
+               }
+       }
+       up_read(&ca->bucket_lock);
+
+       if (sectors_not_moved && !ret)
+               bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+                        sectors_not_moved, sectors_to_move,
+                        buckets_not_moved, buckets_to_move);
+
+       trace_copygc(ca,
+                    atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
+                    buckets_to_move, buckets_not_moved);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+       struct bch_dev *ca = arg;
+       struct bch_fs *c = ca->fs;
+       struct io_clock *clock = &c->io_clock[WRITE];
+       struct bch_dev_usage usage;
+       unsigned long last;
+       u64 available, fragmented, reserve, next;
+
+       set_freezable();
+
+       while (!kthread_should_stop()) {
+               if (kthread_wait_freezable(c->copy_gc_enabled))
+                       break;
+
+               last = atomic_long_read(&clock->now);
+
+               reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+                                ca->mi.bucket_size *
+                                c->opts.gc_reserve_percent, 200);
+
+               usage = bch2_dev_usage_read(c, ca);
+
+               /*
+                * don't start copygc until less than half the gc reserve is
+                * available:
+                */
+               available = __dev_buckets_available(ca, usage) *
+                       ca->mi.bucket_size;
+               if (available > reserve) {
+                       next = last + available - reserve;
+                       bch2_kthread_io_clock_wait(clock, next,
+                                       MAX_SCHEDULE_TIMEOUT);
+                       continue;
+               }
+
+               /*
+                * don't start copygc until there's more than half the copygc
+                * reserve of fragmented space:
+                */
+               fragmented = usage.sectors_fragmented;
+               if (fragmented < reserve) {
+                       next = last + reserve - fragmented;
+                       bch2_kthread_io_clock_wait(clock, next,
+                                       MAX_SCHEDULE_TIMEOUT);
+                       continue;
+               }
+
+               bch2_copygc(c, ca);
+       }
+
+       return 0;
+}
+
+void bch2_copygc_stop(struct bch_dev *ca)
+{
+       ca->copygc_pd.rate.rate = UINT_MAX;
+       bch2_ratelimit_reset(&ca->copygc_pd.rate);
+
+       if (ca->copygc_thread) {
+               kthread_stop(ca->copygc_thread);
+               put_task_struct(ca->copygc_thread);
+       }
+       ca->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct task_struct *t;
+
+       BUG_ON(ca->copygc_thread);
+
+       if (c->opts.nochanges)
+               return 0;
+
+       if (bch2_fs_init_fault("copygc_start"))
+               return -ENOMEM;
+
+       t = kthread_create(bch2_copygc_thread, ca,
+                          "bch_copygc[%s]", ca->name);
+       if (IS_ERR(t))
+               return PTR_ERR(t);
+
+       get_task_struct(t);
+
+       ca->copygc_thread = t;
+       wake_up_process(ca->copygc_thread);
+
+       return 0;
+}
+
+void bch2_dev_copygc_init(struct bch_dev *ca)
+{
+       bch2_pd_controller_init(&ca->copygc_pd);
+       ca->copygc_pd.d_term = 0;
+}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
new file mode 100644 (file)
index 0000000..dcd4796
--- /dev/null
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
new file mode 100644 (file)
index 0000000..9351cae
--- /dev/null
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+const char * const bch2_error_actions[] = {
+       "continue",
+       "remount-ro",
+       "panic",
+       NULL
+};
+
+const char * const bch2_csum_types[] = {
+       "none",
+       "crc32c",
+       "crc64",
+       NULL
+};
+
+const char * const bch2_compression_types[] = {
+       "none",
+       "lz4",
+       "gzip",
+       "zstd",
+       NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+       "crc32c",
+       "crc64",
+       "siphash",
+       NULL
+};
+
+const char * const bch2_data_types[] = {
+       "none",
+       "sb",
+       "journal",
+       "btree",
+       "data",
+       "cached",
+       NULL
+};
+
+const char * const bch2_cache_replacement_policies[] = {
+       "lru",
+       "fifo",
+       "random",
+       NULL
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch2_cache_modes[] = {
+       "default",
+       "writethrough",
+       "writeback",
+       "writearound",
+       "none",
+       NULL
+};
+
+const char * const bch2_dev_state[] = {
+       "readwrite",
+       "readonly",
+       "failed",
+       "spare",
+       NULL
+};
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define BCH_OPT(_name, ...)                                            \
+       if (opt_defined(src, _name))                                    \
+               opt_set(*dst, _name, src._name);
+
+       BCH_OPTS()
+#undef BCH_OPT
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+       switch (id) {
+#define BCH_OPT(_name, ...)                                            \
+       case Opt_##_name:                                               \
+               return opt_defined(*opts, _name);
+       BCH_OPTS()
+#undef BCH_OPT
+       default:
+               BUG();
+       }
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+       switch (id) {
+#define BCH_OPT(_name, ...)                                            \
+       case Opt_##_name:                                               \
+               return opts->_name;
+       BCH_OPTS()
+#undef BCH_OPT
+       default:
+               BUG();
+       }
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+       switch (id) {
+#define BCH_OPT(_name, ...)                                            \
+       case Opt_##_name:                                               \
+               opt_set(*opts, _name, v);                               \
+               break;
+       BCH_OPTS()
+#undef BCH_OPT
+       default:
+               BUG();
+       }
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
+{
+       struct bch_opts opts = bch2_opts_empty();
+
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)         \
+       if (_sb_opt != NO_SB_OPT)                                       \
+               opt_set(opts, _name, _sb_opt(sb));
+       BCH_OPTS()
+#undef BCH_OPT
+
+       return opts;
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL()             .type = BCH_OPT_BOOL
+#define OPT_UINT(_min, _max)   .type = BCH_OPT_UINT, .min = _min, .max = _max
+#define OPT_STR(_choices)      .type = BCH_OPT_STR, .choices = _choices
+#define OPT_FN(_fn)            .type = BCH_OPT_FN,                     \
+                               .parse = _fn##_parse,                   \
+                               .print = _fn##_print
+
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)         \
+       [Opt_##_name] = {                                               \
+               .attr   = {                                             \
+                       .name   = #_name,                               \
+                       .mode = _mode == OPT_RUNTIME ? 0644 : 0444,     \
+               },                                                      \
+               .mode   = _mode,                                        \
+               .set_sb = SET_##_sb_opt,                                \
+               _type                                                   \
+       },
+
+       BCH_OPTS()
+#undef BCH_OPT
+};
+
+int bch2_opt_lookup(const char *name)
+{
+       const struct bch_option *i;
+
+       for (i = bch2_opt_table;
+            i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+            i++)
+               if (!strcmp(name, i->attr.name))
+                       return i - bch2_opt_table;
+
+       return -1;
+}
+
+struct synonym {
+       const char      *s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+       { "quota",      "usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+       const struct synonym *i;
+
+       for (i = bch_opt_synonyms;
+            i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+            i++)
+               if (!strcmp(name, i->s1))
+                       name = i->s2;
+
+       return bch2_opt_lookup(name);
+}
+
+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+                  const char *val, u64 *res)
+{
+       ssize_t ret;
+
+       switch (opt->type) {
+       case BCH_OPT_BOOL:
+               ret = kstrtou64(val, 10, res);
+               if (ret < 0)
+                       return ret;
+
+               if (*res > 1)
+                       return -ERANGE;
+               break;
+       case BCH_OPT_UINT:
+               ret = kstrtou64(val, 10, res);
+               if (ret < 0)
+                       return ret;
+
+               if (*res < opt->min || *res >= opt->max)
+                       return -ERANGE;
+               break;
+       case BCH_OPT_STR:
+               ret = match_string(opt->choices, -1, val);
+               if (ret < 0)
+                       return ret;
+
+               *res = ret;
+               break;
+       case BCH_OPT_FN:
+               if (!c)
+                       return -EINVAL;
+
+               return opt->parse(c, val, res);
+       }
+
+       return 0;
+}
+
+int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
+                    const struct bch_option *opt, u64 v,
+                    unsigned flags)
+{
+       char *out = buf, *end = buf + len;
+
+       if (flags & OPT_SHOW_MOUNT_STYLE) {
+               if (opt->type == BCH_OPT_BOOL)
+                       return scnprintf(out, end - out, "%s%s",
+                                        v ? "" : "no",
+                                        opt->attr.name);
+
+               out += scnprintf(out, end - out, "%s=", opt->attr.name);
+       }
+
+       switch (opt->type) {
+       case BCH_OPT_BOOL:
+       case BCH_OPT_UINT:
+               out += scnprintf(out, end - out, "%lli", v);
+               break;
+       case BCH_OPT_STR:
+               out += (flags & OPT_SHOW_FULL_LIST)
+                       ? bch2_scnprint_string_list(out, end - out, opt->choices, v)
+                       : scnprintf(out, end - out, opt->choices[v]);
+               break;
+       case BCH_OPT_FN:
+               return opt->print(c, out, end - out, v);
+       default:
+               BUG();
+       }
+
+       return out - buf;
+}
+
+int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+{
+       char *opt, *name, *val;
+       int ret, id;
+       u64 v;
+
+       while ((opt = strsep(&options, ",")) != NULL) {
+               name    = strsep(&opt, "=");
+               val     = opt;
+
+               if (val) {
+                       id = bch2_mount_opt_lookup(name);
+                       if (id < 0)
+                               goto bad_opt;
+
+                       ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
+                       if (ret < 0)
+                               goto bad_val;
+               } else {
+                       id = bch2_mount_opt_lookup(name);
+                       v = 1;
+
+                       if (id < 0 &&
+                           !strncmp("no", name, 2)) {
+                               id = bch2_mount_opt_lookup(name + 2);
+                               v = 0;
+                       }
+
+                       if (id < 0)
+                               goto bad_opt;
+
+                       if (bch2_opt_table[id].type != BCH_OPT_BOOL)
+                               goto no_val;
+               }
+
+               if (bch2_opt_table[id].mode < OPT_MOUNT)
+                       goto bad_opt;
+
+               if (id == Opt_acl &&
+                   !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+                       goto bad_opt;
+
+               if ((id == Opt_usrquota ||
+                    id == Opt_grpquota) &&
+                   !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+                       goto bad_opt;
+
+               bch2_opt_set_by_id(opts, id, v);
+       }
+
+       return 0;
+bad_opt:
+       pr_err("Bad mount option %s", name);
+       return -1;
+bad_val:
+       pr_err("Invalid value %s for mount option %s", val, name);
+       return -1;
+no_val:
+       pr_err("Mount option %s requires a value", name);
+       return -1;
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+       struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (opt_defined(src, _name))                                    \
+               opt_set(ret, _name, src._name);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+       struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (opt_defined(src, _name))                                    \
+               opt_set(ret, _name, src._name);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (opt_defined(src, _name))                                    \
+               opt_set(*dst, _name, src._name);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+       static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits)    Opt_##_name,
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       };
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+               if (inode_opt_list[i] == id)
+                       return true;
+
+       return false;
+}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
new file mode 100644 (file)
index 0000000..3b5eddb
--- /dev/null
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_cache_replacement_policies[];
+extern const char * const bch2_cache_modes[];
+extern const char * const bch2_dev_state[];
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+LE64_BITMASK(NO_SB_OPT,                struct bch_sb, flags[0], 0, 0);
+
+enum opt_mode {
+       OPT_INTERNAL,
+       OPT_FORMAT,
+       OPT_MOUNT,
+       OPT_RUNTIME,
+};
+
+enum opt_type {
+       BCH_OPT_BOOL,
+       BCH_OPT_UINT,
+       BCH_OPT_STR,
+       BCH_OPT_FN,
+};
+
+/**
+ * BCH_OPT(name, type, in mem type, mode, sb_opt)
+ *
+ * @name       - name of mount option, sysfs attribute, and struct bch_opts
+ *               member
+ *
+ * @mode       - when opt may be set
+ *
+ * @sb_option  - name of corresponding superblock option
+ *
+ * @type       - one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ *  - default value
+ *  - helptext
+ */
+
+#define BCH_OPTS()                                                     \
+       BCH_OPT(block_size,             u16,    OPT_FORMAT,             \
+               OPT_UINT(1, 128),                                       \
+               BCH_SB_BLOCK_SIZE,              8)                      \
+       BCH_OPT(btree_node_size,        u16,    OPT_FORMAT,             \
+               OPT_UINT(1, 128),                                       \
+               BCH_SB_BTREE_NODE_SIZE,         512)                    \
+       BCH_OPT(errors,                 u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_error_actions),                            \
+               BCH_SB_ERROR_ACTION,            BCH_ON_ERROR_RO)        \
+       BCH_OPT(metadata_replicas,      u8,     OPT_RUNTIME,            \
+               OPT_UINT(1, BCH_REPLICAS_MAX),                          \
+               BCH_SB_META_REPLICAS_WANT,      1)                      \
+       BCH_OPT(data_replicas,          u8,     OPT_RUNTIME,            \
+               OPT_UINT(1, BCH_REPLICAS_MAX),                          \
+               BCH_SB_DATA_REPLICAS_WANT,      1)                      \
+       BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT,              \
+               OPT_UINT(1, BCH_REPLICAS_MAX),                          \
+               BCH_SB_META_REPLICAS_REQ,       1)                      \
+       BCH_OPT(data_replicas_required, u8,     OPT_MOUNT,              \
+               OPT_UINT(1, BCH_REPLICAS_MAX),                          \
+               BCH_SB_DATA_REPLICAS_REQ,       1)                      \
+       BCH_OPT(metadata_checksum,      u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_csum_types),                               \
+               BCH_SB_META_CSUM_TYPE,          BCH_CSUM_OPT_CRC32C)    \
+       BCH_OPT(data_checksum,          u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_csum_types),                               \
+               BCH_SB_DATA_CSUM_TYPE,          BCH_CSUM_OPT_CRC32C)    \
+       BCH_OPT(compression,            u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_compression_types),                        \
+               BCH_SB_COMPRESSION_TYPE,        BCH_COMPRESSION_OPT_NONE)\
+       BCH_OPT(background_compression, u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_compression_types),                        \
+               BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
+       BCH_OPT(str_hash,               u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_str_hash_types),                           \
+               BCH_SB_STR_HASH_TYPE,           BCH_STR_HASH_SIPHASH)   \
+       BCH_OPT(foreground_target,      u16,    OPT_RUNTIME,            \
+               OPT_FN(bch2_opt_target),                                \
+               BCH_SB_FOREGROUND_TARGET,       0)                      \
+       BCH_OPT(background_target,      u16,    OPT_RUNTIME,            \
+               OPT_FN(bch2_opt_target),                                \
+               BCH_SB_BACKGROUND_TARGET,       0)                      \
+       BCH_OPT(promote_target,         u16,    OPT_RUNTIME,            \
+               OPT_FN(bch2_opt_target),                                \
+               BCH_SB_PROMOTE_TARGET,  0)                              \
+       BCH_OPT(inodes_32bit,           u8,     OPT_RUNTIME,            \
+               OPT_BOOL(),                                             \
+               BCH_SB_INODE_32BIT,             false)                  \
+       BCH_OPT(gc_reserve_percent,     u8,     OPT_MOUNT,              \
+               OPT_UINT(5, 21),                                        \
+               BCH_SB_GC_RESERVE,              8)                      \
+       BCH_OPT(root_reserve_percent,   u8,     OPT_MOUNT,              \
+               OPT_UINT(0, 100),                                       \
+               BCH_SB_ROOT_RESERVE,            0)                      \
+       BCH_OPT(wide_macs,              u8,     OPT_RUNTIME,            \
+               OPT_BOOL(),                                             \
+               BCH_SB_128_BIT_MACS,            false)                  \
+       BCH_OPT(acl,                    u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               BCH_SB_POSIX_ACL,               true)                   \
+       BCH_OPT(usrquota,               u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               BCH_SB_USRQUOTA,                false)                  \
+       BCH_OPT(grpquota,               u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               BCH_SB_GRPQUOTA,                false)                  \
+       BCH_OPT(prjquota,               u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               BCH_SB_PRJQUOTA,                false)                  \
+       BCH_OPT(degraded,               u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(discard,                u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(verbose_recovery,       u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(verbose_init,           u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(journal_flush_disabled, u8,     OPT_RUNTIME,            \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(nofsck,                 u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(fix_errors,             u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(nochanges,              u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(noreplay,               u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(norecovery,             u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(noexcl,                 u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(sb,                     u64,    OPT_MOUNT,              \
+               OPT_UINT(0, S64_MAX),                                   \
+               NO_SB_OPT,                      BCH_SB_SECTOR)          \
+       BCH_OPT(read_only,              u8,     OPT_INTERNAL,           \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(nostart,                u8,     OPT_INTERNAL,           \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
+       BCH_OPT(no_data_io,             u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)
+
+struct bch_opts {
+#define BCH_OPT(_name, _bits, ...)     unsigned _name##_defined:1;
+       BCH_OPTS()
+#undef BCH_OPT
+
+#define BCH_OPT(_name, _bits, ...)     _bits   _name;
+       BCH_OPTS()
+#undef BCH_OPT
+};
+
+static const struct bch_opts bch2_opts_default = {
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)         \
+       ._name##_defined = true,                                        \
+       ._name = _default,                                              \
+
+       BCH_OPTS()
+#undef BCH_OPT
+};
+
+#define opt_defined(_opts, _name)      ((_opts)._name##_defined)
+
+#define opt_get(_opts, _name)                                          \
+       (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v)                                      \
+do {                                                                   \
+       (_opts)._name##_defined = true;                                 \
+       (_opts)._name = _v;                                             \
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+       return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define BCH_OPT(_name, ...)    Opt_##_name,
+       BCH_OPTS()
+#undef BCH_OPT
+       bch2_opts_nr
+};
+
+struct bch_fs;
+
+struct bch_option {
+       struct attribute        attr;
+       void                    (*set_sb)(struct bch_sb *, u64);
+       enum opt_mode           mode;
+       enum opt_type           type;
+
+       union {
+       struct {
+               u64             min, max;
+       };
+       struct {
+               const char * const *choices;
+       };
+       struct {
+               int (*parse)(struct bch_fs *, const char *, u64 *);
+               int (*print)(struct bch_fs *, char *, size_t, u64);
+       };
+       };
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+
+#define OPT_SHOW_FULL_LIST     (1 << 0)
+#define OPT_SHOW_MOUNT_STYLE   (1 << 1)
+
+int bch2_opt_to_text(struct bch_fs *, char *, size_t,
+                    const struct bch_option *, u64, unsigned);
+
+int bch2_parse_mount_opts(struct bch_opts *, char *);
+
+/* inode opts: */
+
+#define BCH_INODE_OPTS()                                       \
+       BCH_INODE_OPT(data_checksum,                    8)      \
+       BCH_INODE_OPT(compression,                      8)      \
+       BCH_INODE_OPT(background_compression,           8)      \
+       BCH_INODE_OPT(data_replicas,                    8)      \
+       BCH_INODE_OPT(promote_target,                   16)     \
+       BCH_INODE_OPT(foreground_target,                16)     \
+       BCH_INODE_OPT(background_target,                16)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits)    unsigned _name##_defined:1;
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits)    u##_bits _name;
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
new file mode 100644 (file)
index 0000000..0adbfe5
--- /dev/null
@@ -0,0 +1,790 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "inode.h"
+#include "quota.h"
+#include "super-io.h"
+
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+                                         struct bch_sb_field *f)
+{
+       struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+       if (vstruct_bytes(&q->field) != sizeof(*q))
+               return "invalid field quota: wrong size";
+
+       return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+       .validate       = bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_quota dq;
+
+       if (k.k->p.inode >= QTYP_NR)
+               return "invalid quota type";
+
+       switch (k.k->type) {
+       case BCH_QUOTA: {
+               dq = bkey_s_c_to_quota(k);
+
+               if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+                       return "incorrect value size";
+
+               return NULL;
+       }
+       default:
+               return "invalid type";
+       }
+}
+
+static const char * const bch2_quota_counters[] = {
+       "space",
+       "inodes",
+};
+
+void bch2_quota_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
+{
+       char *out = buf, *end= buf + size;
+       struct bkey_s_c_quota dq;
+       unsigned i;
+
+       switch (k.k->type) {
+       case BCH_QUOTA:
+               dq = bkey_s_c_to_quota(k);
+
+               for (i = 0; i < Q_COUNTERS; i++)
+                       out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
+                                        bch2_quota_counters[i],
+                                        le64_to_cpu(dq.v->c[i].hardlimit),
+                                        le64_to_cpu(dq.v->c[i].softlimit));
+               break;
+       }
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+       qtypes >>= i;
+       return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)                                \
+       for (_i = 0;                                                    \
+            (_i = __next_qtype(_i, _qtypes),                           \
+             _q = &(_c)->quotas[_i],                                   \
+             _i < QTYP_NR);                                            \
+            _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+       if (capable(CAP_SYS_RESOURCE))
+               return true;
+#if 0
+       struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+       return capable(CAP_SYS_RESOURCE) &&
+              (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+               !(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+       return false;
+}
+
+enum quota_msg {
+       SOFTWARN,       /* Softlimit reached */
+       SOFTLONGWARN,   /* Grace time expired */
+       HARDWARN,       /* Hardlimit reached */
+
+       HARDBELOW,      /* Usage got below inode hardlimit */
+       SOFTBELOW,      /* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+       [HARDWARN][Q_SPC]       = QUOTA_NL_BHARDWARN,
+       [SOFTLONGWARN][Q_SPC]   = QUOTA_NL_BSOFTLONGWARN,
+       [SOFTWARN][Q_SPC]       = QUOTA_NL_BSOFTWARN,
+       [HARDBELOW][Q_SPC]      = QUOTA_NL_BHARDBELOW,
+       [SOFTBELOW][Q_SPC]      = QUOTA_NL_BSOFTBELOW,
+
+       [HARDWARN][Q_INO]       = QUOTA_NL_IHARDWARN,
+       [SOFTLONGWARN][Q_INO]   = QUOTA_NL_ISOFTLONGWARN,
+       [SOFTWARN][Q_INO]       = QUOTA_NL_ISOFTWARN,
+       [HARDBELOW][Q_INO]      = QUOTA_NL_IHARDBELOW,
+       [SOFTBELOW][Q_INO]      = QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+       u8              nr;
+       struct {
+               u8      qtype;
+               u8      msg;
+       }               m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+                       enum quota_counters counter,
+                       struct quota_msgs *msgs,
+                       enum quota_msg msg_type)
+{
+       BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+       msgs->m[msgs->nr].qtype = qtype;
+       msgs->m[msgs->nr].msg   = quota_nl[msg_type][counter];
+       msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+                           unsigned qtype,
+                           enum quota_counters counter,
+                           struct quota_msgs *msgs,
+                           enum quota_msg msg_type)
+{
+       if (qc->warning_issued & (1 << msg_type))
+               return;
+
+       prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+                          struct super_block *sb,
+                          struct quota_msgs *msgs)
+{
+       unsigned i;
+
+       for (i = 0; i < msgs->nr; i++)
+               quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+                                  sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+                                 unsigned qtype,
+                                 struct bch_memquota *mq,
+                                 struct quota_msgs *msgs,
+                                 enum quota_counters counter,
+                                 s64 v,
+                                 enum quota_acct_mode mode)
+{
+       struct bch_memquota_type *q = &c->quotas[qtype];
+       struct memquota_counter *qc = &mq->c[counter];
+       u64 n = qc->v + v;
+
+       BUG_ON((s64) n < 0);
+
+       if (mode == BCH_QUOTA_NOCHECK)
+               return 0;
+
+       if (v <= 0) {
+               if (n < qc->hardlimit &&
+                   (qc->warning_issued & (1 << HARDWARN))) {
+                       qc->warning_issued &= ~(1 << HARDWARN);
+                       prepare_msg(qtype, counter, msgs, HARDBELOW);
+               }
+
+               if (n < qc->softlimit &&
+                   (qc->warning_issued & (1 << SOFTWARN))) {
+                       qc->warning_issued &= ~(1 << SOFTWARN);
+                       prepare_msg(qtype, counter, msgs, SOFTBELOW);
+               }
+
+               qc->warning_issued = 0;
+               return 0;
+       }
+
+       if (qc->hardlimit &&
+           qc->hardlimit < n &&
+           !ignore_hardlimit(q)) {
+               if (mode == BCH_QUOTA_PREALLOC)
+                       return -EDQUOT;
+
+               prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+       }
+
+       if (qc->softlimit &&
+           qc->softlimit < n &&
+           qc->timer &&
+           ktime_get_real_seconds() >= qc->timer &&
+           !ignore_hardlimit(q)) {
+               if (mode == BCH_QUOTA_PREALLOC)
+                       return -EDQUOT;
+
+               prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+       }
+
+       if (qc->softlimit &&
+           qc->softlimit < n &&
+           qc->timer == 0) {
+               if (mode == BCH_QUOTA_PREALLOC)
+                       return -EDQUOT;
+
+               prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+
+               /* XXX is this the right one? */
+               qc->timer = ktime_get_real_seconds() +
+                       q->limits[counter].warnlimit;
+       }
+
+       return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+                   enum quota_counters counter, s64 v,
+                   enum quota_acct_mode mode)
+{
+       unsigned qtypes = enabled_qtypes(c);
+       struct bch_memquota_type *q;
+       struct bch_memquota *mq[QTYP_NR];
+       struct quota_msgs msgs;
+       unsigned i;
+       int ret = 0;
+
+       memset(&msgs, 0, sizeof(msgs));
+
+       for_each_set_qtype(c, i, q, qtypes)
+               mutex_lock_nested(&q->lock, i);
+
+       for_each_set_qtype(c, i, q, qtypes) {
+               mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
+               if (!mq[i]) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+               if (ret)
+                       goto err;
+       }
+
+       for_each_set_qtype(c, i, q, qtypes)
+               mq[i]->c[counter].v += v;
+err:
+       for_each_set_qtype(c, i, q, qtypes)
+               mutex_unlock(&q->lock);
+
+       flush_warnings(qid, c->vfs_sb, &msgs);
+
+       return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+                                 struct bch_memquota *dst_q,
+                                 enum quota_counters counter, s64 v)
+{
+       BUG_ON(v > src_q->c[counter].v);
+       BUG_ON(v + dst_q->c[counter].v < v);
+
+       src_q->c[counter].v -= v;
+       dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+                       struct bch_qid dst,
+                       struct bch_qid src, u64 space)
+{
+       struct bch_memquota_type *q;
+       struct bch_memquota *src_q[3], *dst_q[3];
+       struct quota_msgs msgs;
+       unsigned i;
+       int ret = 0;
+
+       qtypes &= enabled_qtypes(c);
+
+       memset(&msgs, 0, sizeof(msgs));
+
+       for_each_set_qtype(c, i, q, qtypes)
+               mutex_lock_nested(&q->lock, i);
+
+       for_each_set_qtype(c, i, q, qtypes) {
+               src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
+               dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
+
+               if (!src_q[i] || !dst_q[i]) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+                                            dst_q[i]->c[Q_SPC].v + space,
+                                            BCH_QUOTA_PREALLOC);
+               if (ret)
+                       goto err;
+
+               ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+                                            dst_q[i]->c[Q_INO].v + 1,
+                                            BCH_QUOTA_PREALLOC);
+               if (ret)
+                       goto err;
+       }
+
+       for_each_set_qtype(c, i, q, qtypes) {
+               __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+               __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+       }
+
+err:
+       for_each_set_qtype(c, i, q, qtypes)
+               mutex_unlock(&q->lock);
+
+       flush_warnings(dst, c->vfs_sb, &msgs);
+
+       return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_quota dq;
+       struct bch_memquota_type *q;
+       struct bch_memquota *mq;
+       unsigned i;
+
+       BUG_ON(k.k->p.inode >= QTYP_NR);
+
+       switch (k.k->type) {
+       case BCH_QUOTA:
+               dq = bkey_s_c_to_quota(k);
+               q = &c->quotas[k.k->p.inode];
+
+               mutex_lock(&q->lock);
+               mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+               if (!mq) {
+                       mutex_unlock(&q->lock);
+                       return -ENOMEM;
+               }
+
+               for (i = 0; i < Q_COUNTERS; i++) {
+                       mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+                       mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+               }
+
+               mutex_unlock(&q->lock);
+       }
+
+       return 0;
+}
+
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
+                          BTREE_ITER_PREFETCH, k) {
+               if (k.k->p.inode != type)
+                       break;
+
+               ret = __bch2_quota_set(c, k);
+               if (ret)
+                       break;
+       }
+
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+               genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+               mutex_init(&c->quotas[i].lock);
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+       struct bch_sb_field_quota *sb_quota;
+       unsigned i, j;
+
+       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+       if (!sb_quota)
+               return;
+
+       for (i = 0; i < QTYP_NR; i++) {
+               struct bch_memquota_type *q = &c->quotas[i];
+
+               for (j = 0; j < Q_COUNTERS; j++) {
+                       q->limits[j].timelimit =
+                               le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+                       q->limits[j].warnlimit =
+                               le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+               }
+       }
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+       unsigned i, qtypes = enabled_qtypes(c);
+       struct bch_memquota_type *q;
+       struct btree_iter iter;
+       struct bch_inode_unpacked u;
+       struct bkey_s_c k;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       bch2_sb_quota_read(c);
+       mutex_unlock(&c->sb_lock);
+
+       for_each_set_qtype(c, i, q, qtypes) {
+               ret = bch2_quota_init_type(c, i);
+               if (ret)
+                       return ret;
+       }
+
+       for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
+                          BTREE_ITER_PREFETCH, k) {
+               switch (k.k->type) {
+               case BCH_INODE_FS:
+                       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+                       if (ret)
+                               return ret;
+
+                       bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+                                       BCH_QUOTA_NOCHECK);
+                       bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+                                       BCH_QUOTA_NOCHECK);
+               }
+       }
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block        *sb, unsigned uflags)
+{
+       struct bch_fs *c = sb->s_fs_info;
+
+       if (sb->s_flags & SB_RDONLY)
+               return -EROFS;
+
+       /* Accounting must be enabled at mount time: */
+       if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+               return -EINVAL;
+
+       /* Can't enable enforcement without accounting: */
+       if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+               return -EINVAL;
+
+       if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+               return -EINVAL;
+
+       if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+               return -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+       if (uflags & FS_QUOTA_UDQ_ENFD)
+               SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+       if (uflags & FS_QUOTA_GDQ_ENFD)
+               SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+       if (uflags & FS_QUOTA_PDQ_ENFD)
+               SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+       struct bch_fs *c = sb->s_fs_info;
+
+       if (sb->s_flags & SB_RDONLY)
+               return -EROFS;
+
+       mutex_lock(&c->sb_lock);
+       if (uflags & FS_QUOTA_UDQ_ENFD)
+               SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+       if (uflags & FS_QUOTA_GDQ_ENFD)
+               SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+       if (uflags & FS_QUOTA_PDQ_ENFD)
+               SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       int ret;
+
+       if (sb->s_flags & SB_RDONLY)
+               return -EROFS;
+
+       if (uflags & FS_USER_QUOTA) {
+               if (c->opts.usrquota)
+                       return -EINVAL;
+
+               ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+                                             POS(QTYP_USR, 0),
+                                             POS(QTYP_USR + 1, 0),
+                                             ZERO_VERSION, NULL, NULL, NULL);
+               if (ret)
+                       return ret;
+       }
+
+       if (uflags & FS_GROUP_QUOTA) {
+               if (c->opts.grpquota)
+                       return -EINVAL;
+
+               ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+                                             POS(QTYP_GRP, 0),
+                                             POS(QTYP_GRP + 1, 0),
+                                             ZERO_VERSION, NULL, NULL, NULL);
+               if (ret)
+                       return ret;
+       }
+
+       if (uflags & FS_PROJ_QUOTA) {
+               if (c->opts.prjquota)
+                       return -EINVAL;
+
+               ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+                                             POS(QTYP_PRJ, 0),
+                                             POS(QTYP_PRJ + 1, 0),
+                                             ZERO_VERSION, NULL, NULL, NULL);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       unsigned qtypes = enabled_qtypes(c);
+       unsigned i;
+
+       memset(state, 0, sizeof(*state));
+
+       for (i = 0; i < QTYP_NR; i++) {
+               state->s_state[i].flags |= QCI_SYSFILE;
+
+               if (!(qtypes & (1 << i)))
+                       continue;
+
+               state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+               state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+               state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+               state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+               state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+       }
+
+       return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+                              struct qc_info *info)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       struct bch_sb_field_quota *sb_quota;
+       struct bch_memquota_type *q;
+
+       if (sb->s_flags & SB_RDONLY)
+               return -EROFS;
+
+       if (type >= QTYP_NR)
+               return -EINVAL;
+
+       if (!((1 << type) & enabled_qtypes(c)))
+               return -ESRCH;
+
+       if (info->i_fieldmask &
+           ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+               return -EINVAL;
+
+       q = &c->quotas[type];
+
+       mutex_lock(&c->sb_lock);
+       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+       if (!sb_quota) {
+               sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+                                       sizeof(*sb_quota) / sizeof(u64));
+               if (!sb_quota)
+                       return -ENOSPC;
+       }
+
+       if (info->i_fieldmask & QC_SPC_TIMER)
+               sb_quota->q[type].c[Q_SPC].timelimit =
+                       cpu_to_le32(info->i_spc_timelimit);
+
+       if (info->i_fieldmask & QC_SPC_WARNS)
+               sb_quota->q[type].c[Q_SPC].warnlimit =
+                       cpu_to_le32(info->i_spc_warnlimit);
+
+       if (info->i_fieldmask & QC_INO_TIMER)
+               sb_quota->q[type].c[Q_INO].timelimit =
+                       cpu_to_le32(info->i_ino_timelimit);
+
+       if (info->i_fieldmask & QC_INO_WARNS)
+               sb_quota->q[type].c[Q_INO].warnlimit =
+                       cpu_to_le32(info->i_ino_warnlimit);
+
+       bch2_sb_quota_read(c);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+       dst->d_space            = src->c[Q_SPC].v << 9;
+       dst->d_spc_hardlimit    = src->c[Q_SPC].hardlimit << 9;
+       dst->d_spc_softlimit    = src->c[Q_SPC].softlimit << 9;
+       dst->d_spc_timer        = src->c[Q_SPC].timer;
+       dst->d_spc_warns        = src->c[Q_SPC].warns;
+
+       dst->d_ino_count        = src->c[Q_INO].v;
+       dst->d_ino_hardlimit    = src->c[Q_INO].hardlimit;
+       dst->d_ino_softlimit    = src->c[Q_INO].softlimit;
+       dst->d_ino_timer        = src->c[Q_INO].timer;
+       dst->d_ino_warns        = src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+                         struct qc_dqblk *qdq)
+{
+       struct bch_fs *c                = sb->s_fs_info;
+       struct bch_memquota_type *q     = &c->quotas[kqid.type];
+       qid_t qid                       = from_kqid(&init_user_ns, kqid);
+       struct bch_memquota *mq;
+
+       memset(qdq, 0, sizeof(*qdq));
+
+       mutex_lock(&q->lock);
+       mq = genradix_ptr(&q->table, qid);
+       if (mq)
+               __bch2_quota_get(qdq, mq);
+       mutex_unlock(&q->lock);
+
+       return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+                              struct qc_dqblk *qdq)
+{
+       struct bch_fs *c                = sb->s_fs_info;
+       struct bch_memquota_type *q     = &c->quotas[kqid->type];
+       qid_t qid                       = from_kqid(&init_user_ns, *kqid);
+       struct genradix_iter iter       = genradix_iter_init(&q->table, qid);
+       struct bch_memquota *mq;
+       int ret = 0;
+
+       mutex_lock(&q->lock);
+
+       while ((mq = genradix_iter_peek(&iter, &q->table))) {
+               if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+                       __bch2_quota_get(qdq, mq);
+                       *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+                       goto found;
+               }
+
+               genradix_iter_advance(&iter, &q->table);
+       }
+
+       ret = -ENOENT;
+found:
+       mutex_unlock(&q->lock);
+       return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+                         struct qc_dqblk *qdq)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_quota new_quota;
+       int ret;
+
+       if (sb->s_flags & SB_RDONLY)
+               return -EROFS;
+
+       bkey_quota_init(&new_quota.k_i);
+       new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+
+       ret = btree_iter_err(k);
+       if (unlikely(ret))
+               return ret;
+
+       switch (k.k->type) {
+       case BCH_QUOTA:
+               new_quota.v = *bkey_s_c_to_quota(k).v;
+               break;
+       }
+
+       if (qdq->d_fieldmask & QC_SPC_SOFT)
+               new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+       if (qdq->d_fieldmask & QC_SPC_HARD)
+               new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+       if (qdq->d_fieldmask & QC_INO_SOFT)
+               new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+       if (qdq->d_fieldmask & QC_INO_HARD)
+               new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                  BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
+       bch2_btree_iter_unlock(&iter);
+
+       if (ret)
+               return ret;
+
+       ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+       return ret;
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+       .quota_enable           = bch2_quota_enable,
+       .quota_disable          = bch2_quota_disable,
+       .rm_xquota              = bch2_quota_remove,
+
+       .get_state              = bch2_quota_get_state,
+       .set_info               = bch2_quota_set_info,
+
+       .get_dqblk              = bch2_get_quota,
+       .get_nextdqblk          = bch2_get_next_quota,
+       .set_dqblk              = bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
new file mode 100644 (file)
index 0000000..4a76b49
--- /dev/null
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_quota_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_quota_invalid,           \
+       .val_to_text    = bch2_quota_to_text,           \
+}
+
+enum quota_acct_mode {
+       BCH_QUOTA_PREALLOC,
+       BCH_QUOTA_WARN,
+       BCH_QUOTA_NOCHECK,
+};
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+       return (struct bch_qid) {
+               .q[QTYP_USR] = u->bi_uid,
+               .q[QTYP_GRP] = u->bi_gid,
+               .q[QTYP_PRJ] = u->bi_project,
+       };
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+       return ((c->opts.usrquota << QTYP_USR)|
+               (c->opts.grpquota << QTYP_GRP)|
+               (c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+                   s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+                       struct bch_qid, u64);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+                                 enum quota_counters counter, s64 v,
+                                 enum quota_acct_mode mode)
+{
+       return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+                                     struct bch_qid dst,
+                                     struct bch_qid src, u64 space)
+{
+       return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
new file mode 100644 (file)
index 0000000..9eda6c3
--- /dev/null
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+       u32             q[QTYP_NR];
+};
+
+struct memquota_counter {
+       u64                             v;
+       u64                             hardlimit;
+       u64                             softlimit;
+       s64                             timer;
+       int                             warns;
+       int                             warning_issued;
+};
+
+struct bch_memquota {
+       struct memquota_counter         c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)  bch_memquota_table;
+
+struct quota_limit {
+       u32                             timelimit;
+       u32                             warnlimit;
+};
+
+struct bch_memquota_type {
+       struct quota_limit              limits[Q_COUNTERS];
+       bch_memquota_table              table;
+       struct mutex                    lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644 (file)
index 0000000..04824f6
--- /dev/null
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+                                     const struct bch_extent_ptr *ptr,
+                                     struct bch_extent_crc_unpacked crc,
+                                     struct bch_io_opts *io_opts)
+{
+       if (io_opts->background_target &&
+           !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+           !ptr->cached)
+               return true;
+
+       if (io_opts->background_compression &&
+           crc.compression_type !=
+           bch2_compression_opt_to_type[io_opts->background_compression])
+               return true;
+
+       return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+                           struct bkey_s_c k,
+                           struct bch_io_opts *io_opts)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       struct bkey_s_c_extent e;
+
+       if (!bkey_extent_is_data(k.k))
+               return;
+
+       if (!io_opts->background_target &&
+           !io_opts->background_compression)
+               return;
+
+       e = bkey_s_c_to_extent(k);
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+                       if (atomic64_add_return(crc.compressed_size,
+                                               &ca->rebalance_work) ==
+                           crc.compressed_size)
+                               rebalance_wakeup(c);
+               }
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+       if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+           sectors)
+               rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+                                   enum bkey_type type,
+                                   struct bkey_s_c_extent e,
+                                   struct bch_io_opts *io_opts,
+                                   struct data_opts *data_opts)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+
+       /* Make sure we have room to add a new pointer: */
+       if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+           BKEY_EXTENT_VAL_U64s_MAX)
+               return DATA_SKIP;
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+                       goto found;
+
+       return DATA_SKIP;
+found:
+       data_opts->target               = io_opts->background_target;
+       data_opts->btree_insert_flags   = 0;
+       return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+       int             dev_most_full_idx;
+       unsigned        dev_most_full_percent;
+       u64             dev_most_full_work;
+       u64             dev_most_full_capacity;
+       u64             total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+               u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+       unsigned percent_full;
+       u64 work = dev_work + unknown_dev;
+
+       if (work < dev_work || work < unknown_dev)
+               work = U64_MAX;
+       work = min(work, capacity);
+
+       percent_full = div_u64(work * 100, capacity);
+
+       if (percent_full >= w->dev_most_full_percent) {
+               w->dev_most_full_idx            = idx;
+               w->dev_most_full_percent        = percent_full;
+               w->dev_most_full_work           = work;
+               w->dev_most_full_capacity       = capacity;
+       }
+
+       if (w->total_work + dev_work >= w->total_work &&
+           w->total_work + dev_work >= dev_work)
+               w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       struct rebalance_work ret = { .dev_most_full_idx = -1 };
+       u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+       unsigned i;
+
+       for_each_online_member(ca, c, i)
+               rebalance_work_accumulate(&ret,
+                       atomic64_read(&ca->rebalance_work),
+                       unknown_dev,
+                       bucket_to_sector(ca, ca->mi.nbuckets -
+                                        ca->mi.first_bucket),
+                       i);
+
+       rebalance_work_accumulate(&ret,
+               unknown_dev, 0, c->capacity, -1);
+
+       return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i)
+               atomic64_set(&ca->rebalance_work, 0);
+
+       atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+       u64 utime, stime;
+
+       task_cputime_adjusted(current, &utime, &stime);
+       return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+       struct bch_fs *c = arg;
+       struct bch_fs_rebalance *r = &c->rebalance;
+       struct io_clock *clock = &c->io_clock[WRITE];
+       struct rebalance_work w, p;
+       unsigned long start, prev_start;
+       unsigned long prev_run_time, prev_run_cputime;
+       unsigned long cputime, prev_cputime;
+       unsigned long io_start;
+       long throttle;
+
+       set_freezable();
+
+       io_start        = atomic_long_read(&clock->now);
+       p               = rebalance_work(c);
+       prev_start      = jiffies;
+       prev_cputime    = curr_cputime();
+
+       while (!kthread_wait_freezable(r->enabled)) {
+               start                   = jiffies;
+               cputime                 = curr_cputime();
+
+               prev_run_time           = start - prev_start;
+               prev_run_cputime        = cputime - prev_cputime;
+
+               w                       = rebalance_work(c);
+               BUG_ON(!w.dev_most_full_capacity);
+
+               if (!w.total_work) {
+                       r->state = REBALANCE_WAITING;
+                       kthread_wait_freezable(rebalance_work(c).total_work);
+                       continue;
+               }
+
+               /*
+                * If there isn't much work to do, throttle cpu usage:
+                */
+               throttle = prev_run_cputime * 100 /
+                       max(1U, w.dev_most_full_percent) -
+                       prev_run_time;
+
+               if (w.dev_most_full_percent < 20 && throttle > 0) {
+                       r->state = REBALANCE_THROTTLED;
+                       r->throttled_until_iotime = io_start +
+                               div_u64(w.dev_most_full_capacity *
+                                       (20 - w.dev_most_full_percent),
+                                       50);
+                       r->throttled_until_cputime = start + throttle;
+
+                       bch2_kthread_io_clock_wait(clock,
+                               r->throttled_until_iotime,
+                               throttle);
+                       continue;
+               }
+
+               /* minimum 1 mb/sec: */
+               r->pd.rate.rate =
+                       max_t(u64, 1 << 11,
+                             r->pd.rate.rate *
+                             max(p.dev_most_full_percent, 1U) /
+                             max(w.dev_most_full_percent, 1U));
+
+               io_start        = atomic_long_read(&clock->now);
+               p               = w;
+               prev_start      = start;
+               prev_cputime    = cputime;
+
+               r->state = REBALANCE_RUNNING;
+               memset(&r->move_stats, 0, sizeof(r->move_stats));
+               rebalance_work_reset(c);
+
+               bch2_move_data(c,
+                              /* ratelimiting disabled for now */
+                              NULL, /*  &r->pd.rate, */
+                              writepoint_ptr(&c->rebalance_write_point),
+                              POS_MIN, POS_MAX,
+                              rebalance_pred, NULL,
+                              &r->move_stats);
+       }
+
+       return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+       char *out = buf, *end = out + PAGE_SIZE;
+       struct bch_fs_rebalance *r = &c->rebalance;
+       struct rebalance_work w = rebalance_work(c);
+       char h1[21], h2[21];
+
+       bch2_hprint(h1, w.dev_most_full_work << 9);
+       bch2_hprint(h2, w.dev_most_full_capacity << 9);
+       out += scnprintf(out, end - out,
+                        "fullest_dev (%i):\t%s/%s\n",
+                        w.dev_most_full_idx, h1, h2);
+
+       bch2_hprint(h1, w.total_work << 9);
+       bch2_hprint(h2, c->capacity << 9);
+       out += scnprintf(out, end - out,
+                        "total work:\t\t%s/%s\n",
+                        h1, h2);
+
+       out += scnprintf(out, end - out,
+                        "rate:\t\t\t%u\n",
+                        r->pd.rate.rate);
+
+       switch (r->state) {
+       case REBALANCE_WAITING:
+               out += scnprintf(out, end - out, "waiting\n");
+               break;
+       case REBALANCE_THROTTLED:
+               bch2_hprint(h1,
+                           (r->throttled_until_iotime -
+                            atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+               out += scnprintf(out, end - out,
+                                "throttled for %lu sec or %s io\n",
+                                (r->throttled_until_cputime - jiffies) / HZ,
+                                h1);
+               break;
+       case REBALANCE_RUNNING:
+               out += scnprintf(out, end - out, "running\n");
+               out += scnprintf(out, end - out, "pos %llu:%llu\n",
+                                r->move_stats.iter.pos.inode,
+                                r->move_stats.iter.pos.offset);
+               break;
+       }
+
+       return out - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       c->rebalance.pd.rate.rate = UINT_MAX;
+       bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+       p = rcu_dereference_protected(c->rebalance.thread, 1);
+       c->rebalance.thread = NULL;
+
+       if (p) {
+               /* for sychronizing with rebalance_wakeup() */
+               synchronize_rcu();
+
+               kthread_stop(p);
+               put_task_struct(p);
+       }
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       if (c->opts.nochanges)
+               return 0;
+
+       p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       get_task_struct(p);
+       rcu_assign_pointer(c->rebalance.thread, p);
+       wake_up_process(p);
+       return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+       bch2_pd_controller_init(&c->rebalance.pd);
+
+       atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644 (file)
index 0000000..99e2a1f
--- /dev/null
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       rcu_read_lock();
+       p = rcu_dereference(c->rebalance.thread);
+       if (p)
+               wake_up_process(p);
+       rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+                           struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
new file mode 100644 (file)
index 0000000..192c6be
--- /dev/null
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+       REBALANCE_WAITING,
+       REBALANCE_THROTTLED,
+       REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+       struct task_struct __rcu *thread;
+       struct bch_pd_controller pd;
+
+       atomic64_t              work_unknown_dev;
+
+       enum rebalance_state    state;
+       unsigned long           throttled_until_iotime;
+       unsigned long           throttled_until_cputime;
+       struct bch_move_stats   move_stats;
+
+       unsigned                enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
new file mode 100644 (file)
index 0000000..2596c3c
--- /dev/null
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "dirent.h"
+#include "error.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super-io.h"
+
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+struct bkey_i *btree_root_find(struct bch_fs *c,
+                              struct bch_sb_field_clean *clean,
+                              struct jset *j,
+                              enum btree_id id, unsigned *level)
+{
+       struct bkey_i *k;
+       struct jset_entry *entry, *start, *end;
+
+       if (clean) {
+               start = clean->start;
+               end = vstruct_end(&clean->field);
+       } else {
+               start = j->start;
+               end = vstruct_last(j);
+       }
+
+       for (entry = start; entry < end; entry = vstruct_next(entry))
+               if (entry->type == BCH_JSET_ENTRY_btree_root &&
+                   entry->btree_id == id)
+                       goto found;
+
+       return NULL;
+found:
+       if (!entry->u64s)
+               return ERR_PTR(-EINVAL);
+
+       k = entry->start;
+       *level = entry->level;
+       return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+                                  struct bch_sb_field_clean *clean,
+                                  struct jset *j)
+{
+       unsigned i;
+       int ret = 0;
+
+       if (!clean || !j)
+               return 0;
+
+       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+                       le64_to_cpu(clean->journal_seq),
+                       le64_to_cpu(j->seq)))
+               bch2_fs_mark_clean(c, false);
+
+       mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+                       "superblock read clock doesn't match journal after clean shutdown");
+       mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+                       "superblock read clock doesn't match journal after clean shutdown");
+
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               struct bkey_i *k1, *k2;
+               unsigned l1 = 0, l2 = 0;
+
+               k1 = btree_root_find(c, clean, NULL, i, &l1);
+               k2 = btree_root_find(c, NULL, j, i, &l2);
+
+               if (!k1 && !k2)
+                       continue;
+
+               mustfix_fsck_err_on(!k1 || !k2 ||
+                                   IS_ERR(k1) ||
+                                   IS_ERR(k2) ||
+                                   k1->k.u64s != k2->k.u64s ||
+                                   memcmp(k1, k2, bkey_bytes(k1)) ||
+                                   l1 != l2, c,
+                       "superblock btree root doesn't match journal after clean shutdown");
+       }
+fsck_err:
+       return ret;
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+       struct journal_replay *i;
+       struct jset_entry *entry;
+
+       if (list_empty(journal))
+               return true;
+
+       i = list_last_entry(journal, struct journal_replay, list);
+
+       if (i->j.last_seq != i->j.seq)
+               return false;
+
+       list_for_each_entry(i, journal, list) {
+               vstruct_for_each(&i->j, entry) {
+                       if (entry->type == BCH_JSET_ENTRY_btree_root)
+                               continue;
+
+                       if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+                           !entry->u64s)
+                               continue;
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+       const char *err = "cannot allocate memory";
+       struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
+       LIST_HEAD(journal);
+       struct jset *j = NULL;
+       unsigned i;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+               bch_info(c, "building replicas info");
+               set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+       }
+
+       if (c->sb.clean)
+               sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+       if (sb_clean) {
+               clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+                               GFP_KERNEL);
+               if (!clean) {
+                       ret = -ENOMEM;
+                       mutex_unlock(&c->sb_lock);
+                       goto err;
+               }
+       }
+       mutex_unlock(&c->sb_lock);
+
+       if (clean)
+               bch_info(c, "recovering from clean shutdown, journal seq %llu",
+                        le64_to_cpu(clean->journal_seq));
+
+       if (!clean || !c->opts.nofsck) {
+               ret = bch2_journal_read(c, &journal);
+               if (ret)
+                       goto err;
+
+               j = &list_entry(journal.prev, struct journal_replay, list)->j;
+       } else {
+               ret = bch2_journal_set_seq(c,
+                                          le64_to_cpu(clean->journal_seq),
+                                          le64_to_cpu(clean->journal_seq));
+               BUG_ON(ret);
+       }
+
+       ret = verify_superblock_clean(c, clean, j);
+       if (ret)
+               goto err;
+
+       fsck_err_on(clean && !journal_empty(&journal), c,
+                   "filesystem marked clean but journal not empty");
+
+       if (clean) {
+               c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+               c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+       } else {
+               c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+               c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               unsigned level;
+               struct bkey_i *k;
+
+               k = btree_root_find(c, clean, j, i, &level);
+               if (!k)
+                       continue;
+
+               err = "invalid btree root pointer";
+               if (IS_ERR(k))
+                       goto err;
+
+               err = "error reading btree root";
+               if (bch2_btree_root_read(c, i, k, level)) {
+                       if (i != BTREE_ID_ALLOC)
+                               goto err;
+
+                       mustfix_fsck_err(c, "error reading btree root");
+               }
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (!c->btree_roots[i].b)
+                       bch2_btree_root_alloc(c, i);
+
+       err = "error reading allocation information";
+       ret = bch2_alloc_read(c, &journal);
+       if (ret)
+               goto err;
+
+       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+       bch_verbose(c, "starting mark and sweep:");
+       err = "error in recovery";
+       ret = bch2_initial_gc(c, &journal);
+       if (ret)
+               goto err;
+       bch_verbose(c, "mark and sweep done");
+
+       if (c->opts.noreplay)
+               goto out;
+
+       /*
+        * Mark dirty before journal replay, fsck:
+        * XXX: after a clean shutdown, this could be done lazily only when fsck
+        * finds an error
+        */
+       bch2_fs_mark_clean(c, false);
+
+       /*
+        * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
+        * will give spurious errors about oldest_gen > bucket_gen -
+        * this is a hack but oh well.
+        */
+       bch2_fs_journal_start(&c->journal);
+
+       err = "error starting allocator";
+       ret = bch2_fs_allocator_start(c);
+       if (ret)
+               goto err;
+
+       bch_verbose(c, "starting journal replay:");
+       err = "journal replay failed";
+       ret = bch2_journal_replay(c, &journal);
+       if (ret)
+               goto err;
+       bch_verbose(c, "journal replay done");
+
+       if (c->opts.norecovery)
+               goto out;
+
+       err = "error in fsck";
+       ret = bch2_fsck(c);
+       if (ret)
+               goto err;
+
+       if (enabled_qtypes(c)) {
+               bch_verbose(c, "reading quotas:");
+               ret = bch2_fs_quota_read(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "quotas done");
+       }
+
+out:
+       bch2_journal_entries_free(&journal);
+       kfree(clean);
+       return ret;
+err:
+fsck_err:
+       BUG_ON(!ret);
+       goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+       struct bch_inode_unpacked root_inode, lostfound_inode;
+       struct bkey_inode_buf packed_inode;
+       struct bch_hash_info root_hash_info;
+       struct qstr lostfound = QSTR("lost+found");
+       const char *err = "cannot allocate memory";
+       struct bch_dev *ca;
+       LIST_HEAD(journal);
+       unsigned i;
+       int ret;
+
+       bch_notice(c, "initializing new filesystem");
+
+       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+       ret = bch2_initial_gc(c, &journal);
+       if (ret)
+               goto err;
+
+       err = "unable to allocate journal buckets";
+       for_each_online_member(ca, c, i)
+               if (bch2_dev_journal_alloc(ca)) {
+                       percpu_ref_put(&ca->io_ref);
+                       goto err;
+               }
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               bch2_btree_root_alloc(c, i);
+
+       /*
+        * journal_res_get() will crash if called before this has
+        * set up the journal.pin FIFO and journal.cur pointer:
+        */
+       bch2_fs_journal_start(&c->journal);
+       bch2_journal_set_replay_done(&c->journal);
+
+       err = "error starting allocator";
+       ret = bch2_fs_allocator_start(c);
+       if (ret)
+               goto err;
+
+       bch2_inode_init(c, &root_inode, 0, 0,
+                       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+       root_inode.bi_inum = BCACHEFS_ROOT_INO;
+       root_inode.bi_nlink++; /* lost+found */
+       bch2_inode_pack(&packed_inode, &root_inode);
+
+       err = "error creating root directory";
+       ret = bch2_btree_insert(c, BTREE_ID_INODES,
+                               &packed_inode.inode.k_i,
+                               NULL, NULL, NULL, 0);
+       if (ret)
+               goto err;
+
+       bch2_inode_init(c, &lostfound_inode, 0, 0,
+                       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
+                       &root_inode);
+       lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
+       bch2_inode_pack(&packed_inode, &lostfound_inode);
+
+       err = "error creating lost+found";
+       ret = bch2_btree_insert(c, BTREE_ID_INODES,
+                               &packed_inode.inode.k_i,
+                               NULL, NULL, NULL, 0);
+       if (ret)
+               goto err;
+
+       root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+       ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+                                &lostfound, lostfound_inode.bi_inum, NULL,
+                                BTREE_INSERT_NOFAIL);
+       if (ret)
+               goto err;
+
+       atomic_long_set(&c->nr_inodes, 2);
+
+       if (enabled_qtypes(c)) {
+               ret = bch2_fs_quota_read(c);
+               if (ret)
+                       goto err;
+       }
+
+       err = "error writing first journal entry";
+       ret = bch2_journal_meta(&c->journal);
+       if (ret)
+               goto err;
+
+       mutex_lock(&c->sb_lock);
+       SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+err:
+       BUG_ON(!ret);
+       return ret;
+}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
new file mode 100644 (file)
index 0000000..9129291
--- /dev/null
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
new file mode 100644 (file)
index 0000000..4b87aa8
--- /dev/null
@@ -0,0 +1,698 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+                                           struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)                            \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+            _i = (void *) (_i) + (_r)->entry_size)
+
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+       return (void *) r->entries + r->entry_size * i;
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+       eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+                                    unsigned dev)
+{
+       return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+                                   unsigned dev)
+{
+       e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+       return (r->entry_size -
+               offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
+                             char *buf, size_t size)
+{
+       char *out = buf, *end = out + size;
+       struct bch_replicas_cpu_entry *e;
+       bool first = true;
+       unsigned i;
+
+       for_each_cpu_replicas_entry(r, e) {
+               bool first_e = true;
+
+               if (!first)
+                       out += scnprintf(out, end - out, " ");
+               first = false;
+
+               out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+               for (i = 0; i < replicas_dev_slots(r); i++)
+                       if (replicas_test_dev(e, i)) {
+                               if (!first_e)
+                                       out += scnprintf(out, end - out, " ");
+                               first_e = false;
+                               out += scnprintf(out, end - out, "%u", i);
+                       }
+               out += scnprintf(out, end - out, "]");
+       }
+
+       return out - buf;
+}
+
+static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+                                       enum bch_data_type data_type,
+                                       struct bch_replicas_cpu_entry *r,
+                                       unsigned *max_dev)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned nr = 0;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       memset(r, 0, sizeof(*r));
+       r->data_type = data_type;
+
+       *max_dev = 0;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached) {
+                       *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+                       replicas_set_dev(r, ptr->dev);
+                       nr++;
+               }
+       return nr;
+}
+
+static inline void devlist_to_replicas(struct bch_devs_list devs,
+                                      enum bch_data_type data_type,
+                                      struct bch_replicas_cpu_entry *r,
+                                      unsigned *max_dev)
+{
+       unsigned i;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       memset(r, 0, sizeof(*r));
+       r->data_type = data_type;
+
+       *max_dev = 0;
+
+       for (i = 0; i < devs.nr; i++) {
+               *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
+               replicas_set_dev(r, devs.devs[i]);
+       }
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+                      struct bch_replicas_cpu_entry new_entry,
+                      unsigned max_dev)
+{
+       struct bch_replicas_cpu *new;
+       unsigned i, nr, entry_size;
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+       entry_size = max(entry_size, old->entry_size);
+       nr = old->nr + 1;
+
+       new = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     nr * entry_size, GFP_NOIO);
+       if (!new)
+               return NULL;
+
+       new->nr         = nr;
+       new->entry_size = entry_size;
+
+       for (i = 0; i < old->nr; i++)
+               memcpy(cpu_replicas_entry(new, i),
+                      cpu_replicas_entry(old, i),
+                      min(new->entry_size, old->entry_size));
+
+       memcpy(cpu_replicas_entry(new, old->nr),
+              &new_entry,
+              new->entry_size);
+
+       bch2_cpu_replicas_sort(new);
+       return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+                               struct bch_replicas_cpu_entry search,
+                               unsigned max_dev)
+{
+       return max_dev < replicas_dev_slots(r) &&
+               eytzinger0_find(r->entries, r->nr,
+                               r->entry_size,
+                               memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+                               struct bch_replicas_cpu_entry new_entry,
+                               unsigned max_dev)
+{
+       struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+       int ret = -ENOMEM;
+
+       mutex_lock(&c->sb_lock);
+
+       old_gc = rcu_dereference_protected(c->replicas_gc,
+                                          lockdep_is_held(&c->sb_lock));
+       if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+               new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+               if (!new_gc)
+                       goto err;
+       }
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+       if (!replicas_has_entry(old_r, new_entry, max_dev)) {
+               new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+               if (!new_r)
+                       goto err;
+
+               ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+               if (ret)
+                       goto err;
+       }
+
+       /* allocations done, now commit: */
+
+       if (new_r)
+               bch2_write_super(c);
+
+       /* don't update in memory replicas until changes are persistent */
+
+       if (new_gc) {
+               rcu_assign_pointer(c->replicas_gc, new_gc);
+               kfree_rcu(old_gc, rcu);
+       }
+
+       if (new_r) {
+               rcu_assign_pointer(c->replicas, new_r);
+               kfree_rcu(old_r, rcu);
+       }
+
+       mutex_unlock(&c->sb_lock);
+       return 0;
+err:
+       mutex_unlock(&c->sb_lock);
+       kfree(new_gc);
+       kfree(new_r);
+       return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+                      enum bch_data_type data_type,
+                      struct bch_devs_list devs)
+{
+       struct bch_replicas_cpu_entry search;
+       struct bch_replicas_cpu *r, *gc_r;
+       unsigned max_dev;
+       bool marked;
+
+       if (!devs.nr)
+               return 0;
+
+       BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
+       devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+       gc_r = rcu_dereference(c->replicas_gc);
+       marked = replicas_has_entry(r, search, max_dev) &&
+               (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+       rcu_read_unlock();
+
+       return likely(marked) ? 0
+               : bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+                           enum bch_data_type data_type,
+                           struct bkey_s_c k)
+{
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
+       int ret;
+
+       for (i = 0; i < cached.nr; i++)
+               if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+                                             bch2_dev_list_single(cached.devs[i]))))
+                       return ret;
+
+       return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+       struct bch_replicas_cpu *new_r, *old_r;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+
+       new_r = rcu_dereference_protected(c->replicas_gc,
+                                         lockdep_is_held(&c->sb_lock));
+       rcu_assign_pointer(c->replicas_gc, NULL);
+
+       if (ret)
+               goto err;
+
+       if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+               ret = -ENOSPC;
+               goto err;
+       }
+
+       bch2_write_super(c);
+
+       /* don't update in memory replicas until changes are persistent */
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+
+       rcu_assign_pointer(c->replicas, new_r);
+       kfree_rcu(old_r, rcu);
+out:
+       mutex_unlock(&c->sb_lock);
+       return ret;
+err:
+       kfree_rcu(new_r, rcu);
+       goto out;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+       struct bch_replicas_cpu *dst, *src;
+       struct bch_replicas_cpu_entry *e;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+       BUG_ON(c->replicas_gc);
+
+       src = rcu_dereference_protected(c->replicas,
+                                       lockdep_is_held(&c->sb_lock));
+
+       dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     src->nr * src->entry_size, GFP_NOIO);
+       if (!dst) {
+               mutex_unlock(&c->sb_lock);
+               return -ENOMEM;
+       }
+
+       dst->nr         = 0;
+       dst->entry_size = src->entry_size;
+
+       for_each_cpu_replicas_entry(src, e)
+               if (!((1 << e->data_type) & typemask))
+                       memcpy(cpu_replicas_entry(dst, dst->nr++),
+                              e, dst->entry_size);
+
+       bch2_cpu_replicas_sort(dst);
+
+       rcu_assign_pointer(c->replicas_gc, dst);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+                                       unsigned *nr,
+                                       unsigned *bytes,
+                                       unsigned *max_dev)
+{
+       struct bch_replicas_entry *i;
+       unsigned j;
+
+       *nr     = 0;
+       *bytes  = sizeof(*r);
+       *max_dev = 0;
+
+       if (!r)
+               return;
+
+       for_each_replicas_entry(r, i) {
+               for (j = 0; j < i->nr; j++)
+                       *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+               (*nr)++;
+       }
+
+       *bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+       struct bch_replicas_cpu *cpu_r;
+       unsigned i, nr, bytes, max_dev, entry_size;
+
+       bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+
+       cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+                       nr * entry_size, GFP_NOIO);
+       if (!cpu_r)
+               return NULL;
+
+       cpu_r->nr               = nr;
+       cpu_r->entry_size       = entry_size;
+
+       if (nr) {
+               struct bch_replicas_cpu_entry *dst =
+                       cpu_replicas_entry(cpu_r, 0);
+               struct bch_replicas_entry *src = sb_r->entries;
+
+               while (dst < cpu_replicas_entry(cpu_r, nr)) {
+                       dst->data_type = src->data_type;
+                       for (i = 0; i < src->nr; i++)
+                               replicas_set_dev(dst, src->devs[i]);
+
+                       src     = replicas_entry_next(src);
+                       dst     = (void *) dst + entry_size;
+               }
+       }
+
+       bch2_cpu_replicas_sort(cpu_r);
+       return cpu_r;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_cpu *cpu_r, *old_r;
+
+       sb_r    = bch2_sb_get_replicas(c->disk_sb.sb);
+       cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
+               return -ENOMEM;
+
+       old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
+       rcu_assign_pointer(c->replicas, cpu_r);
+       if (old_r)
+               kfree_rcu(old_r, rcu);
+
+       return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+                                           struct bch_replicas_cpu *r)
+{
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_entry *sb_e;
+       struct bch_replicas_cpu_entry *e;
+       size_t i, bytes;
+
+       bytes = sizeof(struct bch_sb_field_replicas);
+
+       for_each_cpu_replicas_entry(r, e) {
+               bytes += sizeof(struct bch_replicas_entry);
+               for (i = 0; i < r->entry_size - 1; i++)
+                       bytes += hweight8(e->devs[i]);
+       }
+
+       sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+                       DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+       if (!sb_r)
+               return -ENOSPC;
+
+       memset(&sb_r->entries, 0,
+              vstruct_end(&sb_r->field) -
+              (void *) &sb_r->entries);
+
+       sb_e = sb_r->entries;
+       for_each_cpu_replicas_entry(r, e) {
+               sb_e->data_type = e->data_type;
+
+               for (i = 0; i < replicas_dev_slots(r); i++)
+                       if (replicas_test_dev(e, i))
+                               sb_e->devs[sb_e->nr++] = i;
+
+               sb_e = replicas_entry_next(sb_e);
+
+               BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+       }
+
+       return 0;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+       struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+       struct bch_replicas_cpu *cpu_r = NULL;
+       struct bch_replicas_entry *e;
+       const char *err;
+       unsigned i;
+
+       for_each_replicas_entry(sb_r, e) {
+               err = "invalid replicas entry: invalid data type";
+               if (e->data_type >= BCH_DATA_NR)
+                       goto err;
+
+               err = "invalid replicas entry: no devices";
+               if (!e->nr)
+                       goto err;
+
+               err = "invalid replicas entry: too many devices";
+               if (e->nr >= BCH_REPLICAS_MAX)
+                       goto err;
+
+               err = "invalid replicas entry: invalid device";
+               for (i = 0; i < e->nr; i++)
+                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
+                               goto err;
+       }
+
+       err = "cannot allocate memory";
+       cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
+               goto err;
+
+       sort_cmp_size(cpu_r->entries,
+                     cpu_r->nr,
+                     cpu_r->entry_size,
+                     memcmp, NULL);
+
+       for (i = 0; i + 1 < cpu_r->nr; i++) {
+               struct bch_replicas_cpu_entry *l =
+                       cpu_replicas_entry(cpu_r, i);
+               struct bch_replicas_cpu_entry *r =
+                       cpu_replicas_entry(cpu_r, i + 1);
+
+               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+               err = "duplicate replicas entry";
+               if (!memcmp(l, r, cpu_r->entry_size))
+                       goto err;
+       }
+
+       err = NULL;
+err:
+       kfree(cpu_r);
+       return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+       .validate       = bch2_sb_validate_replicas,
+};
+
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+{
+       char *out = buf, *end = out + size;
+       struct bch_replicas_entry *e;
+       bool first = true;
+       unsigned i;
+
+       if (!r) {
+               out += scnprintf(out, end - out, "(no replicas section found)");
+               return out - buf;
+       }
+
+       for_each_replicas_entry(r, e) {
+               if (!first)
+                       out += scnprintf(out, end - out, " ");
+               first = false;
+
+               out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+               for (i = 0; i < e->nr; i++)
+                       out += scnprintf(out, end - out,
+                                        i ? " %u" : "%u", e->devs[i]);
+               out += scnprintf(out, end - out, "]");
+       }
+
+       return out - buf;
+}
+
+/* Query replicas: */
+
+bool bch2_replicas_marked(struct bch_fs *c,
+                         enum bch_data_type data_type,
+                         struct bch_devs_list devs)
+{
+       struct bch_replicas_cpu_entry search;
+       unsigned max_dev;
+       bool ret;
+
+       if (!devs.nr)
+               return true;
+
+       devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+       rcu_read_lock();
+       ret = replicas_has_entry(rcu_dereference(c->replicas),
+                                search, max_dev);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+                              enum bch_data_type data_type,
+                              struct bkey_s_c k)
+{
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
+
+       for (i = 0; i < cached.nr; i++)
+               if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+                                         bch2_dev_list_single(cached.devs[i])))
+                       return false;
+
+       return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+                                             struct bch_devs_mask online_devs)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_replicas_cpu_entry *e;
+       struct bch_replicas_cpu *r;
+       unsigned i, dev, dev_slots, nr_online, nr_offline;
+       struct replicas_status ret;
+
+       memset(&ret, 0, sizeof(ret));
+
+       for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+               ret.replicas[i].nr_online = UINT_MAX;
+
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       rcu_read_lock();
+
+       r = rcu_dereference(c->replicas);
+       dev_slots = replicas_dev_slots(r);
+
+       for_each_cpu_replicas_entry(r, e) {
+               if (e->data_type >= ARRAY_SIZE(ret.replicas))
+                       panic("e %p data_type %u\n", e, e->data_type);
+
+               nr_online = nr_offline = 0;
+
+               for (dev = 0; dev < dev_slots; dev++) {
+                       if (!replicas_test_dev(e, dev))
+                               continue;
+
+                       BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+
+                       if (test_bit(dev, online_devs.d))
+                               nr_online++;
+                       else
+                               nr_offline++;
+               }
+
+               ret.replicas[e->data_type].nr_online =
+                       min(ret.replicas[e->data_type].nr_online,
+                           nr_online);
+
+               ret.replicas[e->data_type].nr_offline =
+                       max(ret.replicas[e->data_type].nr_offline,
+                           nr_offline);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+       return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+                            enum bch_data_type type,
+                            bool force_if_degraded,
+                            bool force_if_lost)
+{
+       return (!s.replicas[type].nr_offline || force_if_degraded) &&
+               (s.replicas[type].nr_online || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+       return (have_enough_devs(s, BCH_DATA_JOURNAL,
+                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
+                                flags & BCH_FORCE_IF_METADATA_LOST) &&
+               have_enough_devs(s, BCH_DATA_BTREE,
+                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
+                                flags & BCH_FORCE_IF_METADATA_LOST) &&
+               have_enough_devs(s, BCH_DATA_USER,
+                                flags & BCH_FORCE_IF_DATA_DEGRADED,
+                                flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+       struct replicas_status s = bch2_replicas_status(c);
+
+       return meta
+               ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+                     s.replicas[BCH_DATA_BTREE].nr_online)
+               : s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bch_replicas_cpu_entry *e;
+       struct bch_replicas_cpu *r;
+       unsigned ret = 0;
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+
+       if (ca->dev_idx >= replicas_dev_slots(r))
+               goto out;
+
+       for_each_cpu_replicas_entry(r, e)
+               if (replicas_test_dev(e, ca->dev_idx))
+                       ret |= 1 << e->data_type;
+out:
+       rcu_read_unlock();
+
+       return ret;
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
new file mode 100644 (file)
index 0000000..de506cf
--- /dev/null
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
+                         struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+                              struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+                      struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+                           struct bkey_s_c);
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
+
+struct replicas_status {
+       struct {
+               unsigned        nr_online;
+               unsigned        nr_offline;
+       }                       replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+                                             struct bch_devs_mask);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+       return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)                                        \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+            (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
new file mode 100644 (file)
index 0000000..c062edb
--- /dev/null
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*     $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+       while (rounds--) {
+               ctx->v[0] += ctx->v[1];
+               ctx->v[2] += ctx->v[3];
+               ctx->v[1] = rol64(ctx->v[1], 13);
+               ctx->v[3] = rol64(ctx->v[3], 16);
+
+               ctx->v[1] ^= ctx->v[0];
+               ctx->v[3] ^= ctx->v[2];
+               ctx->v[0] = rol64(ctx->v[0], 32);
+
+               ctx->v[2] += ctx->v[1];
+               ctx->v[0] += ctx->v[3];
+               ctx->v[1] = rol64(ctx->v[1], 17);
+               ctx->v[3] = rol64(ctx->v[3], 21);
+
+               ctx->v[1] ^= ctx->v[2];
+               ctx->v[3] ^= ctx->v[0];
+               ctx->v[2] = rol64(ctx->v[2], 32);
+       }
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+       u64 m = get_unaligned_le64(ptr);
+
+       ctx->v[3] ^= m;
+       SipHash_Rounds(ctx, rounds);
+       ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+       u64 k0, k1;
+
+       k0 = le64_to_cpu(key->k0);
+       k1 = le64_to_cpu(key->k1);
+
+       ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+       ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+       ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+       ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+       memset(ctx->buf, 0, sizeof(ctx->buf));
+       ctx->bytes = 0;
+}
+
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+                   const void *src, size_t len)
+{
+       const u8 *ptr = src;
+       size_t left, used;
+
+       if (len == 0)
+               return;
+
+       used = ctx->bytes % sizeof(ctx->buf);
+       ctx->bytes += len;
+
+       if (used > 0) {
+               left = sizeof(ctx->buf) - used;
+
+               if (len >= left) {
+                       memcpy(&ctx->buf[used], ptr, left);
+                       SipHash_CRounds(ctx, ctx->buf, rc);
+                       len -= left;
+                       ptr += left;
+               } else {
+                       memcpy(&ctx->buf[used], ptr, len);
+                       return;
+               }
+       }
+
+       while (len >= sizeof(ctx->buf)) {
+               SipHash_CRounds(ctx, ptr, rc);
+               len -= sizeof(ctx->buf);
+               ptr += sizeof(ctx->buf);
+       }
+
+       if (len > 0)
+               memcpy(&ctx->buf[used], ptr, len);
+}
+
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+       u64 r;
+
+       r = SipHash_End(ctx, rc, rf);
+
+       *((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+       u64 r;
+       size_t left, used;
+
+       used = ctx->bytes % sizeof(ctx->buf);
+       left = sizeof(ctx->buf) - used;
+       memset(&ctx->buf[used], 0, left - 1);
+       ctx->buf[7] = ctx->bytes;
+
+       SipHash_CRounds(ctx, ctx->buf, rc);
+       ctx->v[2] ^= 0xff;
+       SipHash_Rounds(ctx, rf);
+
+       r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+       memset(ctx, 0, sizeof(*ctx));
+       return (r);
+}
+
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+       SIPHASH_CTX ctx;
+
+       SipHash_Init(&ctx, key);
+       SipHash_Update(&ctx, rc, rf, src, len);
+       return SipHash_End(&ctx, rc, rf);
+}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
new file mode 100644 (file)
index 0000000..3dfaf34
--- /dev/null
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH    8
+#define SIPHASH_KEY_LENGTH     16
+#define SIPHASH_DIGEST_LENGTH   8
+
+typedef struct _SIPHASH_CTX {
+       u64             v[4];
+       u8              buf[SIPHASH_BLOCK_LENGTH];
+       u32             bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+       __le64          k0;
+       __le64          k1;
+} SIPHASH_KEY;
+
+void   SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void   SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64    SipHash_End(SIPHASH_CTX *, int, int);
+void   SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64    SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)         SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)   SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)              SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)                SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)          SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)         SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)   SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)              SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)                SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)          SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
new file mode 100644 (file)
index 0000000..9dd4b71
--- /dev/null
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
+
+#include "six.h"
+
+#ifdef DEBUG
+#define EBUG_ON(cond)          BUG_ON(cond)
+#else
+#define EBUG_ON(cond)          do {} while (0)
+#endif
+
+#define six_acquire(l, t)      lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_release(l)         lock_release(l, _RET_IP_)
+
+struct six_lock_vals {
+       /* Value we add to the lock in order to take the lock: */
+       u64                     lock_val;
+
+       /* If the lock has this value (used as a mask), taking the lock fails: */
+       u64                     lock_fail;
+
+       /* Value we add to the lock in order to release the lock: */
+       u64                     unlock_val;
+
+       /* Mask that indicates lock is held for this type: */
+       u64                     held_mask;
+
+       /* Waitlist we wakeup when releasing the lock: */
+       enum six_lock_type      unlock_wakeup;
+};
+
+#define __SIX_LOCK_HELD_read   __SIX_VAL(read_lock, ~0)
+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
+#define __SIX_LOCK_HELD_write  __SIX_VAL(seq, 1)
+
+#define LOCK_VALS {                                                    \
+       [SIX_LOCK_read] = {                                             \
+               .lock_val       = __SIX_VAL(read_lock, 1),              \
+               .lock_fail      = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
+               .unlock_val     = -__SIX_VAL(read_lock, 1),             \
+               .held_mask      = __SIX_LOCK_HELD_read,                 \
+               .unlock_wakeup  = SIX_LOCK_write,                       \
+       },                                                              \
+       [SIX_LOCK_intent] = {                                           \
+               .lock_val       = __SIX_VAL(intent_lock, 1),            \
+               .lock_fail      = __SIX_LOCK_HELD_intent,               \
+               .unlock_val     = -__SIX_VAL(intent_lock, 1),           \
+               .held_mask      = __SIX_LOCK_HELD_intent,               \
+               .unlock_wakeup  = SIX_LOCK_intent,                      \
+       },                                                              \
+       [SIX_LOCK_write] = {                                            \
+               .lock_val       = __SIX_VAL(seq, 1),                    \
+               .lock_fail      = __SIX_LOCK_HELD_read,                 \
+               .unlock_val     = __SIX_VAL(seq, 1),                    \
+               .held_mask      = __SIX_LOCK_HELD_write,                \
+               .unlock_wakeup  = SIX_LOCK_read,                        \
+       },                                                              \
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+                                union six_lock_state old)
+{
+       if (type != SIX_LOCK_intent)
+               return;
+
+       if (!old.intent_lock) {
+               EBUG_ON(lock->owner);
+               lock->owner = current;
+       } else {
+               EBUG_ON(lock->owner != current);
+       }
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+       unsigned read_count = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               read_count += *per_cpu_ptr(lock->readers, cpu);
+       return read_count;
+}
+
+struct six_lock_waiter {
+       struct list_head        list;
+       struct task_struct      *task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+                                  union six_lock_state state,
+                                  unsigned waitlist_id)
+{
+       if (waitlist_id == SIX_LOCK_write) {
+               if (state.write_locking && !state.read_lock) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+                       if (p)
+                               wake_up_process(p);
+               }
+       } else {
+               struct list_head *wait_list = &lock->wait_list[waitlist_id];
+               struct six_lock_waiter *w, *next;
+
+               if (!(state.waiters & (1 << waitlist_id)))
+                       return;
+
+               clear_bit(waitlist_bitnr(waitlist_id),
+                         (unsigned long *) &lock->state.v);
+
+               raw_spin_lock(&lock->wait_lock);
+
+               list_for_each_entry_safe(w, next, wait_list, list) {
+                       list_del_init(&w->list);
+
+                       if (wake_up_process(w->task) &&
+                           waitlist_id != SIX_LOCK_read) {
+                               if (!list_empty(wait_list))
+                                       set_bit(waitlist_bitnr(waitlist_id),
+                                               (unsigned long *) &lock->state.v);
+                               break;
+                       }
+               }
+
+               raw_spin_unlock(&lock->wait_lock);
+       }
+}
+
+static __always_inline bool do_six_trylock_type(struct six_lock *lock,
+                                               enum six_lock_type type,
+                                               bool try)
+{
+       const struct six_lock_vals l[] = LOCK_VALS;
+       union six_lock_state old, new;
+       bool ret;
+       u64 v;
+
+       EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+       EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
+
+       EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
+
+       /*
+        * Percpu reader mode:
+        *
+        * The basic idea behind this algorithm is that you can implement a lock
+        * between two threads without any atomics, just memory barriers:
+        *
+        * For two threads you'll need two variables, one variable for "thread a
+        * has the lock" and another for "thread b has the lock".
+        *
+        * To take the lock, a thread sets its variable indicating that it holds
+        * the lock, then issues a full memory barrier, then reads from the
+        * other thread's variable to check if the other thread thinks it has
+        * the lock. If we raced, we backoff and retry/sleep.
+        */
+
+       if (type == SIX_LOCK_read && lock->readers) {
+retry:
+               preempt_disable();
+               this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+               smp_mb();
+
+               old.v = READ_ONCE(lock->state.v);
+               ret = !(old.v & l[type].lock_fail);
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               /*
+                * If we failed because a writer was trying to take the
+                * lock, issue a wakeup because we might have caused a
+                * spurious trylock failure:
+                */
+               if (old.write_locking) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+
+                       if (p)
+                               wake_up_process(p);
+               }
+
+               /*
+                * If we failed from the lock path and the waiting bit wasn't
+                * set, set it:
+                */
+               if (!try && !ret) {
+                       v = old.v;
+
+                       do {
+                               new.v = old.v = v;
+
+                               if (!(old.v & l[type].lock_fail))
+                                       goto retry;
+
+                               if (new.waiters & (1 << type))
+                                       break;
+
+                               new.waiters |= 1 << type;
+                       } while ((v = atomic64_cmpxchg(&lock->state.counter,
+                                                      old.v, new.v)) != old.v);
+               }
+       } else if (type == SIX_LOCK_write && lock->readers) {
+               if (try) {
+                       atomic64_add(__SIX_VAL(write_locking, 1),
+                                    &lock->state.counter);
+                       smp_mb__after_atomic();
+               }
+
+               ret = !pcpu_read_count(lock);
+
+               /*
+                * On success, we increment lock->seq; also we clear
+                * write_locking unless we failed from the lock path:
+                */
+               v = 0;
+               if (ret)
+                       v += __SIX_VAL(seq, 1);
+               if (ret || try)
+                       v -= __SIX_VAL(write_locking, 1);
+
+               if (try && !ret) {
+                       old.v = atomic64_add_return(v, &lock->state.counter);
+                       six_lock_wakeup(lock, old, SIX_LOCK_read);
+               } else {
+                       atomic64_add(v, &lock->state.counter);
+               }
+       } else {
+               v = READ_ONCE(lock->state.v);
+               do {
+                       new.v = old.v = v;
+
+                       if (!(old.v & l[type].lock_fail)) {
+                               new.v += l[type].lock_val;
+
+                               if (type == SIX_LOCK_write)
+                                       new.write_locking = 0;
+                       } else if (!try && type != SIX_LOCK_write &&
+                                  !(new.waiters & (1 << type)))
+                               new.waiters |= 1 << type;
+                       else
+                               break; /* waiting bit already set */
+               } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+                                       old.v, new.v)) != old.v);
+
+               ret = !(old.v & l[type].lock_fail);
+
+               EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+       }
+
+       if (ret)
+               six_set_owner(lock, type, old);
+
+       EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+       return ret;
+}
+
+__always_inline __flatten
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       if (!do_six_trylock_type(lock, type, true))
+               return false;
+
+       if (type != SIX_LOCK_write)
+               six_acquire(&lock->dep_map, 1);
+       return true;
+}
+
+__always_inline __flatten
+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
+                             unsigned seq)
+{
+       const struct six_lock_vals l[] = LOCK_VALS;
+       union six_lock_state old;
+       u64 v;
+
+       EBUG_ON(type == SIX_LOCK_write);
+
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               bool ret;
+
+               preempt_disable();
+               this_cpu_inc(*lock->readers);
+
+               smp_mb();
+
+               old.v = READ_ONCE(lock->state.v);
+               ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               /*
+                * Similar to the lock path, we may have caused a spurious write
+                * lock fail and need to issue a wakeup:
+                */
+               if (old.write_locking) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+
+                       if (p)
+                               wake_up_process(p);
+               }
+
+               if (ret)
+                       six_acquire(&lock->dep_map, 1);
+
+               return ret;
+       }
+
+       v = READ_ONCE(lock->state.v);
+       do {
+               old.v = v;
+
+               if (old.seq != seq || old.v & l[type].lock_fail)
+                       return false;
+       } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+                               old.v,
+                               old.v + l[type].lock_val)) != old.v);
+
+       six_set_owner(lock, type, old);
+       if (type != SIX_LOCK_write)
+               six_acquire(&lock->dep_map, 1);
+       return true;
+}
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+
+static inline int six_can_spin_on_owner(struct six_lock *lock)
+{
+       struct task_struct *owner;
+       int retval = 1;
+
+       if (need_resched())
+               return 0;
+
+       rcu_read_lock();
+       owner = READ_ONCE(lock->owner);
+       if (owner)
+               retval = owner->on_cpu;
+       rcu_read_unlock();
+       /*
+        * if lock->owner is not set, the mutex owner may have just acquired
+        * it and not set the owner yet or the mutex has been released.
+        */
+       return retval;
+}
+
+static inline bool six_spin_on_owner(struct six_lock *lock,
+                                    struct task_struct *owner)
+{
+       bool ret = true;
+
+       rcu_read_lock();
+       while (lock->owner == owner) {
+               /*
+                * Ensure we emit the owner->on_cpu, dereference _after_
+                * checking lock->owner still matches owner. If that fails,
+                * owner might point to freed memory. If it still matches,
+                * the rcu_read_lock() ensures the memory stays valid.
+                */
+               barrier();
+
+               if (!owner->on_cpu || need_resched()) {
+                       ret = false;
+                       break;
+               }
+
+               cpu_relax();
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+       struct task_struct *task = current;
+
+       if (type == SIX_LOCK_write)
+               return false;
+
+       preempt_disable();
+       if (!six_can_spin_on_owner(lock))
+               goto fail;
+
+       if (!osq_lock(&lock->osq))
+               goto fail;
+
+       while (1) {
+               struct task_struct *owner;
+
+               /*
+                * If there's an owner, wait for it to either
+                * release the lock or go to sleep.
+                */
+               owner = READ_ONCE(lock->owner);
+               if (owner && !six_spin_on_owner(lock, owner))
+                       break;
+
+               if (do_six_trylock_type(lock, type, false)) {
+                       osq_unlock(&lock->osq);
+                       preempt_enable();
+                       return true;
+               }
+
+               /*
+                * When there's no owner, we might have preempted between the
+                * owner acquiring the lock and setting the owner field. If
+                * we're an RT task that will live-lock because we won't let
+                * the owner complete.
+                */
+               if (!owner && (need_resched() || rt_task(task)))
+                       break;
+
+               /*
+                * The cpu_relax() call is a compiler barrier which forces
+                * everything in this loop to be re-loaded. We don't need
+                * memory barriers as we'll eventually observe the right
+                * values at the cost of a few extra spins.
+                */
+               cpu_relax();
+       }
+
+       osq_unlock(&lock->osq);
+fail:
+       preempt_enable();
+
+       /*
+        * If we fell out of the spin path because of need_resched(),
+        * reschedule now, before we try-lock again. This avoids getting
+        * scheduled out right after we obtained the lock.
+        */
+       if (need_resched())
+               schedule();
+
+       return false;
+}
+
+#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+       return false;
+}
+
+#endif
+
+noinline
+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+                                   six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       union six_lock_state old;
+       struct six_lock_waiter wait;
+       int ret = 0;
+
+       if (type == SIX_LOCK_write) {
+               EBUG_ON(lock->state.write_locking);
+               atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+               smp_mb__after_atomic();
+       }
+
+       ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+       if (ret)
+               goto out_before_sleep;
+
+       if (six_optimistic_spin(lock, type))
+               goto out_before_sleep;
+
+       lock_contended(&lock->dep_map, _RET_IP_);
+
+       INIT_LIST_HEAD(&wait.list);
+       wait.task = current;
+
+       while (1) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (type == SIX_LOCK_write)
+                       EBUG_ON(lock->owner != current);
+               else if (list_empty_careful(&wait.list)) {
+                       raw_spin_lock(&lock->wait_lock);
+                       list_add_tail(&wait.list, &lock->wait_list[type]);
+                       raw_spin_unlock(&lock->wait_lock);
+               }
+
+               if (do_six_trylock_type(lock, type, false))
+                       break;
+
+               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+               if (ret)
+                       break;
+
+               schedule();
+       }
+
+       __set_current_state(TASK_RUNNING);
+
+       if (!list_empty_careful(&wait.list)) {
+               raw_spin_lock(&lock->wait_lock);
+               list_del_init(&wait.list);
+               raw_spin_unlock(&lock->wait_lock);
+       }
+out_before_sleep:
+       if (ret && type == SIX_LOCK_write) {
+               old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+                                           &lock->state.counter);
+               six_lock_wakeup(lock, old, SIX_LOCK_read);
+       }
+
+       return ret;
+}
+
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+                          six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       int ret;
+
+       if (type != SIX_LOCK_write)
+               six_acquire(&lock->dep_map, 0);
+
+       ret = do_six_trylock_type(lock, type, true) ? 0
+               : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+
+       if (ret && type != SIX_LOCK_write)
+               six_release(&lock->dep_map);
+       if (!ret)
+               lock_acquired(&lock->dep_map, _RET_IP_);
+
+       return ret;
+}
+
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       const struct six_lock_vals l[] = LOCK_VALS;
+       union six_lock_state state;
+
+       EBUG_ON(type == SIX_LOCK_write &&
+               !(lock->state.v & __SIX_LOCK_HELD_intent));
+
+       if (type != SIX_LOCK_write)
+               six_release(&lock->dep_map);
+
+       if (type == SIX_LOCK_intent) {
+               EBUG_ON(lock->owner != current);
+
+               if (lock->intent_lock_recurse) {
+                       --lock->intent_lock_recurse;
+                       return;
+               }
+
+               lock->owner = NULL;
+       }
+
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               smp_mb(); /* unlock barrier */
+               this_cpu_dec(*lock->readers);
+               smp_mb(); /* between unlocking and checking for waiters */
+               state.v = READ_ONCE(lock->state.v);
+       } else {
+               EBUG_ON(!(lock->state.v & l[type].held_mask));
+               state.v = atomic64_add_return_release(l[type].unlock_val,
+                                                     &lock->state.counter);
+       }
+
+       six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+#define __SIX_LOCK(type)                                               \
+bool six_trylock_##type(struct six_lock *lock)                         \
+{                                                                      \
+       return __six_trylock_type(lock, SIX_LOCK_##type);               \
+}                                                                      \
+EXPORT_SYMBOL_GPL(six_trylock_##type);                                 \
+                                                                       \
+bool six_relock_##type(struct six_lock *lock, u32 seq)                 \
+{                                                                      \
+       return __six_relock_type(lock, SIX_LOCK_##type, seq);           \
+}                                                                      \
+EXPORT_SYMBOL_GPL(six_relock_##type);                                  \
+                                                                       \
+int six_lock_##type(struct six_lock *lock,                             \
+                   six_lock_should_sleep_fn should_sleep_fn, void *p)  \
+{                                                                      \
+       return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
+}                                                                      \
+EXPORT_SYMBOL_GPL(six_lock_##type);                                    \
+                                                                       \
+void six_unlock_##type(struct six_lock *lock)                          \
+{                                                                      \
+       __six_unlock_type(lock, SIX_LOCK_##type);                       \
+}                                                                      \
+EXPORT_SYMBOL_GPL(six_unlock_##type);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+
+#undef __SIX_LOCK
+
+/* Convert from intent to read: */
+void six_lock_downgrade(struct six_lock *lock)
+{
+       six_lock_increment(lock, SIX_LOCK_read);
+       six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+       union six_lock_state old, new;
+       u64 v = READ_ONCE(lock->state.v);
+
+       do {
+               new.v = old.v = v;
+
+               if (new.intent_lock)
+                       return false;
+
+               if (!lock->readers) {
+                       EBUG_ON(!new.read_lock);
+                       new.read_lock--;
+               }
+
+               new.intent_lock = 1;
+       } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+                               old.v, new.v)) != old.v);
+
+       if (lock->readers)
+               this_cpu_dec(*lock->readers);
+
+       six_set_owner(lock, SIX_LOCK_intent, old);
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+bool six_trylock_convert(struct six_lock *lock,
+                        enum six_lock_type from,
+                        enum six_lock_type to)
+{
+       EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+       if (to == from)
+               return true;
+
+       if (to == SIX_LOCK_read) {
+               six_lock_downgrade(lock);
+               return true;
+       } else {
+               return six_lock_tryupgrade(lock);
+       }
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/*
+ * Increment read/intent lock count, assuming we already have it read or intent
+ * locked:
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+       const struct six_lock_vals l[] = LOCK_VALS;
+
+       six_acquire(&lock->dep_map, 0);
+
+       /* XXX: assert already locked, and that we don't overflow: */
+
+       switch (type) {
+       case SIX_LOCK_read:
+               if (lock->readers) {
+                       this_cpu_inc(*lock->readers);
+               } else {
+                       EBUG_ON(!lock->state.read_lock &&
+                               !lock->state.intent_lock);
+                       atomic64_add(l[type].lock_val, &lock->state.counter);
+               }
+               break;
+       case SIX_LOCK_intent:
+               EBUG_ON(!lock->state.intent_lock);
+               lock->intent_lock_recurse++;
+               break;
+       case SIX_LOCK_write:
+               BUG();
+               break;
+       }
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+       struct six_lock_waiter *w;
+
+       raw_spin_lock(&lock->wait_lock);
+
+       list_for_each_entry(w, &lock->wait_list[0], list)
+               wake_up_process(w->task);
+       list_for_each_entry(w, &lock->wait_list[1], list)
+               wake_up_process(w->task);
+
+       raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+struct free_pcpu_rcu {
+       struct rcu_head         rcu;
+       void __percpu           *p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+       struct free_pcpu_rcu *rcu =
+               container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+       free_percpu(rcu->p);
+       kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+       struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+       if (!rcu)
+               return;
+
+       rcu->p = lock->readers;
+       lock->readers = NULL;
+
+       call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+       BUG_ON(lock->readers && pcpu_read_count(lock));
+       BUG_ON(lock->state.read_lock);
+
+       free_percpu(lock->readers);
+       lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+#ifdef __KERNEL__
+       if (!lock->readers)
+               lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+
+/*
+ * Returns lock held counts, for both read and intent
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+       struct six_lock_count ret = { 0, lock->state.intent_lock };
+
+       if (!lock->readers)
+               ret.read += lock->state.read_lock;
+       else {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       ret.read += *per_cpu_ptr(lock->readers, cpu);
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
new file mode 100644 (file)
index 0000000..08d0e0c
--- /dev/null
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/*
+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
+ * semaphores, except with a third intermediate state, intent. Basic operations
+ * are:
+ *
+ * six_lock_read(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ *
+ * six_lock_intent(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ *
+ * Intent locks block other intent locks, but do not block read locks, and you
+ * must have an intent lock held before taking a write lock, like so:
+ *
+ * six_lock_intent(&foo->lock);
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *
+ *   six_trylock_read()
+ *   six_trylock_intent()
+ *   six_trylock_write()
+ *
+ *   six_lock_downgrade():     convert from intent to read
+ *   six_lock_tryupgrade():    attempt to convert from read to intent
+ *
+ * Locks also embed a sequence number, which is incremented when the lock is
+ * locked or unlocked for write. The current sequence number can be grabbed
+ * while a lock is held from lock->state.seq; then, if you drop the lock you can
+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
+ * iff it hasn't been locked for write in the meantime.
+ *
+ * There are also operations that take the lock type as a parameter, where the
+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
+ *
+ *   six_lock_type(lock, type)
+ *   six_unlock_type(lock, type)
+ *   six_relock(lock, type, seq)
+ *   six_trylock_type(lock, type)
+ *   six_trylock_convert(lock, from, to)
+ *
+ * A lock may be held multiple times by the same thread (for read or intent,
+ * not write). However, the six locks code does _not_ implement the actual
+ * recursive checks itself though - rather, if your code (e.g. btree iterator
+ * code) knows that the current thread already has a lock held, and for the
+ * correct type, six_lock_increment() may be used to bump up the counter for
+ * that type - the only effect is that one more call to unlock will be required
+ * before the lock is unlocked.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#include <linux/osq_lock.h>
+#endif
+
+#define SIX_LOCK_SEPARATE_LOCKFNS
+
+union six_lock_state {
+       struct {
+               atomic64_t      counter;
+       };
+
+       struct {
+               u64             v;
+       };
+
+       struct {
+               /* for waitlist_bitnr() */
+               unsigned long   l;
+       };
+
+       struct {
+               unsigned        read_lock:27;
+               unsigned        write_locking:1;
+               unsigned        intent_lock:1;
+               unsigned        waiters:3;
+               /*
+                * seq works much like in seqlocks: it's incremented every time
+                * we lock and unlock for write.
+                *
+                * If it's odd write lock is held, even unlocked.
+                *
+                * Thus readers can unlock, and then lock again later iff it
+                * hasn't been modified in the meantime.
+                */
+               u32             seq;
+       };
+};
+
+enum six_lock_type {
+       SIX_LOCK_read,
+       SIX_LOCK_intent,
+       SIX_LOCK_write,
+};
+
+struct six_lock {
+       union six_lock_state    state;
+       unsigned                intent_lock_recurse;
+       struct task_struct      *owner;
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+       struct optimistic_spin_queue osq;
+#endif
+       unsigned __percpu       *readers;
+
+       raw_spinlock_t          wait_lock;
+       struct list_head        wait_list[2];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lockdep_map      dep_map;
+#endif
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+static __always_inline void __six_lock_init(struct six_lock *lock,
+                                           const char *name,
+                                           struct lock_class_key *key)
+{
+       atomic64_set(&lock->state.counter, 0);
+       raw_spin_lock_init(&lock->wait_lock);
+       INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
+       INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+       lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+
+#define six_lock_init(lock)                                            \
+do {                                                                   \
+       static struct lock_class_key __key;                             \
+                                                                       \
+       __six_lock_init((lock), #lock, &__key);                         \
+} while (0)
+
+#define __SIX_VAL(field, _v)   (((union six_lock_state) { .field = _v }).v)
+
+#define __SIX_LOCK(type)                                               \
+bool six_trylock_##type(struct six_lock *);                            \
+bool six_relock_##type(struct six_lock *, u32);                                \
+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+void six_unlock_##type(struct six_lock *);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+#define SIX_LOCK_DISPATCH(type, fn, ...)                       \
+       switch (type) {                                         \
+       case SIX_LOCK_read:                                     \
+               return fn##_read(__VA_ARGS__);                  \
+       case SIX_LOCK_intent:                                   \
+               return fn##_intent(__VA_ARGS__);                \
+       case SIX_LOCK_write:                                    \
+               return fn##_write(__VA_ARGS__);                 \
+       default:                                                \
+               BUG();                                          \
+       }
+
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       SIX_LOCK_DISPATCH(type, six_trylock, lock);
+}
+
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+                                  unsigned seq)
+{
+       SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
+}
+
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+                               six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
+}
+
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       SIX_LOCK_DISPATCH(type, six_unlock, lock);
+}
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+                        enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+void six_lock_pcpu_free_rcu(struct six_lock *);
+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
+struct six_lock_count {
+       unsigned read;
+       unsigned intent;
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+
+#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
new file mode 100644 (file)
index 0000000..0947fdc
--- /dev/null
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "siphash.h"
+#include "super.h"
+
+#include <linux/crc32c.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+struct bch_hash_info {
+       u8                      type;
+       union {
+               __le64          crc_key;
+               SIPHASH_KEY     siphash_key;
+       };
+};
+
+static inline struct bch_hash_info
+bch2_hash_info_init(struct bch_fs *c,
+                  const struct bch_inode_unpacked *bi)
+{
+       /* XXX ick */
+       struct bch_hash_info info = {
+               .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
+                       ~(~0U << INODE_STR_HASH_BITS)
+       };
+
+       switch (info.type) {
+       case BCH_STR_HASH_CRC32C:
+       case BCH_STR_HASH_CRC64:
+               info.crc_key = bi->bi_hash_seed;
+               break;
+       case BCH_STR_HASH_SIPHASH: {
+               SHASH_DESC_ON_STACK(desc, c->sha256);
+               u8 digest[SHA256_DIGEST_SIZE];
+
+               desc->tfm = c->sha256;
+
+               crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
+                                   sizeof(bi->bi_hash_seed), digest);
+               memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+               break;
+       }
+       default:
+               BUG();
+       }
+
+       return info;
+}
+
+struct bch_str_hash_ctx {
+       union {
+               u32             crc32c;
+               u64             crc64;
+               SIPHASH_CTX     siphash;
+       };
+};
+
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
+                                    const struct bch_hash_info *info)
+{
+       switch (info->type) {
+       case BCH_STR_HASH_CRC32C:
+               ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
+               break;
+       case BCH_STR_HASH_CRC64:
+               ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
+               break;
+       case BCH_STR_HASH_SIPHASH:
+               SipHash24_Init(&ctx->siphash, &info->siphash_key);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
+                                      const struct bch_hash_info *info,
+                                      const void *data, size_t len)
+{
+       switch (info->type) {
+       case BCH_STR_HASH_CRC32C:
+               ctx->crc32c = crc32c(ctx->crc32c, data, len);
+               break;
+       case BCH_STR_HASH_CRC64:
+               ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len);
+               break;
+       case BCH_STR_HASH_SIPHASH:
+               SipHash24_Update(&ctx->siphash, data, len);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+                                  const struct bch_hash_info *info)
+{
+       switch (info->type) {
+       case BCH_STR_HASH_CRC32C:
+               return ctx->crc32c;
+       case BCH_STR_HASH_CRC64:
+               return ctx->crc64 >> 1;
+       case BCH_STR_HASH_SIPHASH:
+               return SipHash24_End(&ctx->siphash) >> 1;
+       default:
+               BUG();
+       }
+}
+
+struct bch_hash_desc {
+       enum btree_id   btree_id;
+       u8              key_type;
+       u8              whiteout_type;
+
+       u64             (*hash_key)(const struct bch_hash_info *, const void *);
+       u64             (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+       bool            (*cmp_key)(struct bkey_s_c, const void *);
+       bool            (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+};
+
+static inline struct btree_iter *
+bch2_hash_lookup(struct btree_trans *trans,
+                const struct bch_hash_desc desc,
+                const struct bch_hash_info *info,
+                u64 inode, const void *key,
+                unsigned flags)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+
+       iter = bch2_trans_get_iter(trans, desc.btree_id,
+                                  POS(inode, desc.hash_key(info, key)),
+                                  BTREE_ITER_SLOTS|flags);
+       if (IS_ERR(iter))
+               return iter;
+
+       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+               if (iter->pos.inode != inode)
+                       break;
+
+               if (k.k->type == desc.key_type) {
+                       if (!desc.cmp_key(k, key))
+                               return iter;
+               } else if (k.k->type == desc.whiteout_type) {
+                       ;
+               } else {
+                       /* hole, not found */
+                       break;
+               }
+       }
+
+       return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
+}
+
+static inline struct btree_iter *
+bch2_hash_hole(struct btree_trans *trans,
+              const struct bch_hash_desc desc,
+              const struct bch_hash_info *info,
+              u64 inode, const void *key)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+
+       iter = bch2_trans_get_iter(trans, desc.btree_id,
+                                  POS(inode, desc.hash_key(info, key)),
+                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return iter;
+
+       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+               if (iter->pos.inode != inode)
+                       break;
+
+               if (k.k->type != desc.key_type)
+                       return iter;
+       }
+
+       return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
+}
+
+static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
+                                          const struct bch_hash_desc desc,
+                                          const struct bch_hash_info *info,
+                                          struct btree_iter *start)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+
+       iter = bch2_trans_copy_iter(trans, start);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
+       bch2_btree_iter_next_slot(iter);
+
+       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+               if (k.k->type != desc.key_type &&
+                   k.k->type != desc.whiteout_type)
+                       return false;
+
+               if (k.k->type == desc.key_type &&
+                   desc.hash_bkey(info, k) <= start->pos.offset)
+                       return true;
+       }
+       return btree_iter_err(k);
+}
+
+static inline int __bch2_hash_set(struct btree_trans *trans,
+                                 const struct bch_hash_desc desc,
+                                 const struct bch_hash_info *info,
+                                 u64 inode, struct bkey_i *insert, int flags)
+{
+       struct btree_iter *iter, *slot = NULL;
+       struct bkey_s_c k;
+
+       iter = bch2_trans_get_iter(trans, desc.btree_id,
+                       POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
+       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+               if (iter->pos.inode != inode)
+                       break;
+
+               if (k.k->type == desc.key_type) {
+                       if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+                               goto found;
+
+                       /* hash collision: */
+                       continue;
+               }
+
+               if (!slot &&
+                   !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+                       slot = bch2_trans_copy_iter(trans, iter);
+                       if (IS_ERR(slot))
+                               return PTR_ERR(slot);
+               }
+
+               if (k.k->type != desc.whiteout_type)
+                       goto not_found;
+       }
+
+       return btree_iter_err(k) ?: -ENOSPC;
+not_found:
+       if (flags & BCH_HASH_SET_MUST_REPLACE)
+               return -ENOENT;
+
+       insert->k.p = slot->pos;
+       bch2_trans_update(trans, slot, insert, 0);
+       return 0;
+found:
+       if (flags & BCH_HASH_SET_MUST_CREATE)
+               return -EEXIST;
+
+       insert->k.p = iter->pos;
+       bch2_trans_update(trans, iter, insert, 0);
+       return 0;
+}
+
+static inline int bch2_hash_set(const struct bch_hash_desc desc,
+                              const struct bch_hash_info *info,
+                              struct bch_fs *c, u64 inode,
+                              u64 *journal_seq,
+                              struct bkey_i *insert, int flags)
+{
+       return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC,
+                       __bch2_hash_set(&trans, desc, info,
+                                       inode, insert, flags));
+}
+
+static inline int bch2_hash_delete_at(struct btree_trans *trans,
+                                     const struct bch_hash_desc desc,
+                                     const struct bch_hash_info *info,
+                                     struct btree_iter *iter)
+{
+       struct bkey_i *delete;
+       int ret;
+
+       ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
+       if (ret < 0)
+               return ret;
+
+       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+       if (IS_ERR(delete))
+               return PTR_ERR(delete);
+
+       bkey_init(&delete->k);
+       delete->k.p = iter->pos;
+       delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+       bch2_trans_update(trans, iter, delete, 0);
+       return 0;
+}
+
+static inline int bch2_hash_delete(struct btree_trans *trans,
+                                  const struct bch_hash_desc desc,
+                                  const struct bch_hash_info *info,
+                                  u64 inode, const void *key)
+{
+       struct btree_iter *iter;
+
+       iter = bch2_hash_lookup(trans, desc, info, inode, key,
+                               BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
+       return bch2_hash_delete_at(trans, desc, info, iter);
+}
+
+#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
new file mode 100644 (file)
index 0000000..64c2375
--- /dev/null
@@ -0,0 +1,971 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "io.h"
+#include "journal.h"
+#include "replicas.h"
+#include "quota.h"
+#include "super-io.h"
+#include "super.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
+const char * const bch2_sb_fields[] = {
+#define x(name, nr)    #name,
+       BCH_SB_FIELDS()
+#undef x
+       NULL
+};
+
+static const char *bch2_sb_field_validate(struct bch_sb *,
+                                         struct bch_sb_field *);
+
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+                                     enum bch_sb_field_type type)
+{
+       struct bch_sb_field *f;
+
+       /* XXX: need locking around superblock to access optional fields */
+
+       vstruct_for_each(sb, f)
+               if (le32_to_cpu(f->type) == type)
+                       return f;
+       return NULL;
+}
+
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+                                                  struct bch_sb_field *f,
+                                                  unsigned u64s)
+{
+       unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+       unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+       BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
+              sb->page_order);
+
+       if (!f) {
+               f = vstruct_last(sb->sb);
+               memset(f, 0, sizeof(u64) * u64s);
+               f->u64s = cpu_to_le32(u64s);
+               f->type = 0;
+       } else {
+               void *src, *dst;
+
+               src = vstruct_end(f);
+               f->u64s = cpu_to_le32(u64s);
+               dst = vstruct_end(f);
+
+               memmove(dst, src, vstruct_end(sb->sb) - src);
+
+               if (dst > src)
+                       memset(src, 0, dst - src);
+       }
+
+       sb->sb->u64s = cpu_to_le32(sb_u64s);
+
+       return f;
+}
+
+/* Superblock realloc/free: */
+
+void bch2_free_super(struct bch_sb_handle *sb)
+{
+       if (sb->bio)
+               kfree(sb->bio);
+       if (!IS_ERR_OR_NULL(sb->bdev))
+               blkdev_put(sb->bdev, sb->holder);
+       kfree(sb->holder);
+
+       free_pages((unsigned long) sb->sb, sb->page_order);
+       memset(sb, 0, sizeof(*sb));
+}
+
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
+{
+       size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+       unsigned order = get_order(new_bytes);
+       struct bch_sb *new_sb;
+       struct bio *bio;
+
+       if (sb->sb && sb->page_order >= order)
+               return 0;
+
+       if (sb->have_layout) {
+               u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+               if (new_bytes > max_bytes) {
+                       pr_err("%pg: superblock too big: want %zu but have %llu",
+                              sb->bdev, new_bytes, max_bytes);
+                       return -ENOSPC;
+               }
+       }
+
+       if (sb->page_order >= order && sb->sb)
+               return 0;
+
+       if (dynamic_fault("bcachefs:add:super_realloc"))
+               return -ENOMEM;
+
+       if (sb->have_bio) {
+               unsigned nr_bvecs = 1 << order;
+
+               bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+               if (!bio)
+                       return -ENOMEM;
+
+               bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+               if (sb->bio)
+                       kfree(sb->bio);
+               sb->bio = bio;
+       }
+
+       new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+       if (!new_sb)
+               return -ENOMEM;
+
+       if (sb->sb)
+               memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+       free_pages((unsigned long) sb->sb, sb->page_order);
+       sb->sb = new_sb;
+
+       sb->page_order = order;
+
+       return 0;
+}
+
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+                                         enum bch_sb_field_type type,
+                                         unsigned u64s)
+{
+       struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+       ssize_t d = -old_u64s + u64s;
+
+       if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+               return NULL;
+
+       if (sb->fs_sb) {
+               struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+               struct bch_dev *ca;
+               unsigned i;
+
+               lockdep_assert_held(&c->sb_lock);
+
+               /* XXX: we're not checking that offline device have enough space */
+
+               for_each_online_member(ca, c, i) {
+                       struct bch_sb_handle *sb = &ca->disk_sb;
+
+                       if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+                               percpu_ref_put(&ca->ref);
+                               return NULL;
+                       }
+               }
+       }
+
+       f = __bch2_sb_field_resize(sb, f, u64s);
+       f->type = cpu_to_le32(type);
+       return f;
+}
+
+/* Superblock validate: */
+
+static inline void __bch2_sb_layout_size_assert(void)
+{
+       BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+}
+
+static const char *validate_sb_layout(struct bch_sb_layout *layout)
+{
+       u64 offset, prev_offset, max_sectors;
+       unsigned i;
+
+       if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
+           !uuid_equal(&layout->magic, &BCHFS_MAGIC))
+               return "Not a bcachefs superblock layout";
+
+       if (layout->layout_type != 0)
+               return "Invalid superblock layout type";
+
+       if (!layout->nr_superblocks)
+               return "Invalid superblock layout: no superblocks";
+
+       if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
+               return "Invalid superblock layout: too many superblocks";
+
+       max_sectors = 1 << layout->sb_max_size_bits;
+
+       prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+       for (i = 1; i < layout->nr_superblocks; i++) {
+               offset = le64_to_cpu(layout->sb_offset[i]);
+
+               if (offset < prev_offset + max_sectors)
+                       return "Invalid superblock layout: superblocks overlap";
+               prev_offset = offset;
+       }
+
+       return NULL;
+}
+
+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+{
+       struct bch_sb *sb = disk_sb->sb;
+       struct bch_sb_field *f;
+       struct bch_sb_field_members *mi;
+       const char *err;
+       u16 block_size;
+
+       if (le16_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
+           le16_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
+               return "Unsupported superblock version";
+
+       if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
+               SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
+               SET_BCH_SB_POSIX_ACL(sb, 1);
+       }
+
+       block_size = le16_to_cpu(sb->block_size);
+
+       if (!is_power_of_2(block_size) ||
+           block_size > PAGE_SECTORS)
+               return "Bad block size";
+
+       if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid)))
+               return "Bad user UUID";
+
+       if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid)))
+               return "Bad internal UUID";
+
+       if (!sb->nr_devices ||
+           sb->nr_devices <= sb->dev_idx ||
+           sb->nr_devices > BCH_SB_MEMBERS_MAX)
+               return "Bad number of member devices";
+
+       if (!BCH_SB_META_REPLICAS_WANT(sb) ||
+           BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+               return "Invalid number of metadata replicas";
+
+       if (!BCH_SB_META_REPLICAS_REQ(sb) ||
+           BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+               return "Invalid number of metadata replicas";
+
+       if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
+           BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+               return "Invalid number of data replicas";
+
+       if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
+           BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+               return "Invalid number of data replicas";
+
+       if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+               return "Invalid metadata checksum type";
+
+       if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+               return "Invalid metadata checksum type";
+
+       if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+               return "Invalid compression type";
+
+       if (!BCH_SB_BTREE_NODE_SIZE(sb))
+               return "Btree node size not set";
+
+       if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
+               return "Btree node size not a power of two";
+
+       if (BCH_SB_GC_RESERVE(sb) < 5)
+               return "gc reserve percentage too small";
+
+       if (!sb->time_precision ||
+           le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
+               return "invalid time precision";
+
+       /* validate layout */
+       err = validate_sb_layout(&sb->layout);
+       if (err)
+               return err;
+
+       vstruct_for_each(sb, f) {
+               if (!f->u64s)
+                       return "Invalid superblock: invalid optional field";
+
+               if (vstruct_next(f) > vstruct_last(sb))
+                       return "Invalid superblock: invalid optional field";
+       }
+
+       /* members must be validated first: */
+       mi = bch2_sb_get_members(sb);
+       if (!mi)
+               return "Invalid superblock: member info area missing";
+
+       err = bch2_sb_field_validate(sb, &mi->field);
+       if (err)
+               return err;
+
+       vstruct_for_each(sb, f) {
+               if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+                       continue;
+
+               err = bch2_sb_field_validate(sb, f);
+               if (err)
+                       return err;
+       }
+
+       if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
+           bch2_sb_get_crypt(sb) &&
+           BCH_SB_INITIALIZED(sb))
+               return "Incompatible extent nonces";
+
+       sb->version = cpu_to_le16(BCH_SB_VERSION_MAX);
+
+       return NULL;
+}
+
+/* device open: */
+
+static void bch2_sb_update(struct bch_fs *c)
+{
+       struct bch_sb *src = c->disk_sb.sb;
+       struct bch_sb_field_members *mi = bch2_sb_get_members(src);
+       struct bch_dev *ca;
+       unsigned i;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       c->sb.uuid              = src->uuid;
+       c->sb.user_uuid         = src->user_uuid;
+       c->sb.nr_devices        = src->nr_devices;
+       c->sb.clean             = BCH_SB_CLEAN(src);
+       c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
+       c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
+       c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
+       c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
+       c->sb.time_precision    = le32_to_cpu(src->time_precision);
+       c->sb.features          = le64_to_cpu(src->features[0]);
+
+       for_each_member_device(ca, c, i)
+               ca->mi = bch2_mi_to_cpu(mi->members + i);
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+{
+       struct bch_sb_field *src_f, *dst_f;
+       struct bch_sb *dst = dst_handle->sb;
+
+       dst->version            = src->version;
+       dst->seq                = src->seq;
+       dst->uuid               = src->uuid;
+       dst->user_uuid          = src->user_uuid;
+       memcpy(dst->label,      src->label, sizeof(dst->label));
+
+       dst->block_size         = src->block_size;
+       dst->nr_devices         = src->nr_devices;
+
+       dst->time_base_lo       = src->time_base_lo;
+       dst->time_base_hi       = src->time_base_hi;
+       dst->time_precision     = src->time_precision;
+
+       memcpy(dst->flags,      src->flags,     sizeof(dst->flags));
+       memcpy(dst->features,   src->features,  sizeof(dst->features));
+       memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
+
+       vstruct_for_each(src, src_f) {
+               if (src_f->type == BCH_SB_FIELD_journal)
+                       continue;
+
+               dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
+               dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+                                              le32_to_cpu(src_f->u64s));
+
+               memcpy(dst_f, src_f, vstruct_bytes(src_f));
+       }
+}
+
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
+{
+       struct bch_sb_field_journal *journal_buckets =
+               bch2_sb_get_journal(src);
+       unsigned journal_u64s = journal_buckets
+               ? le32_to_cpu(journal_buckets->field.u64s)
+               : 0;
+       int ret;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       ret = bch2_sb_realloc(&c->disk_sb,
+                             le32_to_cpu(src->u64s) - journal_u64s);
+       if (ret)
+               return ret;
+
+       __copy_super(&c->disk_sb, src);
+
+       ret = bch2_sb_replicas_to_cpu_replicas(c);
+       if (ret)
+               return ret;
+
+       ret = bch2_sb_disk_groups_to_cpu(c);
+       if (ret)
+               return ret;
+
+       bch2_sb_update(c);
+       return 0;
+}
+
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
+       struct bch_sb_field_journal *journal_buckets =
+               bch2_sb_get_journal(dst);
+       unsigned journal_u64s = journal_buckets
+               ? le32_to_cpu(journal_buckets->field.u64s)
+               : 0;
+       unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
+       int ret;
+
+       ret = bch2_sb_realloc(&ca->disk_sb, u64s);
+       if (ret)
+               return ret;
+
+       __copy_super(&ca->disk_sb, src);
+       return 0;
+}
+
+/* read superblock: */
+
+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+{
+       struct bch_csum csum;
+       size_t bytes;
+reread:
+       bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+       sb->bio->bi_iter.bi_sector = offset;
+       sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
+       bch2_bio_map(sb->bio, sb->sb);
+
+       if (submit_bio_wait(sb->bio))
+               return "IO error";
+
+       if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
+           !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC))
+               return "Not a bcachefs superblock";
+
+       if (le16_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
+           le16_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
+               return "Unsupported superblock version";
+
+       bytes = vstruct_bytes(sb->sb);
+
+       if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
+               return "Bad superblock: too big";
+
+       if (get_order(bytes) > sb->page_order) {
+               if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
+                       return "cannot allocate memory";
+               goto reread;
+       }
+
+       if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
+               return "unknown csum type";
+
+       /* XXX: verify MACs */
+       csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+                           null_nonce(), sb->sb);
+
+       if (bch2_crc_cmp(csum, sb->sb->csum))
+               return "bad checksum reading superblock";
+
+       return NULL;
+}
+
+int bch2_read_super(const char *path, struct bch_opts *opts,
+                   struct bch_sb_handle *sb)
+{
+       u64 offset = opt_get(*opts, sb);
+       struct bch_sb_layout layout;
+       const char *err;
+       __le64 *i;
+       int ret;
+
+       pr_verbose_init(*opts, "");
+
+       memset(sb, 0, sizeof(*sb));
+       sb->mode        = BLK_OPEN_READ;
+       sb->have_bio    = true;
+       sb->holder      = kmalloc(1, GFP_KERNEL);
+       if (!sb->holder)
+               return -ENOMEM;
+
+       if (!opt_get(*opts, noexcl))
+               sb->mode |= BLK_OPEN_EXCL;
+
+       if (!opt_get(*opts, nochanges))
+               sb->mode |= BLK_OPEN_WRITE;
+
+       sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+       if (IS_ERR(sb->bdev) &&
+           PTR_ERR(sb->bdev) == -EACCES &&
+           opt_get(*opts, read_only)) {
+               sb->mode &= ~BLK_OPEN_WRITE;
+
+               sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+               if (!IS_ERR(sb->bdev))
+                       opt_set(*opts, nochanges, true);
+       }
+
+       if (IS_ERR(sb->bdev)) {
+               ret = PTR_ERR(sb->bdev);
+               goto out;
+       }
+
+       err = "cannot allocate memory";
+       ret = bch2_sb_realloc(sb, 0);
+       if (ret)
+               goto err;
+
+       ret = -EFAULT;
+       err = "dynamic fault";
+       if (bch2_fs_init_fault("read_super"))
+               goto err;
+
+       ret = -EINVAL;
+       err = read_one_super(sb, offset);
+       if (!err)
+               goto got_super;
+
+       if (opt_defined(*opts, sb))
+               goto err;
+
+       pr_err("error reading default superblock: %s", err);
+
+       /*
+        * Error reading primary superblock - read location of backup
+        * superblocks:
+        */
+       bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+       sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+       sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
+       /*
+        * use sb buffer to read layout, since sb buffer is page aligned but
+        * layout won't be:
+        */
+       bch2_bio_map(sb->bio, sb->sb);
+
+       err = "IO error";
+       if (submit_bio_wait(sb->bio))
+               goto err;
+
+       memcpy(&layout, sb->sb, sizeof(layout));
+       err = validate_sb_layout(&layout);
+       if (err)
+               goto err;
+
+       for (i = layout.sb_offset;
+            i < layout.sb_offset + layout.nr_superblocks; i++) {
+               offset = le64_to_cpu(*i);
+
+               if (offset == opt_get(*opts, sb))
+                       continue;
+
+               err = read_one_super(sb, offset);
+               if (!err)
+                       goto got_super;
+       }
+
+       ret = -EINVAL;
+       goto err;
+
+got_super:
+       err = "Superblock block size smaller than device block size";
+       ret = -EINVAL;
+       if (le16_to_cpu(sb->sb->block_size) << 9 <
+           bdev_logical_block_size(sb->bdev))
+               goto err;
+
+       ret = 0;
+       sb->have_layout = true;
+out:
+       pr_verbose_init(*opts, "ret %i", ret);
+       return ret;
+err:
+       bch2_free_super(sb);
+       pr_err("error reading superblock: %s", err);
+       goto out;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+       struct bch_dev *ca = bio->bi_private;
+
+       /* XXX: return errors directly */
+
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
+               ca->sb_write_error = 1;
+
+       closure_put(&ca->fs->sb_write);
+       percpu_ref_put(&ca->io_ref);
+}
+
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+{
+       struct bch_sb *sb = ca->disk_sb.sb;
+       struct bio *bio = ca->disk_sb.bio;
+
+       sb->offset = sb->layout.sb_offset[idx];
+
+       SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+       sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+                               null_nonce(), sb);
+
+       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+       bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
+       bio->bi_iter.bi_size    =
+               roundup((size_t) vstruct_bytes(sb),
+                       bdev_logical_block_size(ca->disk_sb.bdev));
+       bio->bi_end_io          = write_super_endio;
+       bio->bi_private         = ca;
+       bch2_bio_map(bio, sb);
+
+       this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+                    bio_sectors(bio));
+
+       percpu_ref_get(&ca->io_ref);
+       closure_bio_submit(bio, &c->sb_write);
+}
+
+void bch2_write_super(struct bch_fs *c)
+{
+       struct closure *cl = &c->sb_write;
+       struct bch_dev *ca;
+       unsigned i, sb = 0, nr_wrote;
+       const char *err;
+       struct bch_devs_mask sb_written;
+       bool wrote, can_mount_without_written, can_mount_with_written;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       closure_init_stack(cl);
+       memset(&sb_written, 0, sizeof(sb_written));
+
+       le64_add_cpu(&c->disk_sb.sb->seq, 1);
+
+       for_each_online_member(ca, c, i)
+               bch2_sb_from_fs(c, ca);
+
+       for_each_online_member(ca, c, i) {
+               err = bch2_sb_validate(&ca->disk_sb);
+               if (err) {
+                       bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+                       goto out;
+               }
+       }
+
+       if (c->opts.nochanges ||
+           test_bit(BCH_FS_ERROR, &c->flags))
+               goto out;
+
+       for_each_online_member(ca, c, i) {
+               __set_bit(ca->dev_idx, sb_written.d);
+               ca->sb_write_error = 0;
+       }
+
+       do {
+               wrote = false;
+               for_each_online_member(ca, c, i)
+                       if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
+                               write_one_super(c, ca, sb);
+                               wrote = true;
+                       }
+               closure_sync(cl);
+               sb++;
+       } while (wrote);
+
+       for_each_online_member(ca, c, i)
+               if (ca->sb_write_error)
+                       __clear_bit(ca->dev_idx, sb_written.d);
+
+       nr_wrote = dev_mask_nr(&sb_written);
+
+       can_mount_with_written =
+               bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+                                     BCH_FORCE_IF_DEGRADED);
+
+       for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+               sb_written.d[i] = ~sb_written.d[i];
+
+       can_mount_without_written =
+               bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+                                     BCH_FORCE_IF_DEGRADED);
+
+       /*
+        * If we would be able to mount _without_ the devices we successfully
+        * wrote superblocks to, we weren't able to write to enough devices:
+        *
+        * Exception: if we can mount without the successes because we haven't
+        * written anything (new filesystem), we continue if we'd be able to
+        * mount with the devices we did successfully write to:
+        */
+       bch2_fs_fatal_err_on(!nr_wrote ||
+                            (can_mount_without_written &&
+                             !can_mount_with_written), c,
+               "Unable to write superblock to sufficient devices");
+out:
+       /* Make new options visible after they're persistent: */
+       bch2_sb_update(c);
+}
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+       u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+       return l < r ? -1 : l > r ? 1 : 0;
+}
+
+static const char *bch2_sb_validate_journal(struct bch_sb *sb,
+                                           struct bch_sb_field *f)
+{
+       struct bch_sb_field_journal *journal = field_to_type(f, journal);
+       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       const char *err;
+       unsigned nr;
+       unsigned i;
+       u64 *b;
+
+       journal = bch2_sb_get_journal(sb);
+       if (!journal)
+               return NULL;
+
+       nr = bch2_nr_journal_buckets(journal);
+       if (!nr)
+               return NULL;
+
+       b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+       if (!b)
+               return "cannot allocate memory";
+
+       for (i = 0; i < nr; i++)
+               b[i] = le64_to_cpu(journal->buckets[i]);
+
+       sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+       err = "journal bucket at sector 0";
+       if (!b[0])
+               goto err;
+
+       err = "journal bucket before first bucket";
+       if (m && b[0] < le16_to_cpu(m->first_bucket))
+               goto err;
+
+       err = "journal bucket past end of device";
+       if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
+               goto err;
+
+       err = "duplicate journal buckets";
+       for (i = 0; i + 1 < nr; i++)
+               if (b[i] == b[i + 1])
+                       goto err;
+
+       err = NULL;
+err:
+       kfree(b);
+       return err;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+       .validate       = bch2_sb_validate_journal,
+};
+
+/* BCH_SB_FIELD_members: */
+
+static const char *bch2_sb_validate_members(struct bch_sb *sb,
+                                           struct bch_sb_field *f)
+{
+       struct bch_sb_field_members *mi = field_to_type(f, members);
+       struct bch_member *m;
+
+       if ((void *) (mi->members + sb->nr_devices) >
+           vstruct_end(&mi->field))
+               return "Invalid superblock: bad member info";
+
+       for (m = mi->members;
+            m < mi->members + sb->nr_devices;
+            m++) {
+               if (!bch2_member_exists(m))
+                       continue;
+
+               if (le64_to_cpu(m->nbuckets) > LONG_MAX)
+                       return "Too many buckets";
+
+               if (le64_to_cpu(m->nbuckets) -
+                   le16_to_cpu(m->first_bucket) < 1 << 10)
+                       return "Not enough buckets";
+
+               if (le16_to_cpu(m->bucket_size) <
+                   le16_to_cpu(sb->block_size))
+                       return "bucket size smaller than block size";
+
+               if (le16_to_cpu(m->bucket_size) <
+                   BCH_SB_BTREE_NODE_SIZE(sb))
+                       return "bucket size smaller than btree node size";
+       }
+
+       if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
+               for (m = mi->members;
+                    m < mi->members + sb->nr_devices;
+                    m++)
+                       SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
+
+       return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
+       .validate       = bch2_sb_validate_members,
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
+                                         struct bch_sb_field *f)
+{
+       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+       if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
+               return "invalid field crypt: wrong size";
+
+       if (BCH_CRYPT_KDF_TYPE(crypt))
+               return "invalid field crypt: bad kdf type";
+
+       return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+       .validate       = bch2_sb_validate_crypt,
+};
+
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+{
+       struct bch_sb_field_clean *sb_clean;
+       unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
+       struct jset_entry *entry;
+       struct btree_root *r;
+
+       mutex_lock(&c->sb_lock);
+       if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
+               goto out;
+
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
+
+       if (!clean)
+               goto write_super;
+
+       mutex_lock(&c->btree_root_lock);
+
+       for (r = c->btree_roots;
+            r < c->btree_roots + BTREE_ID_NR;
+            r++)
+               if (r->alive)
+                       u64s += jset_u64s(r->key.u64s);
+
+       sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+       if (!sb_clean) {
+               bch_err(c, "error resizing superblock while setting filesystem clean");
+               goto out;
+       }
+
+       sb_clean->flags         = 0;
+       sb_clean->read_clock    = cpu_to_le16(c->bucket_clock[READ].hand);
+       sb_clean->write_clock   = cpu_to_le16(c->bucket_clock[WRITE].hand);
+       sb_clean->journal_seq   = journal_cur_seq(&c->journal) - 1;
+
+       entry = sb_clean->start;
+       memset(entry, 0,
+              vstruct_end(&sb_clean->field) - (void *) entry);
+
+       for (r = c->btree_roots;
+            r < c->btree_roots + BTREE_ID_NR;
+            r++)
+               if (r->alive) {
+                       entry->u64s     = r->key.u64s;
+                       entry->btree_id = r - c->btree_roots;
+                       entry->level    = r->level;
+                       entry->type     = BCH_JSET_ENTRY_btree_root;
+                       bkey_copy(&entry->start[0], &r->key);
+                       entry = vstruct_next(entry);
+                       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+               }
+
+       BUG_ON(entry != vstruct_end(&sb_clean->field));
+
+       mutex_unlock(&c->btree_root_lock);
+write_super:
+       bch2_write_super(c);
+out:
+       mutex_unlock(&c->sb_lock);
+}
+
+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
+                                         struct bch_sb_field *f)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+       if (vstruct_bytes(&clean->field) < sizeof(*clean))
+               return "invalid field crypt: wrong size";
+
+       return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+       .validate       = bch2_sb_validate_clean,
+};
+
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)                                       \
+       [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+       BCH_SB_FIELDS()
+#undef x
+};
+
+static const char *bch2_sb_field_validate(struct bch_sb *sb,
+                                         struct bch_sb_field *f)
+{
+       unsigned type = le32_to_cpu(f->type);
+
+       return type < BCH_SB_FIELD_NR
+               ? bch2_sb_field_ops[type]->validate(sb, f)
+               : NULL;
+}
+
+size_t bch2_sb_field_to_text(char *buf, size_t size,
+                            struct bch_sb *sb, struct bch_sb_field *f)
+{
+       unsigned type = le32_to_cpu(f->type);
+       size_t (*to_text)(char *, size_t, struct bch_sb *,
+                                  struct bch_sb_field *) =
+               type < BCH_SB_FIELD_NR
+               ? bch2_sb_field_ops[type]->to_text
+               : NULL;
+
+       if (!to_text) {
+               if (size)
+                       buf[0] = '\0';
+               return 0;
+       }
+
+       return to_text(buf, size, sb, f);
+}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
new file mode 100644 (file)
index 0000000..1ea91f7
--- /dev/null
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H
+
+#include "extents.h"
+#include "eytzinger.h"
+#include "super_types.h"
+#include "super.h"
+
+#include <asm/byteorder.h>
+
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
+                                         enum bch_sb_field_type, unsigned);
+
+#define field_to_type(_f, _name)                                       \
+       container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define x(_name, _nr)                                                  \
+static inline struct bch_sb_field_##_name *                            \
+bch2_sb_get_##_name(struct bch_sb *sb)                                 \
+{                                                                      \
+       return field_to_type(bch2_sb_field_get(sb,                      \
+                               BCH_SB_FIELD_##_name), _name);          \
+}                                                                      \
+                                                                       \
+static inline struct bch_sb_field_##_name *                            \
+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)        \
+{                                                                      \
+       return field_to_type(bch2_sb_field_resize(sb,                   \
+                               BCH_SB_FIELD_##_name, u64s), _name);    \
+}
+
+BCH_SB_FIELDS()
+#undef x
+
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+       const char *    (*validate)(struct bch_sb *, struct bch_sb_field *);
+       size_t          (*to_text)(char *, size_t, struct bch_sb *,
+                                  struct bch_sb_field *);
+};
+
+static inline bool bch2_sb_test_feature(struct bch_sb *sb,
+                                       enum bch_sb_features f)
+{
+       unsigned w = f / 64;
+       unsigned b = f % 64;
+
+       return le64_to_cpu(sb->features[w]) & (1ULL << b);
+}
+
+static inline void bch2_sb_set_feature(struct bch_sb *sb,
+                                      enum bch_sb_features f)
+{
+       if (!bch2_sb_test_feature(sb, f)) {
+               unsigned w = f / 64;
+               unsigned b = f % 64;
+
+               le64_add_cpu(&sb->features[w], 1ULL << b);
+       }
+}
+
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
+{
+       __le64 ret;
+       memcpy(&ret, &c->sb.uuid, sizeof(ret));
+       return ret;
+}
+
+static inline __u64 jset_magic(struct bch_fs *c)
+{
+       return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct bch_fs *c)
+{
+       return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
+}
+
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
+
+void bch2_free_super(struct bch_sb_handle *);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+
+const char *bch2_sb_validate(struct bch_sb_handle *);
+
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+void bch2_write_super(struct bch_fs *);
+
+/* BCH_SB_FIELD_journal: */
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+       return j
+               ? (__le64 *) vstruct_end(&j->field) - j->buckets
+               : 0;
+}
+
+/* BCH_SB_FIELD_members: */
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+       return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb,
+                                  struct bch_sb_field_members *mi,
+                                  unsigned dev)
+{
+       return dev < sb->nr_devices &&
+               bch2_member_exists(&mi->members[dev]);
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+       return (struct bch_member_cpu) {
+               .nbuckets       = le64_to_cpu(mi->nbuckets),
+               .first_bucket   = le16_to_cpu(mi->first_bucket),
+               .bucket_size    = le16_to_cpu(mi->bucket_size),
+               .group          = BCH_MEMBER_GROUP(mi),
+               .state          = BCH_MEMBER_STATE(mi),
+               .replacement    = BCH_MEMBER_REPLACEMENT(mi),
+               .discard        = BCH_MEMBER_DISCARD(mi),
+               .data_allowed   = BCH_MEMBER_DATA_ALLOWED(mi),
+               .durability     = BCH_MEMBER_DURABILITY(mi)
+                       ? BCH_MEMBER_DURABILITY(mi) - 1
+                       : 1,
+               .valid          = bch2_member_exists(mi),
+       };
+}
+
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *, bool);
+
+size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
+                            struct bch_sb_field *);
+
+#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
new file mode 100644 (file)
index 0000000..3191d4c
--- /dev/null
@@ -0,0 +1,1754 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "sysfs.h"
+#include "trace.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+#define KTYPE(type)                                                    \
+static const struct attribute_group type ## _group = {                 \
+       .attrs = type ## _files                                         \
+};                                                                     \
+                                                                       \
+static const struct attribute_group *type ## _groups[] = {             \
+       &type ## _group,                                                \
+       NULL                                                            \
+};                                                                     \
+                                                                       \
+static const struct kobj_type type ## _ktype = {                       \
+       .release        = type ## _release,                             \
+       .sysfs_ops      = &type ## _sysfs_ops,                          \
+       .default_groups = type ## _groups                               \
+}
+
+static void bch2_fs_release(struct kobject *);
+static void bch2_dev_release(struct kobject *);
+
+static void bch2_fs_internal_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_opts_dir_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_time_stats_release(struct kobject *k)
+{
+}
+
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
+
+static struct kset *bcachefs_kset;
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+
+static void bch2_dev_free(struct bch_dev *);
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
+{
+       struct bch_fs *c;
+       struct bch_dev *ca;
+       unsigned i;
+
+       mutex_lock(&bch_fs_list_lock);
+       rcu_read_lock();
+
+       list_for_each_entry(c, &bch_fs_list, list)
+               for_each_member_device_rcu(ca, c, i, NULL)
+                       if (ca->disk_sb.bdev->bd_dev == dev) {
+                               closure_get(&c->cl);
+                               goto found;
+                       }
+       c = NULL;
+found:
+       rcu_read_unlock();
+       mutex_unlock(&bch_fs_list_lock);
+
+       return c;
+}
+
+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
+{
+       struct bch_fs *c;
+
+       lockdep_assert_held(&bch_fs_list_lock);
+
+       list_for_each_entry(c, &bch_fs_list, list)
+               if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
+                       return c;
+
+       return NULL;
+}
+
+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
+{
+       struct bch_fs *c;
+
+       mutex_lock(&bch_fs_list_lock);
+       c = __bch2_uuid_to_fs(uuid);
+       if (c)
+               closure_get(&c->cl);
+       mutex_unlock(&bch_fs_list_lock);
+
+       return c;
+}
+
+/* Filesystem RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and rebalance (to free up space)
+ *
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch2_fs_read_only(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       bch2_rebalance_stop(c);
+
+       for_each_member_device(ca, c, i)
+               bch2_copygc_stop(ca);
+
+       bch2_gc_thread_stop(c);
+
+       /*
+        * Flush journal before stopping allocators, because flushing journal
+        * blacklist entries involves allocating new btree nodes:
+        */
+       bch2_journal_flush_all_pins(&c->journal);
+
+       for_each_member_device(ca, c, i)
+               bch2_dev_allocator_stop(ca);
+
+       bch2_journal_flush_all_pins(&c->journal);
+
+       /*
+        * We need to explicitly wait on btree interior updates to complete
+        * before stopping the journal, flushing all journal pins isn't
+        * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+        * interior updates have to drop their journal pin before they're
+        * fully complete:
+        */
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
+
+       bch2_fs_journal_stop(&c->journal);
+
+       /*
+        * the journal kicks off btree writes via reclaim - wait for in flight
+        * writes after stopping journal:
+        */
+       if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               bch2_btree_flush_all_writes(c);
+       else
+               bch2_btree_verify_flushed(c);
+
+       /*
+        * After stopping journal:
+        */
+       for_each_member_device(ca, c, i)
+               bch2_dev_allocator_remove(c, ca);
+}
+
+static void bch2_writes_disabled(struct percpu_ref *writes)
+{
+       struct bch_fs *c = container_of(writes, struct bch_fs, writes);
+
+       set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+       wake_up(&bch_read_only_wait);
+}
+
+void bch2_fs_read_only(struct bch_fs *c)
+{
+       if (c->state == BCH_FS_RO)
+               return;
+
+       BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+       /*
+        * Block new foreground-end write operations from starting - any new
+        * writes will return -EROFS:
+        *
+        * (This is really blocking new _allocations_, writes to previously
+        * allocated space can still happen until stopping the allocator in
+        * bch2_dev_allocator_stop()).
+        */
+       percpu_ref_kill(&c->writes);
+
+       cancel_delayed_work(&c->pd_controllers_update);
+
+       /*
+        * If we're not doing an emergency shutdown, we want to wait on
+        * outstanding writes to complete so they don't see spurious errors due
+        * to shutting down the allocator:
+        *
+        * If we are doing an emergency shutdown outstanding writes may
+        * hang until we shutdown the allocator so we don't want to wait
+        * on outstanding writes before shutting everything down - but
+        * we do need to wait on them before returning and signalling
+        * that going RO is complete:
+        */
+       wait_event(bch_read_only_wait,
+                  test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+                  test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+       __bch2_fs_read_only(c);
+
+       wait_event(bch_read_only_wait,
+                  test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+       clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+
+       if (!bch2_journal_error(&c->journal) &&
+           !test_bit(BCH_FS_ERROR, &c->flags) &&
+           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               bch2_fs_mark_clean(c, true);
+
+       if (c->state != BCH_FS_STOPPING)
+               c->state = BCH_FS_RO;
+}
+
+static void bch2_fs_read_only_work(struct work_struct *work)
+{
+       struct bch_fs *c =
+               container_of(work, struct bch_fs, read_only_work);
+
+       mutex_lock(&c->state_lock);
+       bch2_fs_read_only(c);
+       mutex_unlock(&c->state_lock);
+}
+
+static void bch2_fs_read_only_async(struct bch_fs *c)
+{
+       queue_work(system_long_wq, &c->read_only_work);
+}
+
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
+{
+       bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+
+       bch2_fs_read_only_async(c);
+       bch2_journal_halt(&c->journal);
+
+       wake_up(&bch_read_only_wait);
+       return ret;
+}
+
+const char *bch2_fs_read_write(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       const char *err = NULL;
+       unsigned i;
+
+       if (c->state == BCH_FS_RW)
+               return NULL;
+
+       bch2_fs_mark_clean(c, false);
+
+       for_each_rw_member(ca, c, i)
+               bch2_dev_allocator_add(c, ca);
+       bch2_recalc_capacity(c);
+
+       err = "error starting allocator thread";
+       for_each_rw_member(ca, c, i)
+               if (bch2_dev_allocator_start(ca)) {
+                       percpu_ref_put(&ca->io_ref);
+                       goto err;
+               }
+
+       err = "error starting btree GC thread";
+       if (bch2_gc_thread_start(c))
+               goto err;
+
+       err = "error starting copygc thread";
+       for_each_rw_member(ca, c, i)
+               if (bch2_copygc_start(c, ca)) {
+                       percpu_ref_put(&ca->io_ref);
+                       goto err;
+               }
+
+       err = "error starting rebalance thread";
+       if (bch2_rebalance_start(c))
+               goto err;
+
+       schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+       if (c->state != BCH_FS_STARTING)
+               percpu_ref_reinit(&c->writes);
+
+       c->state = BCH_FS_RW;
+       return NULL;
+err:
+       __bch2_fs_read_only(c);
+       return err;
+}
+
+/* Filesystem startup/shutdown: */
+
+static void bch2_fs_free(struct bch_fs *c)
+{
+       unsigned i;
+
+       for (i = 0; i < BCH_TIME_STAT_NR; i++)
+               bch2_time_stats_exit(&c->times[i]);
+
+       bch2_fs_quota_exit(c);
+       bch2_fs_fsio_exit(c);
+       bch2_fs_encryption_exit(c);
+       bch2_fs_io_exit(c);
+       bch2_fs_btree_cache_exit(c);
+       bch2_fs_journal_exit(&c->journal);
+       bch2_io_clock_exit(&c->io_clock[WRITE]);
+       bch2_io_clock_exit(&c->io_clock[READ]);
+       bch2_fs_compress_exit(c);
+       percpu_free_rwsem(&c->usage_lock);
+       free_percpu(c->usage_percpu);
+       mempool_exit(&c->btree_bounce_pool);
+       bioset_exit(&c->btree_bio);
+       mempool_exit(&c->btree_interior_update_pool);
+       mempool_exit(&c->btree_reserve_pool);
+       mempool_exit(&c->fill_iter);
+       percpu_ref_exit(&c->writes);
+       kfree(rcu_dereference_protected(c->replicas, 1));
+       kfree(rcu_dereference_protected(c->disk_groups, 1));
+
+       if (c->copygc_wq)
+               destroy_workqueue(c->copygc_wq);
+       if (c->wq)
+               destroy_workqueue(c->wq);
+
+       free_pages((unsigned long) c->disk_sb.sb,
+                  c->disk_sb.page_order);
+       kvpfree(c, sizeof(*c));
+       module_put(THIS_MODULE);
+}
+
+static void bch2_fs_release(struct kobject *kobj)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+       bch2_fs_free(c);
+}
+
+void bch2_fs_stop(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i)
+               if (ca->kobj.state_in_sysfs &&
+                   ca->disk_sb.bdev)
+                       sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+       if (c->kobj.state_in_sysfs)
+               kobject_del(&c->kobj);
+
+       bch2_fs_debug_exit(c);
+       bch2_fs_chardev_exit(c);
+
+       kobject_put(&c->time_stats);
+       kobject_put(&c->opts_dir);
+       kobject_put(&c->internal);
+
+       mutex_lock(&bch_fs_list_lock);
+       list_del(&c->list);
+       mutex_unlock(&bch_fs_list_lock);
+
+       closure_sync(&c->cl);
+       closure_debug_destroy(&c->cl);
+
+       mutex_lock(&c->state_lock);
+       bch2_fs_read_only(c);
+       mutex_unlock(&c->state_lock);
+
+       /* btree prefetch might have kicked off reads in the background: */
+       bch2_btree_flush_all_reads(c);
+
+       for_each_member_device(ca, c, i)
+               cancel_work_sync(&ca->io_error_work);
+
+       cancel_work_sync(&c->btree_write_error_work);
+       cancel_delayed_work_sync(&c->pd_controllers_update);
+       cancel_work_sync(&c->read_only_work);
+
+       for (i = 0; i < c->sb.nr_devices; i++)
+               if (c->devs[i])
+                       bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+
+       kobject_put(&c->kobj);
+}
+
+static const char *bch2_fs_online(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       const char *err = NULL;
+       unsigned i;
+       int ret;
+
+       lockdep_assert_held(&bch_fs_list_lock);
+
+       if (!list_empty(&c->list))
+               return NULL;
+
+       if (__bch2_uuid_to_fs(c->sb.uuid))
+               return "filesystem UUID already open";
+
+       ret = bch2_fs_chardev_init(c);
+       if (ret)
+               return "error creating character device";
+
+       bch2_fs_debug_init(c);
+
+       if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
+           kobject_add(&c->internal, &c->kobj, "internal") ||
+           kobject_add(&c->opts_dir, &c->kobj, "options") ||
+           kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+           bch2_opts_create_sysfs_files(&c->opts_dir))
+               return "error creating sysfs objects";
+
+       mutex_lock(&c->state_lock);
+
+       err = "error creating sysfs objects";
+       __for_each_member_device(ca, c, i, NULL)
+               if (bch2_dev_sysfs_online(c, ca))
+                       goto err;
+
+       list_add(&c->list, &bch_fs_list);
+       err = NULL;
+err:
+       mutex_unlock(&c->state_lock);
+       return err;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_fs *c;
+       unsigned i, iter_size;
+       const char *err;
+
+       pr_verbose_init(opts, "");
+
+       c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+       if (!c)
+               goto out;
+
+       __module_get(THIS_MODULE);
+
+       c->minor                = -1;
+       c->disk_sb.fs_sb        = true;
+
+       mutex_init(&c->state_lock);
+       mutex_init(&c->sb_lock);
+       mutex_init(&c->replicas_gc_lock);
+       mutex_init(&c->btree_root_lock);
+       INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+
+       init_rwsem(&c->gc_lock);
+
+       for (i = 0; i < BCH_TIME_STAT_NR; i++)
+               bch2_time_stats_init(&c->times[i]);
+
+       bch2_fs_allocator_init(c);
+       bch2_fs_rebalance_init(c);
+       bch2_fs_quota_init(c);
+
+       INIT_LIST_HEAD(&c->list);
+
+       INIT_LIST_HEAD(&c->btree_interior_update_list);
+       mutex_init(&c->btree_reserve_cache_lock);
+       mutex_init(&c->btree_interior_update_lock);
+
+       mutex_init(&c->bio_bounce_pages_lock);
+
+       bio_list_init(&c->btree_write_error_list);
+       spin_lock_init(&c->btree_write_error_lock);
+       INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
+
+       INIT_LIST_HEAD(&c->fsck_errors);
+       mutex_init(&c->fsck_error_lock);
+
+       seqcount_init(&c->gc_pos_lock);
+
+       c->copy_gc_enabled              = 1;
+       c->rebalance.enabled            = 1;
+       c->promote_whole_extents        = true;
+
+       c->journal.write_time   = &c->times[BCH_TIME_journal_write];
+       c->journal.delay_time   = &c->times[BCH_TIME_journal_delay];
+       c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked];
+       c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+
+       bch2_fs_btree_cache_init_early(&c->btree_cache);
+
+       mutex_lock(&c->sb_lock);
+
+       if (bch2_sb_to_fs(c, sb)) {
+               mutex_unlock(&c->sb_lock);
+               goto err;
+       }
+
+       mutex_unlock(&c->sb_lock);
+
+       scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+
+       c->opts = bch2_opts_default;
+       bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+       bch2_opts_apply(&c->opts, opts);
+
+       c->block_bits           = ilog2(c->opts.block_size);
+       c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+
+       c->opts.nochanges       |= c->opts.noreplay;
+       c->opts.read_only       |= c->opts.nochanges;
+
+       if (bch2_fs_init_fault("fs_alloc"))
+               goto err;
+
+       iter_size = sizeof(struct btree_node_iter_large) +
+               (btree_blocks(c) + 1) * 2 *
+               sizeof(struct btree_node_iter_set);
+
+       if (!(c->wq = alloc_workqueue("bcachefs",
+                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+           !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+           percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
+           mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
+                                     sizeof(struct btree_reserve)) ||
+           mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+                                     sizeof(struct btree_update)) ||
+           mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+           bioset_init(&c->btree_bio, 1,
+                       max(offsetof(struct btree_read_bio, bio),
+                           offsetof(struct btree_write_bio, wbio.bio)),
+                       BIOSET_NEED_BVECS) ||
+           !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+           percpu_init_rwsem(&c->usage_lock) ||
+           mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+                                       btree_bytes(c)) ||
+           bch2_io_clock_init(&c->io_clock[READ]) ||
+           bch2_io_clock_init(&c->io_clock[WRITE]) ||
+           bch2_fs_journal_init(&c->journal) ||
+           bch2_fs_btree_cache_init(c) ||
+           bch2_fs_io_init(c) ||
+           bch2_fs_encryption_init(c) ||
+           bch2_fs_compress_init(c) ||
+           bch2_fs_fsio_init(c))
+               goto err;
+
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       for (i = 0; i < c->sb.nr_devices; i++)
+               if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
+                   bch2_dev_alloc(c, i))
+                       goto err;
+
+       /*
+        * Now that all allocations have succeeded, init various refcounty
+        * things that let us shutdown:
+        */
+       closure_init(&c->cl, NULL);
+
+       c->kobj.kset = bcachefs_kset;
+       kobject_init(&c->kobj, &bch2_fs_ktype);
+       kobject_init(&c->internal, &bch2_fs_internal_ktype);
+       kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+       kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
+       mutex_lock(&bch_fs_list_lock);
+       err = bch2_fs_online(c);
+       mutex_unlock(&bch_fs_list_lock);
+       if (err) {
+               bch_err(c, "bch2_fs_online() error: %s", err);
+               goto err;
+       }
+out:
+       pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+       return c;
+err:
+       bch2_fs_free(c);
+       c = NULL;
+       goto out;
+}
+
+const char *bch2_fs_start(struct bch_fs *c)
+{
+       const char *err = "cannot allocate memory";
+       struct bch_sb_field_members *mi;
+       struct bch_dev *ca;
+       time64_t now = ktime_get_seconds();
+       unsigned i;
+       int ret = -EINVAL;
+
+       mutex_lock(&c->state_lock);
+
+       BUG_ON(c->state != BCH_FS_STARTING);
+
+       mutex_lock(&c->sb_lock);
+
+       for_each_online_member(ca, c, i)
+               bch2_sb_from_fs(c, ca);
+
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       for_each_online_member(ca, c, i)
+               mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+
+       mutex_unlock(&c->sb_lock);
+
+       for_each_rw_member(ca, c, i)
+               bch2_dev_allocator_add(c, ca);
+       bch2_recalc_capacity(c);
+
+       ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+               ? bch2_fs_recovery(c)
+               : bch2_fs_initialize(c);
+       if (ret)
+               goto err;
+
+       err = "dynamic fault";
+       if (bch2_fs_init_fault("fs_start"))
+               goto err;
+
+       if (c->opts.read_only) {
+               bch2_fs_read_only(c);
+       } else {
+               err = bch2_fs_read_write(c);
+               if (err)
+                       goto err;
+       }
+
+       set_bit(BCH_FS_STARTED, &c->flags);
+
+       err = NULL;
+out:
+       mutex_unlock(&c->state_lock);
+       return err;
+err:
+       switch (ret) {
+       case BCH_FSCK_ERRORS_NOT_FIXED:
+               bch_err(c, "filesystem contains errors: please report this to the developers");
+               pr_cont("mount with -o fix_errors to repair\n");
+               err = "fsck error";
+               break;
+       case BCH_FSCK_REPAIR_UNIMPLEMENTED:
+               bch_err(c, "filesystem contains errors: please report this to the developers");
+               pr_cont("repair unimplemented: inform the developers so that it can be added\n");
+               err = "fsck error";
+               break;
+       case BCH_FSCK_REPAIR_IMPOSSIBLE:
+               bch_err(c, "filesystem contains errors, but repair impossible");
+               err = "fsck error";
+               break;
+       case BCH_FSCK_UNKNOWN_VERSION:
+               err = "unknown metadata version";;
+               break;
+       case -ENOMEM:
+               err = "cannot allocate memory";
+               break;
+       case -EIO:
+               err = "IO error";
+               break;
+       }
+
+       BUG_ON(!err);
+       set_bit(BCH_FS_ERROR, &c->flags);
+       goto out;
+}
+
+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+{
+       struct bch_sb_field_members *sb_mi;
+
+       sb_mi = bch2_sb_get_members(sb);
+       if (!sb_mi)
+               return "Invalid superblock: member info area missing";
+
+       if (le16_to_cpu(sb->block_size) != c->opts.block_size)
+               return "mismatched block size";
+
+       if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+           BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
+               return "new cache bucket size is too small";
+
+       return NULL;
+}
+
+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+{
+       struct bch_sb *newest =
+               le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+       struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
+
+       if (!uuid_equal(&fs->uuid, &sb->uuid))
+               return "device not a member of filesystem";
+
+       if (!bch2_dev_exists(newest, mi, sb->dev_idx))
+               return "device has been removed";
+
+       if (fs->block_size != sb->block_size)
+               return "mismatched block size";
+
+       return NULL;
+}
+
+/* Device startup/shutdown: */
+
+static void bch2_dev_release(struct kobject *kobj)
+{
+       struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+
+       kfree(ca);
+}
+
+static void bch2_dev_free(struct bch_dev *ca)
+{
+       cancel_work_sync(&ca->io_error_work);
+
+       if (ca->kobj.state_in_sysfs &&
+           ca->disk_sb.bdev)
+               sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+       if (ca->kobj.state_in_sysfs)
+               kobject_del(&ca->kobj);
+
+       bch2_free_super(&ca->disk_sb);
+       bch2_dev_journal_exit(ca);
+
+       free_percpu(ca->io_done);
+       bioset_exit(&ca->replica_set);
+       bch2_dev_buckets_free(ca);
+
+       bch2_time_stats_exit(&ca->io_latency[WRITE]);
+       bch2_time_stats_exit(&ca->io_latency[READ]);
+
+       percpu_ref_exit(&ca->io_ref);
+       percpu_ref_exit(&ca->ref);
+       kobject_put(&ca->kobj);
+}
+
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
+{
+
+       lockdep_assert_held(&c->state_lock);
+
+       if (percpu_ref_is_zero(&ca->io_ref))
+               return;
+
+       __bch2_dev_read_only(c, ca);
+
+       reinit_completion(&ca->io_ref_completion);
+       percpu_ref_kill(&ca->io_ref);
+       wait_for_completion(&ca->io_ref_completion);
+
+       if (ca->kobj.state_in_sysfs) {
+               sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+               sysfs_remove_link(&ca->kobj, "block");
+       }
+
+       bch2_free_super(&ca->disk_sb);
+       bch2_dev_journal_exit(ca);
+}
+
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
+{
+       struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+
+       complete(&ca->ref_completion);
+}
+
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+{
+       struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+
+       complete(&ca->io_ref_completion);
+}
+
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
+{
+       int ret;
+
+       if (!c->kobj.state_in_sysfs)
+               return 0;
+
+       if (!ca->kobj.state_in_sysfs) {
+               ret = kobject_add(&ca->kobj, &c->kobj,
+                                 "dev-%u", ca->dev_idx);
+               if (ret)
+                       return ret;
+       }
+
+       if (ca->disk_sb.bdev) {
+               struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
+
+               ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
+               if (ret)
+                       return ret;
+
+               ret = sysfs_create_link(&ca->kobj, block, "block");
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+                                       struct bch_member *member)
+{
+       struct bch_dev *ca;
+
+       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       if (!ca)
+               return NULL;
+
+       kobject_init(&ca->kobj, &bch2_dev_ktype);
+       init_completion(&ca->ref_completion);
+       init_completion(&ca->io_ref_completion);
+
+       init_rwsem(&ca->bucket_lock);
+
+       writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+
+       spin_lock_init(&ca->freelist_lock);
+       bch2_dev_copygc_init(ca);
+
+       INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+
+       bch2_time_stats_init(&ca->io_latency[READ]);
+       bch2_time_stats_init(&ca->io_latency[WRITE]);
+
+       ca->mi = bch2_mi_to_cpu(member);
+       ca->uuid = member->uuid;
+
+       if (opt_defined(c->opts, discard))
+               ca->mi.discard = opt_get(c->opts, discard);
+
+       if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
+                           0, GFP_KERNEL) ||
+           percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+                           PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+           bch2_dev_buckets_alloc(c, ca) ||
+           bioset_init(&ca->replica_set, 4,
+                       offsetof(struct bch_write_bio, bio), 0) ||
+           !(ca->io_done       = alloc_percpu(*ca->io_done)))
+               goto err;
+
+       return ca;
+err:
+       bch2_dev_free(ca);
+       return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+                           unsigned dev_idx)
+{
+       ca->dev_idx = dev_idx;
+       __set_bit(ca->dev_idx, ca->self.d);
+       scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+       ca->fs = c;
+       rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+       if (bch2_dev_sysfs_online(c, ca))
+               pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+       struct bch_member *member =
+               bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+       struct bch_dev *ca = NULL;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       if (bch2_fs_init_fault("dev_alloc"))
+               goto err;
+
+       ca = __bch2_dev_alloc(c, member);
+       if (!ca)
+               goto err;
+
+       bch2_dev_attach(c, ca, dev_idx);
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
+err:
+       if (ca)
+               bch2_dev_free(ca);
+       ret = -ENOMEM;
+       goto out;
+}
+
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+       unsigned ret;
+
+       if (bch2_dev_is_online(ca)) {
+               bch_err(ca, "already have device online in slot %u",
+                       sb->sb->dev_idx);
+               return -EINVAL;
+       }
+
+       if (get_capacity(sb->bdev->bd_disk) <
+           ca->mi.bucket_size * ca->mi.nbuckets) {
+               bch_err(ca, "cannot online: device too small");
+               return -EINVAL;
+       }
+
+       BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
+       if (get_capacity(sb->bdev->bd_disk) <
+           ca->mi.bucket_size * ca->mi.nbuckets) {
+               bch_err(ca, "device too small");
+               return -EINVAL;
+       }
+
+       ret = bch2_dev_journal_init(ca, sb->sb);
+       if (ret)
+               return ret;
+
+       /* Commit: */
+       ca->disk_sb = *sb;
+       memset(sb, 0, sizeof(*sb));
+
+       if (ca->fs)
+               mutex_lock(&ca->fs->sb_lock);
+
+       bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
+       if (ca->fs)
+               mutex_unlock(&ca->fs->sb_lock);
+
+       percpu_ref_reinit(&ca->io_ref);
+
+       return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+       struct bch_dev *ca;
+       int ret;
+
+       lockdep_assert_held(&c->state_lock);
+
+       if (le64_to_cpu(sb->sb->seq) >
+           le64_to_cpu(c->disk_sb.sb->seq))
+               bch2_sb_to_fs(c, sb->sb);
+
+       BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+              !c->devs[sb->sb->dev_idx]);
+
+       ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+       ret = __bch2_dev_attach_bdev(ca, sb);
+       if (ret)
+               return ret;
+
+       bch2_dev_sysfs_online(c, ca);
+
+       if (c->sb.nr_devices == 1)
+               snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+       snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+
+       rebalance_wakeup(c);
+       return 0;
+}
+
+/* Device management: */
+
+/*
+ * Note: this function is also used by the error paths - when a particular
+ * device sees an error, we call it to determine whether we can just set the
+ * device RO, or - if this function returns false - we'll set the whole
+ * filesystem RO:
+ *
+ * XXX: maybe we should be more explicit about whether we're changing state
+ * because we got an error or what have you?
+ */
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
+                           enum bch_member_state new_state, int flags)
+{
+       struct bch_devs_mask new_online_devs;
+       struct replicas_status s;
+       struct bch_dev *ca2;
+       int i, nr_rw = 0, required;
+
+       lockdep_assert_held(&c->state_lock);
+
+       switch (new_state) {
+       case BCH_MEMBER_STATE_RW:
+               return true;
+       case BCH_MEMBER_STATE_RO:
+               if (ca->mi.state != BCH_MEMBER_STATE_RW)
+                       return true;
+
+               /* do we have enough devices to write to?  */
+               for_each_member_device(ca2, c, i)
+                       if (ca2 != ca)
+                               nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+
+               required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+                              ? c->opts.metadata_replicas
+                              : c->opts.metadata_replicas_required,
+                              !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+                              ? c->opts.data_replicas
+                              : c->opts.data_replicas_required);
+
+               return nr_rw >= required;
+       case BCH_MEMBER_STATE_FAILED:
+       case BCH_MEMBER_STATE_SPARE:
+               if (ca->mi.state != BCH_MEMBER_STATE_RW &&
+                   ca->mi.state != BCH_MEMBER_STATE_RO)
+                       return true;
+
+               /* do we have enough devices to read from?  */
+               new_online_devs = bch2_online_devs(c);
+               __clear_bit(ca->dev_idx, new_online_devs.d);
+
+               s = __bch2_replicas_status(c, new_online_devs);
+
+               return bch2_have_enough_devs(s, flags);
+       default:
+               BUG();
+       }
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+       struct replicas_status s;
+       struct bch_sb_field_members *mi;
+       struct bch_dev *ca;
+       unsigned i, flags = c->opts.degraded
+               ? BCH_FORCE_IF_DEGRADED
+               : 0;
+
+       if (!c->opts.degraded) {
+               mutex_lock(&c->sb_lock);
+               mi = bch2_sb_get_members(c->disk_sb.sb);
+
+               for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+                       if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+                               continue;
+
+                       ca = bch_dev_locked(c, i);
+
+                       if (!bch2_dev_is_online(ca) &&
+                           (ca->mi.state == BCH_MEMBER_STATE_RW ||
+                            ca->mi.state == BCH_MEMBER_STATE_RO)) {
+                               mutex_unlock(&c->sb_lock);
+                               return false;
+                       }
+               }
+               mutex_unlock(&c->sb_lock);
+       }
+
+       s = bch2_replicas_status(c);
+
+       return bch2_have_enough_devs(s, flags);
+}
+
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+       bch2_copygc_stop(ca);
+
+       /*
+        * The allocator thread itself allocates btree nodes, so stop it first:
+        */
+       bch2_dev_allocator_stop(ca);
+       bch2_dev_allocator_remove(c, ca);
+       bch2_dev_journal_stop(&c->journal, ca);
+}
+
+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+       lockdep_assert_held(&c->state_lock);
+
+       BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+
+       bch2_dev_allocator_add(c, ca);
+       bch2_recalc_capacity(c);
+
+       if (bch2_dev_allocator_start(ca))
+               return "error starting allocator thread";
+
+       if (bch2_copygc_start(c, ca))
+               return "error starting copygc thread";
+
+       return NULL;
+}
+
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+                        enum bch_member_state new_state, int flags)
+{
+       struct bch_sb_field_members *mi;
+       int ret = 0;
+
+       if (ca->mi.state == new_state)
+               return 0;
+
+       if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+               return -EINVAL;
+
+       if (new_state != BCH_MEMBER_STATE_RW)
+               __bch2_dev_read_only(c, ca);
+
+       bch_notice(ca, "%s", bch2_dev_state[new_state]);
+
+       mutex_lock(&c->sb_lock);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       if (new_state == BCH_MEMBER_STATE_RW &&
+           __bch2_dev_read_write(c, ca))
+               ret = -ENOMEM;
+
+       rebalance_wakeup(c);
+
+       return ret;
+}
+
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+                      enum bch_member_state new_state, int flags)
+{
+       int ret;
+
+       mutex_lock(&c->state_lock);
+       ret = __bch2_dev_set_state(c, ca, new_state, flags);
+       mutex_unlock(&c->state_lock);
+
+       return ret;
+}
+
+/* Device add/removal: */
+
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+       struct bch_sb_field_members *mi;
+       unsigned dev_idx = ca->dev_idx, data;
+       int ret = -EINVAL;
+
+       mutex_lock(&c->state_lock);
+
+       percpu_ref_put(&ca->ref); /* XXX */
+
+       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+               bch_err(ca, "Cannot remove without losing data");
+               goto err;
+       }
+
+       __bch2_dev_read_only(c, ca);
+
+       /*
+        * XXX: verify that dev_idx is really not in use anymore, anywhere
+        *
+        * flag_data_bad() does not check btree pointers
+        */
+       ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+       if (ret) {
+               bch_err(ca, "Remove failed: error %i dropping data", ret);
+               goto err;
+       }
+
+       ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+       if (ret) {
+               bch_err(ca, "Remove failed: error %i flushing journal", ret);
+               goto err;
+       }
+
+       data = bch2_dev_has_data(c, ca);
+       if (data) {
+               char data_has_str[100];
+               bch2_scnprint_flag_list(data_has_str,
+                                       sizeof(data_has_str),
+                                       bch2_data_types,
+                                       data);
+               bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+               ret = -EBUSY;
+               goto err;
+       }
+
+       ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+                                     POS(ca->dev_idx, 0),
+                                     POS(ca->dev_idx + 1, 0),
+                                     ZERO_VERSION,
+                                     NULL, NULL, NULL);
+       if (ret) {
+               bch_err(ca, "Remove failed, error deleting alloc info");
+               goto err;
+       }
+
+       /*
+        * must flush all existing journal entries, they might have
+        * (overwritten) keys that point to the device we're removing:
+        */
+       bch2_journal_flush_all_pins(&c->journal);
+       ret = bch2_journal_error(&c->journal);
+       if (ret) {
+               bch_err(ca, "Remove failed, journal error");
+               goto err;
+       }
+
+       __bch2_dev_offline(c, ca);
+
+       mutex_lock(&c->sb_lock);
+       rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+       mutex_unlock(&c->sb_lock);
+
+       percpu_ref_kill(&ca->ref);
+       wait_for_completion(&ca->ref_completion);
+
+       bch2_dev_free(ca);
+
+       /*
+        * Free this device's slot in the bch_member array - all pointers to
+        * this device must be gone:
+        */
+       mutex_lock(&c->sb_lock);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+       bch2_write_super(c);
+
+       mutex_unlock(&c->sb_lock);
+       mutex_unlock(&c->state_lock);
+       return 0;
+err:
+       if (ca->mi.state == BCH_MEMBER_STATE_RW)
+               __bch2_dev_read_write(c, ca);
+       mutex_unlock(&c->state_lock);
+       return ret;
+}
+
+/* Add new device to running filesystem: */
+int bch2_dev_add(struct bch_fs *c, const char *path)
+{
+       struct bch_opts opts = bch2_opts_empty();
+       struct bch_sb_handle sb;
+       const char *err;
+       struct bch_dev *ca = NULL;
+       struct bch_sb_field_members *mi;
+       struct bch_member dev_mi;
+       unsigned dev_idx, nr_devices, u64s;
+       int ret;
+
+       ret = bch2_read_super(path, &opts, &sb);
+       if (ret)
+               return ret;
+
+       err = bch2_sb_validate(&sb);
+       if (err)
+               return -EINVAL;
+
+       dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+
+       err = bch2_dev_may_add(sb.sb, c);
+       if (err)
+               return -EINVAL;
+
+       ca = __bch2_dev_alloc(c, &dev_mi);
+       if (!ca) {
+               bch2_free_super(&sb);
+               return -ENOMEM;
+       }
+
+       ret = __bch2_dev_attach_bdev(ca, &sb);
+       if (ret) {
+               bch2_dev_free(ca);
+               return ret;
+       }
+
+       err = "journal alloc failed";
+       ret = bch2_dev_journal_alloc(ca);
+       if (ret)
+               goto err;
+
+       mutex_lock(&c->state_lock);
+       mutex_lock(&c->sb_lock);
+
+       err = "insufficient space in new superblock";
+       ret = bch2_sb_from_fs(c, ca);
+       if (ret)
+               goto err_unlock;
+
+       mi = bch2_sb_get_members(ca->disk_sb.sb);
+
+       if (!bch2_sb_resize_members(&ca->disk_sb,
+                               le32_to_cpu(mi->field.u64s) +
+                               sizeof(dev_mi) / sizeof(u64))) {
+               ret = -ENOSPC;
+               goto err_unlock;
+       }
+
+       if (dynamic_fault("bcachefs:add:no_slot"))
+               goto no_slot;
+
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+               if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
+                       goto have_slot;
+no_slot:
+       err = "no slots available in superblock";
+       ret = -ENOSPC;
+       goto err_unlock;
+
+have_slot:
+       nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+       u64s = (sizeof(struct bch_sb_field_members) +
+               sizeof(struct bch_member) * nr_devices) / sizeof(u64);
+
+       err = "no space in superblock for member info";
+       ret = -ENOSPC;
+
+       mi = bch2_sb_resize_members(&c->disk_sb, u64s);
+       if (!mi)
+               goto err_unlock;
+
+       /* success: */
+
+       mi->members[dev_idx] = dev_mi;
+       mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
+       c->disk_sb.sb->nr_devices       = nr_devices;
+
+       ca->disk_sb.sb->dev_idx = dev_idx;
+       bch2_dev_attach(c, ca, dev_idx);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+               err = __bch2_dev_read_write(c, ca);
+               if (err)
+                       goto err_late;
+       }
+
+       mutex_unlock(&c->state_lock);
+       return 0;
+
+err_unlock:
+       mutex_unlock(&c->sb_lock);
+       mutex_unlock(&c->state_lock);
+err:
+       if (ca)
+               bch2_dev_free(ca);
+       bch2_free_super(&sb);
+       bch_err(c, "Unable to add device: %s", err);
+       return ret;
+err_late:
+       bch_err(c, "Error going rw after adding device: %s", err);
+       return -EINVAL;
+}
+
+/* Hot add existing device to running filesystem: */
+int bch2_dev_online(struct bch_fs *c, const char *path)
+{
+       struct bch_opts opts = bch2_opts_empty();
+       struct bch_sb_handle sb = { NULL };
+       struct bch_sb_field_members *mi;
+       struct bch_dev *ca;
+       unsigned dev_idx;
+       const char *err;
+       int ret;
+
+       mutex_lock(&c->state_lock);
+
+       ret = bch2_read_super(path, &opts, &sb);
+       if (ret) {
+               mutex_unlock(&c->state_lock);
+               return ret;
+       }
+
+       dev_idx = sb.sb->dev_idx;
+
+       err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+       if (err)
+               goto err;
+
+       if (bch2_dev_attach_bdev(c, &sb)) {
+               err = "bch2_dev_attach_bdev() error";
+               goto err;
+       }
+
+       ca = bch_dev_locked(c, dev_idx);
+       if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+               err = __bch2_dev_read_write(c, ca);
+               if (err)
+                       goto err;
+       }
+
+       mutex_lock(&c->sb_lock);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+
+       mi->members[ca->dev_idx].last_mount =
+               cpu_to_le64(ktime_get_seconds());
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       mutex_unlock(&c->state_lock);
+       return 0;
+err:
+       mutex_unlock(&c->state_lock);
+       bch2_free_super(&sb);
+       bch_err(c, "error bringing %s online: %s", path, err);
+       return -EINVAL;
+}
+
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+       mutex_lock(&c->state_lock);
+
+       if (!bch2_dev_is_online(ca)) {
+               bch_err(ca, "Already offline");
+               mutex_unlock(&c->state_lock);
+               return 0;
+       }
+
+       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+               bch_err(ca, "Cannot offline required disk");
+               mutex_unlock(&c->state_lock);
+               return -EINVAL;
+       }
+
+       __bch2_dev_offline(c, ca);
+
+       mutex_unlock(&c->state_lock);
+       return 0;
+}
+
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+       struct bch_member *mi;
+       int ret = 0;
+
+       mutex_lock(&c->state_lock);
+
+       if (nbuckets < ca->mi.nbuckets) {
+               bch_err(ca, "Cannot shrink yet");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       if (bch2_dev_is_online(ca) &&
+           get_capacity(ca->disk_sb.bdev->bd_disk) <
+           ca->mi.bucket_size * nbuckets) {
+               bch_err(ca, "New size larger than device");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       ret = bch2_dev_buckets_resize(c, ca, nbuckets);
+       if (ret) {
+               bch_err(ca, "Resize error: %i", ret);
+               goto err;
+       }
+
+       mutex_lock(&c->sb_lock);
+       mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+       mi->nbuckets = cpu_to_le64(nbuckets);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       bch2_recalc_capacity(c);
+err:
+       mutex_unlock(&c->state_lock);
+       return ret;
+}
+
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+{
+
+       struct bch_dev *ca;
+       dev_t dev;
+       unsigned i;
+       int ret;
+
+       ret = lookup_bdev(path, &dev);
+       if (ret)
+               return ERR_PTR(ret);
+
+       for_each_member_device(ca, c, i)
+               if (ca->disk_sb.bdev->bd_dev == dev)
+                       goto found;
+
+       ca = ERR_PTR(-ENOENT);
+found:
+       return ca;
+}
+
+/* Filesystem open: */
+
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+                           struct bch_opts opts)
+{
+       struct bch_sb_handle *sb = NULL;
+       struct bch_fs *c = NULL;
+       unsigned i, best_sb = 0;
+       const char *err;
+       int ret = -ENOMEM;
+
+       pr_verbose_init(opts, "");
+
+       if (!nr_devices) {
+               c = ERR_PTR(-EINVAL);
+               goto out2;
+       }
+
+       if (!try_module_get(THIS_MODULE)) {
+               c = ERR_PTR(-ENODEV);
+               goto out2;
+       }
+
+       sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
+       if (!sb)
+               goto err;
+
+       for (i = 0; i < nr_devices; i++) {
+               ret = bch2_read_super(devices[i], &opts, &sb[i]);
+               if (ret)
+                       goto err;
+
+               err = bch2_sb_validate(&sb[i]);
+               if (err)
+                       goto err_print;
+       }
+
+       for (i = 1; i < nr_devices; i++)
+               if (le64_to_cpu(sb[i].sb->seq) >
+                   le64_to_cpu(sb[best_sb].sb->seq))
+                       best_sb = i;
+
+       for (i = 0; i < nr_devices; i++) {
+               err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+               if (err)
+                       goto err_print;
+       }
+
+       ret = -ENOMEM;
+       c = bch2_fs_alloc(sb[best_sb].sb, opts);
+       if (!c)
+               goto err;
+
+       err = "bch2_dev_online() error";
+       mutex_lock(&c->state_lock);
+       for (i = 0; i < nr_devices; i++)
+               if (bch2_dev_attach_bdev(c, &sb[i])) {
+                       mutex_unlock(&c->state_lock);
+                       goto err_print;
+               }
+       mutex_unlock(&c->state_lock);
+
+       err = "insufficient devices";
+       if (!bch2_fs_may_start(c))
+               goto err_print;
+
+       if (!c->opts.nostart) {
+               err = bch2_fs_start(c);
+               if (err)
+                       goto err_print;
+       }
+out:
+       kfree(sb);
+       module_put(THIS_MODULE);
+out2:
+       pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
+       return c;
+err_print:
+       pr_err("bch_fs_open err opening %s: %s",
+              devices[0], err);
+       ret = -EINVAL;
+err:
+       if (c)
+               bch2_fs_stop(c);
+       for (i = 0; i < nr_devices; i++)
+               bch2_free_super(&sb[i]);
+       c = ERR_PTR(ret);
+       goto out;
+}
+
+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
+                                             struct bch_opts opts)
+{
+       const char *err;
+       struct bch_fs *c;
+       bool allocated_fs = false;
+
+       err = bch2_sb_validate(sb);
+       if (err)
+               return err;
+
+       mutex_lock(&bch_fs_list_lock);
+       c = __bch2_uuid_to_fs(sb->sb->uuid);
+       if (c) {
+               closure_get(&c->cl);
+
+               err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
+               if (err)
+                       goto err;
+       } else {
+               c = bch2_fs_alloc(sb->sb, opts);
+               err = "cannot allocate memory";
+               if (!c)
+                       goto err;
+
+               allocated_fs = true;
+       }
+
+       err = "bch2_dev_online() error";
+
+       mutex_lock(&c->sb_lock);
+       if (bch2_dev_attach_bdev(c, sb)) {
+               mutex_unlock(&c->sb_lock);
+               goto err;
+       }
+       mutex_unlock(&c->sb_lock);
+
+       if (!c->opts.nostart && bch2_fs_may_start(c)) {
+               err = bch2_fs_start(c);
+               if (err)
+                       goto err;
+       }
+
+       closure_put(&c->cl);
+       mutex_unlock(&bch_fs_list_lock);
+
+       return NULL;
+err:
+       mutex_unlock(&bch_fs_list_lock);
+
+       if (allocated_fs)
+               bch2_fs_stop(c);
+       else if (c)
+               closure_put(&c->cl);
+
+       return err;
+}
+
+const char *bch2_fs_open_incremental(const char *path)
+{
+       struct bch_sb_handle sb;
+       struct bch_opts opts = bch2_opts_empty();
+       const char *err;
+
+       if (bch2_read_super(path, &opts, &sb))
+               return "error reading superblock";
+
+       err = __bch2_fs_open_incremental(&sb, opts);
+       bch2_free_super(&sb);
+
+       return err;
+}
+
+/* Global interfaces/init */
+
+static void bcachefs_exit(void)
+{
+       bch2_debug_exit();
+       bch2_vfs_exit();
+       bch2_chardev_exit();
+       if (bcachefs_kset)
+               kset_unregister(bcachefs_kset);
+}
+
+static int __init bcachefs_init(void)
+{
+       bch2_bkey_pack_test();
+       bch2_inode_pack_test();
+
+       if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+           bch2_chardev_init() ||
+           bch2_vfs_init() ||
+           bch2_debug_init())
+               goto err;
+
+       return 0;
+err:
+       bcachefs_exit();
+       return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)                     \
+       bool bch2_##name;                                       \
+       module_param_named(name, bch2_##name, bool, 0644);      \
+       MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+unsigned bch2_metadata_version = BCH_SB_VERSION_MAX;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
+module_exit(bcachefs_exit);
+module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
new file mode 100644 (file)
index 0000000..3f73016
--- /dev/null
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H
+
+#include "extents.h"
+
+#include "bcachefs_ioctl.h"
+
+#include <linux/math64.h>
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+       return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+       return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+       u32 remainder;
+
+       div_u64_rem(s, ca->mi.bucket_size, &remainder);
+       return remainder;
+}
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+       return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+       return bch2_dev_is_online(ca) &&
+               ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+       if (!percpu_ref_tryget(&ca->io_ref))
+               return false;
+
+       if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+           (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+               return true;
+
+       percpu_ref_put(&ca->io_ref);
+       return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+       return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+                                        unsigned dev)
+{
+       unsigned i;
+
+       for (i = 0; i < devs.nr; i++)
+               if (devs.devs[i] == dev)
+                       return true;
+
+       return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+                                         unsigned dev)
+{
+       unsigned i;
+
+       for (i = 0; i < devs->nr; i++)
+               if (devs->devs[i] == dev) {
+                       array_remove_item(devs->devs, devs->nr, i);
+                       return;
+               }
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+                                        unsigned dev)
+{
+       BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+       BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+       devs->devs[devs->nr++] = dev;
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+       return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+                                             const struct bch_devs_mask *mask)
+{
+       struct bch_dev *ca = NULL;
+
+       while ((*iter = mask
+               ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+               : *iter) < c->sb.nr_devices &&
+              !(ca = rcu_dereference_check(c->devs[*iter],
+                                           lockdep_is_held(&c->state_lock))))
+               (*iter)++;
+
+       return ca;
+}
+
+#define __for_each_member_device(ca, c, iter, mask)                    \
+       for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+#define for_each_member_device_rcu(ca, c, iter, mask)                  \
+       __for_each_member_device(ca, c, iter, mask)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+       struct bch_dev *ca;
+
+       rcu_read_lock();
+       if ((ca = __bch2_next_dev(c, iter, NULL)))
+               percpu_ref_get(&ca->ref);
+       rcu_read_unlock();
+
+       return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)                            \
+       for ((iter) = 0;                                                \
+            (ca = bch2_get_next_dev(c, &(iter)));                      \
+            percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+                                                     unsigned *iter,
+                                                     int state_mask)
+{
+       struct bch_dev *ca;
+
+       rcu_read_lock();
+       while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+              (!((1 << ca->mi.state) & state_mask) ||
+               !percpu_ref_tryget(&ca->io_ref)))
+               (*iter)++;
+       rcu_read_unlock();
+
+       return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)              \
+       for ((iter) = 0;                                                \
+            (ca = bch2_get_next_online_dev(c, &(iter), state_mask));   \
+            percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)                            \
+       __for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)                                        \
+       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+
+#define for_each_readable_member(ca, c, iter)                          \
+       __for_each_online_member(ca, c, iter,                           \
+               (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_protected(c->devs[idx],
+                                        lockdep_is_held(&c->sb_lock) ||
+                                        lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+       struct bch_devs_mask devs;
+       struct bch_dev *ca;
+       unsigned i;
+
+       memset(&devs, 0, sizeof(devs));
+       for_each_online_member(ca, c, i)
+               __set_bit(ca->dev_idx, devs.d);
+       return devs;
+}
+
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(__uuid_t);
+
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
+                          enum bch_member_state, int);
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+                       enum bch_member_state, int);
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+                     enum bch_member_state, int);
+
+int bch2_dev_fail(struct bch_dev *, int);
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_add(struct bch_fs *, const char *);
+int bch2_dev_online(struct bch_fs *, const char *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+
+bool bch2_fs_emergency_read_only(struct bch_fs *);
+void bch2_fs_read_only(struct bch_fs *);
+const char *bch2_fs_read_write(struct bch_fs *);
+
+void bch2_fs_stop(struct bch_fs *);
+
+const char *bch2_fs_start(struct bch_fs *);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+const char *bch2_fs_open_incremental(const char *path);
+
+#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
new file mode 100644 (file)
index 0000000..4d8265b
--- /dev/null
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H
+
+struct bch_sb_handle {
+       struct bch_sb           *sb;
+       struct block_device     *bdev;
+       struct bio              *bio;
+       void                    *holder;
+       unsigned                page_order;
+       fmode_t                 mode;
+       unsigned                have_layout:1;
+       unsigned                have_bio:1;
+       unsigned                fs_sb:1;
+};
+
+struct bch_devs_mask {
+       unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
+struct bch_devs_list {
+       u8                      nr;
+       u8                      devs[BCH_REPLICAS_MAX + 1];
+};
+
+struct bch_member_cpu {
+       u64                     nbuckets;       /* device size */
+       u16                     first_bucket;   /* index of first bucket used */
+       u16                     bucket_size;    /* sectors */
+       u16                     group;
+       u8                      state;
+       u8                      replacement;
+       u8                      discard;
+       u8                      data_allowed;
+       u8                      durability;
+       u8                      valid;
+};
+
+struct bch_replicas_cpu_entry {
+       u8                      data_type;
+       u8                      devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+       struct rcu_head         rcu;
+       unsigned                nr;
+       unsigned                entry_size;
+       struct bch_replicas_cpu_entry entries[];
+};
+
+struct bch_disk_group_cpu {
+       bool                            deleted;
+       u16                             parent;
+       struct bch_devs_mask            devs;
+};
+
+struct bch_disk_groups_cpu {
+       struct rcu_head                 rcu;
+       unsigned                        nr;
+       struct bch_disk_group_cpu       entries[];
+};
+
+#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
new file mode 100644 (file)
index 0000000..430dcbc
--- /dev/null
@@ -0,0 +1,1027 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#ifndef NO_BCACHEFS_SYSFS
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "compress.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "opts.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "tests.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/sched/clock.h>
+
+#include "util.h"
+
+#define SYSFS_OPS(type)                                                        \
+struct sysfs_ops type ## _sysfs_ops = {                                        \
+       .show   = type ## _show,                                        \
+       .store  = type ## _store                                        \
+}
+
+#define SHOW(fn)                                                       \
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+                          char *buf)                                   \
+
+#define STORE(fn)                                                      \
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+                           const char *buf, size_t size)               \
+
+#define __sysfs_attribute(_name, _mode)                                        \
+       static struct attribute sysfs_##_name =                         \
+               { .name = #_name, .mode = _mode }
+
+#define write_attribute(n)     __sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)      __sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)                __sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)                                   \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+} while (0)
+
+#define sysfs_print(file, var)                                         \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return snprint(buf, PAGE_SIZE, var);                    \
+} while (0)
+
+#define sysfs_hprint(file, val)                                                \
+do {                                                                   \
+       if (attr == &sysfs_ ## file) {                                  \
+               ssize_t ret = bch2_hprint(buf, val);                    \
+               strcat(buf, "\n");                                      \
+               return ret + 1;                                         \
+       }                                                               \
+} while (0)
+
+#define var_printf(_var, fmt)  sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)                sysfs_print(_var, var(_var))
+#define var_hprint(_var)       sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)                                       \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return strtoul_safe(buf, var) ?: (ssize_t) size;        \
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)                       \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return strtoul_safe_clamp(buf, var, min, max)           \
+                       ?: (ssize_t) size;                              \
+} while (0)
+
+#define strtoul_or_return(cp)                                          \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (_r)                                                         \
+               return _r;                                              \
+       _v;                                                             \
+})
+
+#define strtoul_restrict_or_return(cp, min, max)                       \
+({                                                                     \
+       unsigned long __v = 0;                                          \
+       int _r = strtoul_safe_restrict(cp, __v, min, max);              \
+       if (_r)                                                         \
+               return _r;                                              \
+       __v;                                                            \
+})
+
+#define strtoi_h_or_return(cp)                                         \
+({                                                                     \
+       u64 _v;                                                         \
+       int _r = strtoi_h(cp, &_v);                                     \
+       if (_r)                                                         \
+               return _r;                                              \
+       _v;                                                             \
+})
+
+#define sysfs_hatoi(file, var)                                         \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return strtoi_h(buf, &var) ?: (ssize_t) size;           \
+} while (0)
+
+write_attribute(trigger_journal_flush);
+write_attribute(trigger_btree_coalesce);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+rw_attribute(btree_gc_periodic);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(btree_node_size);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+read_attribute(durability);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(bucket_quantiles_last_read);
+read_attribute(bucket_quantiles_last_write);
+read_attribute(bucket_quantiles_fragmentation);
+read_attribute(bucket_quantiles_oldest_gen);
+
+read_attribute(reserve_stats);
+read_attribute(btree_cache_size);
+read_attribute(compression_stats);
+read_attribute(journal_debug);
+read_attribute(journal_pins);
+read_attribute(btree_updates);
+read_attribute(dirty_btree_nodes);
+
+read_attribute(internal_uuid);
+
+read_attribute(has_data);
+read_attribute(alloc_debug);
+write_attribute(wake_allocator);
+
+read_attribute(read_realloc_races);
+read_attribute(extent_migrate_done);
+read_attribute(extent_migrate_raced);
+
+rw_attribute(journal_write_delay_ms);
+rw_attribute(journal_reclaim_delay_ms);
+
+rw_attribute(discard);
+rw_attribute(cache_replacement_policy);
+rw_attribute(label);
+
+rw_attribute(copy_gc_enabled);
+sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(rebalance_enabled);
+sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
+rw_attribute(promote_whole_extents);
+
+rw_attribute(pd_controllers_update_seconds);
+
+read_attribute(meta_replicas_have);
+read_attribute(data_replicas_have);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#define BCH_DEBUG_PARAM(name, description)                             \
+       rw_attribute(name);
+
+       BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define x(_name)                                               \
+       static struct attribute sysfs_time_stat_##_name =               \
+               { .name = #_name, .mode = S_IRUGO };
+       BCH_TIME_STATS()
+#undef x
+
+static struct attribute sysfs_state_rw = {
+       .name = "state",
+       .mode = S_IRUGO
+};
+
+static size_t bch2_btree_cache_size(struct bch_fs *c)
+{
+       size_t ret = 0;
+       struct btree *b;
+
+       mutex_lock(&c->btree_cache.lock);
+       list_for_each_entry(b, &c->btree_cache.live, list)
+               ret += btree_bytes(c);
+
+       mutex_unlock(&c->btree_cache.lock);
+       return ret;
+}
+
+static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
+{
+       struct bch_fs_usage stats = bch2_fs_usage_read(c);
+
+       return scnprintf(buf, PAGE_SIZE,
+                        "capacity:\t\t%llu\n"
+                        "1 replicas:\n"
+                        "\tmeta:\t\t%llu\n"
+                        "\tdirty:\t\t%llu\n"
+                        "\treserved:\t%llu\n"
+                        "2 replicas:\n"
+                        "\tmeta:\t\t%llu\n"
+                        "\tdirty:\t\t%llu\n"
+                        "\treserved:\t%llu\n"
+                        "3 replicas:\n"
+                        "\tmeta:\t\t%llu\n"
+                        "\tdirty:\t\t%llu\n"
+                        "\treserved:\t%llu\n"
+                        "4 replicas:\n"
+                        "\tmeta:\t\t%llu\n"
+                        "\tdirty:\t\t%llu\n"
+                        "\treserved:\t%llu\n"
+                        "online reserved:\t%llu\n",
+                        c->capacity,
+                        stats.s[0].data[S_META],
+                        stats.s[0].data[S_DIRTY],
+                        stats.s[0].persistent_reserved,
+                        stats.s[1].data[S_META],
+                        stats.s[1].data[S_DIRTY],
+                        stats.s[1].persistent_reserved,
+                        stats.s[2].data[S_META],
+                        stats.s[2].data[S_DIRTY],
+                        stats.s[2].persistent_reserved,
+                        stats.s[3].data[S_META],
+                        stats.s[3].data[S_DIRTY],
+                        stats.s[3].persistent_reserved,
+                        stats.online_reserved);
+}
+
+static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+           nr_compressed_extents = 0,
+           compressed_sectors_compressed = 0,
+           compressed_sectors_uncompressed = 0;
+
+       if (!bch2_fs_running(c))
+               return -EPERM;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
+               if (k.k->type == BCH_EXTENT) {
+                       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+                       const struct bch_extent_ptr *ptr;
+                       struct bch_extent_crc_unpacked crc;
+
+                       extent_for_each_ptr_crc(e, ptr, crc) {
+                               if (crc.compression_type == BCH_COMPRESSION_NONE) {
+                                       nr_uncompressed_extents++;
+                                       uncompressed_sectors += e.k->size;
+                               } else {
+                                       nr_compressed_extents++;
+                                       compressed_sectors_compressed +=
+                                               crc.compressed_size;
+                                       compressed_sectors_uncompressed +=
+                                               crc.uncompressed_size;
+                               }
+
+                               /* only looking at the first ptr */
+                               break;
+                       }
+               }
+       bch2_btree_iter_unlock(&iter);
+
+       return scnprintf(buf, PAGE_SIZE,
+                       "uncompressed data:\n"
+                       "       nr extents:                     %llu\n"
+                       "       size (bytes):                   %llu\n"
+                       "compressed data:\n"
+                       "       nr extents:                     %llu\n"
+                       "       compressed size (bytes):        %llu\n"
+                       "       uncompressed size (bytes):      %llu\n",
+                       nr_uncompressed_extents,
+                       uncompressed_sectors << 9,
+                       nr_compressed_extents,
+                       compressed_sectors_compressed << 9,
+                       compressed_sectors_uncompressed << 9);
+}
+
+SHOW(bch2_fs)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+       sysfs_print(minor,                      c->minor);
+       sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
+
+       sysfs_print(journal_write_delay_ms,     c->journal.write_delay_ms);
+       sysfs_print(journal_reclaim_delay_ms,   c->journal.reclaim_delay_ms);
+
+       sysfs_print(block_size,                 block_bytes(c));
+       sysfs_print(btree_node_size,            btree_bytes(c));
+       sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
+
+       sysfs_print(read_realloc_races,
+                   atomic_long_read(&c->read_realloc_races));
+       sysfs_print(extent_migrate_done,
+                   atomic_long_read(&c->extent_migrate_done));
+       sysfs_print(extent_migrate_raced,
+                   atomic_long_read(&c->extent_migrate_raced));
+
+       sysfs_printf(btree_gc_periodic, "%u",   (int) c->btree_gc_periodic);
+
+       sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+       sysfs_print(pd_controllers_update_seconds,
+                   c->pd_controllers_update_seconds);
+
+       sysfs_printf(rebalance_enabled,         "%i", c->rebalance.enabled);
+       sysfs_pd_controller_show(rebalance,     &c->rebalance.pd); /* XXX */
+
+       if (attr == &sysfs_rebalance_work)
+               return bch2_rebalance_work_show(c, buf);
+
+       sysfs_print(promote_whole_extents,      c->promote_whole_extents);
+
+       sysfs_printf(meta_replicas_have, "%u",  bch2_replicas_online(c, true));
+       sysfs_printf(data_replicas_have, "%u",  bch2_replicas_online(c, false));
+
+       /* Debugging: */
+
+       if (attr == &sysfs_alloc_debug)
+               return show_fs_alloc_debug(c, buf);
+
+       if (attr == &sysfs_journal_debug)
+               return bch2_journal_print_debug(&c->journal, buf);
+
+       if (attr == &sysfs_journal_pins)
+               return bch2_journal_print_pins(&c->journal, buf);
+
+       if (attr == &sysfs_btree_updates)
+               return bch2_btree_updates_print(c, buf);
+
+       if (attr == &sysfs_dirty_btree_nodes)
+               return bch2_dirty_btree_nodes_print(c, buf);
+
+       if (attr == &sysfs_compression_stats)
+               return bch2_compression_stats(c, buf);
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
+       BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+       return 0;
+}
+
+STORE(__bch2_fs)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+       sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
+       sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+
+       if (attr == &sysfs_btree_gc_periodic) {
+               ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+                       ?: (ssize_t) size;
+
+               wake_up_process(c->gc_thread);
+               return ret;
+       }
+
+       if (attr == &sysfs_copy_gc_enabled) {
+               struct bch_dev *ca;
+               unsigned i;
+               ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+                       ?: (ssize_t) size;
+
+               for_each_member_device(ca, c, i)
+                       if (ca->copygc_thread)
+                               wake_up_process(ca->copygc_thread);
+               return ret;
+       }
+
+       if (attr == &sysfs_rebalance_enabled) {
+               ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+                       ?: (ssize_t) size;
+
+               rebalance_wakeup(c);
+               return ret;
+       }
+
+       sysfs_strtoul(pd_controllers_update_seconds,
+                     c->pd_controllers_update_seconds);
+       sysfs_pd_controller_store(rebalance,    &c->rebalance.pd);
+
+       sysfs_strtoul(promote_whole_extents,    c->promote_whole_extents);
+
+       /* Debugging: */
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
+       BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+       if (!bch2_fs_running(c))
+               return -EPERM;
+
+       /* Debugging: */
+
+       if (attr == &sysfs_trigger_journal_flush)
+               bch2_journal_meta_async(&c->journal, NULL);
+
+       if (attr == &sysfs_trigger_btree_coalesce)
+               bch2_coalesce(c);
+
+       if (attr == &sysfs_trigger_gc)
+               bch2_gc(c);
+
+       if (attr == &sysfs_prune_cache) {
+               struct shrink_control sc;
+
+               sc.gfp_mask = GFP_KERNEL;
+               sc.nr_to_scan = strtoul_or_return(buf);
+               c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+       }
+#ifdef CONFIG_BCACHEFS_TESTS
+       if (attr == &sysfs_perf_test) {
+               char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+               char *test              = strsep(&p, " \t\n");
+               char *nr_str            = strsep(&p, " \t\n");
+               char *threads_str       = strsep(&p, " \t\n");
+               unsigned threads;
+               u64 nr;
+               int ret = -EINVAL;
+
+               if (threads_str &&
+                   !(ret = kstrtouint(threads_str, 10, &threads)) &&
+                   !(ret = bch2_strtoull_h(nr_str, &nr)))
+                       bch2_btree_perf_test(c, test, nr, threads);
+               else
+                       size = ret;
+               kfree(tmp);
+       }
+#endif
+       return size;
+}
+
+STORE(bch2_fs)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+       mutex_lock(&c->state_lock);
+       size = __bch2_fs_store(kobj, attr, buf, size);
+       mutex_unlock(&c->state_lock);
+
+       return size;
+}
+SYSFS_OPS(bch2_fs);
+
+struct attribute *bch2_fs_files[] = {
+       &sysfs_minor,
+       &sysfs_block_size,
+       &sysfs_btree_node_size,
+       &sysfs_btree_cache_size,
+
+       &sysfs_meta_replicas_have,
+       &sysfs_data_replicas_have,
+
+       &sysfs_journal_write_delay_ms,
+       &sysfs_journal_reclaim_delay_ms,
+
+       &sysfs_promote_whole_extents,
+
+       &sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+       &sysfs_perf_test,
+#endif
+       NULL
+};
+
+/* internal dir - just a wrapper */
+
+SHOW(bch2_fs_internal)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+       return bch2_fs_show(&c->kobj, attr, buf);
+}
+
+STORE(bch2_fs_internal)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+       return bch2_fs_store(&c->kobj, attr, buf, size);
+}
+SYSFS_OPS(bch2_fs_internal);
+
+struct attribute *bch2_fs_internal_files[] = {
+       &sysfs_alloc_debug,
+       &sysfs_journal_debug,
+       &sysfs_journal_pins,
+       &sysfs_btree_updates,
+       &sysfs_dirty_btree_nodes,
+
+       &sysfs_read_realloc_races,
+       &sysfs_extent_migrate_done,
+       &sysfs_extent_migrate_raced,
+
+       &sysfs_trigger_journal_flush,
+       &sysfs_trigger_btree_coalesce,
+       &sysfs_trigger_gc,
+       &sysfs_prune_cache,
+
+       &sysfs_copy_gc_enabled,
+
+       &sysfs_rebalance_enabled,
+       &sysfs_rebalance_work,
+       sysfs_pd_controller_files(rebalance),
+
+       &sysfs_internal_uuid,
+
+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
+       BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+       NULL
+};
+
+/* options */
+
+SHOW(bch2_fs_opts_dir)
+{
+       char *out = buf, *end = buf + PAGE_SIZE;
+       struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+       const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+       int id = opt - bch2_opt_table;
+       u64 v = bch2_opt_get_by_id(&c->opts, id);
+
+       out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST);
+       out += scnprintf(out, end - out, "\n");
+
+       return out - buf;
+}
+
+STORE(bch2_fs_opts_dir)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+       const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+       int ret, id = opt - bch2_opt_table;
+       char *tmp;
+       u64 v;
+
+       tmp = kstrdup(buf, GFP_KERNEL);
+       if (!tmp)
+               return -ENOMEM;
+
+       ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+       kfree(tmp);
+
+       if (ret < 0)
+               return ret;
+
+       if (id == Opt_compression ||
+           id == Opt_background_compression) {
+               int ret = bch2_check_set_has_compressed_data(c, v);
+               if (ret) {
+                       mutex_unlock(&c->sb_lock);
+                       return ret;
+               }
+       }
+
+       if (opt->set_sb != SET_NO_SB_OPT) {
+               mutex_lock(&c->sb_lock);
+               opt->set_sb(c->disk_sb.sb, v);
+               bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+       }
+
+       bch2_opt_set_by_id(&c->opts, id, v);
+
+       if ((id == Opt_background_target ||
+            id == Opt_background_compression) && v) {
+               bch2_rebalance_add_work(c, S64_MAX);
+               rebalance_wakeup(c);
+       }
+
+       return size;
+}
+SYSFS_OPS(bch2_fs_opts_dir);
+
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
+{
+       const struct bch_option *i;
+       int ret;
+
+       for (i = bch2_opt_table;
+            i < bch2_opt_table + bch2_opts_nr;
+            i++) {
+               if (i->mode == OPT_INTERNAL)
+                       continue;
+
+               ret = sysfs_create_file(kobj, &i->attr);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/* time stats */
+
+SHOW(bch2_fs_time_stats)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)                                                \
+       if (attr == &sysfs_time_stat_##name)                            \
+               return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
+                                            buf, PAGE_SIZE);
+       BCH_TIME_STATS()
+#undef x
+
+       return 0;
+}
+
+STORE(bch2_fs_time_stats)
+{
+       return size;
+}
+SYSFS_OPS(bch2_fs_time_stats);
+
+struct attribute *bch2_fs_time_stats_files[] = {
+#define x(name)                                                \
+       &sysfs_time_stat_##name,
+       BCH_TIME_STATS()
+#undef x
+       NULL
+};
+
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
+                                size_t, void *);
+
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
+                                 size_t b, void *private)
+{
+       int rw = (private ? 1 : 0);
+
+       return bucket_last_io(c, bucket(ca, b), rw);
+}
+
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
+                                      size_t b, void *private)
+{
+       struct bucket *g = bucket(ca, b);
+       return bucket_sectors_used(g->mark);
+}
+
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, void *private)
+{
+       return bucket_gc_gen(ca, b);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+       unsigned l = *((unsigned *) _l);
+       unsigned r = *((unsigned *) _r);
+
+       return (l > r) - (l < r);
+}
+
+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
+                             char *buf, bucket_map_fn *fn, void *private)
+{
+       size_t i, n;
+       /* Compute 31 quantiles */
+       unsigned q[31], *p;
+       ssize_t ret = 0;
+
+       down_read(&ca->bucket_lock);
+       n = ca->mi.nbuckets;
+
+       p = vzalloc(n * sizeof(unsigned));
+       if (!p) {
+               up_read(&ca->bucket_lock);
+               return -ENOMEM;
+       }
+
+       for (i = ca->mi.first_bucket; i < n; i++)
+               p[i] = fn(c, ca, i, private);
+
+       sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
+       up_read(&ca->bucket_lock);
+
+       while (n &&
+              !p[n - 1])
+               --n;
+
+       for (i = 0; i < ARRAY_SIZE(q); i++)
+               q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
+
+       vfree(p);
+
+       for (i = 0; i < ARRAY_SIZE(q); i++)
+               ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                "%u ", q[i]);
+       buf[ret - 1] = '\n';
+
+       return ret;
+}
+
+static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
+{
+       enum alloc_reserve i;
+       ssize_t ret;
+
+       spin_lock(&ca->freelist_lock);
+
+       ret = scnprintf(buf, PAGE_SIZE,
+                       "free_inc:\t%zu\t%zu\n",
+                       fifo_used(&ca->free_inc),
+                       ca->free_inc.size);
+
+       for (i = 0; i < RESERVE_NR; i++)
+               ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+                                "free[%u]:\t%zu\t%zu\n", i,
+                                fifo_used(&ca->free[i]),
+                                ca->free[i].size);
+
+       spin_unlock(&ca->freelist_lock);
+
+       return ret;
+}
+
+static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
+{
+       struct bch_fs *c = ca->fs;
+       struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+
+       return scnprintf(buf, PAGE_SIZE,
+               "free_inc:               %zu/%zu\n"
+               "free[RESERVE_BTREE]:    %zu/%zu\n"
+               "free[RESERVE_MOVINGGC]: %zu/%zu\n"
+               "free[RESERVE_NONE]:     %zu/%zu\n"
+               "buckets:\n"
+               "    capacity:           %llu\n"
+               "    alloc:              %llu\n"
+               "    sb:                 %llu\n"
+               "    journal:            %llu\n"
+               "    meta:               %llu\n"
+               "    user:               %llu\n"
+               "    cached:             %llu\n"
+               "    available:          %llu\n"
+               "sectors:\n"
+               "    sb:                 %llu\n"
+               "    journal:            %llu\n"
+               "    meta:               %llu\n"
+               "    user:               %llu\n"
+               "    cached:             %llu\n"
+               "freelist_wait:          %s\n"
+               "open buckets:           %u/%u (reserved %u)\n"
+               "open_buckets_wait:      %s\n",
+               fifo_used(&ca->free_inc),               ca->free_inc.size,
+               fifo_used(&ca->free[RESERVE_BTREE]),    ca->free[RESERVE_BTREE].size,
+               fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
+               fifo_used(&ca->free[RESERVE_NONE]),     ca->free[RESERVE_NONE].size,
+               ca->mi.nbuckets - ca->mi.first_bucket,
+               stats.buckets_alloc,
+               stats.buckets[BCH_DATA_SB],
+               stats.buckets[BCH_DATA_JOURNAL],
+               stats.buckets[BCH_DATA_BTREE],
+               stats.buckets[BCH_DATA_USER],
+               stats.buckets[BCH_DATA_CACHED],
+               __dev_buckets_available(ca, stats),
+               stats.sectors[BCH_DATA_SB],
+               stats.sectors[BCH_DATA_JOURNAL],
+               stats.sectors[BCH_DATA_BTREE],
+               stats.sectors[BCH_DATA_USER],
+               stats.sectors[BCH_DATA_CACHED],
+               c->freelist_wait.list.first             ? "waiting" : "empty",
+               c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
+               c->open_buckets_wait.list.first         ? "waiting" : "empty");
+}
+
+static const char * const bch2_rw[] = {
+       "read",
+       "write",
+       NULL
+};
+
+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
+{
+       char *out = buf, *end = buf + PAGE_SIZE;
+       int rw, i, cpu;
+
+       for (rw = 0; rw < 2; rw++) {
+               out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]);
+
+               for (i = 1; i < BCH_DATA_NR; i++) {
+                       u64 n = 0;
+
+                       for_each_possible_cpu(cpu)
+                               n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i];
+
+                       out += scnprintf(out, end - out, "%-12s:%12llu\n",
+                                        bch2_data_types[i], n << 9);
+               }
+       }
+
+       return out - buf;
+}
+
+SHOW(bch2_dev)
+{
+       struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+       struct bch_fs *c = ca->fs;
+       char *out = buf, *end = buf + PAGE_SIZE;
+
+       sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
+
+       sysfs_print(bucket_size,        bucket_bytes(ca));
+       sysfs_print(block_size,         block_bytes(c));
+       sysfs_print(first_bucket,       ca->mi.first_bucket);
+       sysfs_print(nbuckets,           ca->mi.nbuckets);
+       sysfs_print(durability,         ca->mi.durability);
+       sysfs_print(discard,            ca->mi.discard);
+
+       if (attr == &sysfs_label) {
+               if (ca->mi.group) {
+                       mutex_lock(&c->sb_lock);
+                       out += bch2_disk_path_print(&c->disk_sb, out, end - out,
+                                                   ca->mi.group - 1);
+                       mutex_unlock(&c->sb_lock);
+               } else {
+                       out += scnprintf(out, end - out, "none");
+               }
+
+               out += scnprintf(out, end - out, "\n");
+               return out - buf;
+       }
+
+       if (attr == &sysfs_has_data) {
+               out += bch2_scnprint_flag_list(out, end - out,
+                                              bch2_data_types,
+                                              bch2_dev_has_data(c, ca));
+               out += scnprintf(out, end - out, "\n");
+               return out - buf;
+       }
+
+       sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
+
+       if (attr == &sysfs_cache_replacement_policy) {
+               out += bch2_scnprint_string_list(out, end - out,
+                                                bch2_cache_replacement_policies,
+                                                ca->mi.replacement);
+               out += scnprintf(out, end - out, "\n");
+               return out - buf;
+       }
+
+       if (attr == &sysfs_state_rw) {
+               out += bch2_scnprint_string_list(out, end - out,
+                                                bch2_dev_state,
+                                                ca->mi.state);
+               out += scnprintf(out, end - out, "\n");
+               return out - buf;
+       }
+
+       if (attr == &sysfs_iodone)
+               return show_dev_iodone(ca, buf);
+
+       sysfs_print(io_latency_read,            atomic64_read(&ca->cur_latency[READ]));
+       sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
+
+       if (attr == &sysfs_io_latency_stats_read)
+               return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+       if (attr == &sysfs_io_latency_stats_write)
+               return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+
+       sysfs_printf(congested,                 "%u%%",
+                    clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+                    * 100 / CONGESTED_MAX);
+
+       if (attr == &sysfs_bucket_quantiles_last_read)
+               return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+       if (attr == &sysfs_bucket_quantiles_last_write)
+               return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+       if (attr == &sysfs_bucket_quantiles_fragmentation)
+               return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+       if (attr == &sysfs_bucket_quantiles_oldest_gen)
+               return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+
+       if (attr == &sysfs_reserve_stats)
+               return show_reserve_stats(ca, buf);
+       if (attr == &sysfs_alloc_debug)
+               return show_dev_alloc_debug(ca, buf);
+
+       return 0;
+}
+
+STORE(bch2_dev)
+{
+       struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+       struct bch_fs *c = ca->fs;
+       struct bch_member *mi;
+
+       sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
+
+       if (attr == &sysfs_discard) {
+               bool v = strtoul_or_return(buf);
+
+               mutex_lock(&c->sb_lock);
+               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+               if (v != BCH_MEMBER_DISCARD(mi)) {
+                       SET_BCH_MEMBER_DISCARD(mi, v);
+                       bch2_write_super(c);
+               }
+               mutex_unlock(&c->sb_lock);
+       }
+
+       if (attr == &sysfs_cache_replacement_policy) {
+               ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
+
+               if (v < 0)
+                       return v;
+
+               mutex_lock(&c->sb_lock);
+               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+               if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
+                       SET_BCH_MEMBER_REPLACEMENT(mi, v);
+                       bch2_write_super(c);
+               }
+               mutex_unlock(&c->sb_lock);
+       }
+
+       if (attr == &sysfs_label) {
+               char *tmp;
+               int ret;
+
+               tmp = kstrdup(buf, GFP_KERNEL);
+               if (!tmp)
+                       return -ENOMEM;
+
+               ret = bch2_dev_group_set(c, ca, strim(tmp));
+               kfree(tmp);
+               if (ret)
+                       return ret;
+       }
+
+       if (attr == &sysfs_wake_allocator)
+               bch2_wake_allocator(ca);
+
+       return size;
+}
+SYSFS_OPS(bch2_dev);
+
+struct attribute *bch2_dev_files[] = {
+       &sysfs_uuid,
+       &sysfs_bucket_size,
+       &sysfs_block_size,
+       &sysfs_first_bucket,
+       &sysfs_nbuckets,
+       &sysfs_durability,
+
+       /* settings: */
+       &sysfs_discard,
+       &sysfs_cache_replacement_policy,
+       &sysfs_state_rw,
+       &sysfs_label,
+
+       &sysfs_has_data,
+       &sysfs_iodone,
+
+       &sysfs_io_latency_read,
+       &sysfs_io_latency_write,
+       &sysfs_io_latency_stats_read,
+       &sysfs_io_latency_stats_write,
+       &sysfs_congested,
+
+       /* alloc info - other stats: */
+       &sysfs_bucket_quantiles_last_read,
+       &sysfs_bucket_quantiles_last_write,
+       &sysfs_bucket_quantiles_fragmentation,
+       &sysfs_bucket_quantiles_oldest_gen,
+
+       &sysfs_reserve_stats,
+
+       /* debug: */
+       &sysfs_alloc_debug,
+       &sysfs_wake_allocator,
+
+       sysfs_pd_controller_files(copy_gc),
+       NULL
+};
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
new file mode 100644 (file)
index 0000000..525fd05
--- /dev/null
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_
+
+#include <linux/sysfs.h>
+
+#ifndef NO_BCACHEFS_SYSFS
+
+struct attribute;
+struct sysfs_ops;
+
+extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_internal_files[];
+extern struct attribute *bch2_fs_opts_dir_files[];
+extern struct attribute *bch2_fs_time_stats_files[];
+extern struct attribute *bch2_dev_files[];
+
+extern struct sysfs_ops bch2_fs_sysfs_ops;
+extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern struct sysfs_ops bch2_dev_sysfs_ops;
+
+int bch2_opts_create_sysfs_files(struct kobject *);
+
+#else
+
+static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_internal_files[] = {};
+static struct attribute *bch2_fs_opts_dir_files[] = {};
+static struct attribute *bch2_fs_time_stats_files[] = {};
+static struct attribute *bch2_dev_files[] = {};
+
+static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+static const struct sysfs_ops bch2_dev_sysfs_ops;
+
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+
+#endif /* NO_BCACHEFS_SYSFS */
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
new file mode 100644 (file)
index 0000000..c522fb7
--- /dev/null
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "journal_reclaim.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void delete_test_keys(struct bch_fs *c)
+{
+       int ret;
+
+       ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+                                     POS(0, 0), POS(0, U64_MAX),
+                                     ZERO_VERSION, NULL, NULL, NULL);
+       BUG_ON(ret);
+
+       ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+                                     POS(0, 0), POS(0, U64_MAX),
+                                     ZERO_VERSION, NULL, NULL, NULL);
+       BUG_ON(ret);
+}
+
+/* unit tests */
+
+static void test_delete(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_i_cookie k;
+       int ret;
+
+       bkey_cookie_init(&k.k_i);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_btree_iter_traverse(&iter);
+       BUG_ON(ret);
+
+       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                  BTREE_INSERT_ENTRY(&iter, &k.k_i));
+       BUG_ON(ret);
+
+       pr_info("deleting once");
+       ret = bch2_btree_delete_at(&iter, 0);
+       BUG_ON(ret);
+
+       pr_info("deleting twice");
+       ret = bch2_btree_delete_at(&iter, 0);
+       BUG_ON(ret);
+
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void test_delete_written(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_i_cookie k;
+       int ret;
+
+       bkey_cookie_init(&k.k_i);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_btree_iter_traverse(&iter);
+       BUG_ON(ret);
+
+       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                  BTREE_INSERT_ENTRY(&iter, &k.k_i));
+       BUG_ON(ret);
+
+       bch2_journal_flush_all_pins(&c->journal);
+
+       ret = bch2_btree_delete_at(&iter, 0);
+       BUG_ON(ret);
+
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void test_iterate(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 i;
+       int ret;
+
+       delete_test_keys(c);
+
+       pr_info("inserting test keys");
+
+       for (i = 0; i < nr; i++) {
+               struct bkey_i_cookie k;
+
+               bkey_cookie_init(&k.k_i);
+               k.k.p.offset = i;
+
+               ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+
+       pr_info("iterating forwards");
+
+       i = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
+               BUG_ON(k.k->p.offset != i++);
+       bch2_btree_iter_unlock(&iter);
+
+       BUG_ON(i != nr);
+
+       pr_info("iterating backwards");
+
+       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
+               BUG_ON(k.k->p.offset != --i);
+       bch2_btree_iter_unlock(&iter);
+
+       BUG_ON(i);
+}
+
+static void test_iterate_extents(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 i;
+       int ret;
+
+       delete_test_keys(c);
+
+       pr_info("inserting test extents");
+
+       for (i = 0; i < nr; i += 8) {
+               struct bkey_i_cookie k;
+
+               bkey_cookie_init(&k.k_i);
+               k.k.p.offset = i + 8;
+               k.k.size = 8;
+
+               ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+
+       pr_info("iterating forwards");
+
+       i = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+               BUG_ON(bkey_start_offset(k.k) != i);
+               i = k.k->p.offset;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       BUG_ON(i != nr);
+
+       pr_info("iterating backwards");
+
+       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
+               BUG_ON(k.k->p.offset != i);
+               i = bkey_start_offset(k.k);
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       BUG_ON(i);
+}
+
+static void test_iterate_slots(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 i;
+       int ret;
+
+       delete_test_keys(c);
+
+       pr_info("inserting test keys");
+
+       for (i = 0; i < nr; i++) {
+               struct bkey_i_cookie k;
+
+               bkey_cookie_init(&k.k_i);
+               k.k.p.offset = i * 2;
+
+               ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+
+       pr_info("iterating forwards");
+
+       i = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
+               BUG_ON(k.k->p.offset != i);
+               i += 2;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       BUG_ON(i != nr * 2);
+
+       pr_info("iterating forwards by slots");
+
+       i = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0),
+                          BTREE_ITER_SLOTS, k) {
+               BUG_ON(bkey_deleted(k.k) != (i & 1));
+               BUG_ON(k.k->p.offset != i++);
+
+               if (i == nr * 2)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 i;
+       int ret;
+
+       delete_test_keys(c);
+
+       pr_info("inserting test keys");
+
+       for (i = 0; i < nr; i += 16) {
+               struct bkey_i_cookie k;
+
+               bkey_cookie_init(&k.k_i);
+               k.k.p.offset = i + 16;
+               k.k.size = 8;
+
+               ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+
+       pr_info("iterating forwards");
+
+       i = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+               BUG_ON(bkey_start_offset(k.k) != i + 8);
+               BUG_ON(k.k->size != 8);
+               i += 16;
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       BUG_ON(i != nr);
+
+       pr_info("iterating forwards by slots");
+
+       i = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0),
+                          BTREE_ITER_SLOTS, k) {
+               BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+               BUG_ON(bkey_start_offset(k.k) != i);
+               BUG_ON(k.k->size != 8);
+               i = k.k->p.offset;
+
+               if (i == nr)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+       u64 v;
+#if 0
+       v = prandom_u32_max(U32_MAX);
+#else
+       get_random_bytes(&v, sizeof(v));
+#endif
+       return v;
+}
+
+static void rand_insert(struct bch_fs *c, u64 nr)
+{
+       struct bkey_i_cookie k;
+       int ret;
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               bkey_cookie_init(&k.k_i);
+               k.k.p.offset = test_rand();
+
+               ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+}
+
+static void rand_lookup(struct bch_fs *c, u64 nr)
+{
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+
+               bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+                                    POS(0, test_rand()), 0);
+
+               k = bch2_btree_iter_peek(&iter);
+               bch2_btree_iter_unlock(&iter);
+       }
+}
+
+static void rand_mixed(struct bch_fs *c, u64 nr)
+{
+       int ret;
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+
+               bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+                                    POS(0, test_rand()), 0);
+
+               k = bch2_btree_iter_peek(&iter);
+
+               if (!(i & 3) && k.k) {
+                       struct bkey_i_cookie k;
+
+                       bkey_cookie_init(&k.k_i);
+                       k.k.p = iter.pos;
+
+                       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                                  BTREE_INSERT_ENTRY(&iter, &k.k_i));
+                       BUG_ON(ret);
+               }
+
+               bch2_btree_iter_unlock(&iter);
+       }
+
+}
+
+static void rand_delete(struct bch_fs *c, u64 nr)
+{
+       struct bkey_i k;
+       int ret;
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               bkey_init(&k.k);
+               k.k.p.offset = test_rand();
+
+               ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+}
+
+static void seq_insert(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_cookie insert;
+       int ret;
+       u64 i = 0;
+
+       bkey_cookie_init(&insert.k_i);
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+               insert.k.p = iter.pos;
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                               BTREE_INSERT_ENTRY(&iter, &insert.k_i));
+               BUG_ON(ret);
+
+               if (++i == nr)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_lookup(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
+               ;
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_overwrite(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+                          BTREE_ITER_INTENT, k) {
+               struct bkey_i_cookie u;
+
+               bkey_reassemble(&u.k_i, k);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                          BTREE_INSERT_ENTRY(&iter, &u.k_i));
+               BUG_ON(ret);
+       }
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_delete(struct bch_fs *c, u64 nr)
+{
+       int ret;
+
+       ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+                                     POS(0, 0), POS(0, U64_MAX),
+                                     ZERO_VERSION, NULL, NULL, NULL);
+       BUG_ON(ret);
+}
+
+typedef void (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+       struct bch_fs                   *c;
+       u64                             nr;
+       unsigned                        nr_threads;
+       perf_test_fn                    fn;
+
+       atomic_t                        ready;
+       wait_queue_head_t               ready_wait;
+
+       atomic_t                        done;
+       struct completion               done_completion;
+
+       u64                             start;
+       u64                             finish;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+       struct test_job *j = data;
+
+       if (atomic_dec_and_test(&j->ready)) {
+               wake_up(&j->ready_wait);
+               j->start = sched_clock();
+       } else {
+               wait_event(j->ready_wait, !atomic_read(&j->ready));
+       }
+
+       j->fn(j->c, j->nr / j->nr_threads);
+
+       if (atomic_dec_and_test(&j->done)) {
+               j->finish = sched_clock();
+               complete(&j->done_completion);
+       }
+
+       return 0;
+}
+
+void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+                         u64 nr, unsigned nr_threads)
+{
+       struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+       char name_buf[20], nr_buf[20], per_sec_buf[20];
+       unsigned i;
+       u64 time;
+
+       atomic_set(&j.ready, nr_threads);
+       init_waitqueue_head(&j.ready_wait);
+
+       atomic_set(&j.done, nr_threads);
+       init_completion(&j.done_completion);
+
+#define perf_test(_test)                               \
+       if (!strcmp(testname, #_test)) j.fn = _test
+
+       perf_test(rand_insert);
+       perf_test(rand_lookup);
+       perf_test(rand_mixed);
+       perf_test(rand_delete);
+
+       perf_test(seq_insert);
+       perf_test(seq_lookup);
+       perf_test(seq_overwrite);
+       perf_test(seq_delete);
+
+       /* a unit test, not a perf test: */
+       perf_test(test_delete);
+       perf_test(test_delete_written);
+       perf_test(test_iterate);
+       perf_test(test_iterate_extents);
+       perf_test(test_iterate_slots);
+       perf_test(test_iterate_slots_extents);
+
+       if (!j.fn) {
+               pr_err("unknown test %s", testname);
+               return;
+       }
+
+       //pr_info("running test %s:", testname);
+
+       if (nr_threads == 1)
+               btree_perf_test_thread(&j);
+       else
+               for (i = 0; i < nr_threads; i++)
+                       kthread_run(btree_perf_test_thread, &j,
+                                   "bcachefs perf test[%u]", i);
+
+       while (wait_for_completion_interruptible(&j.done_completion))
+               ;
+
+       time = j.finish - j.start;
+
+       scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+       bch2_hprint(nr_buf, nr);
+       bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
+       printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+               name_buf, nr_buf, nr_threads,
+               time / NSEC_PER_SEC,
+               time * nr_threads / nr,
+               per_sec_buf);
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
new file mode 100644 (file)
index 0000000..551d076
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
new file mode 100644 (file)
index 0000000..b770973
--- /dev/null
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "btree_types.h"
+#include "keylist.h"
+
+#include <linux/blktrace_api.h>
+#include "keylist.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
new file mode 100644 (file)
index 0000000..d0b99c6
--- /dev/null
@@ -0,0 +1,536 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcachefs
+
+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHEFS_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(bpos,
+       TP_PROTO(struct bpos *p),
+       TP_ARGS(p),
+
+       TP_STRUCT__entry(
+               __field(u64,    inode                           )
+               __field(u64,    offset                          )
+       ),
+
+       TP_fast_assign(
+               __entry->inode  = p->inode;
+               __entry->offset = p->offset;
+       ),
+
+       TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+);
+
+DECLARE_EVENT_CLASS(bkey,
+       TP_PROTO(const struct bkey *k),
+       TP_ARGS(k),
+
+       TP_STRUCT__entry(
+               __field(u64,    inode                           )
+               __field(u64,    offset                          )
+               __field(u32,    size                            )
+       ),
+
+       TP_fast_assign(
+               __entry->inode  = k->p.inode;
+               __entry->offset = k->p.offset;
+               __entry->size   = k->size;
+       ),
+
+       TP_printk("%llu:%llu len %u", __entry->inode,
+                 __entry->offset, __entry->size)
+);
+
+DECLARE_EVENT_CLASS(bch_dev,
+       TP_PROTO(struct bch_dev *ca),
+       TP_ARGS(ca),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16      )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, ca->uuid.b, 16);
+       ),
+
+       TP_printk("%pU", __entry->uuid)
+);
+
+DECLARE_EVENT_CLASS(bch_fs,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+       ),
+
+       TP_printk("%pU", __entry->uuid)
+);
+
+DECLARE_EVENT_CLASS(bio,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(sector_t,       sector                  )
+               __field(unsigned int,   nr_sector               )
+               __array(char,           rwbs,   6               )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev ? bio_dev(bio) : 0;
+               __entry->sector         = bio->bi_iter.bi_sector;
+               __entry->nr_sector      = bio->bi_iter.bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+       ),
+
+       TP_printk("%d,%d  %s %llu + %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_split,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_bounce,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, promote,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio)
+);
+
+/* Journal */
+
+DEFINE_EVENT(bch_fs, journal_full,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, journal_entry_full,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bio, journal_write,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio)
+);
+
+/* bset.c: */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+       TP_PROTO(struct bpos *p),
+       TP_ARGS(p)
+);
+
+/* Btree */
+
+DECLARE_EVENT_CLASS(btree_node,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,           16      )
+               __field(u8,             level                   )
+               __field(u8,             id                      )
+               __field(u64,            inode                   )
+               __field(u64,            offset                  )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->level          = b->level;
+               __entry->id             = b->btree_id;
+               __entry->inode          = b->key.k.p.inode;
+               __entry->offset         = b->key.k.p.offset;
+       ),
+
+       TP_printk("%pU  %u id %u %llu:%llu",
+                 __entry->uuid, __entry->level, __entry->id,
+                 __entry->inode, __entry->offset)
+);
+
+DEFINE_EVENT(btree_node, btree_read,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_write,
+       TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+       TP_ARGS(b, bytes, sectors),
+
+       TP_STRUCT__entry(
+               __field(enum bkey_type, type)
+               __field(unsigned,       bytes                   )
+               __field(unsigned,       sectors                 )
+       ),
+
+       TP_fast_assign(
+               __entry->type   = btree_node_type(b);
+               __entry->bytes  = bytes;
+               __entry->sectors = sectors;
+       ),
+
+       TP_printk("bkey type %u bytes %u sectors %u",
+                 __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, btree_node_alloc,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_free,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_reap,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c),
+
+       TP_STRUCT__entry(
+               __array(char,                   uuid,   16      )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+       ),
+
+       TP_printk("%pU", __entry->uuid)
+);
+
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+TRACE_EVENT(btree_reserve_get_fail,
+       TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
+       TP_ARGS(c, required, cl),
+
+       TP_STRUCT__entry(
+               __array(char,                   uuid,   16      )
+               __field(size_t,                 required        )
+               __field(struct closure *,       cl              )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->required = required;
+               __entry->cl = cl;
+       ),
+
+       TP_printk("%pU required %zu by %p", __entry->uuid,
+                 __entry->required, __entry->cl)
+);
+
+TRACE_EVENT(btree_insert_key,
+       TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
+       TP_ARGS(c, b, k),
+
+       TP_STRUCT__entry(
+               __field(u8,             id                      )
+               __field(u64,            inode                   )
+               __field(u64,            offset                  )
+               __field(u32,            size                    )
+       ),
+
+       TP_fast_assign(
+               __entry->id             = b->btree_id;
+               __entry->inode          = k->k.p.inode;
+               __entry->offset         = k->k.p.offset;
+               __entry->size           = k->k.size;
+       ),
+
+       TP_printk("btree %u: %llu:%llu len %u", __entry->id,
+                 __entry->inode, __entry->offset, __entry->size)
+);
+
+DEFINE_EVENT(btree_node, btree_split,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_compact,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_merge,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_set_root,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+/* Garbage collection */
+
+DEFINE_EVENT(btree_node, btree_gc_coalesce,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_gc_coalesce_fail,
+       TP_PROTO(struct bch_fs *c, int reason),
+       TP_ARGS(c, reason),
+
+       TP_STRUCT__entry(
+               __field(u8,             reason                  )
+               __array(char,           uuid,   16              )
+       ),
+
+       TP_fast_assign(
+               __entry->reason         = reason;
+               memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
+       ),
+
+       TP_printk("%pU: %u", __entry->uuid, __entry->reason)
+);
+
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(bch_fs, gc_start,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_end,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_coalesce_start,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_coalesce_end,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_dev, sectors_saturated,
+       TP_PROTO(struct bch_dev *ca),
+       TP_ARGS(ca)
+);
+
+DEFINE_EVENT(bch_fs, gc_sectors_saturated,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+/* Allocator */
+
+TRACE_EVENT(alloc_batch,
+       TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
+       TP_ARGS(ca, free, total),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16      )
+               __field(size_t,         free            )
+               __field(size_t,         total           )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, ca->uuid.b, 16);
+               __entry->free = free;
+               __entry->total = total;
+       ),
+
+       TP_printk("%pU free %zu total %zu",
+               __entry->uuid, __entry->free, __entry->total)
+);
+
+TRACE_EVENT(invalidate,
+       TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
+       TP_ARGS(ca, offset, sectors),
+
+       TP_STRUCT__entry(
+               __field(unsigned,       sectors                 )
+               __field(dev_t,          dev                     )
+               __field(__u64,          offset                  )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = ca->disk_sb.bdev->bd_dev;
+               __entry->offset         = offset,
+               __entry->sectors        = sectors;
+       ),
+
+       TP_printk("invalidated %u sectors at %d,%d sector=%llu",
+                 __entry->sectors, MAJOR(__entry->dev),
+                 MINOR(__entry->dev), __entry->offset)
+);
+
+DEFINE_EVENT(bch_fs, rescale_prios,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DECLARE_EVENT_CLASS(bucket_alloc,
+       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+       TP_ARGS(ca, reserve),
+
+       TP_STRUCT__entry(
+               __array(char,                   uuid,   16)
+               __field(enum alloc_reserve,     reserve   )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, ca->uuid.b, 16);
+               __entry->reserve = reserve;
+       ),
+
+       TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+       TP_ARGS(ca, reserve)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+       TP_ARGS(ca, reserve)
+);
+
+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
+       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+       TP_ARGS(ca, reserve)
+);
+
+/* Moving IO */
+
+DEFINE_EVENT(bkey, move_extent,
+       TP_PROTO(const struct bkey *k),
+       TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_alloc_fail,
+       TP_PROTO(const struct bkey *k),
+       TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_race,
+       TP_PROTO(const struct bkey *k),
+       TP_ARGS(k)
+);
+
+TRACE_EVENT(move_data,
+       TP_PROTO(struct bch_fs *c, u64 sectors_moved,
+                u64 keys_moved),
+       TP_ARGS(c, sectors_moved, keys_moved),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16      )
+               __field(u64,            sectors_moved   )
+               __field(u64,            keys_moved      )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->sectors_moved = sectors_moved;
+               __entry->keys_moved = keys_moved;
+       ),
+
+       TP_printk("%pU sectors_moved %llu keys_moved %llu",
+               __entry->uuid, __entry->sectors_moved, __entry->keys_moved)
+);
+
+TRACE_EVENT(copygc,
+       TP_PROTO(struct bch_dev *ca,
+                u64 sectors_moved, u64 sectors_not_moved,
+                u64 buckets_moved, u64 buckets_not_moved),
+       TP_ARGS(ca,
+               sectors_moved, sectors_not_moved,
+               buckets_moved, buckets_not_moved),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16              )
+               __field(u64,            sectors_moved           )
+               __field(u64,            sectors_not_moved       )
+               __field(u64,            buckets_moved           )
+               __field(u64,            buckets_not_moved       )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, ca->uuid.b, 16);
+               __entry->sectors_moved          = sectors_moved;
+               __entry->sectors_not_moved      = sectors_not_moved;
+               __entry->buckets_moved          = buckets_moved;
+               __entry->buckets_not_moved = buckets_moved;
+       ),
+
+       TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
+               __entry->uuid,
+               __entry->sectors_moved, __entry->sectors_not_moved,
+               __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+#endif /* _TRACE_BCACHEFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../fs/bcachefs
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
new file mode 100644 (file)
index 0000000..6666c3a
--- /dev/null
@@ -0,0 +1,942 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/log2.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "eytzinger.h"
+#include "util.h"
+
+#define simple_strtoint(c, end, base)  simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
+
+static const char si_units[] = "?kMGTPEZY";
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+                        u64 t_max, bool t_signed)
+{
+       bool positive = *cp != '-';
+       unsigned u;
+       u64 v = 0;
+
+       if (*cp == '+' || *cp == '-')
+               cp++;
+
+       if (!isdigit(*cp))
+               return -EINVAL;
+
+       do {
+               if (v > U64_MAX / 10)
+                       return -ERANGE;
+               v *= 10;
+               if (v > U64_MAX - (*cp - '0'))
+                       return -ERANGE;
+               v += *cp - '0';
+               cp++;
+       } while (isdigit(*cp));
+
+       for (u = 1; u < strlen(si_units); u++)
+               if (*cp == si_units[u]) {
+                       cp++;
+                       goto got_unit;
+               }
+       u = 0;
+got_unit:
+       if (*cp == '\n')
+               cp++;
+       if (*cp)
+               return -EINVAL;
+
+       if (fls64(v) + u * 10 > 64)
+               return -ERANGE;
+
+       v <<= u * 10;
+
+       if (positive) {
+               if (v > t_max)
+                       return -ERANGE;
+       } else {
+               if (v && !t_signed)
+                       return -ERANGE;
+
+               if (v > t_max + 1)
+                       return -ERANGE;
+               v = -v;
+       }
+
+       *res = v;
+       return 0;
+}
+
+#define STRTO_H(name, type)                                    \
+int bch2_ ## name ## _h(const char *cp, type *res)             \
+{                                                              \
+       u64 v;                                                  \
+       int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),      \
+                       ANYSINT_MAX(type) != ((type) ~0ULL));   \
+       *res = v;                                               \
+       return ret;                                             \
+}
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch2_hprint(char *buf, s64 v)
+{
+       char dec[4] = "";
+       int u, t = 0;
+
+       for (u = 0; v >= 1024 || v <= -1024; u++) {
+               t = v & ~(~0U << 10);
+               v >>= 10;
+       }
+
+       if (!u)
+               return sprintf(buf, "%lli", v);
+
+       /*
+        * 103 is magic: t is in the range [-1023, 1023] and we want
+        * to turn it into [-9, 9]
+        */
+       if (v < 100 && v > -100)
+               scnprintf(dec, sizeof(dec), ".%i", t / 103);
+
+       return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
+}
+
+ssize_t bch2_scnprint_string_list(char *buf, size_t size,
+                                 const char * const list[],
+                                 size_t selected)
+{
+       char *out = buf;
+       size_t i;
+
+       if (size)
+               *out = '\0';
+
+       for (i = 0; list[i]; i++)
+               out += scnprintf(out, buf + size - out,
+                                i == selected ? "[%s] " : "%s ", list[i]);
+
+       if (out != buf)
+               *--out = '\0';
+
+       return out - buf;
+}
+
+ssize_t bch2_scnprint_flag_list(char *buf, size_t size,
+                               const char * const list[], u64 flags)
+{
+       char *out = buf, *end = buf + size;
+       unsigned bit, nr = 0;
+
+       while (list[nr])
+               nr++;
+
+       if (size)
+               *out = '\0';
+
+       while (flags && (bit = __ffs(flags)) < nr) {
+               out += scnprintf(out, end - out, "%s,", list[bit]);
+               flags ^= 1 << bit;
+       }
+
+       if (out != buf)
+               *--out = '\0';
+
+       return out - buf;
+}
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+       u64 ret = 0;
+       char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL);
+
+       if (!d)
+               return -ENOMEM;
+
+       s = strim(d);
+
+       while ((p = strsep(&s, ","))) {
+               int flag = match_string(list, -1, p);
+               if (flag < 0) {
+                       ret = -1;
+                       break;
+               }
+
+               ret |= 1 << flag;
+       }
+
+       kfree(d);
+
+       return ret;
+}
+
+bool bch2_is_zero(const void *_p, size_t n)
+{
+       const char *p = _p;
+       size_t i;
+
+       for (i = 0; i < n; i++)
+               if (p[i])
+                       return false;
+       return true;
+}
+
+/* time stats: */
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
+{
+       unsigned i = 0;
+
+       while (i < ARRAY_SIZE(q->entries)) {
+               struct bch2_quantile_entry *e = q->entries + i;
+
+               if (unlikely(!e->step)) {
+                       e->m = v;
+                       e->step = max_t(unsigned, v / 2, 1024);
+               } else if (e->m > v) {
+                       e->m = e->m >= e->step
+                               ? e->m - e->step
+                               : 0;
+               } else if (e->m < v) {
+                       e->m = e->m + e->step > e->m
+                               ? e->m + e->step
+                               : U32_MAX;
+               }
+
+               if ((e->m > v ? e->m - v : v - e->m) < e->step)
+                       e->step = max_t(unsigned, e->step / 2, 1);
+
+               if (v >= e->m)
+                       break;
+
+               i = eytzinger0_child(i, v > e->m);
+       }
+}
+
+static void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+                                      u64 start, u64 end)
+{
+       u64 duration, freq;
+
+       duration        = time_after64(end, start)
+               ? end - start : 0;
+       freq            = time_after64(end, stats->last_event)
+               ? end - stats->last_event : 0;
+
+       stats->count++;
+
+       stats->average_duration = stats->average_duration
+               ? ewma_add(stats->average_duration, duration, 6)
+               : duration;
+
+       stats->average_frequency = stats->average_frequency
+               ? ewma_add(stats->average_frequency, freq, 6)
+               : freq;
+
+       stats->max_duration = max(stats->max_duration, duration);
+
+       stats->last_event = end;
+
+       bch2_quantiles_update(&stats->quantiles, duration);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+       unsigned long flags;
+
+       if (!stats->buffer) {
+               spin_lock_irqsave(&stats->lock, flags);
+               bch2_time_stats_update_one(stats, start, end);
+
+               if (stats->average_frequency < 32 &&
+                   stats->count > 1024)
+                       stats->buffer =
+                               alloc_percpu_gfp(struct bch2_time_stat_buffer,
+                                                GFP_ATOMIC);
+               spin_unlock_irqrestore(&stats->lock, flags);
+       } else {
+               struct bch2_time_stat_buffer_entry *i;
+               struct bch2_time_stat_buffer *b;
+
+               preempt_disable();
+               b = this_cpu_ptr(stats->buffer);
+
+               BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+               b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
+                       .start = start,
+                       .end = end
+               };
+
+               if (b->nr == ARRAY_SIZE(b->entries)) {
+                       spin_lock_irqsave(&stats->lock, flags);
+                       for (i = b->entries;
+                            i < b->entries + ARRAY_SIZE(b->entries);
+                            i++)
+                               bch2_time_stats_update_one(stats, i->start, i->end);
+                       spin_unlock_irqrestore(&stats->lock, flags);
+
+                       b->nr = 0;
+               }
+
+               preempt_enable();
+       }
+}
+#endif
+
+static const struct time_unit {
+       const char      *name;
+       u32             nsecs;
+} time_units[] = {
+       { "ns",         1               },
+       { "us",         NSEC_PER_USEC   },
+       { "ms",         NSEC_PER_MSEC   },
+       { "sec",        NSEC_PER_SEC    },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+       const struct time_unit *u;
+
+       for (u = time_units;
+            u + 1 < time_units + ARRAY_SIZE(time_units) &&
+            ns >= u[1].nsecs << 1;
+            u++)
+               ;
+
+       return u;
+}
+
+static size_t pr_time_units(char *buf, size_t len, u64 ns)
+{
+       const struct time_unit *u = pick_time_units(ns);
+
+       return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+size_t bch2_time_stats_print(struct bch2_time_stats *stats, char *buf, size_t len)
+{
+       char *out = buf, *end = buf + len;
+       const struct time_unit *u;
+       u64 freq = READ_ONCE(stats->average_frequency);
+       u64 q, last_q = 0;
+       int i;
+
+       out += scnprintf(out, end - out, "count:\t\t%llu\n",
+                        stats->count);
+       out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
+                        freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+       out += scnprintf(out, end - out, "frequency:\t");
+       out += pr_time_units(out, end - out, freq);
+
+       out += scnprintf(out, end - out, "\navg duration:\t");
+       out += pr_time_units(out, end - out, stats->average_duration);
+
+       out += scnprintf(out, end - out, "\nmax duration:\t");
+       out += pr_time_units(out, end - out, stats->max_duration);
+
+       i = eytzinger0_first(NR_QUANTILES);
+       u = pick_time_units(stats->quantiles.entries[i].m);
+
+       out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
+       eytzinger0_for_each(i, NR_QUANTILES) {
+               bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+               q = max(stats->quantiles.entries[i].m, last_q);
+               out += scnprintf(out, end - out, "%llu%s",
+                                div_u64(q, u->nsecs),
+                                is_last ? "\n" : " ");
+               last_q = q;
+       }
+
+       return out - buf;
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+       free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+       memset(stats, 0, sizeof(*stats));
+       spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
+/**
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
+ * some work
+ *
+ * @d - the struct bch_ratelimit to update
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
+{
+       u64 now = local_clock();
+
+       return time_after64(d->next, now)
+               ? nsecs_to_jiffies(d->next - now)
+               : 0;
+}
+
+/**
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ */
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+       u64 now = local_clock();
+
+       d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+       if (time_before64(now + NSEC_PER_SEC, d->next))
+               d->next = now + NSEC_PER_SEC;
+
+       if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+               d->next = now - NSEC_PER_SEC * 2;
+}
+
+int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
+{
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
+
+       while (1) {
+               u64 delay = bch2_ratelimit_delay(d);
+
+               if (delay)
+                       set_current_state(TASK_INTERRUPTIBLE);
+
+               if (kthread && kthread_should_stop())
+                       return 1;
+
+               if (!delay)
+                       return 0;
+
+               schedule_timeout(delay);
+               try_to_freeze();
+       }
+}
+
+/* pd controller: */
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
+                             s64 target, s64 actual, int sign)
+{
+       s64 proportional, derivative, change;
+
+       unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+       if (seconds_since_update == 0)
+               return;
+
+       pd->last_update = jiffies;
+
+       proportional = actual - target;
+       proportional *= seconds_since_update;
+       proportional = div_s64(proportional, pd->p_term_inverse);
+
+       derivative = actual - pd->last_actual;
+       derivative = div_s64(derivative, seconds_since_update);
+       derivative = ewma_add(pd->smoothed_derivative, derivative,
+                             (pd->d_term / seconds_since_update) ?: 1);
+       derivative = derivative * pd->d_term;
+       derivative = div_s64(derivative, pd->p_term_inverse);
+
+       change = proportional + derivative;
+
+       /* Don't increase rate if not keeping up */
+       if (change > 0 &&
+           pd->backpressure &&
+           time_after64(local_clock(),
+                        pd->rate.next + NSEC_PER_MSEC))
+               change = 0;
+
+       change *= (sign * -1);
+
+       pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+                               1, UINT_MAX);
+
+       pd->last_actual         = actual;
+       pd->last_derivative     = derivative;
+       pd->last_proportional   = proportional;
+       pd->last_change         = change;
+       pd->last_target         = target;
+}
+
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
+{
+       pd->rate.rate           = 1024;
+       pd->last_update         = jiffies;
+       pd->p_term_inverse      = 6000;
+       pd->d_term              = 30;
+       pd->d_smooth            = pd->d_term;
+       pd->backpressure        = 1;
+}
+
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+{
+       /* 2^64 - 1 is 20 digits, plus null byte */
+       char rate[21];
+       char actual[21];
+       char target[21];
+       char proportional[21];
+       char derivative[21];
+       char change[21];
+       s64 next_io;
+
+       bch2_hprint(rate,       pd->rate.rate);
+       bch2_hprint(actual,     pd->last_actual);
+       bch2_hprint(target,     pd->last_target);
+       bch2_hprint(proportional, pd->last_proportional);
+       bch2_hprint(derivative, pd->last_derivative);
+       bch2_hprint(change,     pd->last_change);
+
+       next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+
+       return sprintf(buf,
+                      "rate:\t\t%s/sec\n"
+                      "target:\t\t%s\n"
+                      "actual:\t\t%s\n"
+                      "proportional:\t%s\n"
+                      "derivative:\t%s\n"
+                      "change:\t\t%s/sec\n"
+                      "next io:\t%llims\n",
+                      rate, target, actual, proportional,
+                      derivative, change, next_io);
+}
+
+/* misc: */
+
+void bch2_bio_map(struct bio *bio, void *base)
+{
+       size_t size = bio->bi_iter.bi_size;
+       struct bio_vec *bv = bio->bi_io_vec;
+
+       BUG_ON(!bio->bi_iter.bi_size);
+       BUG_ON(bio->bi_vcnt);
+
+       bv->bv_offset = base ? offset_in_page(base) : 0;
+       goto start;
+
+       for (; size; bio->bi_vcnt++, bv++) {
+               bv->bv_offset   = 0;
+start:         bv->bv_len      = min_t(size_t, PAGE_SIZE - bv->bv_offset,
+                                       size);
+               BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+               if (base) {
+                       bv->bv_page = is_vmalloc_addr(base)
+                               ? vmalloc_to_page(base)
+                               : virt_to_page(base);
+
+                       base += bv->bv_len;
+               }
+
+               size -= bv->bv_len;
+       }
+}
+
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+{
+       while (size) {
+               struct page *page = alloc_pages(gfp_mask, 0);
+               unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+               if (!page)
+                       return -ENOMEM;
+
+               if (unlikely(!bio_add_page(bio, page, len, 0))) {
+                       __free_page(page);
+                       break;
+               }
+
+               size -= len;
+       }
+
+       return 0;
+}
+
+size_t bch2_rand_range(size_t max)
+{
+       size_t rand;
+
+       if (!max)
+               return 0;
+
+       do {
+               rand = get_random_long();
+               rand &= roundup_pow_of_two(max) - 1;
+       } while (rand >= max);
+
+       return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+{
+       struct bio_vec bv;
+       struct bvec_iter iter;
+
+       __bio_for_each_segment(bv, dst, iter, dst_iter) {
+               void *dstp = kmap_atomic(bv.bv_page);
+               memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+               kunmap_atomic(dstp);
+
+               src += bv.bv_len;
+       }
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+       struct bio_vec bv;
+       struct bvec_iter iter;
+
+       __bio_for_each_segment(bv, src, iter, src_iter) {
+               void *srcp = kmap_atomic(bv.bv_page);
+               memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+               kunmap_atomic(srcp);
+
+               dst += bv.bv_len;
+       }
+}
+
+size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
+{
+       size_t n;
+
+       if (!size)
+               return 0;
+
+       n = min(size - 1, len);
+       memcpy(buf, src, n);
+       buf[n] = '\0';
+
+       return n;
+}
+
+#include "eytzinger.h"
+
+static int alignment_ok(const void *base, size_t align)
+{
+       return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+               ((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+       u32 t = *(u32 *)a;
+       *(u32 *)a = *(u32 *)b;
+       *(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+       u64 t = *(u64 *)a;
+       *(u64 *)a = *(u64 *)b;
+       *(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+       char t;
+
+       do {
+               t = *(char *)a;
+               *(char *)a++ = *(char *)b;
+               *(char *)b++ = t;
+       } while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+                        int (*cmp_func)(const void *, const void *, size_t),
+                        size_t l, size_t r)
+{
+       return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+                       base + inorder_to_eytzinger0(r, n) * size,
+                       size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+                          void (*swap_func)(void *, void *, size_t),
+                          size_t l, size_t r)
+{
+       swap_func(base + inorder_to_eytzinger0(l, n) * size,
+                 base + inorder_to_eytzinger0(r, n) * size,
+                 size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+                    int (*cmp_func)(const void *, const void *, size_t),
+                    void (*swap_func)(void *, void *, size_t))
+{
+       int i, c, r;
+
+       if (!swap_func) {
+               if (size == 4 && alignment_ok(base, 4))
+                       swap_func = u32_swap;
+               else if (size == 8 && alignment_ok(base, 8))
+                       swap_func = u64_swap;
+               else
+                       swap_func = generic_swap;
+       }
+
+       /* heapify */
+       for (i = n / 2 - 1; i >= 0; --i) {
+               for (r = i; r * 2 + 1 < n; r = c) {
+                       c = r * 2 + 1;
+
+                       if (c + 1 < n &&
+                           do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+                               c++;
+
+                       if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+                               break;
+
+                       do_swap(base, n, size, swap_func, r, c);
+               }
+       }
+
+       /* sort */
+       for (i = n - 1; i > 0; --i) {
+               do_swap(base, n, size, swap_func, 0, i);
+
+               for (r = 0; r * 2 + 1 < i; r = c) {
+                       c = r * 2 + 1;
+
+                       if (c + 1 < i &&
+                           do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+                               c++;
+
+                       if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+                               break;
+
+                       do_swap(base, n, size, swap_func, r, c);
+               }
+       }
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+         int (*cmp_func)(const void *, const void *, size_t),
+         void (*swap_func)(void *, void *, size_t size))
+{
+       /* pre-scale counters for performance */
+       int i = (num/2 - 1) * size, n = num * size, c, r;
+
+       if (!swap_func) {
+               if (size == 4 && alignment_ok(base, 4))
+                       swap_func = u32_swap;
+               else if (size == 8 && alignment_ok(base, 8))
+                       swap_func = u64_swap;
+               else
+                       swap_func = generic_swap;
+       }
+
+       /* heapify */
+       for ( ; i >= 0; i -= size) {
+               for (r = i; r * 2 + size < n; r  = c) {
+                       c = r * 2 + size;
+                       if (c < n - size &&
+                           cmp_func(base + c, base + c + size, size) < 0)
+                               c += size;
+                       if (cmp_func(base + r, base + c, size) >= 0)
+                               break;
+                       swap_func(base + r, base + c, size);
+               }
+       }
+
+       /* sort */
+       for (i = n - size; i > 0; i -= size) {
+               swap_func(base, base + i, size);
+               for (r = 0; r * 2 + size < i; r = c) {
+                       c = r * 2 + size;
+                       if (c < i - size &&
+                           cmp_func(base + c, base + c + size, size) < 0)
+                               c += size;
+                       if (cmp_func(base + r, base + c, size) >= 0)
+                               break;
+                       swap_func(base + r, base + c, size);
+               }
+       }
+}
+
+static void mempool_free_vp(void *element, void *pool_data)
+{
+       size_t size = (size_t) pool_data;
+
+       vpfree(element, size);
+}
+
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+       size_t size = (size_t) pool_data;
+
+       return vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+       return size < PAGE_SIZE
+               ? mempool_init_kmalloc_pool(pool, min_nr, size)
+               : mempool_init(pool, min_nr, mempool_alloc_vp,
+                              mempool_free_vp, (void *) size);
+}
+
+#if 0
+void eytzinger1_test(void)
+{
+       unsigned inorder, eytz, size;
+
+       pr_info("1 based eytzinger test:");
+
+       for (size = 2;
+            size < 65536;
+            size++) {
+               unsigned extra = eytzinger1_extra(size);
+
+               if (!(size % 4096))
+                       pr_info("tree size %u", size);
+
+               BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+               BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+               BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)    != 0);
+               BUG_ON(eytzinger1_next(eytzinger1_last(size), size)     != 0);
+
+               inorder = 1;
+               eytzinger1_for_each(eytz, size) {
+                       BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+                       BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+                       BUG_ON(eytz != eytzinger1_last(size) &&
+                              eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+                       inorder++;
+               }
+       }
+}
+
+void eytzinger0_test(void)
+{
+
+       unsigned inorder, eytz, size;
+
+       pr_info("0 based eytzinger test:");
+
+       for (size = 1;
+            size < 65536;
+            size++) {
+               unsigned extra = eytzinger0_extra(size);
+
+               if (!(size % 4096))
+                       pr_info("tree size %u", size);
+
+               BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+               BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+               BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)    != -1);
+               BUG_ON(eytzinger0_next(eytzinger0_last(size), size)     != -1);
+
+               inorder = 0;
+               eytzinger0_for_each(eytz, size) {
+                       BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+                       BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+                       BUG_ON(eytz != eytzinger0_last(size) &&
+                              eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+                       inorder++;
+               }
+       }
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+       const u16 *l = _l, *r = _r;
+
+       return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+       int i, c1 = -1, c2 = -1;
+       ssize_t r;
+
+       r = eytzinger0_find_le(test_array, nr,
+                              sizeof(test_array[0]),
+                              cmp_u16, &search);
+       if (r >= 0)
+               c1 = test_array[r];
+
+       for (i = 0; i < nr; i++)
+               if (test_array[i] <= search && test_array[i] > c2)
+                       c2 = test_array[i];
+
+       if (c1 != c2) {
+               eytzinger0_for_each(i, nr)
+                       pr_info("[%3u] = %12u", i, test_array[i]);
+               pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+                       i, r, c1, c2);
+       }
+}
+
+void eytzinger0_find_test(void)
+{
+       unsigned i, nr, allocated = 1 << 12;
+       u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+       for (nr = 1; nr < allocated; nr++) {
+               pr_info("testing %u elems", nr);
+
+               get_random_bytes(test_array, nr * sizeof(test_array[0]));
+               eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+               /* verify array is sorted correctly: */
+               eytzinger0_for_each(i, nr)
+                       BUG_ON(i != eytzinger0_last(nr) &&
+                              test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+               for (i = 0; i < U16_MAX; i += 1 << 12)
+                       eytzinger0_find_test_val(test_array, nr, i);
+
+               for (i = 0; i < nr; i++) {
+                       eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+                       eytzinger0_find_test_val(test_array, nr, test_array[i]);
+                       eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+               }
+       }
+
+       kfree(test_array);
+}
+#endif
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
new file mode 100644 (file)
index 0000000..c0b2612
--- /dev/null
@@ -0,0 +1,737 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#define PAGE_SECTOR_SHIFT      (PAGE_SHIFT - 9)
+
+struct closure;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define EBUG_ON(cond)          BUG_ON(cond)
+#define atomic_dec_bug(v)      BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)   BUG_ON(atomic_inc_return(v) <= i)
+#define atomic_sub_bug(i, v)   BUG_ON(atomic_sub_return(i, v) < 0)
+#define atomic_add_bug(i, v)   BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v)         BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v)      BUG_ON(atomic_long_sub_return(i, v) < 0)
+#define atomic64_dec_bug(v)    BUG_ON(atomic64_dec_return(v) < 0)
+#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i)
+#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
+#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond)
+#define atomic_dec_bug(v)      atomic_dec(v)
+#define atomic_inc_bug(v, i)   atomic_inc(v)
+#define atomic_sub_bug(i, v)   atomic_sub(i, v)
+#define atomic_add_bug(i, v)   atomic_add(i, v)
+#define atomic_long_dec_bug(v)         atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v)      atomic_long_sub(i, v)
+#define atomic64_dec_bug(v)    atomic64_dec(v)
+#define atomic64_inc_bug(v, i) atomic64_inc(v)
+#define atomic64_sub_bug(i, v) atomic64_sub(i, v)
+#define atomic64_add_bug(i, v) atomic64_add(i, v)
+
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CPU_BIG_ENDIAN         0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CPU_BIG_ENDIAN         1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)                                     \
+       __builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)                                           \
+       (__builtin_types_compatible_p(typeof(_val), _type) ||           \
+        __builtin_types_compatible_p(typeof(_val), const _type))
+
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
+{
+       return DIV_ROUND_UP(len +
+                           ((unsigned long) p & (PAGE_SIZE - 1)),
+                           PAGE_SIZE);
+}
+
+static inline void vpfree(void *p, size_t size)
+{
+       if (is_vmalloc_addr(p))
+               vfree(p);
+       else
+               free_pages((unsigned long) p, get_order(size));
+}
+
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+       return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+                                        get_order(size)) ?:
+               __vmalloc(size, gfp_mask);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+       if (size < PAGE_SIZE)
+               kfree(p);
+       else
+               vpfree(p, size);
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+       return size < PAGE_SIZE
+               ? kmalloc(size, gfp_mask)
+               : vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
+
+#define HEAP(type)                                                     \
+struct {                                                               \
+       size_t size, used;                                              \
+       type *data;                                                     \
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
+
+#define init_heap(heap, _size, gfp)                                    \
+({                                                                     \
+       (heap)->used = 0;                                               \
+       (heap)->size = (_size);                                         \
+       (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+                                (gfp));                                \
+})
+
+#define free_heap(heap)                                                        \
+do {                                                                   \
+       kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));  \
+       (heap)->data = NULL;                                            \
+} while (0)
+
+#define heap_swap(h, i, j)     swap((h)->data[i], (h)->data[j])
+
+#define heap_peek(h)                                                   \
+({                                                                     \
+       EBUG_ON(!(h)->used);                                            \
+       (h)->data[0];                                                   \
+})
+
+#define heap_full(h)   ((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp)                                      \
+do {                                                                   \
+       size_t _c, _j = i;                                              \
+                                                                       \
+       for (; _j * 2 + 1 < (h)->used; _j = _c) {                       \
+               _c = _j * 2 + 1;                                        \
+               if (_c + 1 < (h)->used &&                               \
+                   cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)      \
+                       _c++;                                           \
+                                                                       \
+               if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)          \
+                       break;                                          \
+               heap_swap(h, _c, _j);                                   \
+       }                                                               \
+} while (0)
+
+#define heap_sift_up(h, i, cmp)                                                \
+do {                                                                   \
+       while (i) {                                                     \
+               size_t p = (i - 1) / 2;                                 \
+               if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)            \
+                       break;                                          \
+               heap_swap(h, i, p);                                     \
+               i = p;                                                  \
+       }                                                               \
+} while (0)
+
+#define __heap_add(h, d, cmp)                                          \
+do {                                                                   \
+       size_t _i = (h)->used++;                                        \
+       (h)->data[_i] = d;                                              \
+                                                                       \
+       heap_sift_up(h, _i, cmp);                                       \
+} while (0)
+
+#define heap_add(h, d, cmp)                                            \
+({                                                                     \
+       bool _r = !heap_full(h);                                        \
+       if (_r)                                                         \
+               __heap_add(h, d, cmp);                                  \
+       _r;                                                             \
+})
+
+#define heap_add_or_replace(h, new, cmp)                               \
+do {                                                                   \
+       if (!heap_add(h, new, cmp) &&                                   \
+           cmp(h, new, heap_peek(h)) >= 0) {                           \
+               (h)->data[0] = new;                                     \
+               heap_sift_down(h, 0, cmp);                              \
+       }                                                               \
+} while (0)
+
+#define heap_del(h, i, cmp)                                            \
+do {                                                                   \
+       size_t _i = (i);                                                \
+                                                                       \
+       BUG_ON(_i >= (h)->used);                                        \
+       (h)->used--;                                                    \
+       heap_swap(h, _i, (h)->used);                                    \
+       heap_sift_up(h, _i, cmp);                                       \
+       heap_sift_down(h, _i, cmp);                                     \
+} while (0)
+
+#define heap_pop(h, d, cmp)                                            \
+({                                                                     \
+       bool _r = (h)->used;                                            \
+       if (_r) {                                                       \
+               (d) = (h)->data[0];                                     \
+               heap_del(h, 0, cmp);                                    \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define heap_resort(heap, cmp)                                         \
+do {                                                                   \
+       ssize_t _i;                                                     \
+       for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)       \
+               heap_sift_down(heap, _i, cmp);                          \
+} while (0)
+
+#define ANYSINT_MAX(t)                                                 \
+       ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch2_strtoint_h(const char *, int *);
+int bch2_strtouint_h(const char *, unsigned int *);
+int bch2_strtoll_h(const char *, long long *);
+int bch2_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch2_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+       return bch2_strtoint_h(cp, (int *) res);
+#else
+       return bch2_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch2_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+       return bch2_strtouint_h(cp, (unsigned int *) res);
+#else
+       return bch2_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)                                              \
+       ( type_is(*res, int)            ? bch2_strtoint_h(cp, (void *) res)\
+       : type_is(*res, long)           ? bch2_strtol_h(cp, (void *) res)\
+       : type_is(*res, long long)      ? bch2_strtoll_h(cp, (void *) res)\
+       : type_is(*res, unsigned)       ? bch2_strtouint_h(cp, (void *) res)\
+       : type_is(*res, unsigned long)  ? bch2_strtoul_h(cp, (void *) res)\
+       : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
+       : -EINVAL)
+
+#define strtoul_safe(cp, var)                                          \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (!_r)                                                        \
+               var = _v;                                               \
+       _r;                                                             \
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)                          \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (!_r)                                                        \
+               var = clamp_t(typeof(var), _v, min, max);               \
+       _r;                                                             \
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)                       \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (!_r && _v >= min && _v <= max)                              \
+               var = _v;                                               \
+       else                                                            \
+               _r = -EINVAL;                                           \
+       _r;                                                             \
+})
+
+#define snprint(buf, size, var)                                                \
+       snprintf(buf, size,                                             \
+                  type_is(var, int)            ? "%i\n"                \
+                : type_is(var, unsigned)       ? "%u\n"                \
+                : type_is(var, long)           ? "%li\n"               \
+                : type_is(var, unsigned long)  ? "%lu\n"               \
+                : type_is(var, s64)            ? "%lli\n"              \
+                : type_is(var, u64)            ? "%llu\n"              \
+                : type_is(var, char *)         ? "%s\n"                \
+                : "%i\n", var)
+
+ssize_t bch2_hprint(char *buf, s64 v);
+
+bool bch2_is_zero(const void *, size_t);
+
+ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t);
+
+ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
+u64 bch2_read_flag_list(char *, const char * const[]);
+
+#define NR_QUANTILES   15
+#define QUANTILE_IDX(i)        inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST  eytzinger0_last(NR_QUANTILES)
+
+struct bch2_quantiles {
+       struct bch2_quantile_entry {
+               u64     m;
+               u64     step;
+       }               entries[NR_QUANTILES];
+};
+
+struct bch2_time_stat_buffer {
+       unsigned        nr;
+       struct bch2_time_stat_buffer_entry {
+               u64     start;
+               u64     end;
+       }               entries[32];
+};
+
+struct bch2_time_stats {
+       spinlock_t      lock;
+       u64             count;
+       /* all fields are in nanoseconds */
+       u64             average_duration;
+       u64             average_frequency;
+       u64             max_duration;
+       u64             last_event;
+       struct bch2_quantiles quantiles;
+
+       struct bch2_time_stat_buffer __percpu *buffer;
+};
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
+
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+       __bch2_time_stats_update(stats, start, local_clock());
+}
+
+size_t bch2_time_stats_print(struct bch2_time_stats *, char *, size_t);
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+#define ewma_add(ewma, val, weight)                                    \
+({                                                                     \
+       typeof(ewma) _ewma = (ewma);                                    \
+       typeof(weight) _weight = (weight);                              \
+                                                                       \
+       (((_ewma << _weight) - _ewma) + (val)) >> _weight;              \
+})
+
+struct bch_ratelimit {
+       /* Next time we want to do some work, in nanoseconds */
+       u64                     next;
+
+       /*
+        * Rate at which we want to do work, in units per nanosecond
+        * The units here correspond to the units passed to
+        * bch2_ratelimit_increment()
+        */
+       unsigned                rate;
+};
+
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
+{
+       d->next = local_clock();
+}
+
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
+int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
+
+struct bch_pd_controller {
+       struct bch_ratelimit    rate;
+       unsigned long           last_update;
+
+       s64                     last_actual;
+       s64                     smoothed_derivative;
+
+       unsigned                p_term_inverse;
+       unsigned                d_smooth;
+       unsigned                d_term;
+
+       /* for exporting to sysfs (no effect on behavior) */
+       s64                     last_derivative;
+       s64                     last_proportional;
+       s64                     last_change;
+       s64                     last_target;
+
+       /* If true, the rate will not increase if bch2_ratelimit_delay()
+        * is not being called often enough. */
+       bool                    backpressure;
+};
+
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch2_pd_controller_init(struct bch_pd_controller *);
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+
+#define sysfs_pd_controller_attribute(name)                            \
+       rw_attribute(name##_rate);                                      \
+       rw_attribute(name##_rate_bytes);                                \
+       rw_attribute(name##_rate_d_term);                               \
+       rw_attribute(name##_rate_p_term_inverse);                       \
+       read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)                                        \
+       &sysfs_##name##_rate,                                           \
+       &sysfs_##name##_rate_bytes,                                     \
+       &sysfs_##name##_rate_d_term,                                    \
+       &sysfs_##name##_rate_p_term_inverse,                            \
+       &sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)                            \
+do {                                                                   \
+       sysfs_hprint(name##_rate,               (var)->rate.rate);      \
+       sysfs_print(name##_rate_bytes,          (var)->rate.rate);      \
+       sysfs_print(name##_rate_d_term,         (var)->d_term);         \
+       sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
+                                                                       \
+       if (attr == &sysfs_##name##_rate_debug)                         \
+               return bch2_pd_controller_print_debug(var, buf);                \
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)                           \
+do {                                                                   \
+       sysfs_strtoul_clamp(name##_rate,                                \
+                           (var)->rate.rate, 1, UINT_MAX);             \
+       sysfs_strtoul_clamp(name##_rate_bytes,                          \
+                           (var)->rate.rate, 1, UINT_MAX);             \
+       sysfs_strtoul(name##_rate_d_term,       (var)->d_term);         \
+       sysfs_strtoul_clamp(name##_rate_p_term_inverse,                 \
+                           (var)->p_term_inverse, 1, INT_MAX);         \
+} while (0)
+
+#define __DIV_SAFE(n, d, zero)                                         \
+({                                                                     \
+       typeof(n) _n = (n);                                             \
+       typeof(d) _d = (d);                                             \
+       _d ? _n / _d : zero;                                            \
+})
+
+#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)                                \
+({                                                                     \
+       typeof(ptr) _ptr = ptr;                                         \
+       _ptr ? container_of(_ptr, type, member) : NULL;                 \
+})
+
+#define RB_INSERT(root, new, member, cmp)                              \
+({                                                                     \
+       __label__ dup;                                                  \
+       struct rb_node **n = &(root)->rb_node, *parent = NULL;          \
+       typeof(new) this;                                               \
+       int res, ret = -1;                                              \
+                                                                       \
+       while (*n) {                                                    \
+               parent = *n;                                            \
+               this = container_of(*n, typeof(*(new)), member);        \
+               res = cmp(new, this);                                   \
+               if (!res)                                               \
+                       goto dup;                                       \
+               n = res < 0                                             \
+                       ? &(*n)->rb_left                                \
+                       : &(*n)->rb_right;                              \
+       }                                                               \
+                                                                       \
+       rb_link_node(&(new)->member, parent, n);                        \
+       rb_insert_color(&(new)->member, root);                          \
+       ret = 0;                                                        \
+dup:                                                                   \
+       ret;                                                            \
+})
+
+#define RB_SEARCH(root, search, member, cmp)                           \
+({                                                                     \
+       struct rb_node *n = (root)->rb_node;                            \
+       typeof(&(search)) this, ret = NULL;                             \
+       int res;                                                        \
+                                                                       \
+       while (n) {                                                     \
+               this = container_of(n, typeof(search), member);         \
+               res = cmp(&(search), this);                             \
+               if (!res) {                                             \
+                       ret = this;                                     \
+                       break;                                          \
+               }                                                       \
+               n = res < 0                                             \
+                       ? n->rb_left                                    \
+                       : n->rb_right;                                  \
+       }                                                               \
+       ret;                                                            \
+})
+
+#define RB_GREATER(root, search, member, cmp)                          \
+({                                                                     \
+       struct rb_node *n = (root)->rb_node;                            \
+       typeof(&(search)) this, ret = NULL;                             \
+       int res;                                                        \
+                                                                       \
+       while (n) {                                                     \
+               this = container_of(n, typeof(search), member);         \
+               res = cmp(&(search), this);                             \
+               if (res < 0) {                                          \
+                       ret = this;                                     \
+                       n = n->rb_left;                                 \
+               } else                                                  \
+                       n = n->rb_right;                                \
+       }                                                               \
+       ret;                                                            \
+})
+
+#define RB_FIRST(root, type, member)                                   \
+       container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)                                    \
+       container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)                                           \
+       container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)                                           \
+       container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+       unsigned fract = x & ~(~0 << fract_bits);
+
+       x >>= fract_bits;
+       x   = 1 << x;
+       x  += (x * fract) >> fract_bits;
+
+       return x;
+}
+
+void bch2_bio_map(struct bio *bio, void *base);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+       return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)                                    \
+do {                                                                   \
+       closure_get(cl);                                                \
+       submit_bio(bio);                                                \
+} while (0)
+
+#define kthread_wait_freezable(cond)                                   \
+({                                                                     \
+       int _ret = 0;                                                   \
+       while (1) {                                                     \
+               set_current_state(TASK_INTERRUPTIBLE);                  \
+               if (kthread_should_stop()) {                            \
+                       _ret = -1;                                      \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               if (cond)                                               \
+                       break;                                          \
+                                                                       \
+               schedule();                                             \
+               try_to_freeze();                                        \
+       }                                                               \
+       set_current_state(TASK_RUNNING);                                \
+       _ret;                                                           \
+})
+
+size_t bch2_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+                                unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+       long d0, d1, d2;
+       asm volatile("rep ; movsq"
+                    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+                    : "0" (u64s), "1" (dst), "2" (src)
+                    : "memory");
+#else
+       u64 *d = dst;
+       const u64 *s = src;
+
+       while (u64s--)
+               *d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+                              unsigned u64s)
+{
+       EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+                dst + u64s * sizeof(u64) <= src));
+
+       __memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+                                      unsigned u64s)
+{
+       __memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+                                    unsigned u64s)
+{
+       EBUG_ON(dst > src);
+
+       __memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+                                    unsigned u64s)
+{
+       u64 *dst = (u64 *) _dst + u64s - 1;
+       u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+       long d0, d1, d2;
+       asm volatile("std ;\n"
+                    "rep ; movsq\n"
+                    "cld ;\n"
+                    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+                    : "0" (u64s), "1" (dst), "2" (src)
+                    : "memory");
+#else
+       while (u64s--)
+               *dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+                                  unsigned u64s)
+{
+       EBUG_ON(dst < src);
+
+       __memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+                               unsigned u64s)
+{
+       if (dst < src)
+               __memmove_u64s_down(dst, src, u64s);
+       else
+               __memmove_u64s_up(dst, src, u64s);
+}
+
+static inline struct bio_vec next_contig_bvec(struct bio *bio,
+                                             struct bvec_iter *iter)
+{
+       struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+       bio_advance_iter(bio, iter, bv.bv_len);
+#ifndef CONFIG_HIGHMEM
+       while (iter->bi_size) {
+               struct bio_vec next = bio_iter_iovec(bio, *iter);
+
+               if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
+                   page_address(next.bv_page) + next.bv_offset)
+                       break;
+
+               bv.bv_len += next.bv_len;
+               bio_advance_iter(bio, iter, next.bv_len);
+       }
+#endif
+       return bv;
+}
+
+#define __bio_for_each_contig_segment(bv, bio, iter, start)            \
+       for (iter = (start);                                            \
+            (iter).bi_size &&                                          \
+               ((bv = next_contig_bvec((bio), &(iter))), 1);)
+
+#define bio_for_each_contig_segment(bv, bio, iter)                     \
+       __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
+
+size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+         int (*cmp_func)(const void *, const void *, size_t),
+         void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)                         \
+       memmove(&(_array)[(_pos) + 1],                                  \
+               &(_array)[(_pos)],                                      \
+               sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)                        \
+do {                                                                   \
+       __array_insert_item(_array, _nr, _pos);                         \
+       (_nr)++;                                                        \
+       (_array)[(_pos)] = (_new_item);                                 \
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)           \
+do {                                                                   \
+       (_nr) -= (_nr_to_remove);                                       \
+       memmove(&(_array)[(_pos)],                                      \
+               &(_array)[(_pos) + (_nr_to_remove)],                    \
+               sizeof((_array)[0]) * ((_nr) - (_pos)));                \
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)                           \
+       array_remove_items(_array, _nr, _pos, 1)
+
+#define bubble_sort(_base, _nr, _cmp)                                  \
+do {                                                                   \
+       ssize_t _i, _end;                                               \
+       bool _swapped = true;                                           \
+                                                                       \
+       for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+               _swapped = false;                                       \
+               for (_i = 0; _i < _end; _i++)                           \
+                       if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {   \
+                               swap((_base)[_i], (_base)[_i + 1]);     \
+                               _swapped = true;                        \
+                       }                                               \
+       }                                                               \
+} while (0)
+
+#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
new file mode 100644 (file)
index 0000000..c099cdc
--- /dev/null
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s)                                             \
+({                                                                     \
+       ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)           \
+       : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)           \
+       : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)           \
+       : ((__force u8) ((_s)->u64s)));                                         \
+})
+
+#define __vstruct_bytes(_type, _u64s)                                  \
+({                                                                     \
+       BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));             \
+                                                                       \
+       (offsetof(_type, _data) + (_u64s) * sizeof(u64));               \
+})
+
+#define vstruct_bytes(_s)                                              \
+       __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s)             \
+       (round_up(__vstruct_bytes(_type, _u64s),                        \
+                 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits)                         \
+       __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)             \
+       __vstruct_blocks(typeof(*(_s)), _sector_block_bits,             \
+                        __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits)                                \
+       (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s)                                               \
+       ((typeof(_s))                   ((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s)                                               \
+       ((typeof(&(_s)->start[0]))      ((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s)                                                        \
+       ((void *)                       ((_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i)                                       \
+       for (_i = (_s)->start;                                          \
+            _i < vstruct_last(_s);                                     \
+            _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t)                              \
+       for (_i = (_s)->start;                                          \
+            _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);    \
+            _i = _t)
+
+#define vstruct_idx(_s, _idx)                                          \
+       ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
new file mode 100644 (file)
index 0000000..f0440d1
--- /dev/null
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "compress.h"
+#include "extents.h"
+#include "fs.h"
+#include "rebalance.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
+                         const struct xattr_search_key *key)
+{
+       struct bch_str_hash_ctx ctx;
+
+       bch2_str_hash_init(&ctx, info);
+       bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+       bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
+
+       return bch2_str_hash_end(&ctx, info);
+}
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+       return bch2_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+       struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+       return bch2_xattr_hash(info,
+                &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+       struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+       const struct xattr_search_key *r = _r;
+
+       return l.v->x_type != r->type ||
+               l.v->x_name_len != r->name.len ||
+               memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+       struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+       struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+       return l.v->x_type != r.v->x_type ||
+               l.v->x_name_len != r.v->x_name_len ||
+               memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+const struct bch_hash_desc bch2_xattr_hash_desc = {
+       .btree_id       = BTREE_ID_XATTRS,
+       .key_type       = BCH_XATTR,
+       .whiteout_type  = BCH_XATTR_WHITEOUT,
+       .hash_key       = xattr_hash_key,
+       .hash_bkey      = xattr_hash_bkey,
+       .cmp_key        = xattr_cmp_key,
+       .cmp_bkey       = xattr_cmp_bkey,
+};
+
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       const struct xattr_handler *handler;
+       struct bkey_s_c_xattr xattr;
+
+       switch (k.k->type) {
+       case BCH_XATTR:
+               if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+                       return "value too small";
+
+               xattr = bkey_s_c_to_xattr(k);
+
+               if (bkey_val_u64s(k.k) <
+                       xattr_val_u64s(xattr.v->x_name_len,
+                                      le16_to_cpu(xattr.v->x_val_len)))
+                       return "value too small";
+
+               if (bkey_val_u64s(k.k) >
+                       xattr_val_u64s(xattr.v->x_name_len,
+                                      le16_to_cpu(xattr.v->x_val_len) + 4))
+                       return "value too big";
+
+               handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+               if (!handler)
+                       return "invalid type";
+
+               if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+                       return "xattr name has invalid characters";
+
+               return NULL;
+       case BCH_XATTR_WHITEOUT:
+               return bkey_val_bytes(k.k) != 0
+                       ? "value size should be zero"
+                       : NULL;
+
+       default:
+               return "invalid type";
+       }
+}
+
+void bch2_xattr_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
+{
+       const struct xattr_handler *handler;
+       struct bkey_s_c_xattr xattr;
+       size_t n = 0;
+
+       switch (k.k->type) {
+       case BCH_XATTR:
+               xattr = bkey_s_c_to_xattr(k);
+
+               handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+               if (handler && handler->prefix)
+                       n += scnprintf(buf + n, size - n, "%s", handler->prefix);
+               else if (handler)
+                       n += scnprintf(buf + n, size - n, "(type %u)",
+                                      xattr.v->x_type);
+               else
+                       n += scnprintf(buf + n, size - n, "(unknown type %u)",
+                                      xattr.v->x_type);
+
+               n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
+                                  xattr.v->x_name_len);
+               n += scnprintf(buf + n, size - n, ":");
+               n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
+                                  le16_to_cpu(xattr.v->x_val_len));
+               break;
+       case BCH_XATTR_WHITEOUT:
+               scnprintf(buf, size, "whiteout");
+               break;
+       }
+}
+
+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
+                  const char *name, void *buffer, size_t size, int type)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c_xattr xattr;
+       int ret;
+
+       bch2_trans_init(&trans, c);
+
+       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+                               &inode->ei_str_hash, inode->v.i_ino,
+                               &X_SEARCH(type, name, strlen(name)),
+                               0);
+       if (IS_ERR(iter)) {
+               bch2_trans_exit(&trans);
+               BUG_ON(PTR_ERR(iter) == -EINTR);
+
+               return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
+       }
+
+       xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+       ret = le16_to_cpu(xattr.v->x_val_len);
+       if (buffer) {
+               if (ret > size)
+                       ret = -ERANGE;
+               else
+                       memcpy(buffer, xattr_val(xattr.v), ret);
+       }
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+                  const struct bch_hash_info *hash_info,
+                  const char *name, const void *value, size_t size,
+                  int type, int flags)
+{
+       int ret;
+
+       if (value) {
+               struct bkey_i_xattr *xattr;
+               unsigned namelen = strlen(name);
+               unsigned u64s = BKEY_U64s +
+                       xattr_val_u64s(namelen, size);
+
+               if (u64s > U8_MAX)
+                       return -ERANGE;
+
+               xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+               if (IS_ERR(xattr))
+                       return PTR_ERR(xattr);
+
+               bkey_xattr_init(&xattr->k_i);
+               xattr->k.u64s           = u64s;
+               xattr->v.x_type         = type;
+               xattr->v.x_name_len     = namelen;
+               xattr->v.x_val_len      = cpu_to_le16(size);
+               memcpy(xattr->v.x_name, name, namelen);
+               memcpy(xattr_val(&xattr->v), value, size);
+
+               ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+                             inum, &xattr->k_i,
+                             (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+                             (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+       } else {
+               struct xattr_search_key search =
+                       X_SEARCH(type, name, strlen(name));
+
+               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+                                      hash_info, inum, &search);
+       }
+
+       if (ret == -ENOENT)
+               ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+       return ret;
+}
+
+static size_t bch2_xattr_emit(struct dentry *dentry,
+                            const struct bch_xattr *xattr,
+                            char *buffer, size_t buffer_size)
+{
+       const struct xattr_handler *handler =
+               bch2_xattr_type_to_handler(xattr->x_type);
+
+       if (handler && (!handler->list || handler->list(dentry))) {
+               const char *prefix = handler->prefix ?: handler->name;
+               const size_t prefix_len = strlen(prefix);
+               const size_t total_len = prefix_len + xattr->x_name_len + 1;
+
+               if (buffer && total_len <= buffer_size) {
+                       memcpy(buffer, prefix, prefix_len);
+                       memcpy(buffer + prefix_len,
+                              xattr->x_name, xattr->x_name_len);
+                       buffer[prefix_len + xattr->x_name_len] = '\0';
+               }
+
+               return total_len;
+       } else {
+               return 0;
+       }
+}
+
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+       struct bch_fs *c = dentry->d_sb->s_fs_info;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       const struct bch_xattr *xattr;
+       u64 inum = dentry->d_inode->i_ino;
+       ssize_t ret = 0;
+       size_t len;
+
+       for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) {
+               BUG_ON(k.k->p.inode < inum);
+
+               if (k.k->p.inode > inum)
+                       break;
+
+               if (k.k->type != BCH_XATTR)
+                       continue;
+
+               xattr = bkey_s_c_to_xattr(k).v;
+
+               len = bch2_xattr_emit(dentry, xattr, buffer, buffer_size);
+               if (buffer) {
+                       if (len > buffer_size) {
+                               bch2_btree_iter_unlock(&iter);
+                               return -ERANGE;
+                       }
+
+                       buffer += len;
+                       buffer_size -= len;
+               }
+
+               ret += len;
+
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
+
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
+                                 struct dentry *dentry, struct inode *vinode,
+                                 const char *name, void *buffer, size_t size)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+}
+
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+                                 struct mnt_idmap *idmap,
+                                 struct dentry *dentry, struct inode *vinode,
+                                 const char *name, const void *value,
+                                 size_t size, int flags)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+                       bch2_xattr_set(&trans, inode->v.i_ino,
+                                      &inode->ei_str_hash,
+                                      name, value, size,
+                                      handler->flags, flags));
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+       .prefix = XATTR_USER_PREFIX,
+       .get    = bch2_xattr_get_handler,
+       .set    = bch2_xattr_set_handler,
+       .flags  = BCH_XATTR_INDEX_USER,
+};
+
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
+{
+       return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+       .prefix = XATTR_TRUSTED_PREFIX,
+       .list   = bch2_xattr_trusted_list,
+       .get    = bch2_xattr_get_handler,
+       .set    = bch2_xattr_set_handler,
+       .flags  = BCH_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+       .prefix = XATTR_SECURITY_PREFIX,
+       .get    = bch2_xattr_get_handler,
+       .set    = bch2_xattr_set_handler,
+       .flags  = BCH_XATTR_INDEX_SECURITY,
+};
+
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+                                  struct dentry *dentry, struct inode *vinode,
+                                  const char *name, void *buffer, size_t size)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_opts opts =
+               bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+       const struct bch_option *opt;
+       int ret, id;
+       u64 v;
+
+       id = bch2_opt_lookup(name);
+       if (id < 0 || !bch2_opt_is_inode_opt(id))
+               return -EINVAL;
+
+       opt = bch2_opt_table + id;
+
+       if (!bch2_opt_defined_by_id(&opts, id))
+               return -ENODATA;
+
+       v = bch2_opt_get_by_id(&opts, id);
+
+       ret = bch2_opt_to_text(c, buffer, size, opt, v, 0);
+
+       return ret < size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+       int                     id;
+       u64                     v;
+       bool                    defined;
+};
+
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+                           struct bch_inode_unpacked *bi,
+                           void *p)
+{
+       struct inode_opt_set *s = p;
+
+       if (s->defined)
+               bch2_inode_opt_set(bi, s->id, s->v);
+       else
+               bch2_inode_opt_clear(bi, s->id);
+       return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+                                  struct mnt_idmap *idmap,
+                                  struct dentry *dentry, struct inode *vinode,
+                                  const char *name, const void *value,
+                                  size_t size, int flags)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       const struct bch_option *opt;
+       char *buf;
+       struct inode_opt_set s;
+       int ret;
+
+       s.id = bch2_opt_lookup(name);
+       if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+               return -EINVAL;
+
+       opt = bch2_opt_table + s.id;
+
+       if (value) {
+               buf = kmalloc(size + 1, GFP_KERNEL);
+               if (!buf)
+                       return -ENOMEM;
+               memcpy(buf, value, size);
+               buf[size] = '\0';
+
+               ret = bch2_opt_parse(c, opt, buf, &s.v);
+               kfree(buf);
+
+               if (ret < 0)
+                       return ret;
+
+               if (s.id == Opt_compression ||
+                   s.id == Opt_background_compression) {
+                       ret = bch2_check_set_has_compressed_data(c, s.v);
+                       if (ret)
+                               return ret;
+               }
+
+               s.defined = true;
+       } else {
+               s.defined = false;
+       }
+
+       mutex_lock(&inode->ei_update_lock);
+       ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+       mutex_unlock(&inode->ei_update_lock);
+
+       if (value &&
+           (s.id == Opt_background_compression ||
+            s.id == Opt_background_target))
+               bch2_rebalance_add_work(c, inode->v.i_blocks);
+
+       return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+       .prefix = "bcachefs.",
+       .get    = bch2_xattr_bcachefs_get,
+       .set    = bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+       &bch_xattr_user_handler,
+       &nop_posix_acl_access,
+       &nop_posix_acl_default,
+       &bch_xattr_trusted_handler,
+       &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+       &bch_xattr_bcachefs_handler,
+#endif
+       NULL
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+       [BCH_XATTR_INDEX_USER]                  = &bch_xattr_user_handler,
+       [BCH_XATTR_INDEX_POSIX_ACL_ACCESS]      =
+               &nop_posix_acl_access,
+       [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]     =
+               &nop_posix_acl_default,
+       [BCH_XATTR_INDEX_TRUSTED]               = &bch_xattr_trusted_handler,
+       [BCH_XATTR_INDEX_SECURITY]              = &bch_xattr_security_handler,
+};
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
+{
+       return type < ARRAY_SIZE(bch_xattr_handler_map)
+               ? bch_xattr_handler_map[type]
+               : NULL;
+}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
new file mode 100644 (file)
index 0000000..0e7d2fa
--- /dev/null
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
+
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_xattr_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_xattr_invalid,           \
+       .val_to_text    = bch2_xattr_to_text,           \
+}
+
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+       return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+                           name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr)                                      \
+       ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+
+struct xattr_search_key {
+       u8              type;
+       struct qstr     name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)        \
+       { .type = _type, .name = QSTR_INIT(_name, _len) })
+
+struct dentry;
+struct xattr_handler;
+struct bch_hash_info;
+struct bch_inode_info;
+
+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
+                 const char *, void *, size_t, int);
+
+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+                  const char *, const void *, size_t, int, int);
+
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch2_xattr_handlers[];
+
+#endif /* _BCACHEFS_XATTR_H */