fs/bcachefs/super.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * bcachefs setup/teardown code, and some metadata io - read a superblock and
   4  * figure out what to do with it.
   5  *
   6  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   7  * Copyright 2012 Google, Inc.
   8  */
   9
  10 #include "bcachefs.h"
  11 #include "alloc_background.h"
  12 #include "alloc_foreground.h"
  13 #include "bkey_sort.h"
  14 #include "btree_cache.h"
  15 #include "btree_gc.h"
  16 #include "btree_key_cache.h"
  17 #include "btree_update_interior.h"
  18 #include "btree_io.h"
  19 #include "chardev.h"
  20 #include "checksum.h"
  21 #include "clock.h"
  22 #include "compress.h"
  23 #include "debug.h"
  24 #include "disk_groups.h"
  25 #include "ec.h"
  26 #include "error.h"
  27 #include "fs.h"
  28 #include "fs-io.h"
  29 #include "fsck.h"
  30 #include "inode.h"
  31 #include "io.h"
  32 #include "journal.h"
  33 #include "journal_reclaim.h"
  34 #include "journal_seq_blacklist.h"
  35 #include "move.h"
  36 #include "migrate.h"
  37 #include "movinggc.h"
  38 #include "quota.h"
  39 #include "rebalance.h"
  40 #include "recovery.h"
  41 #include "replicas.h"
  42 #include "super.h"
  43 #include "super-io.h"
  44 #include "sysfs.h"
  45 #include "trace.h"
  46
  47 #include <linux/backing-dev.h>
  48 #include <linux/blkdev.h>
  49 #include <linux/debugfs.h>
  50 #include <linux/device.h>
  51 #include <linux/idr.h>
  52 #include <linux/kthread.h>
  53 #include <linux/module.h>
  54 #include <linux/percpu.h>
  55 #include <linux/random.h>
  56 #include <linux/sysfs.h>
  57 #include <crypto/hash.h>
  58
  59 MODULE_LICENSE("GPL");
  60 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
  61
  62 #define KTYPE(type)                                                     \
  63 static const struct attribute_group type ## _group = {                  \
  64         .attrs = type ## _files                                         \
  65 };                                                                      \
  66                                                                         \
  67 static const struct attribute_group *type ## _groups[] = {              \
  68         &type ## _group,                                                \
  69         NULL                                                            \
  70 };                                                                      \
  71                                                                         \
  72 static const struct kobj_type type ## _ktype = {                        \
  73         .release        = type ## _release,                             \
  74         .sysfs_ops      = &type ## _sysfs_ops,                          \
  75         .default_groups = type ## _groups                               \
  76 }
  77
  78 static void bch2_fs_release(struct kobject *);
  79 static void bch2_dev_release(struct kobject *);
  80
  81 static void bch2_fs_internal_release(struct kobject *k)
  82 {
  83 }
  84
  85 static void bch2_fs_opts_dir_release(struct kobject *k)
  86 {
  87 }
  88
  89 static void bch2_fs_time_stats_release(struct kobject *k)
  90 {
  91 }
  92
  93 KTYPE(bch2_fs);
  94 KTYPE(bch2_fs_internal);
  95 KTYPE(bch2_fs_opts_dir);
  96 KTYPE(bch2_fs_time_stats);
  97 KTYPE(bch2_dev);
  98
  99 static struct kset *bcachefs_kset;
 100 static LIST_HEAD(bch_fs_list);
 101 static DEFINE_MUTEX(bch_fs_list_lock);
 102
 103 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 104
 105 static void bch2_dev_free(struct bch_dev *);
 106 static int bch2_dev_alloc(struct bch_fs *, unsigned);
 107 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
 108 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
 109
 110 struct bch_fs *bch2_dev_to_fs(dev_t dev)
 111 {
 112         struct bch_fs *c;
 113         struct bch_dev *ca;
 114         unsigned i;
 115
 116         mutex_lock(&bch_fs_list_lock);
 117         rcu_read_lock();
 118
 119         list_for_each_entry(c, &bch_fs_list, list)
 120                 for_each_member_device_rcu(ca, c, i, NULL)
 121                         if (ca->disk_sb.bdev->bd_dev == dev) {
 122                                 closure_get(&c->cl);
 123                                 goto found;
 124                         }
 125         c = NULL;
 126 found:
 127         rcu_read_unlock();
 128         mutex_unlock(&bch_fs_list_lock);
 129
 130         return c;
 131 }
 132
 133 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
 134 {
 135         struct bch_fs *c;
 136
 137         lockdep_assert_held(&bch_fs_list_lock);
 138
 139         list_for_each_entry(c, &bch_fs_list, list)
 140                 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
 141                         return c;
 142
 143         return NULL;
 144 }
 145
 146 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 147 {
 148         struct bch_fs *c;
 149
 150         mutex_lock(&bch_fs_list_lock);
 151         c = __bch2_uuid_to_fs(uuid);
 152         if (c)
 153                 closure_get(&c->cl);
 154         mutex_unlock(&bch_fs_list_lock);
 155
 156         return c;
 157 }
 158
 159 /* Filesystem RO/RW: */
 160
 161 /*
 162  * For startup/shutdown of RW stuff, the dependencies are:
 163  *
 164  * - foreground writes depend on copygc and rebalance (to free up space)
 165  *
 166  * - copygc and rebalance depend on mark and sweep gc (they actually probably
 167  *   don't because they either reserve ahead of time or don't block if
 168  *   allocations fail, but allocations can require mark and sweep gc to run
 169  *   because of generation number wraparound)
 170  *
 171  * - all of the above depends on the allocator threads
 172  *
 173  * - allocator depends on the journal (when it rewrites prios and gens)
 174  */
 175
 176 static void __bch2_fs_read_only(struct bch_fs *c)
 177 {
 178         struct bch_dev *ca;
 179         bool wrote = false;
 180         unsigned i, clean_passes = 0;
 181         int ret;
 182
 183         bch2_rebalance_stop(c);
 184         bch2_copygc_stop(c);
 185         bch2_gc_thread_stop(c);
 186
 187         /*
 188          * Flush journal before stopping allocators, because flushing journal
 189          * blacklist entries involves allocating new btree nodes:
 190          */
 191         bch2_journal_flush_all_pins(&c->journal);
 192
 193         /*
 194          * If the allocator threads didn't all start up, the btree updates to
 195          * write out alloc info aren't going to work:
 196          */
 197         if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
 198                 goto nowrote_alloc;
 199
 200         bch_verbose(c, "writing alloc info");
 201         /*
 202          * This should normally just be writing the bucket read/write clocks:
 203          */
 204         ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
 205                 bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
 206         bch_verbose(c, "writing alloc info complete");
 207
 208         if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
 209                 bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
 210
 211         if (ret)
 212                 goto nowrote_alloc;
 213
 214         bch_verbose(c, "flushing journal and stopping allocators");
 215
 216         bch2_journal_flush_all_pins(&c->journal);
 217         set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 218
 219         do {
 220                 clean_passes++;
 221
 222                 if (bch2_journal_flush_all_pins(&c->journal))
 223                         clean_passes = 0;
 224
 225                 /*
 226                  * In flight interior btree updates will generate more journal
 227                  * updates and btree updates (alloc btree):
 228                  */
 229                 if (bch2_btree_interior_updates_nr_pending(c)) {
 230                         closure_wait_event(&c->btree_interior_update_wait,
 231                                            !bch2_btree_interior_updates_nr_pending(c));
 232                         clean_passes = 0;
 233                 }
 234                 flush_work(&c->btree_interior_update_work);
 235
 236                 if (bch2_journal_flush_all_pins(&c->journal))
 237                         clean_passes = 0;
 238         } while (clean_passes < 2);
 239         bch_verbose(c, "flushing journal and stopping allocators complete");
 240
 241         set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 242 nowrote_alloc:
 243         closure_wait_event(&c->btree_interior_update_wait,
 244                            !bch2_btree_interior_updates_nr_pending(c));
 245         flush_work(&c->btree_interior_update_work);
 246
 247         for_each_member_device(ca, c, i)
 248                 bch2_dev_allocator_stop(ca);
 249
 250         clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 251         clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 252
 253         bch2_fs_journal_stop(&c->journal);
 254
 255         /*
 256          * the journal kicks off btree writes via reclaim - wait for in flight
 257          * writes after stopping journal:
 258          */
 259         if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
 260                 bch2_btree_flush_all_writes(c);
 261         else
 262                 bch2_btree_verify_flushed(c);
 263
 264         /*
 265          * After stopping journal:
 266          */
 267         for_each_member_device(ca, c, i)
 268                 bch2_dev_allocator_remove(c, ca);
 269 }
 270
 271 static void bch2_writes_disabled(struct percpu_ref *writes)
 272 {
 273         struct bch_fs *c = container_of(writes, struct bch_fs, writes);
 274
 275         set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 276         wake_up(&bch_read_only_wait);
 277 }
 278
 279 void bch2_fs_read_only(struct bch_fs *c)
 280 {
 281         if (!test_bit(BCH_FS_RW, &c->flags)) {
 282                 cancel_delayed_work_sync(&c->journal.reclaim_work);
 283                 return;
 284         }
 285
 286         BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 287
 288         /*
 289          * Block new foreground-end write operations from starting - any new
 290          * writes will return -EROFS:
 291          *
 292          * (This is really blocking new _allocations_, writes to previously
 293          * allocated space can still happen until stopping the allocator in
 294          * bch2_dev_allocator_stop()).
 295          */
 296         percpu_ref_kill(&c->writes);
 297
 298         cancel_work_sync(&c->ec_stripe_delete_work);
 299         cancel_delayed_work(&c->pd_controllers_update);
 300
 301         /*
 302          * If we're not doing an emergency shutdown, we want to wait on
 303          * outstanding writes to complete so they don't see spurious errors due
 304          * to shutting down the allocator:
 305          *
 306          * If we are doing an emergency shutdown outstanding writes may
 307          * hang until we shutdown the allocator so we don't want to wait
 308          * on outstanding writes before shutting everything down - but
 309          * we do need to wait on them before returning and signalling
 310          * that going RO is complete:
 311          */
 312         wait_event(bch_read_only_wait,
 313                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
 314                    test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 315
 316         __bch2_fs_read_only(c);
 317
 318         wait_event(bch_read_only_wait,
 319                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 320
 321         clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 322
 323         if (!bch2_journal_error(&c->journal) &&
 324             !test_bit(BCH_FS_ERROR, &c->flags) &&
 325             !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
 326             test_bit(BCH_FS_STARTED, &c->flags) &&
 327             test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
 328             !c->opts.norecovery) {
 329                 bch_verbose(c, "marking filesystem clean");
 330                 bch2_fs_mark_clean(c);
 331         }
 332
 333         clear_bit(BCH_FS_RW, &c->flags);
 334 }
 335
 336 static void bch2_fs_read_only_work(struct work_struct *work)
 337 {
 338         struct bch_fs *c =
 339                 container_of(work, struct bch_fs, read_only_work);
 340
 341         down_write(&c->state_lock);
 342         bch2_fs_read_only(c);
 343         up_write(&c->state_lock);
 344 }
 345
 346 static void bch2_fs_read_only_async(struct bch_fs *c)
 347 {
 348         queue_work(system_long_wq, &c->read_only_work);
 349 }
 350
 351 bool bch2_fs_emergency_read_only(struct bch_fs *c)
 352 {
 353         bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 354
 355         bch2_journal_halt(&c->journal);
 356         bch2_fs_read_only_async(c);
 357
 358         wake_up(&bch_read_only_wait);
 359         return ret;
 360 }
 361
 362 static int bch2_fs_read_write_late(struct bch_fs *c)
 363 {
 364         int ret;
 365
 366         ret = bch2_gc_thread_start(c);
 367         if (ret) {
 368                 bch_err(c, "error starting gc thread");
 369                 return ret;
 370         }
 371
 372         ret = bch2_copygc_start(c);
 373         if (ret) {
 374                 bch_err(c, "error starting copygc thread");
 375                 return ret;
 376         }
 377
 378         ret = bch2_rebalance_start(c);
 379         if (ret) {
 380                 bch_err(c, "error starting rebalance thread");
 381                 return ret;
 382         }
 383
 384         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 385
 386         schedule_work(&c->ec_stripe_delete_work);
 387
 388         return 0;
 389 }
 390
 391 static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 392 {
 393         struct bch_dev *ca;
 394         unsigned i;
 395         int ret;
 396
 397         if (test_bit(BCH_FS_RW, &c->flags))
 398                 return 0;
 399
 400         /*
 401          * nochanges is used for fsck -n mode - we have to allow going rw
 402          * during recovery for that to work:
 403          */
 404         if (c->opts.norecovery ||
 405             (c->opts.nochanges &&
 406              (!early || c->opts.read_only)))
 407                 return -EROFS;
 408
 409         ret = bch2_fs_mark_dirty(c);
 410         if (ret)
 411                 goto err;
 412
 413         /*
 414          * We need to write out a journal entry before we start doing btree
 415          * updates, to ensure that on unclean shutdown new journal blacklist
 416          * entries are created:
 417          */
 418         bch2_journal_meta(&c->journal);
 419
 420         clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 421
 422         for_each_rw_member(ca, c, i)
 423                 bch2_dev_allocator_add(c, ca);
 424         bch2_recalc_capacity(c);
 425
 426         for_each_rw_member(ca, c, i) {
 427                 ret = bch2_dev_allocator_start(ca);
 428                 if (ret) {
 429                         bch_err(c, "error starting allocator threads");
 430                         percpu_ref_put(&ca->io_ref);
 431                         goto err;
 432                 }
 433         }
 434
 435         set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 436
 437         if (!early) {
 438                 ret = bch2_fs_read_write_late(c);
 439                 if (ret)
 440                         goto err;
 441         }
 442
 443         percpu_ref_reinit(&c->writes);
 444         set_bit(BCH_FS_RW, &c->flags);
 445
 446         queue_delayed_work(c->journal_reclaim_wq,
 447                            &c->journal.reclaim_work, 0);
 448         return 0;
 449 err:
 450         __bch2_fs_read_only(c);
 451         return ret;
 452 }
 453
 454 int bch2_fs_read_write(struct bch_fs *c)
 455 {
 456         return __bch2_fs_read_write(c, false);
 457 }
 458
 459 int bch2_fs_read_write_early(struct bch_fs *c)
 460 {
 461         lockdep_assert_held(&c->state_lock);
 462
 463         return __bch2_fs_read_write(c, true);
 464 }
 465
 466 /* Filesystem startup/shutdown: */
 467
 468 static void bch2_fs_free(struct bch_fs *c)
 469 {
 470         unsigned i;
 471
 472         for (i = 0; i < BCH_TIME_STAT_NR; i++)
 473                 bch2_time_stats_exit(&c->times[i]);
 474
 475         bch2_fs_quota_exit(c);
 476         bch2_fs_fsio_exit(c);
 477         bch2_fs_ec_exit(c);
 478         bch2_fs_encryption_exit(c);
 479         bch2_fs_io_exit(c);
 480         bch2_fs_btree_interior_update_exit(c);
 481         bch2_fs_btree_iter_exit(c);
 482         bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
 483         bch2_fs_btree_cache_exit(c);
 484         bch2_fs_journal_exit(&c->journal);
 485         bch2_io_clock_exit(&c->io_clock[WRITE]);
 486         bch2_io_clock_exit(&c->io_clock[READ]);
 487         bch2_fs_compress_exit(c);
 488         bch2_journal_keys_free(&c->journal_keys);
 489         bch2_journal_entries_free(&c->journal_entries);
 490         percpu_free_rwsem(&c->mark_lock);
 491         free_percpu(c->online_reserved);
 492         kfree(c->usage_scratch);
 493         free_percpu(c->usage[1]);
 494         free_percpu(c->usage[0]);
 495         kfree(c->usage_base);
 496         free_percpu(c->pcpu);
 497         mempool_exit(&c->large_bkey_pool);
 498         mempool_exit(&c->btree_bounce_pool);
 499         bioset_exit(&c->btree_bio);
 500         mempool_exit(&c->fill_iter);
 501         percpu_ref_exit(&c->writes);
 502         kfree(c->replicas.entries);
 503         kfree(c->replicas_gc.entries);
 504         kfree(rcu_dereference_protected(c->disk_groups, 1));
 505         kfree(c->journal_seq_blacklist_table);
 506         free_heap(&c->copygc_heap);
 507
 508         if (c->journal_reclaim_wq)
 509                 destroy_workqueue(c->journal_reclaim_wq);
 510         if (c->copygc_wq)
 511                 destroy_workqueue(c->copygc_wq);
 512         if (c->wq)
 513                 destroy_workqueue(c->wq);
 514
 515         free_pages((unsigned long) c->disk_sb.sb,
 516                    c->disk_sb.page_order);
 517         kvpfree(c, sizeof(*c));
 518         module_put(THIS_MODULE);
 519 }
 520
 521 static void bch2_fs_release(struct kobject *kobj)
 522 {
 523         struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 524
 525         bch2_fs_free(c);
 526 }
 527
 528 void bch2_fs_stop(struct bch_fs *c)
 529 {
 530         struct bch_dev *ca;
 531         unsigned i;
 532
 533         bch_verbose(c, "shutting down");
 534
 535         set_bit(BCH_FS_STOPPING, &c->flags);
 536
 537         cancel_work_sync(&c->journal_seq_blacklist_gc_work);
 538
 539         down_write(&c->state_lock);
 540         bch2_fs_read_only(c);
 541         up_write(&c->state_lock);
 542
 543         for_each_member_device(ca, c, i)
 544                 if (ca->kobj.state_in_sysfs &&
 545                     ca->disk_sb.bdev)
 546                         sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
 547
 548         if (c->kobj.state_in_sysfs)
 549                 kobject_del(&c->kobj);
 550
 551         bch2_fs_debug_exit(c);
 552         bch2_fs_chardev_exit(c);
 553
 554         kobject_put(&c->time_stats);
 555         kobject_put(&c->opts_dir);
 556         kobject_put(&c->internal);
 557
 558         mutex_lock(&bch_fs_list_lock);
 559         list_del(&c->list);
 560         mutex_unlock(&bch_fs_list_lock);
 561
 562         closure_sync(&c->cl);
 563         closure_debug_destroy(&c->cl);
 564
 565         /* btree prefetch might have kicked off reads in the background: */
 566         bch2_btree_flush_all_reads(c);
 567
 568         for_each_member_device(ca, c, i)
 569                 cancel_work_sync(&ca->io_error_work);
 570
 571         cancel_work_sync(&c->btree_write_error_work);
 572         cancel_delayed_work_sync(&c->pd_controllers_update);
 573         cancel_work_sync(&c->read_only_work);
 574
 575         for (i = 0; i < c->sb.nr_devices; i++)
 576                 if (c->devs[i])
 577                         bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
 578
 579         bch_verbose(c, "shutdown complete");
 580
 581         kobject_put(&c->kobj);
 582 }
 583
 584 static const char *bch2_fs_online(struct bch_fs *c)
 585 {
 586         struct bch_dev *ca;
 587         const char *err = NULL;
 588         unsigned i;
 589         int ret;
 590
 591         lockdep_assert_held(&bch_fs_list_lock);
 592
 593         if (!list_empty(&c->list))
 594                 return NULL;
 595
 596         if (__bch2_uuid_to_fs(c->sb.uuid))
 597                 return "filesystem UUID already open";
 598
 599         ret = bch2_fs_chardev_init(c);
 600         if (ret)
 601                 return "error creating character device";
 602
 603         bch2_fs_debug_init(c);
 604
 605         if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
 606             kobject_add(&c->internal, &c->kobj, "internal") ||
 607             kobject_add(&c->opts_dir, &c->kobj, "options") ||
 608             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
 609             bch2_opts_create_sysfs_files(&c->opts_dir))
 610                 return "error creating sysfs objects";
 611
 612         down_write(&c->state_lock);
 613
 614         err = "error creating sysfs objects";
 615         __for_each_member_device(ca, c, i, NULL)
 616                 if (bch2_dev_sysfs_online(c, ca))
 617                         goto err;
 618
 619         list_add(&c->list, &bch_fs_list);
 620         err = NULL;
 621 err:
 622         up_write(&c->state_lock);
 623         return err;
 624 }
 625
 626 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 627 {
 628         struct bch_sb_field_members *mi;
 629         struct bch_fs *c;
 630         unsigned i, iter_size;
 631         const char *err;
 632
 633         pr_verbose_init(opts, "");
 634
 635         c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
 636         if (!c)
 637                 goto out;
 638
 639         __module_get(THIS_MODULE);
 640
 641         c->minor                = -1;
 642         c->disk_sb.fs_sb        = true;
 643
 644         init_rwsem(&c->state_lock);
 645         mutex_init(&c->sb_lock);
 646         mutex_init(&c->replicas_gc_lock);
 647         mutex_init(&c->btree_root_lock);
 648         INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
 649
 650         init_rwsem(&c->gc_lock);
 651
 652         for (i = 0; i < BCH_TIME_STAT_NR; i++)
 653                 bch2_time_stats_init(&c->times[i]);
 654
 655         bch2_fs_copygc_init(c);
 656         bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
 657         bch2_fs_allocator_background_init(c);
 658         bch2_fs_allocator_foreground_init(c);
 659         bch2_fs_rebalance_init(c);
 660         bch2_fs_quota_init(c);
 661
 662         INIT_LIST_HEAD(&c->list);
 663
 664         mutex_init(&c->usage_scratch_lock);
 665
 666         mutex_init(&c->bio_bounce_pages_lock);
 667
 668         bio_list_init(&c->btree_write_error_list);
 669         spin_lock_init(&c->btree_write_error_lock);
 670         INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
 671
 672         INIT_WORK(&c->journal_seq_blacklist_gc_work,
 673                   bch2_blacklist_entries_gc);
 674
 675         INIT_LIST_HEAD(&c->journal_entries);
 676
 677         INIT_LIST_HEAD(&c->fsck_errors);
 678         mutex_init(&c->fsck_error_lock);
 679
 680         INIT_LIST_HEAD(&c->ec_stripe_head_list);
 681         mutex_init(&c->ec_stripe_head_lock);
 682
 683         INIT_LIST_HEAD(&c->ec_stripe_new_list);
 684         mutex_init(&c->ec_stripe_new_lock);
 685
 686         spin_lock_init(&c->ec_stripes_heap_lock);
 687
 688         seqcount_init(&c->gc_pos_lock);
 689
 690         seqcount_init(&c->usage_lock);
 691
 692         c->copy_gc_enabled              = 1;
 693         c->rebalance.enabled            = 1;
 694         c->promote_whole_extents        = true;
 695
 696         c->journal.write_time   = &c->times[BCH_TIME_journal_write];
 697         c->journal.delay_time   = &c->times[BCH_TIME_journal_delay];
 698         c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
 699         c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
 700
 701         bch2_fs_btree_cache_init_early(&c->btree_cache);
 702
 703         mutex_init(&c->sectors_available_lock);
 704
 705         if (percpu_init_rwsem(&c->mark_lock))
 706                 goto err;
 707
 708         mutex_lock(&c->sb_lock);
 709
 710         if (bch2_sb_to_fs(c, sb)) {
 711                 mutex_unlock(&c->sb_lock);
 712                 goto err;
 713         }
 714
 715         mutex_unlock(&c->sb_lock);
 716
 717         scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
 718
 719         c->opts = bch2_opts_default;
 720         bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
 721         bch2_opts_apply(&c->opts, opts);
 722
 723         c->block_bits           = ilog2(c->opts.block_size);
 724         c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 725
 726         if (bch2_fs_init_fault("fs_alloc"))
 727                 goto err;
 728
 729         iter_size = sizeof(struct sort_iter) +
 730                 (btree_blocks(c) + 1) * 2 *
 731                 sizeof(struct sort_iter_set);
 732
 733         if (!(c->wq = alloc_workqueue("bcachefs",
 734                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 735             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
 736                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 737             !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
 738                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 739             percpu_ref_init(&c->writes, bch2_writes_disabled,
 740                             PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 741             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 742             bioset_init(&c->btree_bio, 1,
 743                         max(offsetof(struct btree_read_bio, bio),
 744                             offsetof(struct btree_write_bio, wbio.bio)),
 745                         BIOSET_NEED_BVECS) ||
 746             !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 747             !(c->online_reserved = alloc_percpu(u64)) ||
 748             mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 749                                         btree_bytes(c)) ||
 750             mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 751             bch2_io_clock_init(&c->io_clock[READ]) ||
 752             bch2_io_clock_init(&c->io_clock[WRITE]) ||
 753             bch2_fs_journal_init(&c->journal) ||
 754             bch2_fs_replicas_init(c) ||
 755             bch2_fs_btree_cache_init(c) ||
 756             bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
 757             bch2_fs_btree_iter_init(c) ||
 758             bch2_fs_btree_interior_update_init(c) ||
 759             bch2_fs_io_init(c) ||
 760             bch2_fs_encryption_init(c) ||
 761             bch2_fs_compress_init(c) ||
 762             bch2_fs_ec_init(c) ||
 763             bch2_fs_fsio_init(c))
 764                 goto err;
 765
 766         mi = bch2_sb_get_members(c->disk_sb.sb);
 767         for (i = 0; i < c->sb.nr_devices; i++)
 768                 if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
 769                     bch2_dev_alloc(c, i))
 770                         goto err;
 771
 772         /*
 773          * Now that all allocations have succeeded, init various refcounty
 774          * things that let us shutdown:
 775          */
 776         closure_init(&c->cl, NULL);
 777
 778         c->kobj.kset = bcachefs_kset;
 779         kobject_init(&c->kobj, &bch2_fs_ktype);
 780         kobject_init(&c->internal, &bch2_fs_internal_ktype);
 781         kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
 782         kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
 783
 784         mutex_lock(&bch_fs_list_lock);
 785         err = bch2_fs_online(c);
 786         mutex_unlock(&bch_fs_list_lock);
 787         if (err) {
 788                 bch_err(c, "bch2_fs_online() error: %s", err);
 789                 goto err;
 790         }
 791 out:
 792         pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
 793         return c;
 794 err:
 795         bch2_fs_free(c);
 796         c = NULL;
 797         goto out;
 798 }
 799
 800 noinline_for_stack
 801 static void print_mount_opts(struct bch_fs *c)
 802 {
 803         enum bch_opt_id i;
 804         char buf[512];
 805         struct printbuf p = PBUF(buf);
 806         bool first = true;
 807
 808         strcpy(buf, "(null)");
 809
 810         if (c->opts.read_only) {
 811                 pr_buf(&p, "ro");
 812                 first = false;
 813         }
 814
 815         for (i = 0; i < bch2_opts_nr; i++) {
 816                 const struct bch_option *opt = &bch2_opt_table[i];
 817                 u64 v = bch2_opt_get_by_id(&c->opts, i);
 818
 819                 if (!(opt->mode & OPT_MOUNT))
 820                         continue;
 821
 822                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
 823                         continue;
 824
 825                 if (!first)
 826                         pr_buf(&p, ",");
 827                 first = false;
 828                 bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
 829         }
 830
 831         bch_info(c, "mounted with opts: %s", buf);
 832 }
 833
 834 int bch2_fs_start(struct bch_fs *c)
 835 {
 836         const char *err = "cannot allocate memory";
 837         struct bch_sb_field_members *mi;
 838         struct bch_dev *ca;
 839         time64_t now = ktime_get_real_seconds();
 840         unsigned i;
 841         int ret = -EINVAL;
 842
 843         down_write(&c->state_lock);
 844
 845         BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
 846
 847         mutex_lock(&c->sb_lock);
 848
 849         for_each_online_member(ca, c, i)
 850                 bch2_sb_from_fs(c, ca);
 851
 852         mi = bch2_sb_get_members(c->disk_sb.sb);
 853         for_each_online_member(ca, c, i)
 854                 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
 855
 856         mutex_unlock(&c->sb_lock);
 857
 858         for_each_rw_member(ca, c, i)
 859                 bch2_dev_allocator_add(c, ca);
 860         bch2_recalc_capacity(c);
 861
 862         ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
 863                 ? bch2_fs_recovery(c)
 864                 : bch2_fs_initialize(c);
 865         if (ret)
 866                 goto err;
 867
 868         ret = bch2_opts_check_may_set(c);
 869         if (ret)
 870                 goto err;
 871
 872         err = "dynamic fault";
 873         ret = -EINVAL;
 874         if (bch2_fs_init_fault("fs_start"))
 875                 goto err;
 876
 877         set_bit(BCH_FS_STARTED, &c->flags);
 878
 879         if (c->opts.read_only || c->opts.nochanges) {
 880                 bch2_fs_read_only(c);
 881         } else {
 882                 err = "error going read write";
 883                 ret = !test_bit(BCH_FS_RW, &c->flags)
 884                         ? bch2_fs_read_write(c)
 885                         : bch2_fs_read_write_late(c);
 886                 if (ret)
 887                         goto err;
 888         }
 889
 890         print_mount_opts(c);
 891         ret = 0;
 892 out:
 893         up_write(&c->state_lock);
 894         return ret;
 895 err:
 896         switch (ret) {
 897         case BCH_FSCK_ERRORS_NOT_FIXED:
 898                 bch_err(c, "filesystem contains errors: please report this to the developers");
 899                 pr_cont("mount with -o fix_errors to repair\n");
 900                 err = "fsck error";
 901                 break;
 902         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
 903                 bch_err(c, "filesystem contains errors: please report this to the developers");
 904                 pr_cont("repair unimplemented: inform the developers so that it can be added\n");
 905                 err = "fsck error";
 906                 break;
 907         case BCH_FSCK_REPAIR_IMPOSSIBLE:
 908                 bch_err(c, "filesystem contains errors, but repair impossible");
 909                 err = "fsck error";
 910                 break;
 911         case BCH_FSCK_UNKNOWN_VERSION:
 912                 err = "unknown metadata version";;
 913                 break;
 914         case -ENOMEM:
 915                 err = "cannot allocate memory";
 916                 break;
 917         case -EIO:
 918                 err = "IO error";
 919                 break;
 920         }
 921
 922         if (ret >= 0)
 923                 ret = -EIO;
 924         goto out;
 925 }
 926
 927 static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 928 {
 929         struct bch_sb_field_members *sb_mi;
 930
 931         sb_mi = bch2_sb_get_members(sb);
 932         if (!sb_mi)
 933                 return "Invalid superblock: member info area missing";
 934
 935         if (le16_to_cpu(sb->block_size) != c->opts.block_size)
 936                 return "mismatched block size";
 937
 938         if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
 939             BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
 940                 return "new cache bucket size is too small";
 941
 942         return NULL;
 943 }
 944
 945 static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
 946 {
 947         struct bch_sb *newest =
 948                 le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
 949         struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
 950
 951         if (!uuid_equal(&fs->uuid, &sb->uuid))
 952                 return "device not a member of filesystem";
 953
 954         if (!bch2_dev_exists(newest, mi, sb->dev_idx))
 955                 return "device has been removed";
 956
 957         if (fs->block_size != sb->block_size)
 958                 return "mismatched block size";
 959
 960         return NULL;
 961 }
 962
 963 /* Device startup/shutdown: */
 964
 965 static void bch2_dev_release(struct kobject *kobj)
 966 {
 967         struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 968
 969         kfree(ca);
 970 }
 971
 972 static void bch2_dev_free(struct bch_dev *ca)
 973 {
 974         cancel_work_sync(&ca->io_error_work);
 975
 976         if (ca->kobj.state_in_sysfs &&
 977             ca->disk_sb.bdev)
 978                 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
 979
 980         if (ca->kobj.state_in_sysfs)
 981                 kobject_del(&ca->kobj);
 982
 983         bch2_free_super(&ca->disk_sb);
 984         bch2_dev_journal_exit(ca);
 985
 986         free_percpu(ca->io_done);
 987         bioset_exit(&ca->replica_set);
 988         bch2_dev_buckets_free(ca);
 989         free_page((unsigned long) ca->sb_read_scratch);
 990
 991         bch2_time_stats_exit(&ca->io_latency[WRITE]);
 992         bch2_time_stats_exit(&ca->io_latency[READ]);
 993
 994         percpu_ref_exit(&ca->io_ref);
 995         percpu_ref_exit(&ca->ref);
 996         kobject_put(&ca->kobj);
 997 }
 998
 999 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
1000 {
1001
1002         lockdep_assert_held(&c->state_lock);
1003
1004         if (percpu_ref_is_zero(&ca->io_ref))
1005                 return;
1006
1007         __bch2_dev_read_only(c, ca);
1008
1009         reinit_completion(&ca->io_ref_completion);
1010         percpu_ref_kill(&ca->io_ref);
1011         wait_for_completion(&ca->io_ref_completion);
1012
1013         if (ca->kobj.state_in_sysfs) {
1014                 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
1015                 sysfs_remove_link(&ca->kobj, "block");
1016         }
1017
1018         bch2_free_super(&ca->disk_sb);
1019         bch2_dev_journal_exit(ca);
1020 }
1021
1022 static void bch2_dev_ref_complete(struct percpu_ref *ref)
1023 {
1024         struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
1025
1026         complete(&ca->ref_completion);
1027 }
1028
1029 static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
1030 {
1031         struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
1032
1033         complete(&ca->io_ref_completion);
1034 }
1035
1036 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
1037 {
1038         int ret;
1039
1040         if (!c->kobj.state_in_sysfs)
1041                 return 0;
1042
1043         if (!ca->kobj.state_in_sysfs) {
1044                 ret = kobject_add(&ca->kobj, &c->kobj,
1045                                   "dev-%u", ca->dev_idx);
1046                 if (ret)
1047                         return ret;
1048         }
1049
1050         if (ca->disk_sb.bdev) {
1051                 struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
1052
1053                 ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
1054                 if (ret)
1055                         return ret;
1056
1057                 ret = sysfs_create_link(&ca->kobj, block, "block");
1058                 if (ret)
1059                         return ret;
1060         }
1061
1062         return 0;
1063 }
1064
1065 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
1066                                         struct bch_member *member)
1067 {
1068         struct bch_dev *ca;
1069
1070         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1071         if (!ca)
1072                 return NULL;
1073
1074         kobject_init(&ca->kobj, &bch2_dev_ktype);
1075         init_completion(&ca->ref_completion);
1076         init_completion(&ca->io_ref_completion);
1077
1078         init_rwsem(&ca->bucket_lock);
1079
1080         INIT_WORK(&ca->io_error_work, bch2_io_error_work);
1081
1082         bch2_time_stats_init(&ca->io_latency[READ]);
1083         bch2_time_stats_init(&ca->io_latency[WRITE]);
1084
1085         ca->mi = bch2_mi_to_cpu(member);
1086         ca->uuid = member->uuid;
1087
1088         if (opt_defined(c->opts, discard))
1089                 ca->mi.discard = opt_get(c->opts, discard);
1090
1091         if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
1092                             0, GFP_KERNEL) ||
1093             percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
1094                             PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
1095             !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
1096             bch2_dev_buckets_alloc(c, ca) ||
1097             bioset_init(&ca->replica_set, 4,
1098                         offsetof(struct bch_write_bio, bio), 0) ||
1099             !(ca->io_done       = alloc_percpu(*ca->io_done)))
1100                 goto err;
1101
1102         return ca;
1103 err:
1104         bch2_dev_free(ca);
1105         return NULL;
1106 }
1107
1108 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
1109                             unsigned dev_idx)
1110 {
1111         ca->dev_idx = dev_idx;
1112         __set_bit(ca->dev_idx, ca->self.d);
1113         scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
1114
1115         ca->fs = c;
1116         rcu_assign_pointer(c->devs[ca->dev_idx], ca);
1117
1118         if (bch2_dev_sysfs_online(c, ca))
1119                 pr_warn("error creating sysfs objects");
1120 }
1121
1122 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
1123 {
1124         struct bch_member *member =
1125                 bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
1126         struct bch_dev *ca = NULL;
1127         int ret = 0;
1128
1129         pr_verbose_init(c->opts, "");
1130
1131         if (bch2_fs_init_fault("dev_alloc"))
1132                 goto err;
1133
1134         ca = __bch2_dev_alloc(c, member);
1135         if (!ca)
1136                 goto err;
1137
1138         bch2_dev_attach(c, ca, dev_idx);
1139 out:
1140         pr_verbose_init(c->opts, "ret %i", ret);
1141         return ret;
1142 err:
1143         if (ca)
1144                 bch2_dev_free(ca);
1145         ret = -ENOMEM;
1146         goto out;
1147 }
1148
1149 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
1150 {
1151         unsigned ret;
1152
1153         if (bch2_dev_is_online(ca)) {
1154                 bch_err(ca, "already have device online in slot %u",
1155                         sb->sb->dev_idx);
1156                 return -EINVAL;
1157         }
1158
1159         if (get_capacity(sb->bdev->bd_disk) <
1160             ca->mi.bucket_size * ca->mi.nbuckets) {
1161                 bch_err(ca, "cannot online: device too small");
1162                 return -EINVAL;
1163         }
1164
1165         BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
1166
1167         if (get_capacity(sb->bdev->bd_disk) <
1168             ca->mi.bucket_size * ca->mi.nbuckets) {
1169                 bch_err(ca, "device too small");
1170                 return -EINVAL;
1171         }
1172
1173         ret = bch2_dev_journal_init(ca, sb->sb);
1174         if (ret)
1175                 return ret;
1176
1177         /* Commit: */
1178         ca->disk_sb = *sb;
1179         memset(sb, 0, sizeof(*sb));
1180
1181         percpu_ref_reinit(&ca->io_ref);
1182
1183         return 0;
1184 }
1185
1186 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
1187 {
1188         struct bch_dev *ca;
1189         int ret;
1190
1191         lockdep_assert_held(&c->state_lock);
1192
1193         if (le64_to_cpu(sb->sb->seq) >
1194             le64_to_cpu(c->disk_sb.sb->seq))
1195                 bch2_sb_to_fs(c, sb->sb);
1196
1197         BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
1198                !c->devs[sb->sb->dev_idx]);
1199
1200         ca = bch_dev_locked(c, sb->sb->dev_idx);
1201
1202         ret = __bch2_dev_attach_bdev(ca, sb);
1203         if (ret)
1204                 return ret;
1205
1206         if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
1207             !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
1208                 mutex_lock(&c->sb_lock);
1209                 bch2_mark_dev_superblock(ca->fs, ca, 0);
1210                 mutex_unlock(&c->sb_lock);
1211         }
1212
1213         bch2_dev_sysfs_online(c, ca);
1214
1215         if (c->sb.nr_devices == 1)
1216                 snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
1217         snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
1218
1219         rebalance_wakeup(c);
1220         return 0;
1221 }
1222
1223 /* Device management: */
1224
1225 /*
1226  * Note: this function is also used by the error paths - when a particular
1227  * device sees an error, we call it to determine whether we can just set the
1228  * device RO, or - if this function returns false - we'll set the whole
1229  * filesystem RO:
1230  *
1231  * XXX: maybe we should be more explicit about whether we're changing state
1232  * because we got an error or what have you?
1233  */
1234 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
1235                             enum bch_member_state new_state, int flags)
1236 {
1237         struct bch_devs_mask new_online_devs;
1238         struct replicas_status s;
1239         struct bch_dev *ca2;
1240         int i, nr_rw = 0, required;
1241
1242         lockdep_assert_held(&c->state_lock);
1243
1244         switch (new_state) {
1245         case BCH_MEMBER_STATE_RW:
1246                 return true;
1247         case BCH_MEMBER_STATE_RO:
1248                 if (ca->mi.state != BCH_MEMBER_STATE_RW)
1249                         return true;
1250
1251                 /* do we have enough devices to write to?  */
1252                 for_each_member_device(ca2, c, i)
1253                         if (ca2 != ca)
1254                                 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
1255
1256                 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
1257                                ? c->opts.metadata_replicas
1258                                : c->opts.metadata_replicas_required,
1259                                !(flags & BCH_FORCE_IF_DATA_DEGRADED)
1260                                ? c->opts.data_replicas
1261                                : c->opts.data_replicas_required);
1262
1263                 return nr_rw >= required;
1264         case BCH_MEMBER_STATE_FAILED:
1265         case BCH_MEMBER_STATE_SPARE:
1266                 if (ca->mi.state != BCH_MEMBER_STATE_RW &&
1267                     ca->mi.state != BCH_MEMBER_STATE_RO)
1268                         return true;
1269
1270                 /* do we have enough devices to read from?  */
1271                 new_online_devs = bch2_online_devs(c);
1272                 __clear_bit(ca->dev_idx, new_online_devs.d);
1273
1274                 s = __bch2_replicas_status(c, new_online_devs);
1275
1276                 return bch2_have_enough_devs(s, flags);
1277         default:
1278                 BUG();
1279         }
1280 }
1281
1282 static bool bch2_fs_may_start(struct bch_fs *c)
1283 {
1284         struct replicas_status s;
1285         struct bch_sb_field_members *mi;
1286         struct bch_dev *ca;
1287         unsigned i, flags = c->opts.degraded
1288                 ? BCH_FORCE_IF_DEGRADED
1289                 : 0;
1290
1291         if (!c->opts.degraded) {
1292                 mutex_lock(&c->sb_lock);
1293                 mi = bch2_sb_get_members(c->disk_sb.sb);
1294
1295                 for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
1296                         if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
1297                                 continue;
1298
1299                         ca = bch_dev_locked(c, i);
1300
1301                         if (!bch2_dev_is_online(ca) &&
1302                             (ca->mi.state == BCH_MEMBER_STATE_RW ||
1303                              ca->mi.state == BCH_MEMBER_STATE_RO)) {
1304                                 mutex_unlock(&c->sb_lock);
1305                                 return false;
1306                         }
1307                 }
1308                 mutex_unlock(&c->sb_lock);
1309         }
1310
1311         s = bch2_replicas_status(c);
1312
1313         return bch2_have_enough_devs(s, flags);
1314 }
1315
1316 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
1317 {
1318         /*
1319          * Device going read only means the copygc reserve get smaller, so we
1320          * don't want that happening while copygc is in progress:
1321          */
1322         bch2_copygc_stop(c);
1323
1324         /*
1325          * The allocator thread itself allocates btree nodes, so stop it first:
1326          */
1327         bch2_dev_allocator_stop(ca);
1328         bch2_dev_allocator_remove(c, ca);
1329         bch2_dev_journal_stop(&c->journal, ca);
1330
1331         bch2_copygc_start(c);
1332 }
1333
1334 static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
1335 {
1336         lockdep_assert_held(&c->state_lock);
1337
1338         BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
1339
1340         bch2_dev_allocator_add(c, ca);
1341         bch2_recalc_capacity(c);
1342
1343         if (bch2_dev_allocator_start(ca))
1344                 return "error starting allocator thread";
1345
1346         return NULL;
1347 }
1348
1349 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1350                          enum bch_member_state new_state, int flags)
1351 {
1352         struct bch_sb_field_members *mi;
1353         int ret = 0;
1354
1355         if (ca->mi.state == new_state)
1356                 return 0;
1357
1358         if (!bch2_dev_state_allowed(c, ca, new_state, flags))
1359                 return -EINVAL;
1360
1361         if (new_state != BCH_MEMBER_STATE_RW)
1362                 __bch2_dev_read_only(c, ca);
1363
1364         bch_notice(ca, "%s", bch2_dev_state[new_state]);
1365
1366         mutex_lock(&c->sb_lock);
1367         mi = bch2_sb_get_members(c->disk_sb.sb);
1368         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
1369         bch2_write_super(c);
1370         mutex_unlock(&c->sb_lock);
1371
1372         if (new_state == BCH_MEMBER_STATE_RW &&
1373             __bch2_dev_read_write(c, ca))
1374                 ret = -ENOMEM;
1375
1376         rebalance_wakeup(c);
1377
1378         return ret;
1379 }
1380
1381 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1382                        enum bch_member_state new_state, int flags)
1383 {
1384         int ret;
1385
1386         down_write(&c->state_lock);
1387         ret = __bch2_dev_set_state(c, ca, new_state, flags);
1388         up_write(&c->state_lock);
1389
1390         return ret;
1391 }
1392
1393 /* Device add/removal: */
1394
1395 int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
1396 {
1397         struct btree_trans trans;
1398         size_t i;
1399         int ret;
1400
1401         bch2_trans_init(&trans, c, 0, 0);
1402
1403         for (i = 0; i < ca->mi.nbuckets; i++) {
1404                 ret = bch2_btree_key_cache_flush(&trans,
1405                                 BTREE_ID_ALLOC, POS(ca->dev_idx, i));
1406                 if (ret)
1407                         break;
1408         }
1409         bch2_trans_exit(&trans);
1410
1411         if (ret)
1412                 return ret;
1413
1414         return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
1415                                        POS(ca->dev_idx, 0),
1416                                        POS(ca->dev_idx + 1, 0),
1417                                        NULL);
1418 }
1419
1420 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1421 {
1422         struct bch_sb_field_members *mi;
1423         unsigned dev_idx = ca->dev_idx, data;
1424         int ret = -EINVAL;
1425
1426         down_write(&c->state_lock);
1427
1428         /*
1429          * We consume a reference to ca->ref, regardless of whether we succeed
1430          * or fail:
1431          */
1432         percpu_ref_put(&ca->ref);
1433
1434         if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1435                 bch_err(ca, "Cannot remove without losing data");
1436                 goto err;
1437         }
1438
1439         __bch2_dev_read_only(c, ca);
1440
1441         ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
1442         if (ret) {
1443                 bch_err(ca, "Remove failed: error %i dropping data", ret);
1444                 goto err;
1445         }
1446
1447         ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
1448         if (ret) {
1449                 bch_err(ca, "Remove failed: error %i flushing journal", ret);
1450                 goto err;
1451         }
1452
1453         ret = bch2_dev_remove_alloc(c, ca);
1454         if (ret) {
1455                 bch_err(ca, "Remove failed, error deleting alloc info");
1456                 goto err;
1457         }
1458
1459         /*
1460          * must flush all existing journal entries, they might have
1461          * (overwritten) keys that point to the device we're removing:
1462          */
1463         bch2_journal_flush_all_pins(&c->journal);
1464         /*
1465          * hack to ensure bch2_replicas_gc2() clears out entries to this device
1466          */
1467         bch2_journal_meta(&c->journal);
1468         ret = bch2_journal_error(&c->journal);
1469         if (ret) {
1470                 bch_err(ca, "Remove failed, journal error");
1471                 goto err;
1472         }
1473
1474         ret = bch2_replicas_gc2(c);
1475         if (ret) {
1476                 bch_err(ca, "Remove failed: error %i from replicas gc", ret);
1477                 goto err;
1478         }
1479
1480         data = bch2_dev_has_data(c, ca);
1481         if (data) {
1482                 char data_has_str[100];
1483
1484                 bch2_flags_to_text(&PBUF(data_has_str),
1485                                    bch2_data_types, data);
1486                 bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
1487                 ret = -EBUSY;
1488                 goto err;
1489         }
1490
1491         __bch2_dev_offline(c, ca);
1492
1493         mutex_lock(&c->sb_lock);
1494         rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
1495         mutex_unlock(&c->sb_lock);
1496
1497         percpu_ref_kill(&ca->ref);
1498         wait_for_completion(&ca->ref_completion);
1499
1500         bch2_dev_free(ca);
1501
1502         /*
1503          * Free this device's slot in the bch_member array - all pointers to
1504          * this device must be gone:
1505          */
1506         mutex_lock(&c->sb_lock);
1507         mi = bch2_sb_get_members(c->disk_sb.sb);
1508         memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1509
1510         bch2_write_super(c);
1511
1512         mutex_unlock(&c->sb_lock);
1513         up_write(&c->state_lock);
1514         return 0;
1515 err:
1516         if (ca->mi.state == BCH_MEMBER_STATE_RW &&
1517             !percpu_ref_is_zero(&ca->io_ref))
1518                 __bch2_dev_read_write(c, ca);
1519         up_write(&c->state_lock);
1520         return ret;
1521 }
1522
1523 static void dev_usage_clear(struct bch_dev *ca)
1524 {
1525         struct bucket_array *buckets;
1526
1527         percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
1528
1529         down_read(&ca->bucket_lock);
1530         buckets = bucket_array(ca);
1531
1532         memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
1533         up_read(&ca->bucket_lock);
1534 }
1535
1536 /* Add new device to running filesystem: */
1537 int bch2_dev_add(struct bch_fs *c, const char *path)
1538 {
1539         struct bch_opts opts = bch2_opts_empty();
1540         struct bch_sb_handle sb;
1541         const char *err;
1542         struct bch_dev *ca = NULL;
1543         struct bch_sb_field_members *mi;
1544         struct bch_member dev_mi;
1545         unsigned dev_idx, nr_devices, u64s;
1546         int ret;
1547
1548         ret = bch2_read_super(path, &opts, &sb);
1549         if (ret)
1550                 return ret;
1551
1552         err = bch2_sb_validate(&sb);
1553         if (err)
1554                 return -EINVAL;
1555
1556         dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
1557
1558         err = bch2_dev_may_add(sb.sb, c);
1559         if (err)
1560                 return -EINVAL;
1561
1562         ca = __bch2_dev_alloc(c, &dev_mi);
1563         if (!ca) {
1564                 bch2_free_super(&sb);
1565                 return -ENOMEM;
1566         }
1567
1568         ret = __bch2_dev_attach_bdev(ca, &sb);
1569         if (ret) {
1570                 bch2_dev_free(ca);
1571                 return ret;
1572         }
1573
1574         /*
1575          * We want to allocate journal on the new device before adding the new
1576          * device to the filesystem because allocating after we attach requires
1577          * spinning up the allocator thread, and the allocator thread requires
1578          * doing btree writes, which if the existing devices are RO isn't going
1579          * to work
1580          *
1581          * So we have to mark where the superblocks are, but marking allocated
1582          * data normally updates the filesystem usage too, so we have to mark,
1583          * allocate the journal, reset all the marks, then remark after we
1584          * attach...
1585          */
1586         bch2_mark_dev_superblock(ca->fs, ca, 0);
1587
1588         err = "journal alloc failed";
1589         ret = bch2_dev_journal_alloc(ca);
1590         if (ret)
1591                 goto err;
1592
1593         dev_usage_clear(ca);
1594
1595         down_write(&c->state_lock);
1596         mutex_lock(&c->sb_lock);
1597
1598         err = "insufficient space in new superblock";
1599         ret = bch2_sb_from_fs(c, ca);
1600         if (ret)
1601                 goto err_unlock;
1602
1603         mi = bch2_sb_get_members(ca->disk_sb.sb);
1604
1605         if (!bch2_sb_resize_members(&ca->disk_sb,
1606                                 le32_to_cpu(mi->field.u64s) +
1607                                 sizeof(dev_mi) / sizeof(u64))) {
1608                 ret = -ENOSPC;
1609                 goto err_unlock;
1610         }
1611
1612         if (dynamic_fault("bcachefs:add:no_slot"))
1613                 goto no_slot;
1614
1615         mi = bch2_sb_get_members(c->disk_sb.sb);
1616         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1617                 if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
1618                         goto have_slot;
1619 no_slot:
1620         err = "no slots available in superblock";
1621         ret = -ENOSPC;
1622         goto err_unlock;
1623
1624 have_slot:
1625         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1626         u64s = (sizeof(struct bch_sb_field_members) +
1627                 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1628
1629         err = "no space in superblock for member info";
1630         ret = -ENOSPC;
1631
1632         mi = bch2_sb_resize_members(&c->disk_sb, u64s);
1633         if (!mi)
1634                 goto err_unlock;
1635
1636         /* success: */
1637
1638         mi->members[dev_idx] = dev_mi;
1639         mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
1640         c->disk_sb.sb->nr_devices       = nr_devices;
1641
1642         ca->disk_sb.sb->dev_idx = dev_idx;
1643         bch2_dev_attach(c, ca, dev_idx);
1644
1645         bch2_mark_dev_superblock(c, ca, 0);
1646
1647         bch2_write_super(c);
1648         mutex_unlock(&c->sb_lock);
1649
1650         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1651                 err = __bch2_dev_read_write(c, ca);
1652                 if (err)
1653                         goto err_late;
1654         }
1655
1656         up_write(&c->state_lock);
1657         return 0;
1658
1659 err_unlock:
1660         mutex_unlock(&c->sb_lock);
1661         up_write(&c->state_lock);
1662 err:
1663         if (ca)
1664                 bch2_dev_free(ca);
1665         bch2_free_super(&sb);
1666         bch_err(c, "Unable to add device: %s", err);
1667         return ret;
1668 err_late:
1669         bch_err(c, "Error going rw after adding device: %s", err);
1670         return -EINVAL;
1671 }
1672
1673 /* Hot add existing device to running filesystem: */
1674 int bch2_dev_online(struct bch_fs *c, const char *path)
1675 {
1676         struct bch_opts opts = bch2_opts_empty();
1677         struct bch_sb_handle sb = { NULL };
1678         struct bch_sb_field_members *mi;
1679         struct bch_dev *ca;
1680         unsigned dev_idx;
1681         const char *err;
1682         int ret;
1683
1684         down_write(&c->state_lock);
1685
1686         ret = bch2_read_super(path, &opts, &sb);
1687         if (ret) {
1688                 up_write(&c->state_lock);
1689                 return ret;
1690         }
1691
1692         dev_idx = sb.sb->dev_idx;
1693
1694         err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
1695         if (err)
1696                 goto err;
1697
1698         if (bch2_dev_attach_bdev(c, &sb)) {
1699                 err = "bch2_dev_attach_bdev() error";
1700                 goto err;
1701         }
1702
1703         ca = bch_dev_locked(c, dev_idx);
1704         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1705                 err = __bch2_dev_read_write(c, ca);
1706                 if (err)
1707                         goto err;
1708         }
1709
1710         mutex_lock(&c->sb_lock);
1711         mi = bch2_sb_get_members(c->disk_sb.sb);
1712
1713         mi->members[ca->dev_idx].last_mount =
1714                 cpu_to_le64(ktime_get_real_seconds());
1715
1716         bch2_write_super(c);
1717         mutex_unlock(&c->sb_lock);
1718
1719         up_write(&c->state_lock);
1720         return 0;
1721 err:
1722         up_write(&c->state_lock);
1723         bch2_free_super(&sb);
1724         bch_err(c, "error bringing %s online: %s", path, err);
1725         return -EINVAL;
1726 }
1727
1728 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
1729 {
1730         down_write(&c->state_lock);
1731
1732         if (!bch2_dev_is_online(ca)) {
1733                 bch_err(ca, "Already offline");
1734                 up_write(&c->state_lock);
1735                 return 0;
1736         }
1737
1738         if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1739                 bch_err(ca, "Cannot offline required disk");
1740                 up_write(&c->state_lock);
1741                 return -EINVAL;
1742         }
1743
1744         __bch2_dev_offline(c, ca);
1745
1746         up_write(&c->state_lock);
1747         return 0;
1748 }
1749
1750 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1751 {
1752         struct bch_member *mi;
1753         int ret = 0;
1754
1755         down_write(&c->state_lock);
1756
1757         if (nbuckets < ca->mi.nbuckets) {
1758                 bch_err(ca, "Cannot shrink yet");
1759                 ret = -EINVAL;
1760                 goto err;
1761         }
1762
1763         if (bch2_dev_is_online(ca) &&
1764             get_capacity(ca->disk_sb.bdev->bd_disk) <
1765             ca->mi.bucket_size * nbuckets) {
1766                 bch_err(ca, "New size larger than device");
1767                 ret = -EINVAL;
1768                 goto err;
1769         }
1770
1771         ret = bch2_dev_buckets_resize(c, ca, nbuckets);
1772         if (ret) {
1773                 bch_err(ca, "Resize error: %i", ret);
1774                 goto err;
1775         }
1776
1777         mutex_lock(&c->sb_lock);
1778         mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
1779         mi->nbuckets = cpu_to_le64(nbuckets);
1780
1781         bch2_write_super(c);
1782         mutex_unlock(&c->sb_lock);
1783
1784         bch2_recalc_capacity(c);
1785 err:
1786         up_write(&c->state_lock);
1787         return ret;
1788 }
1789
1790 /* return with ref on ca->ref: */
1791 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
1792 {
1793         struct bch_dev *ca;
1794         dev_t dev;
1795         unsigned i;
1796         int ret;
1797
1798         ret = lookup_bdev(path, &dev);
1799         if (ret)
1800                 return ERR_PTR(ret);
1801
1802         for_each_member_device(ca, c, i)
1803                 if (ca->disk_sb.bdev->bd_dev == dev)
1804                         goto found;
1805
1806         ca = ERR_PTR(-ENOENT);
1807 found:
1808         return ca;
1809 }
1810
1811 /* Filesystem open: */
1812
1813 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
1814                             struct bch_opts opts)
1815 {
1816         struct bch_sb_handle *sb = NULL;
1817         struct bch_fs *c = NULL;
1818         struct bch_sb_field_members *mi;
1819         unsigned i, best_sb = 0;
1820         const char *err;
1821         int ret = -ENOMEM;
1822
1823         pr_verbose_init(opts, "");
1824
1825         if (!nr_devices) {
1826                 c = ERR_PTR(-EINVAL);
1827                 goto out2;
1828         }
1829
1830         if (!try_module_get(THIS_MODULE)) {
1831                 c = ERR_PTR(-ENODEV);
1832                 goto out2;
1833         }
1834
1835         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1836         if (!sb)
1837                 goto err;
1838
1839         for (i = 0; i < nr_devices; i++) {
1840                 ret = bch2_read_super(devices[i], &opts, &sb[i]);
1841                 if (ret)
1842                         goto err;
1843
1844                 err = bch2_sb_validate(&sb[i]);
1845                 if (err)
1846                         goto err_print;
1847         }
1848
1849         for (i = 1; i < nr_devices; i++)
1850                 if (le64_to_cpu(sb[i].sb->seq) >
1851                     le64_to_cpu(sb[best_sb].sb->seq))
1852                         best_sb = i;
1853
1854         mi = bch2_sb_get_members(sb[best_sb].sb);
1855
1856         i = 0;
1857         while (i < nr_devices) {
1858                 if (i != best_sb &&
1859                     !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
1860                         pr_info("%pg has been removed, skipping", sb[i].bdev);
1861                         bch2_free_super(&sb[i]);
1862                         array_remove_item(sb, nr_devices, i);
1863                         continue;
1864                 }
1865
1866                 err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
1867                 if (err)
1868                         goto err_print;
1869                 i++;
1870         }
1871
1872         ret = -ENOMEM;
1873         c = bch2_fs_alloc(sb[best_sb].sb, opts);
1874         if (!c)
1875                 goto err;
1876
1877         err = "bch2_dev_online() error";
1878         down_write(&c->state_lock);
1879         for (i = 0; i < nr_devices; i++)
1880                 if (bch2_dev_attach_bdev(c, &sb[i])) {
1881                         up_write(&c->state_lock);
1882                         goto err_print;
1883                 }
1884         up_write(&c->state_lock);
1885
1886         err = "insufficient devices";
1887         if (!bch2_fs_may_start(c))
1888                 goto err_print;
1889
1890         if (!c->opts.nostart) {
1891                 ret = bch2_fs_start(c);
1892                 if (ret)
1893                         goto err;
1894         }
1895 out:
1896         kfree(sb);
1897         module_put(THIS_MODULE);
1898 out2:
1899         pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
1900         return c;
1901 err_print:
1902         pr_err("bch_fs_open err opening %s: %s",
1903                devices[0], err);
1904         ret = -EINVAL;
1905 err:
1906         if (c)
1907                 bch2_fs_stop(c);
1908         for (i = 0; i < nr_devices; i++)
1909                 bch2_free_super(&sb[i]);
1910         c = ERR_PTR(ret);
1911         goto out;
1912 }
1913
1914 static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
1915                                               struct bch_opts opts)
1916 {
1917         const char *err;
1918         struct bch_fs *c;
1919         bool allocated_fs = false;
1920         int ret;
1921
1922         err = bch2_sb_validate(sb);
1923         if (err)
1924                 return err;
1925
1926         mutex_lock(&bch_fs_list_lock);
1927         c = __bch2_uuid_to_fs(sb->sb->uuid);
1928         if (c) {
1929                 closure_get(&c->cl);
1930
1931                 err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
1932                 if (err)
1933                         goto err;
1934         } else {
1935                 c = bch2_fs_alloc(sb->sb, opts);
1936                 err = "cannot allocate memory";
1937                 if (!c)
1938                         goto err;
1939
1940                 allocated_fs = true;
1941         }
1942
1943         err = "bch2_dev_online() error";
1944
1945         mutex_lock(&c->sb_lock);
1946         if (bch2_dev_attach_bdev(c, sb)) {
1947                 mutex_unlock(&c->sb_lock);
1948                 goto err;
1949         }
1950         mutex_unlock(&c->sb_lock);
1951
1952         if (!c->opts.nostart && bch2_fs_may_start(c)) {
1953                 err = "error starting filesystem";
1954                 ret = bch2_fs_start(c);
1955                 if (ret)
1956                         goto err;
1957         }
1958
1959         closure_put(&c->cl);
1960         mutex_unlock(&bch_fs_list_lock);
1961
1962         return NULL;
1963 err:
1964         mutex_unlock(&bch_fs_list_lock);
1965
1966         if (allocated_fs)
1967                 bch2_fs_stop(c);
1968         else if (c)
1969                 closure_put(&c->cl);
1970
1971         return err;
1972 }
1973
1974 const char *bch2_fs_open_incremental(const char *path)
1975 {
1976         struct bch_sb_handle sb;
1977         struct bch_opts opts = bch2_opts_empty();
1978         const char *err;
1979
1980         if (bch2_read_super(path, &opts, &sb))
1981                 return "error reading superblock";
1982
1983         err = __bch2_fs_open_incremental(&sb, opts);
1984         bch2_free_super(&sb);
1985
1986         return err;
1987 }
1988
1989 /* Global interfaces/init */
1990
1991 static void bcachefs_exit(void)
1992 {
1993         bch2_debug_exit();
1994         bch2_vfs_exit();
1995         bch2_chardev_exit();
1996         if (bcachefs_kset)
1997                 kset_unregister(bcachefs_kset);
1998 }
1999
2000 static int __init bcachefs_init(void)
2001 {
2002         bch2_bkey_pack_test();
2003         bch2_inode_pack_test();
2004
2005         if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
2006             bch2_chardev_init() ||
2007             bch2_vfs_init() ||
2008             bch2_debug_init())
2009                 goto err;
2010
2011         return 0;
2012 err:
2013         bcachefs_exit();
2014         return -ENOMEM;
2015 }
2016
2017 #define BCH_DEBUG_PARAM(name, description)                      \
2018         bool bch2_##name;                                       \
2019         module_param_named(name, bch2_##name, bool, 0644);      \
2020         MODULE_PARM_DESC(name, description);
2021 BCH_DEBUG_PARAMS()
2022 #undef BCH_DEBUG_PARAM
2023
2024 unsigned bch2_metadata_version = bcachefs_metadata_version_current;
2025 module_param_named(version, bch2_metadata_version, uint, 0400);
2026
2027 module_exit(bcachefs_exit);
2028 module_init(bcachefs_init);