bcachefs: Add a workqueue for journal reclaim
[linux-block.git] / fs / bcachefs / super.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * bcachefs setup/teardown code, and some metadata io - read a superblock and
4 * figure out what to do with it.
5 *
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
8 */
9
10#include "bcachefs.h"
7b3f84ea
KO
11#include "alloc_background.h"
12#include "alloc_foreground.h"
5b8a9227 13#include "bkey_sort.h"
1c6fdbd8
KO
14#include "btree_cache.h"
15#include "btree_gc.h"
16#include "btree_update_interior.h"
17#include "btree_io.h"
18#include "chardev.h"
19#include "checksum.h"
20#include "clock.h"
21#include "compress.h"
22#include "debug.h"
23#include "disk_groups.h"
cd575ddf 24#include "ec.h"
1c6fdbd8
KO
25#include "error.h"
26#include "fs.h"
27#include "fs-io.h"
28#include "fsck.h"
29#include "inode.h"
30#include "io.h"
31#include "journal.h"
32#include "journal_reclaim.h"
33#include "move.h"
34#include "migrate.h"
35#include "movinggc.h"
36#include "quota.h"
37#include "rebalance.h"
38#include "recovery.h"
39#include "replicas.h"
40#include "super.h"
41#include "super-io.h"
42#include "sysfs.h"
43#include "trace.h"
44
45#include <linux/backing-dev.h>
46#include <linux/blkdev.h>
47#include <linux/debugfs.h>
48#include <linux/device.h>
49#include <linux/idr.h>
50#include <linux/kthread.h>
51#include <linux/module.h>
52#include <linux/percpu.h>
53#include <linux/random.h>
54#include <linux/sysfs.h>
55#include <crypto/hash.h>
56
57MODULE_LICENSE("GPL");
58MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
59
60#define KTYPE(type) \
61static const struct attribute_group type ## _group = { \
62 .attrs = type ## _files \
63}; \
64 \
65static const struct attribute_group *type ## _groups[] = { \
66 &type ## _group, \
67 NULL \
68}; \
69 \
70static const struct kobj_type type ## _ktype = { \
71 .release = type ## _release, \
72 .sysfs_ops = &type ## _sysfs_ops, \
73 .default_groups = type ## _groups \
74}
75
76static void bch2_fs_release(struct kobject *);
77static void bch2_dev_release(struct kobject *);
78
79static void bch2_fs_internal_release(struct kobject *k)
80{
81}
82
83static void bch2_fs_opts_dir_release(struct kobject *k)
84{
85}
86
87static void bch2_fs_time_stats_release(struct kobject *k)
88{
89}
90
91KTYPE(bch2_fs);
92KTYPE(bch2_fs_internal);
93KTYPE(bch2_fs_opts_dir);
94KTYPE(bch2_fs_time_stats);
95KTYPE(bch2_dev);
96
97static struct kset *bcachefs_kset;
98static LIST_HEAD(bch_fs_list);
99static DEFINE_MUTEX(bch_fs_list_lock);
100
101static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
102
103static void bch2_dev_free(struct bch_dev *);
104static int bch2_dev_alloc(struct bch_fs *, unsigned);
105static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
106static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
107
108struct bch_fs *bch2_dev_to_fs(dev_t dev)
109{
110 struct bch_fs *c;
111 struct bch_dev *ca;
112 unsigned i;
113
114 mutex_lock(&bch_fs_list_lock);
115 rcu_read_lock();
116
117 list_for_each_entry(c, &bch_fs_list, list)
118 for_each_member_device_rcu(ca, c, i, NULL)
119 if (ca->disk_sb.bdev->bd_dev == dev) {
120 closure_get(&c->cl);
121 goto found;
122 }
123 c = NULL;
124found:
125 rcu_read_unlock();
126 mutex_unlock(&bch_fs_list_lock);
127
128 return c;
129}
130
131static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
132{
133 struct bch_fs *c;
134
135 lockdep_assert_held(&bch_fs_list_lock);
136
137 list_for_each_entry(c, &bch_fs_list, list)
138 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
139 return c;
140
141 return NULL;
142}
143
144struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
145{
146 struct bch_fs *c;
147
148 mutex_lock(&bch_fs_list_lock);
149 c = __bch2_uuid_to_fs(uuid);
150 if (c)
151 closure_get(&c->cl);
152 mutex_unlock(&bch_fs_list_lock);
153
154 return c;
155}
156
157/* Filesystem RO/RW: */
158
159/*
160 * For startup/shutdown of RW stuff, the dependencies are:
161 *
162 * - foreground writes depend on copygc and rebalance (to free up space)
163 *
164 * - copygc and rebalance depend on mark and sweep gc (they actually probably
165 * don't because they either reserve ahead of time or don't block if
166 * allocations fail, but allocations can require mark and sweep gc to run
167 * because of generation number wraparound)
168 *
169 * - all of the above depends on the allocator threads
170 *
171 * - allocator depends on the journal (when it rewrites prios and gens)
172 */
173
174static void __bch2_fs_read_only(struct bch_fs *c)
175{
176 struct bch_dev *ca;
177 unsigned i;
178
179 bch2_rebalance_stop(c);
180
181 for_each_member_device(ca, c, i)
182 bch2_copygc_stop(ca);
183
184 bch2_gc_thread_stop(c);
185
186 /*
187 * Flush journal before stopping allocators, because flushing journal
188 * blacklist entries involves allocating new btree nodes:
189 */
190 bch2_journal_flush_all_pins(&c->journal);
191
192 for_each_member_device(ca, c, i)
193 bch2_dev_allocator_stop(ca);
194
195 bch2_journal_flush_all_pins(&c->journal);
196
197 /*
198 * We need to explicitly wait on btree interior updates to complete
199 * before stopping the journal, flushing all journal pins isn't
200 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
201 * interior updates have to drop their journal pin before they're
202 * fully complete:
203 */
204 closure_wait_event(&c->btree_interior_update_wait,
205 !bch2_btree_interior_updates_nr_pending(c));
206
207 bch2_fs_journal_stop(&c->journal);
208
209 /*
210 * the journal kicks off btree writes via reclaim - wait for in flight
211 * writes after stopping journal:
212 */
213 if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
214 bch2_btree_flush_all_writes(c);
215 else
216 bch2_btree_verify_flushed(c);
217
218 /*
219 * After stopping journal:
220 */
221 for_each_member_device(ca, c, i)
222 bch2_dev_allocator_remove(c, ca);
223}
224
225static void bch2_writes_disabled(struct percpu_ref *writes)
226{
227 struct bch_fs *c = container_of(writes, struct bch_fs, writes);
228
229 set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
230 wake_up(&bch_read_only_wait);
231}
232
233void bch2_fs_read_only(struct bch_fs *c)
234{
235 if (c->state == BCH_FS_RO)
236 return;
237
238 BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
239
240 /*
241 * Block new foreground-end write operations from starting - any new
242 * writes will return -EROFS:
243 *
244 * (This is really blocking new _allocations_, writes to previously
245 * allocated space can still happen until stopping the allocator in
246 * bch2_dev_allocator_stop()).
247 */
248 percpu_ref_kill(&c->writes);
249
250 cancel_delayed_work(&c->pd_controllers_update);
251
252 /*
253 * If we're not doing an emergency shutdown, we want to wait on
254 * outstanding writes to complete so they don't see spurious errors due
255 * to shutting down the allocator:
256 *
257 * If we are doing an emergency shutdown outstanding writes may
258 * hang until we shutdown the allocator so we don't want to wait
259 * on outstanding writes before shutting everything down - but
260 * we do need to wait on them before returning and signalling
261 * that going RO is complete:
262 */
263 wait_event(bch_read_only_wait,
264 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
265 test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
266
267 __bch2_fs_read_only(c);
268
269 wait_event(bch_read_only_wait,
270 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
271
272 clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
273
274 if (!bch2_journal_error(&c->journal) &&
275 !test_bit(BCH_FS_ERROR, &c->flags) &&
276 !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
277 bch2_fs_mark_clean(c, true);
278
279 if (c->state != BCH_FS_STOPPING)
280 c->state = BCH_FS_RO;
281}
282
283static void bch2_fs_read_only_work(struct work_struct *work)
284{
285 struct bch_fs *c =
286 container_of(work, struct bch_fs, read_only_work);
287
288 mutex_lock(&c->state_lock);
289 bch2_fs_read_only(c);
290 mutex_unlock(&c->state_lock);
291}
292
293static void bch2_fs_read_only_async(struct bch_fs *c)
294{
295 queue_work(system_long_wq, &c->read_only_work);
296}
297
298bool bch2_fs_emergency_read_only(struct bch_fs *c)
299{
300 bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
301
302 bch2_fs_read_only_async(c);
303 bch2_journal_halt(&c->journal);
304
305 wake_up(&bch_read_only_wait);
306 return ret;
307}
308
309const char *bch2_fs_read_write(struct bch_fs *c)
310{
311 struct bch_dev *ca;
312 const char *err = NULL;
313 unsigned i;
314
315 if (c->state == BCH_FS_RW)
316 return NULL;
317
318 bch2_fs_mark_clean(c, false);
319
320 for_each_rw_member(ca, c, i)
321 bch2_dev_allocator_add(c, ca);
322 bch2_recalc_capacity(c);
323
324 err = "error starting allocator thread";
325 for_each_rw_member(ca, c, i)
326 if (bch2_dev_allocator_start(ca)) {
327 percpu_ref_put(&ca->io_ref);
328 goto err;
329 }
330
331 err = "error starting btree GC thread";
332 if (bch2_gc_thread_start(c))
333 goto err;
334
335 err = "error starting copygc thread";
336 for_each_rw_member(ca, c, i)
337 if (bch2_copygc_start(c, ca)) {
338 percpu_ref_put(&ca->io_ref);
339 goto err;
340 }
341
342 err = "error starting rebalance thread";
343 if (bch2_rebalance_start(c))
344 goto err;
345
346 schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
347
348 if (c->state != BCH_FS_STARTING)
349 percpu_ref_reinit(&c->writes);
350
351 c->state = BCH_FS_RW;
352 return NULL;
353err:
354 __bch2_fs_read_only(c);
355 return err;
356}
357
358/* Filesystem startup/shutdown: */
359
360static void bch2_fs_free(struct bch_fs *c)
361{
362 unsigned i;
363
364 for (i = 0; i < BCH_TIME_STAT_NR; i++)
365 bch2_time_stats_exit(&c->times[i]);
366
367 bch2_fs_quota_exit(c);
368 bch2_fs_fsio_exit(c);
cd575ddf 369 bch2_fs_ec_exit(c);
1c6fdbd8
KO
370 bch2_fs_encryption_exit(c);
371 bch2_fs_io_exit(c);
372 bch2_fs_btree_cache_exit(c);
373 bch2_fs_journal_exit(&c->journal);
374 bch2_io_clock_exit(&c->io_clock[WRITE]);
375 bch2_io_clock_exit(&c->io_clock[READ]);
376 bch2_fs_compress_exit(c);
9166b41d 377 percpu_free_rwsem(&c->mark_lock);
9ca53b55 378 free_percpu(c->usage[0]);
5663a415 379 free_percpu(c->pcpu);
581edb63 380 mempool_exit(&c->btree_iters_pool);
1c6fdbd8
KO
381 mempool_exit(&c->btree_bounce_pool);
382 bioset_exit(&c->btree_bio);
383 mempool_exit(&c->btree_interior_update_pool);
384 mempool_exit(&c->btree_reserve_pool);
385 mempool_exit(&c->fill_iter);
386 percpu_ref_exit(&c->writes);
73e6ab95
KO
387 kfree(c->replicas.entries);
388 kfree(c->replicas_gc.entries);
1c6fdbd8
KO
389 kfree(rcu_dereference_protected(c->disk_groups, 1));
390
0519b72d
KO
391 if (c->journal_reclaim_wq)
392 destroy_workqueue(c->journal_reclaim_wq);
1c6fdbd8
KO
393 if (c->copygc_wq)
394 destroy_workqueue(c->copygc_wq);
395 if (c->wq)
396 destroy_workqueue(c->wq);
397
398 free_pages((unsigned long) c->disk_sb.sb,
399 c->disk_sb.page_order);
400 kvpfree(c, sizeof(*c));
401 module_put(THIS_MODULE);
402}
403
404static void bch2_fs_release(struct kobject *kobj)
405{
406 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
407
408 bch2_fs_free(c);
409}
410
411void bch2_fs_stop(struct bch_fs *c)
412{
413 struct bch_dev *ca;
414 unsigned i;
415
af1c6871
KO
416 bch_verbose(c, "shutting down");
417
1c6fdbd8
KO
418 for_each_member_device(ca, c, i)
419 if (ca->kobj.state_in_sysfs &&
420 ca->disk_sb.bdev)
421 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
422
423 if (c->kobj.state_in_sysfs)
424 kobject_del(&c->kobj);
425
426 bch2_fs_debug_exit(c);
427 bch2_fs_chardev_exit(c);
428
429 kobject_put(&c->time_stats);
430 kobject_put(&c->opts_dir);
431 kobject_put(&c->internal);
432
433 mutex_lock(&bch_fs_list_lock);
434 list_del(&c->list);
435 mutex_unlock(&bch_fs_list_lock);
436
437 closure_sync(&c->cl);
438 closure_debug_destroy(&c->cl);
439
440 mutex_lock(&c->state_lock);
441 bch2_fs_read_only(c);
442 mutex_unlock(&c->state_lock);
443
444 /* btree prefetch might have kicked off reads in the background: */
445 bch2_btree_flush_all_reads(c);
446
447 for_each_member_device(ca, c, i)
448 cancel_work_sync(&ca->io_error_work);
449
450 cancel_work_sync(&c->btree_write_error_work);
451 cancel_delayed_work_sync(&c->pd_controllers_update);
452 cancel_work_sync(&c->read_only_work);
453
454 for (i = 0; i < c->sb.nr_devices; i++)
455 if (c->devs[i])
456 bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
457
af1c6871
KO
458 bch_verbose(c, "shutdown complete");
459
1c6fdbd8
KO
460 kobject_put(&c->kobj);
461}
462
463static const char *bch2_fs_online(struct bch_fs *c)
464{
465 struct bch_dev *ca;
466 const char *err = NULL;
467 unsigned i;
468 int ret;
469
470 lockdep_assert_held(&bch_fs_list_lock);
471
472 if (!list_empty(&c->list))
473 return NULL;
474
475 if (__bch2_uuid_to_fs(c->sb.uuid))
476 return "filesystem UUID already open";
477
478 ret = bch2_fs_chardev_init(c);
479 if (ret)
480 return "error creating character device";
481
482 bch2_fs_debug_init(c);
483
484 if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
485 kobject_add(&c->internal, &c->kobj, "internal") ||
486 kobject_add(&c->opts_dir, &c->kobj, "options") ||
487 kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
488 bch2_opts_create_sysfs_files(&c->opts_dir))
489 return "error creating sysfs objects";
490
491 mutex_lock(&c->state_lock);
492
493 err = "error creating sysfs objects";
494 __for_each_member_device(ca, c, i, NULL)
495 if (bch2_dev_sysfs_online(c, ca))
496 goto err;
497
498 list_add(&c->list, &bch_fs_list);
499 err = NULL;
500err:
501 mutex_unlock(&c->state_lock);
502 return err;
503}
504
505static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
506{
507 struct bch_sb_field_members *mi;
508 struct bch_fs *c;
509 unsigned i, iter_size;
510 const char *err;
511
512 pr_verbose_init(opts, "");
513
514 c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
515 if (!c)
516 goto out;
517
518 __module_get(THIS_MODULE);
519
520 c->minor = -1;
521 c->disk_sb.fs_sb = true;
522
523 mutex_init(&c->state_lock);
524 mutex_init(&c->sb_lock);
525 mutex_init(&c->replicas_gc_lock);
526 mutex_init(&c->btree_root_lock);
527 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
528
529 init_rwsem(&c->gc_lock);
530
531 for (i = 0; i < BCH_TIME_STAT_NR; i++)
532 bch2_time_stats_init(&c->times[i]);
533
b092dadd
KO
534 bch2_fs_allocator_background_init(c);
535 bch2_fs_allocator_foreground_init(c);
1c6fdbd8
KO
536 bch2_fs_rebalance_init(c);
537 bch2_fs_quota_init(c);
538
539 INIT_LIST_HEAD(&c->list);
540
541 INIT_LIST_HEAD(&c->btree_interior_update_list);
542 mutex_init(&c->btree_reserve_cache_lock);
543 mutex_init(&c->btree_interior_update_lock);
544
545 mutex_init(&c->bio_bounce_pages_lock);
546
547 bio_list_init(&c->btree_write_error_list);
548 spin_lock_init(&c->btree_write_error_lock);
549 INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
550
551 INIT_LIST_HEAD(&c->fsck_errors);
552 mutex_init(&c->fsck_error_lock);
553
cd575ddf
KO
554 INIT_LIST_HEAD(&c->ec_new_stripe_list);
555 mutex_init(&c->ec_new_stripe_lock);
dfe9bfb3 556 mutex_init(&c->ec_stripe_create_lock);
cd575ddf
KO
557 spin_lock_init(&c->ec_stripes_heap_lock);
558
1c6fdbd8
KO
559 seqcount_init(&c->gc_pos_lock);
560
561 c->copy_gc_enabled = 1;
562 c->rebalance.enabled = 1;
563 c->promote_whole_extents = true;
564
565 c->journal.write_time = &c->times[BCH_TIME_journal_write];
566 c->journal.delay_time = &c->times[BCH_TIME_journal_delay];
567 c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked];
568 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
569
570 bch2_fs_btree_cache_init_early(&c->btree_cache);
571
73e6ab95
KO
572 if (percpu_init_rwsem(&c->mark_lock))
573 goto err;
574
1c6fdbd8
KO
575 mutex_lock(&c->sb_lock);
576
577 if (bch2_sb_to_fs(c, sb)) {
578 mutex_unlock(&c->sb_lock);
579 goto err;
580 }
581
582 mutex_unlock(&c->sb_lock);
583
584 scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
585
586 c->opts = bch2_opts_default;
587 bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
588 bch2_opts_apply(&c->opts, opts);
589
590 c->block_bits = ilog2(c->opts.block_size);
591 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
592
593 c->opts.nochanges |= c->opts.noreplay;
594 c->opts.read_only |= c->opts.nochanges;
595
596 if (bch2_fs_init_fault("fs_alloc"))
597 goto err;
598
599 iter_size = sizeof(struct btree_node_iter_large) +
600 (btree_blocks(c) + 1) * 2 *
601 sizeof(struct btree_node_iter_set);
602
603 if (!(c->wq = alloc_workqueue("bcachefs",
604 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
605 !(c->copygc_wq = alloc_workqueue("bcache_copygc",
606 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
0519b72d
KO
607 !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
608 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
1c6fdbd8
KO
609 percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
610 mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
611 sizeof(struct btree_reserve)) ||
612 mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
613 sizeof(struct btree_update)) ||
614 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
615 bioset_init(&c->btree_bio, 1,
616 max(offsetof(struct btree_read_bio, bio),
617 offsetof(struct btree_write_bio, wbio.bio)),
618 BIOSET_NEED_BVECS) ||
9ca53b55 619 !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
5663a415 620 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
1c6fdbd8
KO
621 mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
622 btree_bytes(c)) ||
581edb63
KO
623 mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
624 sizeof(struct btree_iter) * BTREE_ITER_MAX) ||
1c6fdbd8
KO
625 bch2_io_clock_init(&c->io_clock[READ]) ||
626 bch2_io_clock_init(&c->io_clock[WRITE]) ||
627 bch2_fs_journal_init(&c->journal) ||
628 bch2_fs_btree_cache_init(c) ||
629 bch2_fs_io_init(c) ||
630 bch2_fs_encryption_init(c) ||
631 bch2_fs_compress_init(c) ||
cd575ddf 632 bch2_fs_ec_init(c) ||
1c6fdbd8
KO
633 bch2_fs_fsio_init(c))
634 goto err;
635
636 mi = bch2_sb_get_members(c->disk_sb.sb);
637 for (i = 0; i < c->sb.nr_devices; i++)
638 if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
639 bch2_dev_alloc(c, i))
640 goto err;
641
642 /*
643 * Now that all allocations have succeeded, init various refcounty
644 * things that let us shutdown:
645 */
646 closure_init(&c->cl, NULL);
647
648 c->kobj.kset = bcachefs_kset;
649 kobject_init(&c->kobj, &bch2_fs_ktype);
650 kobject_init(&c->internal, &bch2_fs_internal_ktype);
651 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
652 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
653
654 mutex_lock(&bch_fs_list_lock);
655 err = bch2_fs_online(c);
656 mutex_unlock(&bch_fs_list_lock);
657 if (err) {
658 bch_err(c, "bch2_fs_online() error: %s", err);
659 goto err;
660 }
661out:
662 pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
663 return c;
664err:
665 bch2_fs_free(c);
666 c = NULL;
667 goto out;
668}
669
670const char *bch2_fs_start(struct bch_fs *c)
671{
672 const char *err = "cannot allocate memory";
673 struct bch_sb_field_members *mi;
674 struct bch_dev *ca;
a420eea6 675 time64_t now = ktime_get_real_seconds();
1c6fdbd8
KO
676 unsigned i;
677 int ret = -EINVAL;
678
679 mutex_lock(&c->state_lock);
680
681 BUG_ON(c->state != BCH_FS_STARTING);
682
683 mutex_lock(&c->sb_lock);
684
685 for_each_online_member(ca, c, i)
686 bch2_sb_from_fs(c, ca);
687
688 mi = bch2_sb_get_members(c->disk_sb.sb);
689 for_each_online_member(ca, c, i)
690 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
691
692 mutex_unlock(&c->sb_lock);
693
694 for_each_rw_member(ca, c, i)
695 bch2_dev_allocator_add(c, ca);
696 bch2_recalc_capacity(c);
697
698 ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
699 ? bch2_fs_recovery(c)
700 : bch2_fs_initialize(c);
701 if (ret)
702 goto err;
cd575ddf
KO
703
704 ret = bch2_opts_check_may_set(c);
705 if (ret)
706 goto err;
1c6fdbd8
KO
707
708 err = "dynamic fault";
709 if (bch2_fs_init_fault("fs_start"))
710 goto err;
711
712 if (c->opts.read_only) {
713 bch2_fs_read_only(c);
714 } else {
715 err = bch2_fs_read_write(c);
716 if (err)
717 goto err;
718 }
719
720 set_bit(BCH_FS_STARTED, &c->flags);
721
722 err = NULL;
723out:
724 mutex_unlock(&c->state_lock);
725 return err;
726err:
727 switch (ret) {
728 case BCH_FSCK_ERRORS_NOT_FIXED:
729 bch_err(c, "filesystem contains errors: please report this to the developers");
730 pr_cont("mount with -o fix_errors to repair\n");
731 err = "fsck error";
732 break;
733 case BCH_FSCK_REPAIR_UNIMPLEMENTED:
734 bch_err(c, "filesystem contains errors: please report this to the developers");
735 pr_cont("repair unimplemented: inform the developers so that it can be added\n");
736 err = "fsck error";
737 break;
738 case BCH_FSCK_REPAIR_IMPOSSIBLE:
739 bch_err(c, "filesystem contains errors, but repair impossible");
740 err = "fsck error";
741 break;
742 case BCH_FSCK_UNKNOWN_VERSION:
743 err = "unknown metadata version";;
744 break;
745 case -ENOMEM:
746 err = "cannot allocate memory";
747 break;
748 case -EIO:
749 err = "IO error";
750 break;
751 }
752
753 BUG_ON(!err);
754 set_bit(BCH_FS_ERROR, &c->flags);
755 goto out;
756}
757
758static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
759{
760 struct bch_sb_field_members *sb_mi;
761
762 sb_mi = bch2_sb_get_members(sb);
763 if (!sb_mi)
764 return "Invalid superblock: member info area missing";
765
766 if (le16_to_cpu(sb->block_size) != c->opts.block_size)
767 return "mismatched block size";
768
769 if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
770 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
771 return "new cache bucket size is too small";
772
773 return NULL;
774}
775
776static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
777{
778 struct bch_sb *newest =
779 le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
780 struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
781
782 if (!uuid_equal(&fs->uuid, &sb->uuid))
783 return "device not a member of filesystem";
784
785 if (!bch2_dev_exists(newest, mi, sb->dev_idx))
786 return "device has been removed";
787
788 if (fs->block_size != sb->block_size)
789 return "mismatched block size";
790
791 return NULL;
792}
793
794/* Device startup/shutdown: */
795
796static void bch2_dev_release(struct kobject *kobj)
797{
798 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
799
800 kfree(ca);
801}
802
803static void bch2_dev_free(struct bch_dev *ca)
804{
805 cancel_work_sync(&ca->io_error_work);
806
807 if (ca->kobj.state_in_sysfs &&
808 ca->disk_sb.bdev)
809 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
810
811 if (ca->kobj.state_in_sysfs)
812 kobject_del(&ca->kobj);
813
814 bch2_free_super(&ca->disk_sb);
815 bch2_dev_journal_exit(ca);
816
817 free_percpu(ca->io_done);
818 bioset_exit(&ca->replica_set);
819 bch2_dev_buckets_free(ca);
820
821 bch2_time_stats_exit(&ca->io_latency[WRITE]);
822 bch2_time_stats_exit(&ca->io_latency[READ]);
823
824 percpu_ref_exit(&ca->io_ref);
825 percpu_ref_exit(&ca->ref);
826 kobject_put(&ca->kobj);
827}
828
829static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
830{
831
832 lockdep_assert_held(&c->state_lock);
833
834 if (percpu_ref_is_zero(&ca->io_ref))
835 return;
836
837 __bch2_dev_read_only(c, ca);
838
839 reinit_completion(&ca->io_ref_completion);
840 percpu_ref_kill(&ca->io_ref);
841 wait_for_completion(&ca->io_ref_completion);
842
843 if (ca->kobj.state_in_sysfs) {
844 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
845 sysfs_remove_link(&ca->kobj, "block");
846 }
847
848 bch2_free_super(&ca->disk_sb);
849 bch2_dev_journal_exit(ca);
850}
851
852static void bch2_dev_ref_complete(struct percpu_ref *ref)
853{
854 struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
855
856 complete(&ca->ref_completion);
857}
858
859static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
860{
861 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
862
863 complete(&ca->io_ref_completion);
864}
865
866static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
867{
868 int ret;
869
870 if (!c->kobj.state_in_sysfs)
871 return 0;
872
873 if (!ca->kobj.state_in_sysfs) {
874 ret = kobject_add(&ca->kobj, &c->kobj,
875 "dev-%u", ca->dev_idx);
876 if (ret)
877 return ret;
878 }
879
880 if (ca->disk_sb.bdev) {
881 struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
882
883 ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
884 if (ret)
885 return ret;
886
887 ret = sysfs_create_link(&ca->kobj, block, "block");
888 if (ret)
889 return ret;
890 }
891
892 return 0;
893}
894
895static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
896 struct bch_member *member)
897{
898 struct bch_dev *ca;
899
900 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
901 if (!ca)
902 return NULL;
903
904 kobject_init(&ca->kobj, &bch2_dev_ktype);
905 init_completion(&ca->ref_completion);
906 init_completion(&ca->io_ref_completion);
907
908 init_rwsem(&ca->bucket_lock);
909
910 writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
911
912 spin_lock_init(&ca->freelist_lock);
913 bch2_dev_copygc_init(ca);
914
915 INIT_WORK(&ca->io_error_work, bch2_io_error_work);
916
917 bch2_time_stats_init(&ca->io_latency[READ]);
918 bch2_time_stats_init(&ca->io_latency[WRITE]);
919
920 ca->mi = bch2_mi_to_cpu(member);
921 ca->uuid = member->uuid;
922
923 if (opt_defined(c->opts, discard))
924 ca->mi.discard = opt_get(c->opts, discard);
925
926 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
927 0, GFP_KERNEL) ||
928 percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
929 PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
930 bch2_dev_buckets_alloc(c, ca) ||
931 bioset_init(&ca->replica_set, 4,
932 offsetof(struct bch_write_bio, bio), 0) ||
933 !(ca->io_done = alloc_percpu(*ca->io_done)))
934 goto err;
935
936 return ca;
937err:
938 bch2_dev_free(ca);
939 return NULL;
940}
941
942static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
943 unsigned dev_idx)
944{
945 ca->dev_idx = dev_idx;
946 __set_bit(ca->dev_idx, ca->self.d);
947 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
948
949 ca->fs = c;
950 rcu_assign_pointer(c->devs[ca->dev_idx], ca);
951
952 if (bch2_dev_sysfs_online(c, ca))
953 pr_warn("error creating sysfs objects");
954}
955
956static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
957{
958 struct bch_member *member =
959 bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
960 struct bch_dev *ca = NULL;
961 int ret = 0;
962
963 pr_verbose_init(c->opts, "");
964
965 if (bch2_fs_init_fault("dev_alloc"))
966 goto err;
967
968 ca = __bch2_dev_alloc(c, member);
969 if (!ca)
970 goto err;
971
972 bch2_dev_attach(c, ca, dev_idx);
973out:
974 pr_verbose_init(c->opts, "ret %i", ret);
975 return ret;
976err:
977 if (ca)
978 bch2_dev_free(ca);
979 ret = -ENOMEM;
980 goto out;
981}
982
983static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
984{
985 unsigned ret;
986
987 if (bch2_dev_is_online(ca)) {
988 bch_err(ca, "already have device online in slot %u",
989 sb->sb->dev_idx);
990 return -EINVAL;
991 }
992
993 if (get_capacity(sb->bdev->bd_disk) <
994 ca->mi.bucket_size * ca->mi.nbuckets) {
995 bch_err(ca, "cannot online: device too small");
996 return -EINVAL;
997 }
998
999 BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
1000
1001 if (get_capacity(sb->bdev->bd_disk) <
1002 ca->mi.bucket_size * ca->mi.nbuckets) {
1003 bch_err(ca, "device too small");
1004 return -EINVAL;
1005 }
1006
1007 ret = bch2_dev_journal_init(ca, sb->sb);
1008 if (ret)
1009 return ret;
1010
1011 /* Commit: */
1012 ca->disk_sb = *sb;
1013 memset(sb, 0, sizeof(*sb));
1014
1c6fdbd8
KO
1015 percpu_ref_reinit(&ca->io_ref);
1016
1017 return 0;
1018}
1019
1020static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
1021{
1022 struct bch_dev *ca;
1023 int ret;
1024
1025 lockdep_assert_held(&c->state_lock);
1026
1027 if (le64_to_cpu(sb->sb->seq) >
1028 le64_to_cpu(c->disk_sb.sb->seq))
1029 bch2_sb_to_fs(c, sb->sb);
1030
1031 BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
1032 !c->devs[sb->sb->dev_idx]);
1033
1034 ca = bch_dev_locked(c, sb->sb->dev_idx);
1035
1036 ret = __bch2_dev_attach_bdev(ca, sb);
1037 if (ret)
1038 return ret;
1039
6eac2c2e 1040 mutex_lock(&c->sb_lock);
9ca53b55 1041 bch2_mark_dev_superblock(ca->fs, ca, 0);
6eac2c2e
KO
1042 mutex_unlock(&c->sb_lock);
1043
1c6fdbd8
KO
1044 bch2_dev_sysfs_online(c, ca);
1045
1046 if (c->sb.nr_devices == 1)
1047 snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
1048 snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
1049
1050 rebalance_wakeup(c);
1051 return 0;
1052}
1053
1054/* Device management: */
1055
1056/*
1057 * Note: this function is also used by the error paths - when a particular
1058 * device sees an error, we call it to determine whether we can just set the
1059 * device RO, or - if this function returns false - we'll set the whole
1060 * filesystem RO:
1061 *
1062 * XXX: maybe we should be more explicit about whether we're changing state
1063 * because we got an error or what have you?
1064 */
1065bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
1066 enum bch_member_state new_state, int flags)
1067{
1068 struct bch_devs_mask new_online_devs;
1069 struct replicas_status s;
1070 struct bch_dev *ca2;
1071 int i, nr_rw = 0, required;
1072
1073 lockdep_assert_held(&c->state_lock);
1074
1075 switch (new_state) {
1076 case BCH_MEMBER_STATE_RW:
1077 return true;
1078 case BCH_MEMBER_STATE_RO:
1079 if (ca->mi.state != BCH_MEMBER_STATE_RW)
1080 return true;
1081
1082 /* do we have enough devices to write to? */
1083 for_each_member_device(ca2, c, i)
1084 if (ca2 != ca)
1085 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
1086
1087 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
1088 ? c->opts.metadata_replicas
1089 : c->opts.metadata_replicas_required,
1090 !(flags & BCH_FORCE_IF_DATA_DEGRADED)
1091 ? c->opts.data_replicas
1092 : c->opts.data_replicas_required);
1093
1094 return nr_rw >= required;
1095 case BCH_MEMBER_STATE_FAILED:
1096 case BCH_MEMBER_STATE_SPARE:
1097 if (ca->mi.state != BCH_MEMBER_STATE_RW &&
1098 ca->mi.state != BCH_MEMBER_STATE_RO)
1099 return true;
1100
1101 /* do we have enough devices to read from? */
1102 new_online_devs = bch2_online_devs(c);
1103 __clear_bit(ca->dev_idx, new_online_devs.d);
1104
1105 s = __bch2_replicas_status(c, new_online_devs);
1106
1107 return bch2_have_enough_devs(s, flags);
1108 default:
1109 BUG();
1110 }
1111}
1112
1113static bool bch2_fs_may_start(struct bch_fs *c)
1114{
1115 struct replicas_status s;
1116 struct bch_sb_field_members *mi;
1117 struct bch_dev *ca;
1118 unsigned i, flags = c->opts.degraded
1119 ? BCH_FORCE_IF_DEGRADED
1120 : 0;
1121
1122 if (!c->opts.degraded) {
1123 mutex_lock(&c->sb_lock);
1124 mi = bch2_sb_get_members(c->disk_sb.sb);
1125
1126 for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
1127 if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
1128 continue;
1129
1130 ca = bch_dev_locked(c, i);
1131
1132 if (!bch2_dev_is_online(ca) &&
1133 (ca->mi.state == BCH_MEMBER_STATE_RW ||
1134 ca->mi.state == BCH_MEMBER_STATE_RO)) {
1135 mutex_unlock(&c->sb_lock);
1136 return false;
1137 }
1138 }
1139 mutex_unlock(&c->sb_lock);
1140 }
1141
1142 s = bch2_replicas_status(c);
1143
1144 return bch2_have_enough_devs(s, flags);
1145}
1146
1147static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
1148{
1149 bch2_copygc_stop(ca);
1150
1151 /*
1152 * The allocator thread itself allocates btree nodes, so stop it first:
1153 */
1154 bch2_dev_allocator_stop(ca);
1155 bch2_dev_allocator_remove(c, ca);
1156 bch2_dev_journal_stop(&c->journal, ca);
1157}
1158
1159static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
1160{
1161 lockdep_assert_held(&c->state_lock);
1162
1163 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
1164
1165 bch2_dev_allocator_add(c, ca);
1166 bch2_recalc_capacity(c);
1167
1168 if (bch2_dev_allocator_start(ca))
1169 return "error starting allocator thread";
1170
1171 if (bch2_copygc_start(c, ca))
1172 return "error starting copygc thread";
1173
1174 return NULL;
1175}
1176
1177int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1178 enum bch_member_state new_state, int flags)
1179{
1180 struct bch_sb_field_members *mi;
1181 int ret = 0;
1182
1183 if (ca->mi.state == new_state)
1184 return 0;
1185
1186 if (!bch2_dev_state_allowed(c, ca, new_state, flags))
1187 return -EINVAL;
1188
1189 if (new_state != BCH_MEMBER_STATE_RW)
1190 __bch2_dev_read_only(c, ca);
1191
1192 bch_notice(ca, "%s", bch2_dev_state[new_state]);
1193
1194 mutex_lock(&c->sb_lock);
1195 mi = bch2_sb_get_members(c->disk_sb.sb);
1196 SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
1197 bch2_write_super(c);
1198 mutex_unlock(&c->sb_lock);
1199
1200 if (new_state == BCH_MEMBER_STATE_RW &&
1201 __bch2_dev_read_write(c, ca))
1202 ret = -ENOMEM;
1203
1204 rebalance_wakeup(c);
1205
1206 return ret;
1207}
1208
1209int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1210 enum bch_member_state new_state, int flags)
1211{
1212 int ret;
1213
1214 mutex_lock(&c->state_lock);
1215 ret = __bch2_dev_set_state(c, ca, new_state, flags);
1216 mutex_unlock(&c->state_lock);
1217
1218 return ret;
1219}
1220
1221/* Device add/removal: */
1222
1223int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1224{
1225 struct bch_sb_field_members *mi;
1226 unsigned dev_idx = ca->dev_idx, data;
1227 int ret = -EINVAL;
1228
1229 mutex_lock(&c->state_lock);
1230
1231 percpu_ref_put(&ca->ref); /* XXX */
1232
1233 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1234 bch_err(ca, "Cannot remove without losing data");
1235 goto err;
1236 }
1237
1238 __bch2_dev_read_only(c, ca);
1239
1240 /*
1241 * XXX: verify that dev_idx is really not in use anymore, anywhere
1242 *
1243 * flag_data_bad() does not check btree pointers
1244 */
1245 ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
1246 if (ret) {
1247 bch_err(ca, "Remove failed: error %i dropping data", ret);
1248 goto err;
1249 }
1250
1251 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
1252 if (ret) {
1253 bch_err(ca, "Remove failed: error %i flushing journal", ret);
1254 goto err;
1255 }
1256
1257 data = bch2_dev_has_data(c, ca);
1258 if (data) {
1259 char data_has_str[100];
319f9ac3
KO
1260
1261 bch2_string_opt_to_text(&PBUF(data_has_str),
1262 bch2_data_types, data);
1c6fdbd8
KO
1263 bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
1264 ret = -EBUSY;
1265 goto err;
1266 }
1267
1268 ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
1269 POS(ca->dev_idx, 0),
1270 POS(ca->dev_idx + 1, 0),
fc3268c1 1271 NULL);
1c6fdbd8
KO
1272 if (ret) {
1273 bch_err(ca, "Remove failed, error deleting alloc info");
1274 goto err;
1275 }
1276
1277 /*
1278 * must flush all existing journal entries, they might have
1279 * (overwritten) keys that point to the device we're removing:
1280 */
1281 bch2_journal_flush_all_pins(&c->journal);
1282 ret = bch2_journal_error(&c->journal);
1283 if (ret) {
1284 bch_err(ca, "Remove failed, journal error");
1285 goto err;
1286 }
1287
1288 __bch2_dev_offline(c, ca);
1289
1290 mutex_lock(&c->sb_lock);
1291 rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
1292 mutex_unlock(&c->sb_lock);
1293
1294 percpu_ref_kill(&ca->ref);
1295 wait_for_completion(&ca->ref_completion);
1296
1297 bch2_dev_free(ca);
1298
1299 /*
1300 * Free this device's slot in the bch_member array - all pointers to
1301 * this device must be gone:
1302 */
1303 mutex_lock(&c->sb_lock);
1304 mi = bch2_sb_get_members(c->disk_sb.sb);
1305 memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1306
1307 bch2_write_super(c);
1308
1309 mutex_unlock(&c->sb_lock);
1310 mutex_unlock(&c->state_lock);
1311 return 0;
1312err:
d3bb629d
KO
1313 if (ca->mi.state == BCH_MEMBER_STATE_RW &&
1314 !percpu_ref_is_zero(&ca->io_ref))
1c6fdbd8
KO
1315 __bch2_dev_read_write(c, ca);
1316 mutex_unlock(&c->state_lock);
1317 return ret;
1318}
1319
6eac2c2e
KO
1320static void dev_usage_clear(struct bch_dev *ca)
1321{
1322 struct bucket_array *buckets;
1323 int cpu;
1324
1325 for_each_possible_cpu(cpu) {
1326 struct bch_dev_usage *p =
9ca53b55 1327 per_cpu_ptr(ca->usage[0], cpu);
6eac2c2e
KO
1328 memset(p, 0, sizeof(*p));
1329 }
1330
1331 down_read(&ca->bucket_lock);
1332 buckets = bucket_array(ca);
1333
1334 memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
1335 up_read(&ca->bucket_lock);
1336}
1337
1c6fdbd8
KO
1338/* Add new device to running filesystem: */
1339int bch2_dev_add(struct bch_fs *c, const char *path)
1340{
1341 struct bch_opts opts = bch2_opts_empty();
1342 struct bch_sb_handle sb;
1343 const char *err;
1344 struct bch_dev *ca = NULL;
1345 struct bch_sb_field_members *mi;
1346 struct bch_member dev_mi;
1347 unsigned dev_idx, nr_devices, u64s;
1348 int ret;
1349
1350 ret = bch2_read_super(path, &opts, &sb);
1351 if (ret)
1352 return ret;
1353
1354 err = bch2_sb_validate(&sb);
1355 if (err)
1356 return -EINVAL;
1357
1358 dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
1359
1360 err = bch2_dev_may_add(sb.sb, c);
1361 if (err)
1362 return -EINVAL;
1363
1364 ca = __bch2_dev_alloc(c, &dev_mi);
1365 if (!ca) {
1366 bch2_free_super(&sb);
1367 return -ENOMEM;
1368 }
1369
1370 ret = __bch2_dev_attach_bdev(ca, &sb);
1371 if (ret) {
1372 bch2_dev_free(ca);
1373 return ret;
1374 }
1375
6eac2c2e
KO
1376 /*
1377 * We want to allocate journal on the new device before adding the new
1378 * device to the filesystem because allocating after we attach requires
1379 * spinning up the allocator thread, and the allocator thread requires
1380 * doing btree writes, which if the existing devices are RO isn't going
1381 * to work
1382 *
1383 * So we have to mark where the superblocks are, but marking allocated
1384 * data normally updates the filesystem usage too, so we have to mark,
1385 * allocate the journal, reset all the marks, then remark after we
1386 * attach...
1387 */
9ca53b55 1388 bch2_mark_dev_superblock(ca->fs, ca, 0);
6eac2c2e 1389
1c6fdbd8
KO
1390 err = "journal alloc failed";
1391 ret = bch2_dev_journal_alloc(ca);
1392 if (ret)
1393 goto err;
1394
6eac2c2e
KO
1395 dev_usage_clear(ca);
1396
1c6fdbd8
KO
1397 mutex_lock(&c->state_lock);
1398 mutex_lock(&c->sb_lock);
1399
1400 err = "insufficient space in new superblock";
1401 ret = bch2_sb_from_fs(c, ca);
1402 if (ret)
1403 goto err_unlock;
1404
1405 mi = bch2_sb_get_members(ca->disk_sb.sb);
1406
1407 if (!bch2_sb_resize_members(&ca->disk_sb,
1408 le32_to_cpu(mi->field.u64s) +
1409 sizeof(dev_mi) / sizeof(u64))) {
1410 ret = -ENOSPC;
1411 goto err_unlock;
1412 }
1413
1414 if (dynamic_fault("bcachefs:add:no_slot"))
1415 goto no_slot;
1416
1417 mi = bch2_sb_get_members(c->disk_sb.sb);
1418 for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1419 if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
1420 goto have_slot;
1421no_slot:
1422 err = "no slots available in superblock";
1423 ret = -ENOSPC;
1424 goto err_unlock;
1425
1426have_slot:
1427 nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1428 u64s = (sizeof(struct bch_sb_field_members) +
1429 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1430
1431 err = "no space in superblock for member info";
1432 ret = -ENOSPC;
1433
1434 mi = bch2_sb_resize_members(&c->disk_sb, u64s);
1435 if (!mi)
1436 goto err_unlock;
1437
1438 /* success: */
1439
1440 mi->members[dev_idx] = dev_mi;
a420eea6 1441 mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
1c6fdbd8
KO
1442 c->disk_sb.sb->nr_devices = nr_devices;
1443
1444 ca->disk_sb.sb->dev_idx = dev_idx;
1445 bch2_dev_attach(c, ca, dev_idx);
1446
9ca53b55 1447 bch2_mark_dev_superblock(c, ca, 0);
6eac2c2e 1448
1c6fdbd8
KO
1449 bch2_write_super(c);
1450 mutex_unlock(&c->sb_lock);
1451
1452 if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1453 err = __bch2_dev_read_write(c, ca);
1454 if (err)
1455 goto err_late;
1456 }
1457
1458 mutex_unlock(&c->state_lock);
1459 return 0;
1460
1461err_unlock:
1462 mutex_unlock(&c->sb_lock);
1463 mutex_unlock(&c->state_lock);
1464err:
1465 if (ca)
1466 bch2_dev_free(ca);
1467 bch2_free_super(&sb);
1468 bch_err(c, "Unable to add device: %s", err);
1469 return ret;
1470err_late:
1471 bch_err(c, "Error going rw after adding device: %s", err);
1472 return -EINVAL;
1473}
1474
1475/* Hot add existing device to running filesystem: */
1476int bch2_dev_online(struct bch_fs *c, const char *path)
1477{
1478 struct bch_opts opts = bch2_opts_empty();
1479 struct bch_sb_handle sb = { NULL };
1480 struct bch_sb_field_members *mi;
1481 struct bch_dev *ca;
1482 unsigned dev_idx;
1483 const char *err;
1484 int ret;
1485
1486 mutex_lock(&c->state_lock);
1487
1488 ret = bch2_read_super(path, &opts, &sb);
1489 if (ret) {
1490 mutex_unlock(&c->state_lock);
1491 return ret;
1492 }
1493
1494 dev_idx = sb.sb->dev_idx;
1495
1496 err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
1497 if (err)
1498 goto err;
1499
1500 if (bch2_dev_attach_bdev(c, &sb)) {
1501 err = "bch2_dev_attach_bdev() error";
1502 goto err;
1503 }
1504
1505 ca = bch_dev_locked(c, dev_idx);
1506 if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1507 err = __bch2_dev_read_write(c, ca);
1508 if (err)
1509 goto err;
1510 }
1511
1512 mutex_lock(&c->sb_lock);
1513 mi = bch2_sb_get_members(c->disk_sb.sb);
1514
1515 mi->members[ca->dev_idx].last_mount =
a420eea6 1516 cpu_to_le64(ktime_get_real_seconds());
1c6fdbd8
KO
1517
1518 bch2_write_super(c);
1519 mutex_unlock(&c->sb_lock);
1520
1521 mutex_unlock(&c->state_lock);
1522 return 0;
1523err:
1524 mutex_unlock(&c->state_lock);
1525 bch2_free_super(&sb);
1526 bch_err(c, "error bringing %s online: %s", path, err);
1527 return -EINVAL;
1528}
1529
1530int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
1531{
1532 mutex_lock(&c->state_lock);
1533
1534 if (!bch2_dev_is_online(ca)) {
1535 bch_err(ca, "Already offline");
1536 mutex_unlock(&c->state_lock);
1537 return 0;
1538 }
1539
1540 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1541 bch_err(ca, "Cannot offline required disk");
1542 mutex_unlock(&c->state_lock);
1543 return -EINVAL;
1544 }
1545
1546 __bch2_dev_offline(c, ca);
1547
1548 mutex_unlock(&c->state_lock);
1549 return 0;
1550}
1551
1552int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1553{
1554 struct bch_member *mi;
1555 int ret = 0;
1556
1557 mutex_lock(&c->state_lock);
1558
1559 if (nbuckets < ca->mi.nbuckets) {
1560 bch_err(ca, "Cannot shrink yet");
1561 ret = -EINVAL;
1562 goto err;
1563 }
1564
1565 if (bch2_dev_is_online(ca) &&
1566 get_capacity(ca->disk_sb.bdev->bd_disk) <
1567 ca->mi.bucket_size * nbuckets) {
1568 bch_err(ca, "New size larger than device");
1569 ret = -EINVAL;
1570 goto err;
1571 }
1572
1573 ret = bch2_dev_buckets_resize(c, ca, nbuckets);
1574 if (ret) {
1575 bch_err(ca, "Resize error: %i", ret);
1576 goto err;
1577 }
1578
1579 mutex_lock(&c->sb_lock);
1580 mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
1581 mi->nbuckets = cpu_to_le64(nbuckets);
1582
1583 bch2_write_super(c);
1584 mutex_unlock(&c->sb_lock);
1585
1586 bch2_recalc_capacity(c);
1587err:
1588 mutex_unlock(&c->state_lock);
1589 return ret;
1590}
1591
1592/* return with ref on ca->ref: */
1593struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
1594{
1595
1596 struct bch_dev *ca;
1597 dev_t dev;
1598 unsigned i;
1599 int ret;
1600
1601 ret = lookup_bdev(path, &dev);
1602 if (ret)
1603 return ERR_PTR(ret);
1604
1605 for_each_member_device(ca, c, i)
1606 if (ca->disk_sb.bdev->bd_dev == dev)
1607 goto found;
1608
1609 ca = ERR_PTR(-ENOENT);
1610found:
1611 return ca;
1612}
1613
1614/* Filesystem open: */
1615
1616struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
1617 struct bch_opts opts)
1618{
1619 struct bch_sb_handle *sb = NULL;
1620 struct bch_fs *c = NULL;
1621 unsigned i, best_sb = 0;
1622 const char *err;
1623 int ret = -ENOMEM;
1624
1625 pr_verbose_init(opts, "");
1626
1627 if (!nr_devices) {
1628 c = ERR_PTR(-EINVAL);
1629 goto out2;
1630 }
1631
1632 if (!try_module_get(THIS_MODULE)) {
1633 c = ERR_PTR(-ENODEV);
1634 goto out2;
1635 }
1636
1637 sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1638 if (!sb)
1639 goto err;
1640
1641 for (i = 0; i < nr_devices; i++) {
1642 ret = bch2_read_super(devices[i], &opts, &sb[i]);
1643 if (ret)
1644 goto err;
1645
1646 err = bch2_sb_validate(&sb[i]);
1647 if (err)
1648 goto err_print;
1649 }
1650
1651 for (i = 1; i < nr_devices; i++)
1652 if (le64_to_cpu(sb[i].sb->seq) >
1653 le64_to_cpu(sb[best_sb].sb->seq))
1654 best_sb = i;
1655
1656 for (i = 0; i < nr_devices; i++) {
1657 err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
1658 if (err)
1659 goto err_print;
1660 }
1661
1662 ret = -ENOMEM;
1663 c = bch2_fs_alloc(sb[best_sb].sb, opts);
1664 if (!c)
1665 goto err;
1666
1667 err = "bch2_dev_online() error";
1668 mutex_lock(&c->state_lock);
1669 for (i = 0; i < nr_devices; i++)
1670 if (bch2_dev_attach_bdev(c, &sb[i])) {
1671 mutex_unlock(&c->state_lock);
1672 goto err_print;
1673 }
1674 mutex_unlock(&c->state_lock);
1675
1676 err = "insufficient devices";
1677 if (!bch2_fs_may_start(c))
1678 goto err_print;
1679
1680 if (!c->opts.nostart) {
1681 err = bch2_fs_start(c);
1682 if (err)
1683 goto err_print;
1684 }
1685out:
1686 kfree(sb);
1687 module_put(THIS_MODULE);
1688out2:
1689 pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
1690 return c;
1691err_print:
1692 pr_err("bch_fs_open err opening %s: %s",
1693 devices[0], err);
1694 ret = -EINVAL;
1695err:
1696 if (c)
1697 bch2_fs_stop(c);
1698 for (i = 0; i < nr_devices; i++)
1699 bch2_free_super(&sb[i]);
1700 c = ERR_PTR(ret);
1701 goto out;
1702}
1703
1704static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
1705 struct bch_opts opts)
1706{
1707 const char *err;
1708 struct bch_fs *c;
1709 bool allocated_fs = false;
1710
1711 err = bch2_sb_validate(sb);
1712 if (err)
1713 return err;
1714
1715 mutex_lock(&bch_fs_list_lock);
1716 c = __bch2_uuid_to_fs(sb->sb->uuid);
1717 if (c) {
1718 closure_get(&c->cl);
1719
1720 err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
1721 if (err)
1722 goto err;
1723 } else {
1724 c = bch2_fs_alloc(sb->sb, opts);
1725 err = "cannot allocate memory";
1726 if (!c)
1727 goto err;
1728
1729 allocated_fs = true;
1730 }
1731
1732 err = "bch2_dev_online() error";
1733
1734 mutex_lock(&c->sb_lock);
1735 if (bch2_dev_attach_bdev(c, sb)) {
1736 mutex_unlock(&c->sb_lock);
1737 goto err;
1738 }
1739 mutex_unlock(&c->sb_lock);
1740
1741 if (!c->opts.nostart && bch2_fs_may_start(c)) {
1742 err = bch2_fs_start(c);
1743 if (err)
1744 goto err;
1745 }
1746
1747 closure_put(&c->cl);
1748 mutex_unlock(&bch_fs_list_lock);
1749
1750 return NULL;
1751err:
1752 mutex_unlock(&bch_fs_list_lock);
1753
1754 if (allocated_fs)
1755 bch2_fs_stop(c);
1756 else if (c)
1757 closure_put(&c->cl);
1758
1759 return err;
1760}
1761
1762const char *bch2_fs_open_incremental(const char *path)
1763{
1764 struct bch_sb_handle sb;
1765 struct bch_opts opts = bch2_opts_empty();
1766 const char *err;
1767
1768 if (bch2_read_super(path, &opts, &sb))
1769 return "error reading superblock";
1770
1771 err = __bch2_fs_open_incremental(&sb, opts);
1772 bch2_free_super(&sb);
1773
1774 return err;
1775}
1776
1777/* Global interfaces/init */
1778
1779static void bcachefs_exit(void)
1780{
1781 bch2_debug_exit();
1782 bch2_vfs_exit();
1783 bch2_chardev_exit();
1784 if (bcachefs_kset)
1785 kset_unregister(bcachefs_kset);
1786}
1787
1788static int __init bcachefs_init(void)
1789{
1790 bch2_bkey_pack_test();
1791 bch2_inode_pack_test();
1792
1793 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
1794 bch2_chardev_init() ||
1795 bch2_vfs_init() ||
1796 bch2_debug_init())
1797 goto err;
1798
1799 return 0;
1800err:
1801 bcachefs_exit();
1802 return -ENOMEM;
1803}
1804
1805#define BCH_DEBUG_PARAM(name, description) \
1806 bool bch2_##name; \
1807 module_param_named(name, bch2_##name, bool, 0644); \
1808 MODULE_PARM_DESC(name, description);
1809BCH_DEBUG_PARAMS()
1810#undef BCH_DEBUG_PARAM
1811
26609b61 1812unsigned bch2_metadata_version = bcachefs_metadata_version_current;
1c6fdbd8
KO
1813module_param_named(version, bch2_metadata_version, uint, 0400);
1814
1815module_exit(bcachefs_exit);
1816module_init(bcachefs_init);