1c345b86b1c007cf0c9ca8f58b80eb459f166f03
[linux-block.git] / fs / bcachefs / rebalance.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "btree_iter.h"
7 #include "btree_update.h"
8 #include "btree_write_buffer.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "compress.h"
12 #include "disk_groups.h"
13 #include "errcode.h"
14 #include "error.h"
15 #include "inode.h"
16 #include "io_write.h"
17 #include "move.h"
18 #include "rebalance.h"
19 #include "subvolume.h"
20 #include "super-io.h"
21 #include "trace.h"
22
23 #include <linux/freezer.h>
24 #include <linux/kthread.h>
25 #include <linux/sched/cputime.h>
26
27 /* bch_extent_rebalance: */
28
29 static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
30 {
31         const union bch_extent_entry *entry;
32
33         bkey_extent_entry_for_each(ptrs, entry)
34                 if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
35                         return &entry->rebalance;
36
37         return NULL;
38 }
39
40 static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
41 {
42         return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
43 }
44
45 static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
46                                            struct bch_io_opts *opts,
47                                            struct bkey_s_c k,
48                                            struct bkey_ptrs_c ptrs)
49 {
50         if (!opts->background_compression)
51                 return 0;
52
53         unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
54         const union bch_extent_entry *entry;
55         struct extent_ptr_decoded p;
56         unsigned ptr_bit = 1;
57         unsigned rewrite_ptrs = 0;
58
59         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
60                 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
61                     p.ptr.unwritten)
62                         return 0;
63
64                 if (!p.ptr.cached && p.crc.compression_type != compression_type)
65                         rewrite_ptrs |= ptr_bit;
66                 ptr_bit <<= 1;
67         }
68
69         return rewrite_ptrs;
70 }
71
72 static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
73                                        struct bch_io_opts *opts,
74                                        struct bkey_ptrs_c ptrs)
75 {
76         if (!opts->background_target ||
77             !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
78                 return 0;
79
80         unsigned ptr_bit = 1;
81         unsigned rewrite_ptrs = 0;
82
83         guard(rcu)();
84         bkey_for_each_ptr(ptrs, ptr) {
85                 if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
86                         rewrite_ptrs |= ptr_bit;
87                 ptr_bit <<= 1;
88         }
89
90         return rewrite_ptrs;
91 }
92
93 static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
94                                               struct bch_io_opts *opts,
95                                               struct bkey_s_c k)
96 {
97         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
98
99         if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
100                 return 0;
101
102         return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
103                 bch2_bkey_ptrs_need_move(c, opts, ptrs);
104 }
105
106 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
107 {
108         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
109
110         const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
111         if (!opts)
112                 return 0;
113
114         if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
115                 return 0;
116
117         const union bch_extent_entry *entry;
118         struct extent_ptr_decoded p;
119         u64 sectors = 0;
120
121         if (opts->background_compression) {
122                 unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
123
124                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
125                         if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
126                             p.ptr.unwritten) {
127                                 sectors = 0;
128                                 goto incompressible;
129                         }
130
131                         if (!p.ptr.cached && p.crc.compression_type != compression_type)
132                                 sectors += p.crc.compressed_size;
133                 }
134         }
135 incompressible:
136         if (opts->background_target) {
137                 guard(rcu)();
138                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
139                         if (!p.ptr.cached &&
140                             !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
141                                 sectors += p.crc.compressed_size;
142         }
143
144         return sectors;
145 }
146
147 static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
148                                              struct bkey_s_c k)
149 {
150         if (!bkey_extent_is_direct_data(k.k))
151                 return 0;
152
153         const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
154
155         if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
156                 struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
157                 return old == NULL || memcmp(old, &new, sizeof(new));
158         } else {
159                 return old != NULL;
160         }
161 }
162
163 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
164                                   struct bkey_i *_k)
165 {
166         if (!bkey_extent_is_direct_data(&_k->k))
167                 return 0;
168
169         struct bkey_s k = bkey_i_to_s(_k);
170         struct bch_extent_rebalance *old =
171                 (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
172
173         if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
174                 if (!old) {
175                         old = bkey_val_end(k);
176                         k.k->u64s += sizeof(*old) / sizeof(u64);
177                 }
178
179                 *old = io_opts_to_rebalance_opts(c, opts);
180         } else {
181                 if (old)
182                         extent_entry_drop(k, (union bch_extent_entry *) old);
183         }
184
185         return 0;
186 }
187
188 int bch2_get_update_rebalance_opts(struct btree_trans *trans,
189                                    struct bch_io_opts *io_opts,
190                                    struct btree_iter *iter,
191                                    struct bkey_s_c k)
192 {
193         BUG_ON(iter->flags & BTREE_ITER_is_extents);
194         BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
195
196         const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
197                 ? bch2_bkey_rebalance_opts(k) : NULL;
198         if (r) {
199 #define x(_name)                                                        \
200                 if (r->_name##_from_inode) {                            \
201                         io_opts->_name = r->_name;                      \
202                         io_opts->_name##_from_inode = true;             \
203                 }
204                 BCH_REBALANCE_OPTS()
205 #undef x
206         }
207
208         if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
209                 return 0;
210
211         struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
212         int ret = PTR_ERR_OR_ZERO(n);
213         if (ret)
214                 return ret;
215
216         bkey_reassemble(n, k);
217
218         /* On successfull transaction commit, @k was invalidated: */
219
220         return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
221                 bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
222                 bch2_trans_commit(trans, NULL, NULL, 0) ?:
223                 -BCH_ERR_transaction_restart_nested;
224 }
225
226 #define REBALANCE_WORK_SCAN_OFFSET      (U64_MAX - 1)
227
228 static const char * const bch2_rebalance_state_strs[] = {
229 #define x(t) #t,
230         BCH_REBALANCE_STATES()
231         NULL
232 #undef x
233 };
234
235 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
236 {
237         struct btree_iter iter;
238         struct bkey_s_c k;
239         struct bkey_i_cookie *cookie;
240         u64 v;
241         int ret;
242
243         bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
244                              SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
245                              BTREE_ITER_intent);
246         k = bch2_btree_iter_peek_slot(trans, &iter);
247         ret = bkey_err(k);
248         if (ret)
249                 goto err;
250
251         v = k.k->type == KEY_TYPE_cookie
252                 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
253                 : 0;
254
255         cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
256         ret = PTR_ERR_OR_ZERO(cookie);
257         if (ret)
258                 goto err;
259
260         bkey_cookie_init(&cookie->k_i);
261         cookie->k.p = iter.pos;
262         cookie->v.cookie = cpu_to_le64(v + 1);
263
264         ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
265 err:
266         bch2_trans_iter_exit(trans, &iter);
267         return ret;
268 }
269
270 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
271 {
272         int ret = bch2_trans_commit_do(c, NULL, NULL,
273                                        BCH_TRANS_COMMIT_no_enospc,
274                             bch2_set_rebalance_needs_scan_trans(trans, inum));
275         bch2_rebalance_wakeup(c);
276         return ret;
277 }
278
279 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
280 {
281         return bch2_set_rebalance_needs_scan(c, 0);
282 }
283
284 static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
285 {
286         struct btree_iter iter;
287         struct bkey_s_c k;
288         u64 v;
289         int ret;
290
291         bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
292                              SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
293                              BTREE_ITER_intent);
294         k = bch2_btree_iter_peek_slot(trans, &iter);
295         ret = bkey_err(k);
296         if (ret)
297                 goto err;
298
299         v = k.k->type == KEY_TYPE_cookie
300                 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
301                 : 0;
302
303         if (v == cookie)
304                 ret = bch2_btree_delete_at(trans, &iter, 0);
305 err:
306         bch2_trans_iter_exit(trans, &iter);
307         return ret;
308 }
309
310 static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
311                                             struct btree_iter *work_iter)
312 {
313         return !kthread_should_stop()
314                 ? bch2_btree_iter_peek(trans, work_iter)
315                 : bkey_s_c_null;
316 }
317
318 static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
319                                            struct btree_iter *iter,
320                                            struct bkey_s_c k)
321 {
322         if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
323                 return 0;
324
325         struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
326         int ret = PTR_ERR_OR_ZERO(n);
327         if (ret)
328                 return ret;
329
330         extent_entry_drop(bkey_i_to_s(n),
331                           (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
332         return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
333 }
334
335 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
336                         struct bpos work_pos,
337                         struct btree_iter *extent_iter,
338                         struct bch_io_opts *io_opts,
339                         struct data_update_opts *data_opts)
340 {
341         struct bch_fs *c = trans->c;
342
343         bch2_trans_iter_exit(trans, extent_iter);
344         bch2_trans_iter_init(trans, extent_iter,
345                              work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
346                              work_pos,
347                              BTREE_ITER_all_snapshots);
348         struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
349         if (bkey_err(k))
350                 return k;
351
352         int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
353         if (ret)
354                 return bkey_s_c_err(ret);
355
356         memset(data_opts, 0, sizeof(*data_opts));
357         data_opts->rewrite_ptrs         = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
358         data_opts->target               = io_opts->background_target;
359         data_opts->write_flags          |= BCH_WRITE_only_specified_devs;
360
361         if (!data_opts->rewrite_ptrs) {
362                 /*
363                  * device we would want to write to offline? devices in target
364                  * changed?
365                  *
366                  * We'll now need a full scan before this extent is picked up
367                  * again:
368                  */
369                 int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
370                 if (ret)
371                         return bkey_s_c_err(ret);
372                 return bkey_s_c_null;
373         }
374
375         if (trace_rebalance_extent_enabled()) {
376                 struct printbuf buf = PRINTBUF;
377
378                 bch2_bkey_val_to_text(&buf, c, k);
379                 prt_newline(&buf);
380
381                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
382
383                 unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
384                 if (p) {
385                         prt_str(&buf, "compression=");
386                         bch2_compression_opt_to_text(&buf, io_opts->background_compression);
387                         prt_str(&buf, " ");
388                         bch2_prt_u64_base2(&buf, p);
389                         prt_newline(&buf);
390                 }
391
392                 p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
393                 if (p) {
394                         prt_str(&buf, "move=");
395                         bch2_target_to_text(&buf, c, io_opts->background_target);
396                         prt_str(&buf, " ");
397                         bch2_prt_u64_base2(&buf, p);
398                         prt_newline(&buf);
399                 }
400
401                 trace_rebalance_extent(c, buf.buf);
402                 printbuf_exit(&buf);
403         }
404
405         return k;
406 }
407
408 noinline_for_stack
409 static int do_rebalance_extent(struct moving_context *ctxt,
410                                struct bpos work_pos,
411                                struct btree_iter *extent_iter)
412 {
413         struct btree_trans *trans = ctxt->trans;
414         struct bch_fs *c = trans->c;
415         struct bch_fs_rebalance *r = &trans->c->rebalance;
416         struct data_update_opts data_opts;
417         struct bch_io_opts io_opts;
418         struct bkey_s_c k;
419         struct bkey_buf sk;
420         int ret;
421
422         ctxt->stats = &r->work_stats;
423         r->state = BCH_REBALANCE_working;
424
425         bch2_bkey_buf_init(&sk);
426
427         ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
428                                 extent_iter, &io_opts, &data_opts));
429         if (ret || !k.k)
430                 goto out;
431
432         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
433
434         /*
435          * The iterator gets unlocked by __bch2_read_extent - need to
436          * save a copy of @k elsewhere:
437          */
438         bch2_bkey_buf_reassemble(&sk, c, k);
439         k = bkey_i_to_s_c(sk.k);
440
441         ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
442         if (ret) {
443                 if (bch2_err_matches(ret, ENOMEM)) {
444                         /* memory allocation failure, wait for some IO to finish */
445                         bch2_move_ctxt_wait_for_io(ctxt);
446                         ret = bch_err_throw(c, transaction_restart_nested);
447                 }
448
449                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
450                         goto out;
451
452                 /* skip it and continue, XXX signal failure */
453                 ret = 0;
454         }
455 out:
456         bch2_bkey_buf_exit(&sk, c);
457         return ret;
458 }
459
460 static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
461 {
462         struct btree_trans *trans = ctxt->trans;
463         struct bch_fs *c = trans->c;
464         struct bch_fs_rebalance *r = &trans->c->rebalance;
465
466         bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
467         ctxt->stats = &r->scan_stats;
468
469         if (!inum) {
470                 r->scan_start   = BBPOS_MIN;
471                 r->scan_end     = BBPOS_MAX;
472         } else {
473                 r->scan_start   = BBPOS(BTREE_ID_extents, POS(inum, 0));
474                 r->scan_end     = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
475         }
476
477         r->state = BCH_REBALANCE_scanning;
478
479         struct per_snapshot_io_opts snapshot_io_opts;
480         per_snapshot_io_opts_init(&snapshot_io_opts, c);
481
482         int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
483                                       r->scan_start.pos, r->scan_end.pos,
484                                       BTREE_ITER_all_snapshots|
485                                       BTREE_ITER_not_extents|
486                                       BTREE_ITER_prefetch, k, ({
487                 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
488
489                 struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
490                                         &snapshot_io_opts, iter.pos, &iter, k);
491                 PTR_ERR_OR_ZERO(io_opts);
492         })) ?:
493         commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
494                   bch2_clear_rebalance_needs_scan(trans, inum, cookie));
495
496         per_snapshot_io_opts_exit(&snapshot_io_opts);
497         bch2_move_stats_exit(&r->scan_stats, trans->c);
498
499         /*
500          * Ensure that the rebalance_work entries we created are seen by the
501          * next iteration of do_rebalance(), so we don't end up stuck in
502          * rebalance_wait():
503          */
504         atomic64_inc(&r->scan_stats.sectors_seen);
505         bch2_btree_write_buffer_flush_sync(trans);
506
507         return ret;
508 }
509
510 static void rebalance_wait(struct bch_fs *c)
511 {
512         struct bch_fs_rebalance *r = &c->rebalance;
513         struct io_clock *clock = &c->io_clock[WRITE];
514         u64 now = atomic64_read(&clock->now);
515         u64 min_member_capacity = bch2_min_rw_member_capacity(c);
516
517         if (min_member_capacity == U64_MAX)
518                 min_member_capacity = 128 * 2048;
519
520         r->wait_iotime_end              = now + (min_member_capacity >> 6);
521
522         if (r->state != BCH_REBALANCE_waiting) {
523                 r->wait_iotime_start    = now;
524                 r->wait_wallclock_start = ktime_get_real_ns();
525                 r->state                = BCH_REBALANCE_waiting;
526         }
527
528         bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
529 }
530
531 static bool bch2_rebalance_enabled(struct bch_fs *c)
532 {
533         return c->opts.rebalance_enabled &&
534                 !(c->opts.rebalance_on_ac_only &&
535                   c->rebalance.on_battery);
536 }
537
538 static int do_rebalance(struct moving_context *ctxt)
539 {
540         struct btree_trans *trans = ctxt->trans;
541         struct bch_fs *c = trans->c;
542         struct bch_fs_rebalance *r = &c->rebalance;
543         struct btree_iter rebalance_work_iter, extent_iter = {};
544         struct bkey_s_c k;
545         u32 kick = r->kick;
546         int ret = 0;
547
548         bch2_trans_begin(trans);
549
550         bch2_move_stats_init(&r->work_stats, "rebalance_work");
551         bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
552
553         bch2_trans_iter_init(trans, &rebalance_work_iter,
554                              BTREE_ID_rebalance_work, POS_MIN,
555                              BTREE_ITER_all_snapshots);
556
557         while (!bch2_move_ratelimit(ctxt)) {
558                 if (!bch2_rebalance_enabled(c)) {
559                         bch2_moving_ctxt_flush_all(ctxt);
560                         kthread_wait_freezable(bch2_rebalance_enabled(c) ||
561                                                kthread_should_stop());
562                 }
563
564                 if (kthread_should_stop())
565                         break;
566
567                 bch2_trans_begin(trans);
568
569                 ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
570                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
571                         continue;
572                 if (ret || !k.k)
573                         break;
574
575                 ret = k.k->type == KEY_TYPE_cookie
576                         ? do_rebalance_scan(ctxt, k.k->p.inode,
577                                             le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
578                         : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
579
580                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
581                         continue;
582                 if (ret)
583                         break;
584
585                 bch2_btree_iter_advance(trans, &rebalance_work_iter);
586         }
587
588         bch2_trans_iter_exit(trans, &extent_iter);
589         bch2_trans_iter_exit(trans, &rebalance_work_iter);
590         bch2_move_stats_exit(&r->scan_stats, c);
591
592         if (!ret &&
593             !kthread_should_stop() &&
594             !atomic64_read(&r->work_stats.sectors_seen) &&
595             !atomic64_read(&r->scan_stats.sectors_seen) &&
596             kick == r->kick) {
597                 bch2_moving_ctxt_flush_all(ctxt);
598                 bch2_trans_unlock_long(trans);
599                 rebalance_wait(c);
600         }
601
602         if (!bch2_err_matches(ret, EROFS))
603                 bch_err_fn(c, ret);
604         return ret;
605 }
606
607 static int bch2_rebalance_thread(void *arg)
608 {
609         struct bch_fs *c = arg;
610         struct bch_fs_rebalance *r = &c->rebalance;
611         struct moving_context ctxt;
612
613         set_freezable();
614
615         /*
616          * Data move operations can't run until after check_snapshots has
617          * completed, and bch2_snapshot_is_ancestor() is available.
618          */
619         kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
620                                kthread_should_stop());
621
622         bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
623                               writepoint_ptr(&c->rebalance_write_point),
624                               true);
625
626         while (!kthread_should_stop() && !do_rebalance(&ctxt))
627                 ;
628
629         bch2_moving_ctxt_exit(&ctxt);
630
631         return 0;
632 }
633
634 void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
635 {
636         printbuf_tabstop_push(out, 32);
637
638         struct bch_fs_rebalance *r = &c->rebalance;
639
640         /* print pending work */
641         struct disk_accounting_pos acc;
642         disk_accounting_key_init(acc, rebalance_work);
643         u64 v;
644         bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
645
646         prt_printf(out, "pending work:\t");
647         prt_human_readable_u64(out, v << 9);
648         prt_printf(out, "\n\n");
649
650         prt_str(out, bch2_rebalance_state_strs[r->state]);
651         prt_newline(out);
652         printbuf_indent_add(out, 2);
653
654         switch (r->state) {
655         case BCH_REBALANCE_waiting: {
656                 u64 now = atomic64_read(&c->io_clock[WRITE].now);
657
658                 prt_printf(out, "io wait duration:\t");
659                 bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
660                 prt_newline(out);
661
662                 prt_printf(out, "io wait remaining:\t");
663                 bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
664                 prt_newline(out);
665
666                 prt_printf(out, "duration waited:\t");
667                 bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
668                 prt_newline(out);
669                 break;
670         }
671         case BCH_REBALANCE_working:
672                 bch2_move_stats_to_text(out, &r->work_stats);
673                 break;
674         case BCH_REBALANCE_scanning:
675                 bch2_move_stats_to_text(out, &r->scan_stats);
676                 break;
677         }
678         prt_newline(out);
679
680         struct task_struct *t;
681         scoped_guard(rcu) {
682                 t = rcu_dereference(c->rebalance.thread);
683                 if (t)
684                         get_task_struct(t);
685         }
686
687         if (t) {
688                 bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
689                 put_task_struct(t);
690         }
691
692         printbuf_indent_sub(out, 2);
693 }
694
695 void bch2_rebalance_stop(struct bch_fs *c)
696 {
697         struct task_struct *p;
698
699         c->rebalance.pd.rate.rate = UINT_MAX;
700         bch2_ratelimit_reset(&c->rebalance.pd.rate);
701
702         p = rcu_dereference_protected(c->rebalance.thread, 1);
703         c->rebalance.thread = NULL;
704
705         if (p) {
706                 /* for sychronizing with bch2_rebalance_wakeup() */
707                 synchronize_rcu();
708
709                 kthread_stop(p);
710                 put_task_struct(p);
711         }
712 }
713
714 int bch2_rebalance_start(struct bch_fs *c)
715 {
716         struct task_struct *p;
717         int ret;
718
719         if (c->rebalance.thread)
720                 return 0;
721
722         if (c->opts.nochanges)
723                 return 0;
724
725         p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
726         ret = PTR_ERR_OR_ZERO(p);
727         bch_err_msg(c, ret, "creating rebalance thread");
728         if (ret)
729                 return ret;
730
731         get_task_struct(p);
732         rcu_assign_pointer(c->rebalance.thread, p);
733         wake_up_process(p);
734         return 0;
735 }
736
737 #ifdef CONFIG_POWER_SUPPLY
738 #include <linux/power_supply.h>
739
740 static int bch2_rebalance_power_notifier(struct notifier_block *nb,
741                                          unsigned long event, void *data)
742 {
743         struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
744
745         c->rebalance.on_battery = !power_supply_is_system_supplied();
746         bch2_rebalance_wakeup(c);
747         return NOTIFY_OK;
748 }
749 #endif
750
751 void bch2_fs_rebalance_exit(struct bch_fs *c)
752 {
753 #ifdef CONFIG_POWER_SUPPLY
754         power_supply_unreg_notifier(&c->rebalance.power_notifier);
755 #endif
756 }
757
758 int bch2_fs_rebalance_init(struct bch_fs *c)
759 {
760         struct bch_fs_rebalance *r = &c->rebalance;
761
762         bch2_pd_controller_init(&r->pd);
763
764 #ifdef CONFIG_POWER_SUPPLY
765         r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
766         int ret = power_supply_reg_notifier(&r->power_notifier);
767         if (ret)
768                 return ret;
769
770         r->on_battery = !power_supply_is_system_supplied();
771 #endif
772         return 0;
773 }
774
775 static int check_rebalance_work_one(struct btree_trans *trans,
776                                     struct btree_iter *extent_iter,
777                                     struct btree_iter *rebalance_iter,
778                                     struct bkey_buf *last_flushed)
779 {
780         struct bch_fs *c = trans->c;
781         struct bkey_s_c extent_k, rebalance_k;
782         struct printbuf buf = PRINTBUF;
783
784         int ret = bkey_err(extent_k     = bch2_btree_iter_peek(trans, extent_iter)) ?:
785                   bkey_err(rebalance_k  = bch2_btree_iter_peek(trans, rebalance_iter));
786         if (ret)
787                 return ret;
788
789         if (!extent_k.k &&
790             extent_iter->btree_id == BTREE_ID_reflink &&
791             (!rebalance_k.k ||
792              rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
793                 bch2_trans_iter_exit(trans, extent_iter);
794                 bch2_trans_iter_init(trans, extent_iter,
795                                      BTREE_ID_extents, POS_MIN,
796                                      BTREE_ITER_prefetch|
797                                      BTREE_ITER_all_snapshots);
798                 return bch_err_throw(c, transaction_restart_nested);
799         }
800
801         if (!extent_k.k && !rebalance_k.k)
802                 return 1;
803
804         int cmp = bpos_cmp(extent_k.k    ? extent_k.k->p    : SPOS_MAX,
805                            rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
806
807         struct bkey deleted;
808         bkey_init(&deleted);
809
810         if (cmp < 0) {
811                 deleted.p = extent_k.k->p;
812                 rebalance_k.k = &deleted;
813         } else if (cmp > 0) {
814                 deleted.p = rebalance_k.k->p;
815                 extent_k.k = &deleted;
816         }
817
818         bool should_have_rebalance =
819                 bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
820         bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
821
822         if (should_have_rebalance != have_rebalance) {
823                 ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
824                 if (ret)
825                         return ret;
826
827                 bch2_bkey_val_to_text(&buf, c, extent_k);
828         }
829
830         if (fsck_err_on(!should_have_rebalance && have_rebalance,
831                         trans, rebalance_work_incorrectly_set,
832                         "rebalance work incorrectly set\n%s", buf.buf)) {
833                 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
834                                                   extent_k.k->p, false);
835                 if (ret)
836                         goto err;
837         }
838
839         if (fsck_err_on(should_have_rebalance && !have_rebalance,
840                         trans, rebalance_work_incorrectly_unset,
841                         "rebalance work incorrectly unset\n%s", buf.buf)) {
842                 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
843                                                   extent_k.k->p, true);
844                 if (ret)
845                         goto err;
846         }
847
848         if (cmp <= 0)
849                 bch2_btree_iter_advance(trans, extent_iter);
850         if (cmp >= 0)
851                 bch2_btree_iter_advance(trans, rebalance_iter);
852 err:
853 fsck_err:
854         printbuf_exit(&buf);
855         return ret;
856 }
857
858 int bch2_check_rebalance_work(struct bch_fs *c)
859 {
860         struct btree_trans *trans = bch2_trans_get(c);
861         struct btree_iter rebalance_iter, extent_iter;
862         int ret = 0;
863
864         bch2_trans_iter_init(trans, &extent_iter,
865                              BTREE_ID_reflink, POS_MIN,
866                              BTREE_ITER_prefetch);
867         bch2_trans_iter_init(trans, &rebalance_iter,
868                              BTREE_ID_rebalance_work, POS_MIN,
869                              BTREE_ITER_prefetch);
870
871         struct bkey_buf last_flushed;
872         bch2_bkey_buf_init(&last_flushed);
873         bkey_init(&last_flushed.k->k);
874
875         while (!ret) {
876                 bch2_trans_begin(trans);
877
878                 ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
879
880                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
881                         ret = 0;
882         }
883
884         bch2_bkey_buf_exit(&last_flushed, c);
885         bch2_trans_iter_exit(trans, &extent_iter);
886         bch2_trans_iter_exit(trans, &rebalance_iter);
887         bch2_trans_put(trans);
888         return ret < 0 ? ret : 0;
889 }