Merge tag 'probes-fixes-v6.16-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-block.git] / fs / bcachefs / rebalance.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
1f7056b7 4#include "alloc_background.h"
7b3f84ea 5#include "alloc_foreground.h"
1c6fdbd8 6#include "btree_iter.h"
fb3f57bb
KO
7#include "btree_update.h"
8#include "btree_write_buffer.h"
1c6fdbd8
KO
9#include "buckets.h"
10#include "clock.h"
986e9842 11#include "compress.h"
1c6fdbd8 12#include "disk_groups.h"
d4bf5eec 13#include "errcode.h"
fb3f57bb
KO
14#include "error.h"
15#include "inode.h"
d90c8acd 16#include "io_write.h"
1c6fdbd8
KO
17#include "move.h"
18#include "rebalance.h"
fb3f57bb 19#include "subvolume.h"
1c6fdbd8
KO
20#include "super-io.h"
21#include "trace.h"
22
23#include <linux/freezer.h>
24#include <linux/kthread.h>
25#include <linux/sched/cputime.h>
26
161d1383
KO
27/* bch_extent_rebalance: */
28
9ec00891 29static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
161d1383 30{
161d1383
KO
31 const union bch_extent_entry *entry;
32
33 bkey_extent_entry_for_each(ptrs, entry)
34 if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
35 return &entry->rebalance;
36
37 return NULL;
38}
39
9ec00891
KO
40static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
41{
42 return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
43}
44
161d1383
KO
45static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
46 struct bch_io_opts *opts,
47 struct bkey_s_c k,
48 struct bkey_ptrs_c ptrs)
49{
50 if (!opts->background_compression)
51 return 0;
52
53 unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
54 const union bch_extent_entry *entry;
55 struct extent_ptr_decoded p;
56 unsigned ptr_bit = 1;
57 unsigned rewrite_ptrs = 0;
58
59 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
60 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
61 p.ptr.unwritten)
62 return 0;
63
64 if (!p.ptr.cached && p.crc.compression_type != compression_type)
65 rewrite_ptrs |= ptr_bit;
66 ptr_bit <<= 1;
67 }
68
69 return rewrite_ptrs;
70}
71
72static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
73 struct bch_io_opts *opts,
74 struct bkey_ptrs_c ptrs)
75{
76 if (!opts->background_target ||
77 !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
78 return 0;
79
80 unsigned ptr_bit = 1;
81 unsigned rewrite_ptrs = 0;
82
18dad454 83 guard(rcu)();
161d1383
KO
84 bkey_for_each_ptr(ptrs, ptr) {
85 if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
86 rewrite_ptrs |= ptr_bit;
87 ptr_bit <<= 1;
88 }
89
90 return rewrite_ptrs;
91}
92
93static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
94 struct bch_io_opts *opts,
95 struct bkey_s_c k)
96{
97 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
98
8c087d2d
KO
99 if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
100 return 0;
101
161d1383
KO
102 return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
103 bch2_bkey_ptrs_need_move(c, opts, ptrs);
104}
105
106u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
107{
9ec00891
KO
108 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
109
110 const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
161d1383
KO
111 if (!opts)
112 return 0;
113
8c087d2d
KO
114 if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
115 return 0;
116
161d1383
KO
117 const union bch_extent_entry *entry;
118 struct extent_ptr_decoded p;
119 u64 sectors = 0;
120
121 if (opts->background_compression) {
122 unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
123
124 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
125 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
126 p.ptr.unwritten) {
127 sectors = 0;
128 goto incompressible;
129 }
130
131 if (!p.ptr.cached && p.crc.compression_type != compression_type)
132 sectors += p.crc.compressed_size;
133 }
134 }
135incompressible:
e02888fa 136 if (opts->background_target) {
18dad454 137 guard(rcu)();
161d1383 138 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
e02888fa
KO
139 if (!p.ptr.cached &&
140 !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
161d1383 141 sectors += p.crc.compressed_size;
e02888fa 142 }
161d1383
KO
143
144 return sectors;
145}
146
147static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
148 struct bkey_s_c k)
149{
150 if (!bkey_extent_is_direct_data(k.k))
151 return 0;
152
153 const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
154
155 if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
4be214c2 156 struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
161d1383
KO
157 return old == NULL || memcmp(old, &new, sizeof(new));
158 } else {
159 return old != NULL;
160 }
161}
162
163int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
164 struct bkey_i *_k)
165{
166 if (!bkey_extent_is_direct_data(&_k->k))
167 return 0;
168
169 struct bkey_s k = bkey_i_to_s(_k);
170 struct bch_extent_rebalance *old =
171 (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
172
173 if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
174 if (!old) {
175 old = bkey_val_end(k);
176 k.k->u64s += sizeof(*old) / sizeof(u64);
177 }
178
4be214c2 179 *old = io_opts_to_rebalance_opts(c, opts);
161d1383
KO
180 } else {
181 if (old)
182 extent_entry_drop(k, (union bch_extent_entry *) old);
183 }
184
185 return 0;
186}
187
188int bch2_get_update_rebalance_opts(struct btree_trans *trans,
189 struct bch_io_opts *io_opts,
190 struct btree_iter *iter,
191 struct bkey_s_c k)
192{
193 BUG_ON(iter->flags & BTREE_ITER_is_extents);
194 BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
195
196 const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
197 ? bch2_bkey_rebalance_opts(k) : NULL;
198 if (r) {
199#define x(_name) \
200 if (r->_name##_from_inode) { \
201 io_opts->_name = r->_name; \
202 io_opts->_name##_from_inode = true; \
203 }
204 BCH_REBALANCE_OPTS()
205#undef x
206 }
207
208 if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
209 return 0;
210
211 struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
212 int ret = PTR_ERR_OR_ZERO(n);
213 if (ret)
214 return ret;
215
216 bkey_reassemble(n, k);
217
218 /* On successfull transaction commit, @k was invalidated: */
219
220 return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
221 bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
222 bch2_trans_commit(trans, NULL, NULL, 0) ?:
223 -BCH_ERR_transaction_restart_nested;
224}
225
fb3f57bb 226#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
7f5c5d20 227
fb3f57bb
KO
228static const char * const bch2_rebalance_state_strs[] = {
229#define x(t) #t,
230 BCH_REBALANCE_STATES()
231 NULL
232#undef x
233};
7f5c5d20 234
4ae6bbb5 235int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
fb3f57bb
KO
236{
237 struct btree_iter iter;
238 struct bkey_s_c k;
239 struct bkey_i_cookie *cookie;
240 u64 v;
241 int ret;
1c6fdbd8 242
fb3f57bb
KO
243 bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
244 SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
5dd8c60e 245 BTREE_ITER_intent);
9180ad2e 246 k = bch2_btree_iter_peek_slot(trans, &iter);
fb3f57bb
KO
247 ret = bkey_err(k);
248 if (ret)
249 goto err;
250
251 v = k.k->type == KEY_TYPE_cookie
252 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
253 : 0;
254
255 cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
256 ret = PTR_ERR_OR_ZERO(cookie);
257 if (ret)
258 goto err;
259
260 bkey_cookie_init(&cookie->k_i);
261 cookie->k.p = iter.pos;
262 cookie->v.cookie = cpu_to_le64(v + 1);
263
264 ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
265err:
266 bch2_trans_iter_exit(trans, &iter);
267 return ret;
1c6fdbd8
KO
268}
269
fb3f57bb 270int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
1c6fdbd8 271{
a0d11fee 272 int ret = bch2_trans_commit_do(c, NULL, NULL,
a34b0264 273 BCH_TRANS_COMMIT_no_enospc,
4ae6bbb5 274 bch2_set_rebalance_needs_scan_trans(trans, inum));
10e42b6f 275 bch2_rebalance_wakeup(c);
fb3f57bb
KO
276 return ret;
277}
1c6fdbd8 278
fb3f57bb
KO
279int bch2_set_fs_needs_rebalance(struct bch_fs *c)
280{
281 return bch2_set_rebalance_needs_scan(c, 0);
182084e3 282}
5055b509 283
fb3f57bb 284static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
182084e3 285{
fb3f57bb
KO
286 struct btree_iter iter;
287 struct bkey_s_c k;
288 u64 v;
289 int ret;
290
291 bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
292 SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
5dd8c60e 293 BTREE_ITER_intent);
9180ad2e 294 k = bch2_btree_iter_peek_slot(trans, &iter);
fb3f57bb
KO
295 ret = bkey_err(k);
296 if (ret)
297 goto err;
298
299 v = k.k->type == KEY_TYPE_cookie
300 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
301 : 0;
302
303 if (v == cookie)
304 ret = bch2_btree_delete_at(trans, &iter, 0);
305err:
306 bch2_trans_iter_exit(trans, &iter);
307 return ret;
1c6fdbd8
KO
308}
309
fb3f57bb
KO
310static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
311 struct btree_iter *work_iter)
312{
313 return !kthread_should_stop()
9180ad2e 314 ? bch2_btree_iter_peek(trans, work_iter)
fb3f57bb
KO
315 : bkey_s_c_null;
316}
1c6fdbd8 317
fb3f57bb
KO
318static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
319 struct btree_iter *iter,
320 struct bkey_s_c k)
1c6fdbd8 321{
cd52cc35 322 if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
3de8b727
KO
323 return 0;
324
fb3f57bb
KO
325 struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
326 int ret = PTR_ERR_OR_ZERO(n);
327 if (ret)
328 return ret;
1c6fdbd8 329
fb3f57bb
KO
330 extent_entry_drop(bkey_i_to_s(n),
331 (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
cb52d23e 332 return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
fb3f57bb
KO
333}
334
335static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
336 struct bpos work_pos,
337 struct btree_iter *extent_iter,
3de8b727 338 struct bch_io_opts *io_opts,
fb3f57bb
KO
339 struct data_update_opts *data_opts)
340{
341 struct bch_fs *c = trans->c;
fb3f57bb
KO
342
343 bch2_trans_iter_exit(trans, extent_iter);
344 bch2_trans_iter_init(trans, extent_iter,
345 work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
346 work_pos,
5dd8c60e 347 BTREE_ITER_all_snapshots);
9180ad2e 348 struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
fb3f57bb
KO
349 if (bkey_err(k))
350 return k;
351
3de8b727
KO
352 int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
353 if (ret)
354 return bkey_s_c_err(ret);
d0445e13 355
fb3f57bb 356 memset(data_opts, 0, sizeof(*data_opts));
3de8b727
KO
357 data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
358 data_opts->target = io_opts->background_target;
14e2523f 359 data_opts->write_flags |= BCH_WRITE_only_specified_devs;
1c6fdbd8 360
fb3f57bb
KO
361 if (!data_opts->rewrite_ptrs) {
362 /*
363 * device we would want to write to offline? devices in target
364 * changed?
365 *
366 * We'll now need a full scan before this extent is picked up
367 * again:
368 */
369 int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
370 if (ret)
371 return bkey_s_c_err(ret);
372 return bkey_s_c_null;
1c6fdbd8
KO
373 }
374
25d1e39d
KO
375 if (trace_rebalance_extent_enabled()) {
376 struct printbuf buf = PRINTBUF;
377
25d1e39d 378 bch2_bkey_val_to_text(&buf, c, k);
a652c565
KO
379 prt_newline(&buf);
380
381 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
382
383 unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
384 if (p) {
385 prt_str(&buf, "compression=");
386 bch2_compression_opt_to_text(&buf, io_opts->background_compression);
387 prt_str(&buf, " ");
388 bch2_prt_u64_base2(&buf, p);
389 prt_newline(&buf);
390 }
391
392 p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
393 if (p) {
394 prt_str(&buf, "move=");
395 bch2_target_to_text(&buf, c, io_opts->background_target);
396 prt_str(&buf, " ");
397 bch2_prt_u64_base2(&buf, p);
398 prt_newline(&buf);
399 }
25d1e39d
KO
400
401 trace_rebalance_extent(c, buf.buf);
402 printbuf_exit(&buf);
403 }
404
fb3f57bb 405 return k;
1c6fdbd8
KO
406}
407
fb3f57bb
KO
408noinline_for_stack
409static int do_rebalance_extent(struct moving_context *ctxt,
410 struct bpos work_pos,
411 struct btree_iter *extent_iter)
1c6fdbd8 412{
fb3f57bb
KO
413 struct btree_trans *trans = ctxt->trans;
414 struct bch_fs *c = trans->c;
415 struct bch_fs_rebalance *r = &trans->c->rebalance;
416 struct data_update_opts data_opts;
417 struct bch_io_opts io_opts;
418 struct bkey_s_c k;
419 struct bkey_buf sk;
420 int ret;
421
422 ctxt->stats = &r->work_stats;
423 r->state = BCH_REBALANCE_working;
1c6fdbd8 424
fb3f57bb 425 bch2_bkey_buf_init(&sk);
1c6fdbd8 426
fb3f57bb 427 ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
3de8b727 428 extent_iter, &io_opts, &data_opts));
fb3f57bb
KO
429 if (ret || !k.k)
430 goto out;
1c6fdbd8 431
fb3f57bb
KO
432 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
433
434 /*
435 * The iterator gets unlocked by __bch2_read_extent - need to
436 * save a copy of @k elsewhere:
437 */
438 bch2_bkey_buf_reassemble(&sk, c, k);
439 k = bkey_i_to_s_c(sk.k);
440
441 ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
442 if (ret) {
443 if (bch2_err_matches(ret, ENOMEM)) {
444 /* memory allocation failure, wait for some IO to finish */
445 bch2_move_ctxt_wait_for_io(ctxt);
09b9c72b 446 ret = bch_err_throw(c, transaction_restart_nested);
fb3f57bb
KO
447 }
448
449 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
450 goto out;
451
452 /* skip it and continue, XXX signal failure */
453 ret = 0;
454 }
455out:
456 bch2_bkey_buf_exit(&sk, c);
1c6fdbd8
KO
457 return ret;
458}
459
fb3f57bb 460static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
1c6fdbd8 461{
fb3f57bb 462 struct btree_trans *trans = ctxt->trans;
84b9f171 463 struct bch_fs *c = trans->c;
fb3f57bb 464 struct bch_fs_rebalance *r = &trans->c->rebalance;
fb3f57bb
KO
465
466 bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
467 ctxt->stats = &r->scan_stats;
1c6fdbd8 468
fb3f57bb
KO
469 if (!inum) {
470 r->scan_start = BBPOS_MIN;
471 r->scan_end = BBPOS_MAX;
472 } else {
473 r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
474 r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
475 }
476
477 r->state = BCH_REBALANCE_scanning;
478
84b9f171
KO
479 struct per_snapshot_io_opts snapshot_io_opts;
480 per_snapshot_io_opts_init(&snapshot_io_opts, c);
481
482 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
483 r->scan_start.pos, r->scan_end.pos,
484 BTREE_ITER_all_snapshots|
485 BTREE_ITER_not_extents|
486 BTREE_ITER_prefetch, k, ({
487 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
fb3f57bb 488
84b9f171
KO
489 struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
490 &snapshot_io_opts, iter.pos, &iter, k);
491 PTR_ERR_OR_ZERO(io_opts);
492 })) ?:
493 commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
494 bch2_clear_rebalance_needs_scan(trans, inum, cookie));
495
496 per_snapshot_io_opts_exit(&snapshot_io_opts);
fb3f57bb 497 bch2_move_stats_exit(&r->scan_stats, trans->c);
84b9f171
KO
498
499 /*
500 * Ensure that the rebalance_work entries we created are seen by the
501 * next iteration of do_rebalance(), so we don't end up stuck in
502 * rebalance_wait():
503 */
504 atomic64_inc(&r->scan_stats.sectors_seen);
505 bch2_btree_write_buffer_flush_sync(trans);
506
fb3f57bb 507 return ret;
1c6fdbd8
KO
508}
509
fb3f57bb 510static void rebalance_wait(struct bch_fs *c)
1c6fdbd8 511{
1c6fdbd8
KO
512 struct bch_fs_rebalance *r = &c->rebalance;
513 struct io_clock *clock = &c->io_clock[WRITE];
fb3f57bb 514 u64 now = atomic64_read(&clock->now);
1f7056b7 515 u64 min_member_capacity = bch2_min_rw_member_capacity(c);
1c6fdbd8 516
1f7056b7
KO
517 if (min_member_capacity == U64_MAX)
518 min_member_capacity = 128 * 2048;
fb3f57bb
KO
519
520 r->wait_iotime_end = now + (min_member_capacity >> 6);
521
522 if (r->state != BCH_REBALANCE_waiting) {
523 r->wait_iotime_start = now;
524 r->wait_wallclock_start = ktime_get_real_ns();
525 r->state = BCH_REBALANCE_waiting;
526 }
1c6fdbd8 527
9e2c3c2e 528 bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
fb3f57bb 529}
1c6fdbd8 530
96fc7d8a
KO
531static bool bch2_rebalance_enabled(struct bch_fs *c)
532{
533 return c->opts.rebalance_enabled &&
534 !(c->opts.rebalance_on_ac_only &&
535 c->rebalance.on_battery);
536}
537
fb3f57bb
KO
538static int do_rebalance(struct moving_context *ctxt)
539{
540 struct btree_trans *trans = ctxt->trans;
541 struct bch_fs *c = trans->c;
542 struct bch_fs_rebalance *r = &c->rebalance;
9180ad2e 543 struct btree_iter rebalance_work_iter, extent_iter = {};
fb3f57bb 544 struct bkey_s_c k;
9e2c3c2e 545 u32 kick = r->kick;
fb3f57bb 546 int ret = 0;
6876d2ab 547
ca563dcc
KO
548 bch2_trans_begin(trans);
549
fb3f57bb
KO
550 bch2_move_stats_init(&r->work_stats, "rebalance_work");
551 bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
1c6fdbd8 552
fb3f57bb
KO
553 bch2_trans_iter_init(trans, &rebalance_work_iter,
554 BTREE_ID_rebalance_work, POS_MIN,
5dd8c60e 555 BTREE_ITER_all_snapshots);
1c6fdbd8 556
0c069781 557 while (!bch2_move_ratelimit(ctxt)) {
96fc7d8a 558 if (!bch2_rebalance_enabled(c)) {
0c069781 559 bch2_moving_ctxt_flush_all(ctxt);
96fc7d8a 560 kthread_wait_freezable(bch2_rebalance_enabled(c) ||
0c069781
DH
561 kthread_should_stop());
562 }
563
564 if (kthread_should_stop())
565 break;
566
fb3f57bb 567 bch2_trans_begin(trans);
1c6fdbd8 568
fb3f57bb
KO
569 ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
570 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1c6fdbd8 571 continue;
fb3f57bb
KO
572 if (ret || !k.k)
573 break;
1c6fdbd8 574
fb3f57bb
KO
575 ret = k.k->type == KEY_TYPE_cookie
576 ? do_rebalance_scan(ctxt, k.k->p.inode,
577 le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
578 : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
579
580 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
581 continue;
582 if (ret)
583 break;
1c6fdbd8 584
9180ad2e 585 bch2_btree_iter_advance(trans, &rebalance_work_iter);
1c6fdbd8
KO
586 }
587
fb3f57bb
KO
588 bch2_trans_iter_exit(trans, &extent_iter);
589 bch2_trans_iter_exit(trans, &rebalance_work_iter);
590 bch2_move_stats_exit(&r->scan_stats, c);
591
592 if (!ret &&
593 !kthread_should_stop() &&
594 !atomic64_read(&r->work_stats.sectors_seen) &&
9e2c3c2e
KO
595 !atomic64_read(&r->scan_stats.sectors_seen) &&
596 kick == r->kick) {
ef740a1e 597 bch2_moving_ctxt_flush_all(ctxt);
f82755e4 598 bch2_trans_unlock_long(trans);
fb3f57bb
KO
599 rebalance_wait(c);
600 }
601
602 if (!bch2_err_matches(ret, EROFS))
603 bch_err_fn(c, ret);
604 return ret;
1c6fdbd8
KO
605}
606
fb3f57bb 607static int bch2_rebalance_thread(void *arg)
1c6fdbd8 608{
fb3f57bb 609 struct bch_fs *c = arg;
1c6fdbd8 610 struct bch_fs_rebalance *r = &c->rebalance;
fb3f57bb 611 struct moving_context ctxt;
1c6fdbd8 612
fb3f57bb 613 set_freezable();
1c6fdbd8 614
387df331
KO
615 /*
616 * Data move operations can't run until after check_snapshots has
617 * completed, and bch2_snapshot_is_ancestor() is available.
618 */
68708efc 619 kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
387df331
KO
620 kthread_should_stop());
621
fb3f57bb
KO
622 bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
623 writepoint_ptr(&c->rebalance_write_point),
624 true);
1c6fdbd8 625
aead3428 626 while (!kthread_should_stop() && !do_rebalance(&ctxt))
fb3f57bb 627 ;
fa8e94fa 628
fb3f57bb 629 bch2_moving_ctxt_exit(&ctxt);
fa8e94fa 630
fb3f57bb
KO
631 return 0;
632}
633
634void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
635{
c991fbee
KO
636 printbuf_tabstop_push(out, 32);
637
fb3f57bb 638 struct bch_fs_rebalance *r = &c->rebalance;
fa8e94fa 639
c991fbee 640 /* print pending work */
393a05a7
KO
641 struct disk_accounting_pos acc;
642 disk_accounting_key_init(acc, rebalance_work);
c991fbee
KO
643 u64 v;
644 bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
645
646 prt_printf(out, "pending work:\t");
458e2ef8 647 prt_human_readable_u64(out, v << 9);
c991fbee
KO
648 prt_printf(out, "\n\n");
649
fb3f57bb 650 prt_str(out, bch2_rebalance_state_strs[r->state]);
401ec4db 651 prt_newline(out);
fb3f57bb 652 printbuf_indent_add(out, 2);
1c6fdbd8
KO
653
654 switch (r->state) {
fb3f57bb
KO
655 case BCH_REBALANCE_waiting: {
656 u64 now = atomic64_read(&c->io_clock[WRITE].now);
657
c991fbee 658 prt_printf(out, "io wait duration:\t");
ba78af9e 659 bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
fb3f57bb
KO
660 prt_newline(out);
661
c991fbee 662 prt_printf(out, "io wait remaining:\t");
ba78af9e 663 bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
fb3f57bb
KO
664 prt_newline(out);
665
c991fbee 666 prt_printf(out, "duration waited:\t");
fb3f57bb
KO
667 bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
668 prt_newline(out);
1c6fdbd8 669 break;
fb3f57bb
KO
670 }
671 case BCH_REBALANCE_working:
672 bch2_move_stats_to_text(out, &r->work_stats);
1c6fdbd8 673 break;
fb3f57bb
KO
674 case BCH_REBALANCE_scanning:
675 bch2_move_stats_to_text(out, &r->scan_stats);
1c6fdbd8
KO
676 break;
677 }
401ec4db 678 prt_newline(out);
c991fbee 679
18dad454
KO
680 struct task_struct *t;
681 scoped_guard(rcu) {
682 t = rcu_dereference(c->rebalance.thread);
683 if (t)
684 get_task_struct(t);
685 }
c991fbee
KO
686
687 if (t) {
688 bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
689 put_task_struct(t);
690 }
691
fb3f57bb 692 printbuf_indent_sub(out, 2);
1c6fdbd8
KO
693}
694
695void bch2_rebalance_stop(struct bch_fs *c)
696{
697 struct task_struct *p;
698
699 c->rebalance.pd.rate.rate = UINT_MAX;
700 bch2_ratelimit_reset(&c->rebalance.pd.rate);
701
702 p = rcu_dereference_protected(c->rebalance.thread, 1);
703 c->rebalance.thread = NULL;
704
705 if (p) {
10e42b6f 706 /* for sychronizing with bch2_rebalance_wakeup() */
1c6fdbd8
KO
707 synchronize_rcu();
708
709 kthread_stop(p);
710 put_task_struct(p);
711 }
712}
713
714int bch2_rebalance_start(struct bch_fs *c)
715{
716 struct task_struct *p;
d4bf5eec 717 int ret;
1c6fdbd8 718
a4805d66
KO
719 if (c->rebalance.thread)
720 return 0;
721
1c6fdbd8
KO
722 if (c->opts.nochanges)
723 return 0;
724
b7a9bbfc 725 p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
d4bf5eec 726 ret = PTR_ERR_OR_ZERO(p);
cf904c8d
KO
727 bch_err_msg(c, ret, "creating rebalance thread");
728 if (ret)
d4bf5eec 729 return ret;
1c6fdbd8
KO
730
731 get_task_struct(p);
732 rcu_assign_pointer(c->rebalance.thread, p);
733 wake_up_process(p);
734 return 0;
735}
736
96fc7d8a
KO
737#ifdef CONFIG_POWER_SUPPLY
738#include <linux/power_supply.h>
739
740static int bch2_rebalance_power_notifier(struct notifier_block *nb,
741 unsigned long event, void *data)
742{
743 struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
744
745 c->rebalance.on_battery = !power_supply_is_system_supplied();
746 bch2_rebalance_wakeup(c);
747 return NOTIFY_OK;
748}
749#endif
750
751void bch2_fs_rebalance_exit(struct bch_fs *c)
1c6fdbd8 752{
96fc7d8a
KO
753#ifdef CONFIG_POWER_SUPPLY
754 power_supply_unreg_notifier(&c->rebalance.power_notifier);
755#endif
756}
757
758int bch2_fs_rebalance_init(struct bch_fs *c)
759{
760 struct bch_fs_rebalance *r = &c->rebalance;
761
762 bch2_pd_controller_init(&r->pd);
763
764#ifdef CONFIG_POWER_SUPPLY
765 r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
766 int ret = power_supply_reg_notifier(&r->power_notifier);
767 if (ret)
768 return ret;
769
770 r->on_battery = !power_supply_is_system_supplied();
771#endif
772 return 0;
1c6fdbd8 773}
834f9475
KO
774
775static int check_rebalance_work_one(struct btree_trans *trans,
776 struct btree_iter *extent_iter,
777 struct btree_iter *rebalance_iter,
778 struct bkey_buf *last_flushed)
779{
780 struct bch_fs *c = trans->c;
781 struct bkey_s_c extent_k, rebalance_k;
782 struct printbuf buf = PRINTBUF;
783
784 int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
785 bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
786 if (ret)
787 return ret;
788
789 if (!extent_k.k &&
790 extent_iter->btree_id == BTREE_ID_reflink &&
791 (!rebalance_k.k ||
792 rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
793 bch2_trans_iter_exit(trans, extent_iter);
794 bch2_trans_iter_init(trans, extent_iter,
795 BTREE_ID_extents, POS_MIN,
796 BTREE_ITER_prefetch|
797 BTREE_ITER_all_snapshots);
09b9c72b 798 return bch_err_throw(c, transaction_restart_nested);
834f9475
KO
799 }
800
801 if (!extent_k.k && !rebalance_k.k)
802 return 1;
803
804 int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
805 rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
806
807 struct bkey deleted;
808 bkey_init(&deleted);
809
810 if (cmp < 0) {
811 deleted.p = extent_k.k->p;
812 rebalance_k.k = &deleted;
813 } else if (cmp > 0) {
814 deleted.p = rebalance_k.k->p;
815 extent_k.k = &deleted;
816 }
817
818 bool should_have_rebalance =
819 bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
820 bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
821
822 if (should_have_rebalance != have_rebalance) {
823 ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
824 if (ret)
825 return ret;
826
827 bch2_bkey_val_to_text(&buf, c, extent_k);
828 }
829
830 if (fsck_err_on(!should_have_rebalance && have_rebalance,
831 trans, rebalance_work_incorrectly_set,
832 "rebalance work incorrectly set\n%s", buf.buf)) {
833 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
834 extent_k.k->p, false);
835 if (ret)
836 goto err;
837 }
838
839 if (fsck_err_on(should_have_rebalance && !have_rebalance,
840 trans, rebalance_work_incorrectly_unset,
841 "rebalance work incorrectly unset\n%s", buf.buf)) {
842 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
843 extent_k.k->p, true);
844 if (ret)
845 goto err;
846 }
847
848 if (cmp <= 0)
849 bch2_btree_iter_advance(trans, extent_iter);
850 if (cmp >= 0)
851 bch2_btree_iter_advance(trans, rebalance_iter);
852err:
853fsck_err:
854 printbuf_exit(&buf);
855 return ret;
856}
857
858int bch2_check_rebalance_work(struct bch_fs *c)
859{
860 struct btree_trans *trans = bch2_trans_get(c);
861 struct btree_iter rebalance_iter, extent_iter;
862 int ret = 0;
863
864 bch2_trans_iter_init(trans, &extent_iter,
865 BTREE_ID_reflink, POS_MIN,
866 BTREE_ITER_prefetch);
867 bch2_trans_iter_init(trans, &rebalance_iter,
868 BTREE_ID_rebalance_work, POS_MIN,
869 BTREE_ITER_prefetch);
870
871 struct bkey_buf last_flushed;
872 bch2_bkey_buf_init(&last_flushed);
873 bkey_init(&last_flushed.k->k);
874
875 while (!ret) {
876 bch2_trans_begin(trans);
877
878 ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
879
880 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
881 ret = 0;
882 }
883
884 bch2_bkey_buf_exit(&last_flushed, c);
885 bch2_trans_iter_exit(trans, &extent_iter);
886 bch2_trans_iter_exit(trans, &rebalance_iter);
887 bch2_trans_put(trans);
888 return ret < 0 ? ret : 0;
889}