Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
1f7056b7 | 4 | #include "alloc_background.h" |
7b3f84ea | 5 | #include "alloc_foreground.h" |
1c6fdbd8 | 6 | #include "btree_iter.h" |
fb3f57bb KO |
7 | #include "btree_update.h" |
8 | #include "btree_write_buffer.h" | |
1c6fdbd8 KO |
9 | #include "buckets.h" |
10 | #include "clock.h" | |
986e9842 | 11 | #include "compress.h" |
1c6fdbd8 | 12 | #include "disk_groups.h" |
d4bf5eec | 13 | #include "errcode.h" |
fb3f57bb KO |
14 | #include "error.h" |
15 | #include "inode.h" | |
d90c8acd | 16 | #include "io_write.h" |
1c6fdbd8 KO |
17 | #include "move.h" |
18 | #include "rebalance.h" | |
fb3f57bb | 19 | #include "subvolume.h" |
1c6fdbd8 KO |
20 | #include "super-io.h" |
21 | #include "trace.h" | |
22 | ||
23 | #include <linux/freezer.h> | |
24 | #include <linux/kthread.h> | |
25 | #include <linux/sched/cputime.h> | |
26 | ||
161d1383 KO |
27 | /* bch_extent_rebalance: */ |
28 | ||
9ec00891 | 29 | static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) |
161d1383 | 30 | { |
161d1383 KO |
31 | const union bch_extent_entry *entry; |
32 | ||
33 | bkey_extent_entry_for_each(ptrs, entry) | |
34 | if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) | |
35 | return &entry->rebalance; | |
36 | ||
37 | return NULL; | |
38 | } | |
39 | ||
9ec00891 KO |
40 | static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) |
41 | { | |
42 | return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); | |
43 | } | |
44 | ||
161d1383 KO |
45 | static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, |
46 | struct bch_io_opts *opts, | |
47 | struct bkey_s_c k, | |
48 | struct bkey_ptrs_c ptrs) | |
49 | { | |
50 | if (!opts->background_compression) | |
51 | return 0; | |
52 | ||
53 | unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); | |
54 | const union bch_extent_entry *entry; | |
55 | struct extent_ptr_decoded p; | |
56 | unsigned ptr_bit = 1; | |
57 | unsigned rewrite_ptrs = 0; | |
58 | ||
59 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { | |
60 | if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || | |
61 | p.ptr.unwritten) | |
62 | return 0; | |
63 | ||
64 | if (!p.ptr.cached && p.crc.compression_type != compression_type) | |
65 | rewrite_ptrs |= ptr_bit; | |
66 | ptr_bit <<= 1; | |
67 | } | |
68 | ||
69 | return rewrite_ptrs; | |
70 | } | |
71 | ||
72 | static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, | |
73 | struct bch_io_opts *opts, | |
74 | struct bkey_ptrs_c ptrs) | |
75 | { | |
76 | if (!opts->background_target || | |
77 | !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) | |
78 | return 0; | |
79 | ||
80 | unsigned ptr_bit = 1; | |
81 | unsigned rewrite_ptrs = 0; | |
82 | ||
18dad454 | 83 | guard(rcu)(); |
161d1383 KO |
84 | bkey_for_each_ptr(ptrs, ptr) { |
85 | if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) | |
86 | rewrite_ptrs |= ptr_bit; | |
87 | ptr_bit <<= 1; | |
88 | } | |
89 | ||
90 | return rewrite_ptrs; | |
91 | } | |
92 | ||
93 | static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, | |
94 | struct bch_io_opts *opts, | |
95 | struct bkey_s_c k) | |
96 | { | |
97 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); | |
98 | ||
8c087d2d KO |
99 | if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) |
100 | return 0; | |
101 | ||
161d1383 KO |
102 | return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | |
103 | bch2_bkey_ptrs_need_move(c, opts, ptrs); | |
104 | } | |
105 | ||
106 | u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) | |
107 | { | |
9ec00891 KO |
108 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
109 | ||
110 | const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); | |
161d1383 KO |
111 | if (!opts) |
112 | return 0; | |
113 | ||
8c087d2d KO |
114 | if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) |
115 | return 0; | |
116 | ||
161d1383 KO |
117 | const union bch_extent_entry *entry; |
118 | struct extent_ptr_decoded p; | |
119 | u64 sectors = 0; | |
120 | ||
121 | if (opts->background_compression) { | |
122 | unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); | |
123 | ||
124 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { | |
125 | if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || | |
126 | p.ptr.unwritten) { | |
127 | sectors = 0; | |
128 | goto incompressible; | |
129 | } | |
130 | ||
131 | if (!p.ptr.cached && p.crc.compression_type != compression_type) | |
132 | sectors += p.crc.compressed_size; | |
133 | } | |
134 | } | |
135 | incompressible: | |
e02888fa | 136 | if (opts->background_target) { |
18dad454 | 137 | guard(rcu)(); |
161d1383 | 138 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) |
e02888fa KO |
139 | if (!p.ptr.cached && |
140 | !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) | |
161d1383 | 141 | sectors += p.crc.compressed_size; |
e02888fa | 142 | } |
161d1383 KO |
143 | |
144 | return sectors; | |
145 | } | |
146 | ||
147 | static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, | |
148 | struct bkey_s_c k) | |
149 | { | |
150 | if (!bkey_extent_is_direct_data(k.k)) | |
151 | return 0; | |
152 | ||
153 | const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); | |
154 | ||
155 | if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { | |
4be214c2 | 156 | struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); |
161d1383 KO |
157 | return old == NULL || memcmp(old, &new, sizeof(new)); |
158 | } else { | |
159 | return old != NULL; | |
160 | } | |
161 | } | |
162 | ||
163 | int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, | |
164 | struct bkey_i *_k) | |
165 | { | |
166 | if (!bkey_extent_is_direct_data(&_k->k)) | |
167 | return 0; | |
168 | ||
169 | struct bkey_s k = bkey_i_to_s(_k); | |
170 | struct bch_extent_rebalance *old = | |
171 | (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); | |
172 | ||
173 | if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { | |
174 | if (!old) { | |
175 | old = bkey_val_end(k); | |
176 | k.k->u64s += sizeof(*old) / sizeof(u64); | |
177 | } | |
178 | ||
4be214c2 | 179 | *old = io_opts_to_rebalance_opts(c, opts); |
161d1383 KO |
180 | } else { |
181 | if (old) | |
182 | extent_entry_drop(k, (union bch_extent_entry *) old); | |
183 | } | |
184 | ||
185 | return 0; | |
186 | } | |
187 | ||
188 | int bch2_get_update_rebalance_opts(struct btree_trans *trans, | |
189 | struct bch_io_opts *io_opts, | |
190 | struct btree_iter *iter, | |
191 | struct bkey_s_c k) | |
192 | { | |
193 | BUG_ON(iter->flags & BTREE_ITER_is_extents); | |
194 | BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); | |
195 | ||
196 | const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v | |
197 | ? bch2_bkey_rebalance_opts(k) : NULL; | |
198 | if (r) { | |
199 | #define x(_name) \ | |
200 | if (r->_name##_from_inode) { \ | |
201 | io_opts->_name = r->_name; \ | |
202 | io_opts->_name##_from_inode = true; \ | |
203 | } | |
204 | BCH_REBALANCE_OPTS() | |
205 | #undef x | |
206 | } | |
207 | ||
208 | if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) | |
209 | return 0; | |
210 | ||
211 | struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); | |
212 | int ret = PTR_ERR_OR_ZERO(n); | |
213 | if (ret) | |
214 | return ret; | |
215 | ||
216 | bkey_reassemble(n, k); | |
217 | ||
218 | /* On successfull transaction commit, @k was invalidated: */ | |
219 | ||
220 | return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: | |
221 | bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: | |
222 | bch2_trans_commit(trans, NULL, NULL, 0) ?: | |
223 | -BCH_ERR_transaction_restart_nested; | |
224 | } | |
225 | ||
fb3f57bb | 226 | #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) |
7f5c5d20 | 227 | |
fb3f57bb KO |
228 | static const char * const bch2_rebalance_state_strs[] = { |
229 | #define x(t) #t, | |
230 | BCH_REBALANCE_STATES() | |
231 | NULL | |
232 | #undef x | |
233 | }; | |
7f5c5d20 | 234 | |
4ae6bbb5 | 235 | int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) |
fb3f57bb KO |
236 | { |
237 | struct btree_iter iter; | |
238 | struct bkey_s_c k; | |
239 | struct bkey_i_cookie *cookie; | |
240 | u64 v; | |
241 | int ret; | |
1c6fdbd8 | 242 | |
fb3f57bb KO |
243 | bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, |
244 | SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), | |
5dd8c60e | 245 | BTREE_ITER_intent); |
9180ad2e | 246 | k = bch2_btree_iter_peek_slot(trans, &iter); |
fb3f57bb KO |
247 | ret = bkey_err(k); |
248 | if (ret) | |
249 | goto err; | |
250 | ||
251 | v = k.k->type == KEY_TYPE_cookie | |
252 | ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) | |
253 | : 0; | |
254 | ||
255 | cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); | |
256 | ret = PTR_ERR_OR_ZERO(cookie); | |
257 | if (ret) | |
258 | goto err; | |
259 | ||
260 | bkey_cookie_init(&cookie->k_i); | |
261 | cookie->k.p = iter.pos; | |
262 | cookie->v.cookie = cpu_to_le64(v + 1); | |
263 | ||
264 | ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); | |
265 | err: | |
266 | bch2_trans_iter_exit(trans, &iter); | |
267 | return ret; | |
1c6fdbd8 KO |
268 | } |
269 | ||
fb3f57bb | 270 | int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) |
1c6fdbd8 | 271 | { |
a0d11fee | 272 | int ret = bch2_trans_commit_do(c, NULL, NULL, |
a34b0264 | 273 | BCH_TRANS_COMMIT_no_enospc, |
4ae6bbb5 | 274 | bch2_set_rebalance_needs_scan_trans(trans, inum)); |
10e42b6f | 275 | bch2_rebalance_wakeup(c); |
fb3f57bb KO |
276 | return ret; |
277 | } | |
1c6fdbd8 | 278 | |
fb3f57bb KO |
279 | int bch2_set_fs_needs_rebalance(struct bch_fs *c) |
280 | { | |
281 | return bch2_set_rebalance_needs_scan(c, 0); | |
182084e3 | 282 | } |
5055b509 | 283 | |
fb3f57bb | 284 | static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) |
182084e3 | 285 | { |
fb3f57bb KO |
286 | struct btree_iter iter; |
287 | struct bkey_s_c k; | |
288 | u64 v; | |
289 | int ret; | |
290 | ||
291 | bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, | |
292 | SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), | |
5dd8c60e | 293 | BTREE_ITER_intent); |
9180ad2e | 294 | k = bch2_btree_iter_peek_slot(trans, &iter); |
fb3f57bb KO |
295 | ret = bkey_err(k); |
296 | if (ret) | |
297 | goto err; | |
298 | ||
299 | v = k.k->type == KEY_TYPE_cookie | |
300 | ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) | |
301 | : 0; | |
302 | ||
303 | if (v == cookie) | |
304 | ret = bch2_btree_delete_at(trans, &iter, 0); | |
305 | err: | |
306 | bch2_trans_iter_exit(trans, &iter); | |
307 | return ret; | |
1c6fdbd8 KO |
308 | } |
309 | ||
fb3f57bb KO |
310 | static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, |
311 | struct btree_iter *work_iter) | |
312 | { | |
313 | return !kthread_should_stop() | |
9180ad2e | 314 | ? bch2_btree_iter_peek(trans, work_iter) |
fb3f57bb KO |
315 | : bkey_s_c_null; |
316 | } | |
1c6fdbd8 | 317 | |
fb3f57bb KO |
318 | static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, |
319 | struct btree_iter *iter, | |
320 | struct bkey_s_c k) | |
1c6fdbd8 | 321 | { |
cd52cc35 | 322 | if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) |
3de8b727 KO |
323 | return 0; |
324 | ||
fb3f57bb KO |
325 | struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); |
326 | int ret = PTR_ERR_OR_ZERO(n); | |
327 | if (ret) | |
328 | return ret; | |
1c6fdbd8 | 329 | |
fb3f57bb KO |
330 | extent_entry_drop(bkey_i_to_s(n), |
331 | (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); | |
cb52d23e | 332 | return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); |
fb3f57bb KO |
333 | } |
334 | ||
335 | static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, | |
336 | struct bpos work_pos, | |
337 | struct btree_iter *extent_iter, | |
3de8b727 | 338 | struct bch_io_opts *io_opts, |
fb3f57bb KO |
339 | struct data_update_opts *data_opts) |
340 | { | |
341 | struct bch_fs *c = trans->c; | |
fb3f57bb KO |
342 | |
343 | bch2_trans_iter_exit(trans, extent_iter); | |
344 | bch2_trans_iter_init(trans, extent_iter, | |
345 | work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, | |
346 | work_pos, | |
5dd8c60e | 347 | BTREE_ITER_all_snapshots); |
9180ad2e | 348 | struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); |
fb3f57bb KO |
349 | if (bkey_err(k)) |
350 | return k; | |
351 | ||
3de8b727 KO |
352 | int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); |
353 | if (ret) | |
354 | return bkey_s_c_err(ret); | |
d0445e13 | 355 | |
fb3f57bb | 356 | memset(data_opts, 0, sizeof(*data_opts)); |
3de8b727 KO |
357 | data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); |
358 | data_opts->target = io_opts->background_target; | |
14e2523f | 359 | data_opts->write_flags |= BCH_WRITE_only_specified_devs; |
1c6fdbd8 | 360 | |
fb3f57bb KO |
361 | if (!data_opts->rewrite_ptrs) { |
362 | /* | |
363 | * device we would want to write to offline? devices in target | |
364 | * changed? | |
365 | * | |
366 | * We'll now need a full scan before this extent is picked up | |
367 | * again: | |
368 | */ | |
369 | int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); | |
370 | if (ret) | |
371 | return bkey_s_c_err(ret); | |
372 | return bkey_s_c_null; | |
1c6fdbd8 KO |
373 | } |
374 | ||
25d1e39d KO |
375 | if (trace_rebalance_extent_enabled()) { |
376 | struct printbuf buf = PRINTBUF; | |
377 | ||
25d1e39d | 378 | bch2_bkey_val_to_text(&buf, c, k); |
a652c565 KO |
379 | prt_newline(&buf); |
380 | ||
381 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); | |
382 | ||
383 | unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); | |
384 | if (p) { | |
385 | prt_str(&buf, "compression="); | |
386 | bch2_compression_opt_to_text(&buf, io_opts->background_compression); | |
387 | prt_str(&buf, " "); | |
388 | bch2_prt_u64_base2(&buf, p); | |
389 | prt_newline(&buf); | |
390 | } | |
391 | ||
392 | p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); | |
393 | if (p) { | |
394 | prt_str(&buf, "move="); | |
395 | bch2_target_to_text(&buf, c, io_opts->background_target); | |
396 | prt_str(&buf, " "); | |
397 | bch2_prt_u64_base2(&buf, p); | |
398 | prt_newline(&buf); | |
399 | } | |
25d1e39d KO |
400 | |
401 | trace_rebalance_extent(c, buf.buf); | |
402 | printbuf_exit(&buf); | |
403 | } | |
404 | ||
fb3f57bb | 405 | return k; |
1c6fdbd8 KO |
406 | } |
407 | ||
fb3f57bb KO |
408 | noinline_for_stack |
409 | static int do_rebalance_extent(struct moving_context *ctxt, | |
410 | struct bpos work_pos, | |
411 | struct btree_iter *extent_iter) | |
1c6fdbd8 | 412 | { |
fb3f57bb KO |
413 | struct btree_trans *trans = ctxt->trans; |
414 | struct bch_fs *c = trans->c; | |
415 | struct bch_fs_rebalance *r = &trans->c->rebalance; | |
416 | struct data_update_opts data_opts; | |
417 | struct bch_io_opts io_opts; | |
418 | struct bkey_s_c k; | |
419 | struct bkey_buf sk; | |
420 | int ret; | |
421 | ||
422 | ctxt->stats = &r->work_stats; | |
423 | r->state = BCH_REBALANCE_working; | |
1c6fdbd8 | 424 | |
fb3f57bb | 425 | bch2_bkey_buf_init(&sk); |
1c6fdbd8 | 426 | |
fb3f57bb | 427 | ret = bkey_err(k = next_rebalance_extent(trans, work_pos, |
3de8b727 | 428 | extent_iter, &io_opts, &data_opts)); |
fb3f57bb KO |
429 | if (ret || !k.k) |
430 | goto out; | |
1c6fdbd8 | 431 | |
fb3f57bb KO |
432 | atomic64_add(k.k->size, &ctxt->stats->sectors_seen); |
433 | ||
434 | /* | |
435 | * The iterator gets unlocked by __bch2_read_extent - need to | |
436 | * save a copy of @k elsewhere: | |
437 | */ | |
438 | bch2_bkey_buf_reassemble(&sk, c, k); | |
439 | k = bkey_i_to_s_c(sk.k); | |
440 | ||
441 | ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); | |
442 | if (ret) { | |
443 | if (bch2_err_matches(ret, ENOMEM)) { | |
444 | /* memory allocation failure, wait for some IO to finish */ | |
445 | bch2_move_ctxt_wait_for_io(ctxt); | |
09b9c72b | 446 | ret = bch_err_throw(c, transaction_restart_nested); |
fb3f57bb KO |
447 | } |
448 | ||
449 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
450 | goto out; | |
451 | ||
452 | /* skip it and continue, XXX signal failure */ | |
453 | ret = 0; | |
454 | } | |
455 | out: | |
456 | bch2_bkey_buf_exit(&sk, c); | |
1c6fdbd8 KO |
457 | return ret; |
458 | } | |
459 | ||
fb3f57bb | 460 | static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) |
1c6fdbd8 | 461 | { |
fb3f57bb | 462 | struct btree_trans *trans = ctxt->trans; |
84b9f171 | 463 | struct bch_fs *c = trans->c; |
fb3f57bb | 464 | struct bch_fs_rebalance *r = &trans->c->rebalance; |
fb3f57bb KO |
465 | |
466 | bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); | |
467 | ctxt->stats = &r->scan_stats; | |
1c6fdbd8 | 468 | |
fb3f57bb KO |
469 | if (!inum) { |
470 | r->scan_start = BBPOS_MIN; | |
471 | r->scan_end = BBPOS_MAX; | |
472 | } else { | |
473 | r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); | |
474 | r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); | |
475 | } | |
476 | ||
477 | r->state = BCH_REBALANCE_scanning; | |
478 | ||
84b9f171 KO |
479 | struct per_snapshot_io_opts snapshot_io_opts; |
480 | per_snapshot_io_opts_init(&snapshot_io_opts, c); | |
481 | ||
482 | int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, | |
483 | r->scan_start.pos, r->scan_end.pos, | |
484 | BTREE_ITER_all_snapshots| | |
485 | BTREE_ITER_not_extents| | |
486 | BTREE_ITER_prefetch, k, ({ | |
487 | ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); | |
fb3f57bb | 488 | |
84b9f171 KO |
489 | struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, |
490 | &snapshot_io_opts, iter.pos, &iter, k); | |
491 | PTR_ERR_OR_ZERO(io_opts); | |
492 | })) ?: | |
493 | commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, | |
494 | bch2_clear_rebalance_needs_scan(trans, inum, cookie)); | |
495 | ||
496 | per_snapshot_io_opts_exit(&snapshot_io_opts); | |
fb3f57bb | 497 | bch2_move_stats_exit(&r->scan_stats, trans->c); |
84b9f171 KO |
498 | |
499 | /* | |
500 | * Ensure that the rebalance_work entries we created are seen by the | |
501 | * next iteration of do_rebalance(), so we don't end up stuck in | |
502 | * rebalance_wait(): | |
503 | */ | |
504 | atomic64_inc(&r->scan_stats.sectors_seen); | |
505 | bch2_btree_write_buffer_flush_sync(trans); | |
506 | ||
fb3f57bb | 507 | return ret; |
1c6fdbd8 KO |
508 | } |
509 | ||
fb3f57bb | 510 | static void rebalance_wait(struct bch_fs *c) |
1c6fdbd8 | 511 | { |
1c6fdbd8 KO |
512 | struct bch_fs_rebalance *r = &c->rebalance; |
513 | struct io_clock *clock = &c->io_clock[WRITE]; | |
fb3f57bb | 514 | u64 now = atomic64_read(&clock->now); |
1f7056b7 | 515 | u64 min_member_capacity = bch2_min_rw_member_capacity(c); |
1c6fdbd8 | 516 | |
1f7056b7 KO |
517 | if (min_member_capacity == U64_MAX) |
518 | min_member_capacity = 128 * 2048; | |
fb3f57bb KO |
519 | |
520 | r->wait_iotime_end = now + (min_member_capacity >> 6); | |
521 | ||
522 | if (r->state != BCH_REBALANCE_waiting) { | |
523 | r->wait_iotime_start = now; | |
524 | r->wait_wallclock_start = ktime_get_real_ns(); | |
525 | r->state = BCH_REBALANCE_waiting; | |
526 | } | |
1c6fdbd8 | 527 | |
9e2c3c2e | 528 | bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); |
fb3f57bb | 529 | } |
1c6fdbd8 | 530 | |
96fc7d8a KO |
531 | static bool bch2_rebalance_enabled(struct bch_fs *c) |
532 | { | |
533 | return c->opts.rebalance_enabled && | |
534 | !(c->opts.rebalance_on_ac_only && | |
535 | c->rebalance.on_battery); | |
536 | } | |
537 | ||
fb3f57bb KO |
538 | static int do_rebalance(struct moving_context *ctxt) |
539 | { | |
540 | struct btree_trans *trans = ctxt->trans; | |
541 | struct bch_fs *c = trans->c; | |
542 | struct bch_fs_rebalance *r = &c->rebalance; | |
9180ad2e | 543 | struct btree_iter rebalance_work_iter, extent_iter = {}; |
fb3f57bb | 544 | struct bkey_s_c k; |
9e2c3c2e | 545 | u32 kick = r->kick; |
fb3f57bb | 546 | int ret = 0; |
6876d2ab | 547 | |
ca563dcc KO |
548 | bch2_trans_begin(trans); |
549 | ||
fb3f57bb KO |
550 | bch2_move_stats_init(&r->work_stats, "rebalance_work"); |
551 | bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); | |
1c6fdbd8 | 552 | |
fb3f57bb KO |
553 | bch2_trans_iter_init(trans, &rebalance_work_iter, |
554 | BTREE_ID_rebalance_work, POS_MIN, | |
5dd8c60e | 555 | BTREE_ITER_all_snapshots); |
1c6fdbd8 | 556 | |
0c069781 | 557 | while (!bch2_move_ratelimit(ctxt)) { |
96fc7d8a | 558 | if (!bch2_rebalance_enabled(c)) { |
0c069781 | 559 | bch2_moving_ctxt_flush_all(ctxt); |
96fc7d8a | 560 | kthread_wait_freezable(bch2_rebalance_enabled(c) || |
0c069781 DH |
561 | kthread_should_stop()); |
562 | } | |
563 | ||
564 | if (kthread_should_stop()) | |
565 | break; | |
566 | ||
fb3f57bb | 567 | bch2_trans_begin(trans); |
1c6fdbd8 | 568 | |
fb3f57bb KO |
569 | ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); |
570 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
1c6fdbd8 | 571 | continue; |
fb3f57bb KO |
572 | if (ret || !k.k) |
573 | break; | |
1c6fdbd8 | 574 | |
fb3f57bb KO |
575 | ret = k.k->type == KEY_TYPE_cookie |
576 | ? do_rebalance_scan(ctxt, k.k->p.inode, | |
577 | le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) | |
578 | : do_rebalance_extent(ctxt, k.k->p, &extent_iter); | |
579 | ||
580 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
581 | continue; | |
582 | if (ret) | |
583 | break; | |
1c6fdbd8 | 584 | |
9180ad2e | 585 | bch2_btree_iter_advance(trans, &rebalance_work_iter); |
1c6fdbd8 KO |
586 | } |
587 | ||
fb3f57bb KO |
588 | bch2_trans_iter_exit(trans, &extent_iter); |
589 | bch2_trans_iter_exit(trans, &rebalance_work_iter); | |
590 | bch2_move_stats_exit(&r->scan_stats, c); | |
591 | ||
592 | if (!ret && | |
593 | !kthread_should_stop() && | |
594 | !atomic64_read(&r->work_stats.sectors_seen) && | |
9e2c3c2e KO |
595 | !atomic64_read(&r->scan_stats.sectors_seen) && |
596 | kick == r->kick) { | |
ef740a1e | 597 | bch2_moving_ctxt_flush_all(ctxt); |
f82755e4 | 598 | bch2_trans_unlock_long(trans); |
fb3f57bb KO |
599 | rebalance_wait(c); |
600 | } | |
601 | ||
602 | if (!bch2_err_matches(ret, EROFS)) | |
603 | bch_err_fn(c, ret); | |
604 | return ret; | |
1c6fdbd8 KO |
605 | } |
606 | ||
fb3f57bb | 607 | static int bch2_rebalance_thread(void *arg) |
1c6fdbd8 | 608 | { |
fb3f57bb | 609 | struct bch_fs *c = arg; |
1c6fdbd8 | 610 | struct bch_fs_rebalance *r = &c->rebalance; |
fb3f57bb | 611 | struct moving_context ctxt; |
1c6fdbd8 | 612 | |
fb3f57bb | 613 | set_freezable(); |
1c6fdbd8 | 614 | |
387df331 KO |
615 | /* |
616 | * Data move operations can't run until after check_snapshots has | |
617 | * completed, and bch2_snapshot_is_ancestor() is available. | |
618 | */ | |
68708efc | 619 | kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || |
387df331 KO |
620 | kthread_should_stop()); |
621 | ||
fb3f57bb KO |
622 | bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, |
623 | writepoint_ptr(&c->rebalance_write_point), | |
624 | true); | |
1c6fdbd8 | 625 | |
aead3428 | 626 | while (!kthread_should_stop() && !do_rebalance(&ctxt)) |
fb3f57bb | 627 | ; |
fa8e94fa | 628 | |
fb3f57bb | 629 | bch2_moving_ctxt_exit(&ctxt); |
fa8e94fa | 630 | |
fb3f57bb KO |
631 | return 0; |
632 | } | |
633 | ||
634 | void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) | |
635 | { | |
c991fbee KO |
636 | printbuf_tabstop_push(out, 32); |
637 | ||
fb3f57bb | 638 | struct bch_fs_rebalance *r = &c->rebalance; |
fa8e94fa | 639 | |
c991fbee | 640 | /* print pending work */ |
393a05a7 KO |
641 | struct disk_accounting_pos acc; |
642 | disk_accounting_key_init(acc, rebalance_work); | |
c991fbee KO |
643 | u64 v; |
644 | bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); | |
645 | ||
646 | prt_printf(out, "pending work:\t"); | |
458e2ef8 | 647 | prt_human_readable_u64(out, v << 9); |
c991fbee KO |
648 | prt_printf(out, "\n\n"); |
649 | ||
fb3f57bb | 650 | prt_str(out, bch2_rebalance_state_strs[r->state]); |
401ec4db | 651 | prt_newline(out); |
fb3f57bb | 652 | printbuf_indent_add(out, 2); |
1c6fdbd8 KO |
653 | |
654 | switch (r->state) { | |
fb3f57bb KO |
655 | case BCH_REBALANCE_waiting: { |
656 | u64 now = atomic64_read(&c->io_clock[WRITE].now); | |
657 | ||
c991fbee | 658 | prt_printf(out, "io wait duration:\t"); |
ba78af9e | 659 | bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); |
fb3f57bb KO |
660 | prt_newline(out); |
661 | ||
c991fbee | 662 | prt_printf(out, "io wait remaining:\t"); |
ba78af9e | 663 | bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); |
fb3f57bb KO |
664 | prt_newline(out); |
665 | ||
c991fbee | 666 | prt_printf(out, "duration waited:\t"); |
fb3f57bb KO |
667 | bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); |
668 | prt_newline(out); | |
1c6fdbd8 | 669 | break; |
fb3f57bb KO |
670 | } |
671 | case BCH_REBALANCE_working: | |
672 | bch2_move_stats_to_text(out, &r->work_stats); | |
1c6fdbd8 | 673 | break; |
fb3f57bb KO |
674 | case BCH_REBALANCE_scanning: |
675 | bch2_move_stats_to_text(out, &r->scan_stats); | |
1c6fdbd8 KO |
676 | break; |
677 | } | |
401ec4db | 678 | prt_newline(out); |
c991fbee | 679 | |
18dad454 KO |
680 | struct task_struct *t; |
681 | scoped_guard(rcu) { | |
682 | t = rcu_dereference(c->rebalance.thread); | |
683 | if (t) | |
684 | get_task_struct(t); | |
685 | } | |
c991fbee KO |
686 | |
687 | if (t) { | |
688 | bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); | |
689 | put_task_struct(t); | |
690 | } | |
691 | ||
fb3f57bb | 692 | printbuf_indent_sub(out, 2); |
1c6fdbd8 KO |
693 | } |
694 | ||
695 | void bch2_rebalance_stop(struct bch_fs *c) | |
696 | { | |
697 | struct task_struct *p; | |
698 | ||
699 | c->rebalance.pd.rate.rate = UINT_MAX; | |
700 | bch2_ratelimit_reset(&c->rebalance.pd.rate); | |
701 | ||
702 | p = rcu_dereference_protected(c->rebalance.thread, 1); | |
703 | c->rebalance.thread = NULL; | |
704 | ||
705 | if (p) { | |
10e42b6f | 706 | /* for sychronizing with bch2_rebalance_wakeup() */ |
1c6fdbd8 KO |
707 | synchronize_rcu(); |
708 | ||
709 | kthread_stop(p); | |
710 | put_task_struct(p); | |
711 | } | |
712 | } | |
713 | ||
714 | int bch2_rebalance_start(struct bch_fs *c) | |
715 | { | |
716 | struct task_struct *p; | |
d4bf5eec | 717 | int ret; |
1c6fdbd8 | 718 | |
a4805d66 KO |
719 | if (c->rebalance.thread) |
720 | return 0; | |
721 | ||
1c6fdbd8 KO |
722 | if (c->opts.nochanges) |
723 | return 0; | |
724 | ||
b7a9bbfc | 725 | p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); |
d4bf5eec | 726 | ret = PTR_ERR_OR_ZERO(p); |
cf904c8d KO |
727 | bch_err_msg(c, ret, "creating rebalance thread"); |
728 | if (ret) | |
d4bf5eec | 729 | return ret; |
1c6fdbd8 KO |
730 | |
731 | get_task_struct(p); | |
732 | rcu_assign_pointer(c->rebalance.thread, p); | |
733 | wake_up_process(p); | |
734 | return 0; | |
735 | } | |
736 | ||
96fc7d8a KO |
737 | #ifdef CONFIG_POWER_SUPPLY |
738 | #include <linux/power_supply.h> | |
739 | ||
740 | static int bch2_rebalance_power_notifier(struct notifier_block *nb, | |
741 | unsigned long event, void *data) | |
742 | { | |
743 | struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); | |
744 | ||
745 | c->rebalance.on_battery = !power_supply_is_system_supplied(); | |
746 | bch2_rebalance_wakeup(c); | |
747 | return NOTIFY_OK; | |
748 | } | |
749 | #endif | |
750 | ||
751 | void bch2_fs_rebalance_exit(struct bch_fs *c) | |
1c6fdbd8 | 752 | { |
96fc7d8a KO |
753 | #ifdef CONFIG_POWER_SUPPLY |
754 | power_supply_unreg_notifier(&c->rebalance.power_notifier); | |
755 | #endif | |
756 | } | |
757 | ||
758 | int bch2_fs_rebalance_init(struct bch_fs *c) | |
759 | { | |
760 | struct bch_fs_rebalance *r = &c->rebalance; | |
761 | ||
762 | bch2_pd_controller_init(&r->pd); | |
763 | ||
764 | #ifdef CONFIG_POWER_SUPPLY | |
765 | r->power_notifier.notifier_call = bch2_rebalance_power_notifier; | |
766 | int ret = power_supply_reg_notifier(&r->power_notifier); | |
767 | if (ret) | |
768 | return ret; | |
769 | ||
770 | r->on_battery = !power_supply_is_system_supplied(); | |
771 | #endif | |
772 | return 0; | |
1c6fdbd8 | 773 | } |
834f9475 KO |
774 | |
775 | static int check_rebalance_work_one(struct btree_trans *trans, | |
776 | struct btree_iter *extent_iter, | |
777 | struct btree_iter *rebalance_iter, | |
778 | struct bkey_buf *last_flushed) | |
779 | { | |
780 | struct bch_fs *c = trans->c; | |
781 | struct bkey_s_c extent_k, rebalance_k; | |
782 | struct printbuf buf = PRINTBUF; | |
783 | ||
784 | int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: | |
785 | bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); | |
786 | if (ret) | |
787 | return ret; | |
788 | ||
789 | if (!extent_k.k && | |
790 | extent_iter->btree_id == BTREE_ID_reflink && | |
791 | (!rebalance_k.k || | |
792 | rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { | |
793 | bch2_trans_iter_exit(trans, extent_iter); | |
794 | bch2_trans_iter_init(trans, extent_iter, | |
795 | BTREE_ID_extents, POS_MIN, | |
796 | BTREE_ITER_prefetch| | |
797 | BTREE_ITER_all_snapshots); | |
09b9c72b | 798 | return bch_err_throw(c, transaction_restart_nested); |
834f9475 KO |
799 | } |
800 | ||
801 | if (!extent_k.k && !rebalance_k.k) | |
802 | return 1; | |
803 | ||
804 | int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, | |
805 | rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); | |
806 | ||
807 | struct bkey deleted; | |
808 | bkey_init(&deleted); | |
809 | ||
810 | if (cmp < 0) { | |
811 | deleted.p = extent_k.k->p; | |
812 | rebalance_k.k = &deleted; | |
813 | } else if (cmp > 0) { | |
814 | deleted.p = rebalance_k.k->p; | |
815 | extent_k.k = &deleted; | |
816 | } | |
817 | ||
818 | bool should_have_rebalance = | |
819 | bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; | |
820 | bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; | |
821 | ||
822 | if (should_have_rebalance != have_rebalance) { | |
823 | ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); | |
824 | if (ret) | |
825 | return ret; | |
826 | ||
827 | bch2_bkey_val_to_text(&buf, c, extent_k); | |
828 | } | |
829 | ||
830 | if (fsck_err_on(!should_have_rebalance && have_rebalance, | |
831 | trans, rebalance_work_incorrectly_set, | |
832 | "rebalance work incorrectly set\n%s", buf.buf)) { | |
833 | ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, | |
834 | extent_k.k->p, false); | |
835 | if (ret) | |
836 | goto err; | |
837 | } | |
838 | ||
839 | if (fsck_err_on(should_have_rebalance && !have_rebalance, | |
840 | trans, rebalance_work_incorrectly_unset, | |
841 | "rebalance work incorrectly unset\n%s", buf.buf)) { | |
842 | ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, | |
843 | extent_k.k->p, true); | |
844 | if (ret) | |
845 | goto err; | |
846 | } | |
847 | ||
848 | if (cmp <= 0) | |
849 | bch2_btree_iter_advance(trans, extent_iter); | |
850 | if (cmp >= 0) | |
851 | bch2_btree_iter_advance(trans, rebalance_iter); | |
852 | err: | |
853 | fsck_err: | |
854 | printbuf_exit(&buf); | |
855 | return ret; | |
856 | } | |
857 | ||
858 | int bch2_check_rebalance_work(struct bch_fs *c) | |
859 | { | |
860 | struct btree_trans *trans = bch2_trans_get(c); | |
861 | struct btree_iter rebalance_iter, extent_iter; | |
862 | int ret = 0; | |
863 | ||
864 | bch2_trans_iter_init(trans, &extent_iter, | |
865 | BTREE_ID_reflink, POS_MIN, | |
866 | BTREE_ITER_prefetch); | |
867 | bch2_trans_iter_init(trans, &rebalance_iter, | |
868 | BTREE_ID_rebalance_work, POS_MIN, | |
869 | BTREE_ITER_prefetch); | |
870 | ||
871 | struct bkey_buf last_flushed; | |
872 | bch2_bkey_buf_init(&last_flushed); | |
873 | bkey_init(&last_flushed.k->k); | |
874 | ||
875 | while (!ret) { | |
876 | bch2_trans_begin(trans); | |
877 | ||
878 | ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); | |
879 | ||
880 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
881 | ret = 0; | |
882 | } | |
883 | ||
884 | bch2_bkey_buf_exit(&last_flushed, c); | |
885 | bch2_trans_iter_exit(trans, &extent_iter); | |
886 | bch2_trans_iter_exit(trans, &rebalance_iter); | |
887 | bch2_trans_put(trans); | |
888 | return ret < 0 ? ret : 0; | |
889 | } |