bcachefs: Fix check for if extent update is allocating
[linux-block.git] / fs / bcachefs / move.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
7b3f84ea 4#include "alloc_foreground.h"
1c6fdbd8
KO
5#include "btree_gc.h"
6#include "btree_update.h"
7ef2a73a 7#include "btree_update_interior.h"
1c6fdbd8 8#include "buckets.h"
4628529f 9#include "disk_groups.h"
1c6fdbd8
KO
10#include "inode.h"
11#include "io.h"
12#include "journal_reclaim.h"
13#include "keylist.h"
14#include "move.h"
15#include "replicas.h"
16#include "super-io.h"
17#include "trace.h"
18
19#include <linux/ioprio.h>
20#include <linux/kthread.h>
21
22#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
23
24struct moving_io {
25 struct list_head list;
26 struct closure cl;
27 bool read_completed;
28
29 unsigned read_sectors;
30 unsigned write_sectors;
31
32 struct bch_read_bio rbio;
33
34 struct migrate_write write;
35 /* Must be last since it is variable size */
36 struct bio_vec bi_inline_vecs[0];
37};
38
39struct moving_context {
40 /* Closure for waiting on all reads and writes to complete */
41 struct closure cl;
42
43 struct bch_move_stats *stats;
44
45 struct list_head reads;
46
47 /* in flight sectors: */
48 atomic_t read_sectors;
49 atomic_t write_sectors;
50
51 wait_queue_head_t wait;
52};
53
54static int bch2_migrate_index_update(struct bch_write_op *op)
55{
56 struct bch_fs *c = op->c;
57 struct migrate_write *m =
58 container_of(op, struct migrate_write, op);
59 struct keylist *keys = &op->insert_keys;
60 struct btree_iter iter;
61 int ret = 0;
62
63 bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
64 bkey_start_pos(&bch2_keylist_front(keys)->k),
65 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
66
67 while (1) {
68 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
69 struct bkey_i_extent *insert, *new =
70 bkey_i_to_extent(bch2_keylist_front(keys));
71 BKEY_PADDED(k) _new, _insert;
1742237b
KO
72 const union bch_extent_entry *entry;
73 struct extent_ptr_decoded p;
1c6fdbd8
KO
74 bool did_work = false;
75 int nr;
76
77 if (btree_iter_err(k)) {
78 ret = bch2_btree_iter_unlock(&iter);
79 break;
80 }
81
82 if (bversion_cmp(k.k->version, new->k.version) ||
83 !bkey_extent_is_data(k.k) ||
84 !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
85 m->ptr, m->offset))
86 goto nomatch;
87
88 if (m->data_cmd == DATA_REWRITE &&
89 !bch2_extent_has_device(bkey_s_c_to_extent(k),
90 m->data_opts.rewrite_dev))
91 goto nomatch;
92
93 bkey_reassemble(&_insert.k, k);
94 insert = bkey_i_to_extent(&_insert.k);
95
96 bkey_copy(&_new.k, bch2_keylist_front(keys));
97 new = bkey_i_to_extent(&_new.k);
98
99 bch2_cut_front(iter.pos, &insert->k_i);
100 bch2_cut_back(new->k.p, &insert->k);
101 bch2_cut_back(insert->k.p, &new->k);
102
a2753581 103 if (m->data_cmd == DATA_REWRITE)
26609b61
KO
104 bch2_bkey_drop_device(extent_i_to_s(insert).s,
105 m->data_opts.rewrite_dev);
1c6fdbd8 106
1742237b
KO
107 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
108 if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
1c6fdbd8
KO
109 /*
110 * raced with another move op? extent already
111 * has a pointer to the device we just wrote
112 * data to
113 */
114 continue;
115 }
116
71c9e0ba 117 bch2_extent_ptr_decoded_append(insert, &p);
1c6fdbd8
KO
118 did_work = true;
119 }
120
121 if (!did_work)
122 goto nomatch;
123
124 bch2_extent_narrow_crcs(insert,
125 (struct bch_extent_crc_unpacked) { 0 });
126 bch2_extent_normalize(c, extent_i_to_s(insert).s);
127 bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
128 op->opts.background_target,
129 op->opts.data_replicas);
130
131 /*
db636adb
KO
132 * If we're not fully overwriting @k, and it's compressed, we
133 * need a reservation for all the pointers in @insert
1c6fdbd8 134 */
26609b61 135 nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
db636adb
KO
136 m->nr_ptrs_reserved;
137
138 if (insert->k.size < k.k->size &&
139 bch2_extent_is_compressed(k) &&
140 nr > 0) {
1c6fdbd8
KO
141 /*
142 * can't call bch2_disk_reservation_add() with btree
143 * locks held, at least not without a song and dance
144 */
145 bch2_btree_iter_unlock(&iter);
146
147 ret = bch2_disk_reservation_add(c, &op->res,
148 keylist_sectors(keys) * nr, 0);
149 if (ret)
150 goto out;
151
152 m->nr_ptrs_reserved += nr;
153 goto next;
154 }
155
1c6fdbd8 156 ret = bch2_btree_insert_at(c, &op->res,
fc3268c1 157 op_journal_seq(op),
1c6fdbd8
KO
158 BTREE_INSERT_ATOMIC|
159 BTREE_INSERT_NOFAIL|
160 BTREE_INSERT_USE_RESERVE|
161 m->data_opts.btree_insert_flags,
162 BTREE_INSERT_ENTRY(&iter, &insert->k_i));
163 if (!ret)
164 atomic_long_inc(&c->extent_migrate_done);
165 if (ret == -EINTR)
166 ret = 0;
167 if (ret)
168 break;
169next:
170 while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
171 bch2_keylist_pop_front(keys);
172 if (bch2_keylist_empty(keys))
173 goto out;
174 }
175
176 bch2_cut_front(iter.pos, bch2_keylist_front(keys));
177 continue;
178nomatch:
179 if (m->ctxt)
180 atomic64_add(k.k->p.offset - iter.pos.offset,
181 &m->ctxt->stats->sectors_raced);
182 atomic_long_inc(&c->extent_migrate_raced);
183 trace_move_race(&new->k);
184 bch2_btree_iter_next_slot(&iter);
185 goto next;
186 }
187out:
188 bch2_btree_iter_unlock(&iter);
189 return ret;
190}
191
192void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
193{
194 /* write bio must own pages: */
195 BUG_ON(!m->op.wbio.bio.bi_vcnt);
196
197 m->ptr = rbio->pick.ptr;
198 m->offset = rbio->pos.offset - rbio->pick.crc.offset;
199 m->op.devs_have = rbio->devs_have;
200 m->op.pos = rbio->pos;
201 m->op.version = rbio->version;
202 m->op.crc = rbio->pick.crc;
203 m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
204
205 if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
206 m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
207 m->op.csum_type = m->op.crc.csum_type;
208 }
209
210 if (m->data_cmd == DATA_REWRITE)
211 bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
212}
213
214int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
215 struct write_point_specifier wp,
216 struct bch_io_opts io_opts,
217 enum data_cmd data_cmd,
218 struct data_opts data_opts,
219 struct bkey_s_c k)
220{
221 int ret;
222
223 m->data_cmd = data_cmd;
224 m->data_opts = data_opts;
225 m->nr_ptrs_reserved = 0;
226
227 bch2_write_op_init(&m->op, c, io_opts);
228 m->op.compression_type =
229 bch2_compression_opt_to_type[io_opts.background_compression ?:
230 io_opts.compression];
231 m->op.target = data_opts.target,
232 m->op.write_point = wp;
233
234 if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
235 m->op.alloc_reserve = RESERVE_MOVINGGC;
236
237 m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
238 BCH_WRITE_PAGES_STABLE|
239 BCH_WRITE_PAGES_OWNED|
1d25849c 240 BCH_WRITE_DATA_ENCODED;
1c6fdbd8
KO
241
242 m->op.nr_replicas = 1;
243 m->op.nr_replicas_required = 1;
244 m->op.index_update_fn = bch2_migrate_index_update;
245
246 switch (data_cmd) {
247 case DATA_ADD_REPLICAS: {
db636adb
KO
248 /*
249 * DATA_ADD_REPLICAS is used for moving data to a different
250 * device in the background, and due to compression the new copy
251 * might take up more space than the old copy:
252 */
253#if 0
1c6fdbd8 254 int nr = (int) io_opts.data_replicas -
26609b61 255 bch2_bkey_nr_dirty_ptrs(k);
db636adb
KO
256#endif
257 int nr = (int) io_opts.data_replicas;
1c6fdbd8
KO
258
259 if (nr > 0) {
260 m->op.nr_replicas = m->nr_ptrs_reserved = nr;
261
262 ret = bch2_disk_reservation_get(c, &m->op.res,
263 k.k->size, m->op.nr_replicas, 0);
264 if (ret)
265 return ret;
266 }
267 break;
268 }
4628529f
KO
269 case DATA_REWRITE: {
270 const union bch_extent_entry *entry;
271 struct extent_ptr_decoded p;
272 unsigned compressed_sectors = 0;
273
274 extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry)
275 if (!p.ptr.cached &&
276 p.crc.compression_type != BCH_COMPRESSION_NONE &&
277 bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
278 compressed_sectors += p.crc.compressed_size;
279
280 if (compressed_sectors) {
281 ret = bch2_disk_reservation_add(c, &m->op.res,
282 compressed_sectors,
283 BCH_DISK_RESERVATION_NOFAIL);
284 if (ret)
285 return ret;
286 }
1c6fdbd8 287 break;
4628529f 288 }
1c6fdbd8
KO
289 case DATA_PROMOTE:
290 m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
291 m->op.flags |= BCH_WRITE_CACHED;
292 break;
293 default:
294 BUG();
295 }
296
297 return 0;
298}
299
300static void move_free(struct closure *cl)
301{
302 struct moving_io *io = container_of(cl, struct moving_io, cl);
303 struct moving_context *ctxt = io->write.ctxt;
304 struct bvec_iter_all iter;
305 struct bio_vec *bv;
306
307 bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
308
309 bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
310 if (bv->bv_page)
311 __free_page(bv->bv_page);
312
313 wake_up(&ctxt->wait);
314
315 kfree(io);
316}
317
318static void move_write_done(struct closure *cl)
319{
320 struct moving_io *io = container_of(cl, struct moving_io, cl);
321
322 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
323 closure_return_with_destructor(cl, move_free);
324}
325
326static void move_write(struct closure *cl)
327{
328 struct moving_io *io = container_of(cl, struct moving_io, cl);
329
330 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
331 closure_return_with_destructor(cl, move_free);
332 return;
333 }
334
335 bch2_migrate_read_done(&io->write, &io->rbio);
336
337 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
338 closure_call(&io->write.op.cl, bch2_write, NULL, cl);
339 continue_at(cl, move_write_done, NULL);
340}
341
342static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
343{
344 struct moving_io *io =
345 list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
346
347 return io && io->read_completed ? io : NULL;
348}
349
350static void move_read_endio(struct bio *bio)
351{
352 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
353 struct moving_context *ctxt = io->write.ctxt;
354
355 atomic_sub(io->read_sectors, &ctxt->read_sectors);
356 io->read_completed = true;
357
358 if (next_pending_write(ctxt))
359 wake_up(&ctxt->wait);
360
361 closure_put(&ctxt->cl);
362}
363
364static void do_pending_writes(struct moving_context *ctxt)
365{
366 struct moving_io *io;
367
368 while ((io = next_pending_write(ctxt))) {
369 list_del(&io->list);
370 closure_call(&io->cl, move_write, NULL, &ctxt->cl);
371 }
372}
373
374#define move_ctxt_wait_event(_ctxt, _cond) \
375do { \
376 do_pending_writes(_ctxt); \
377 \
378 if (_cond) \
379 break; \
380 __wait_event((_ctxt)->wait, \
381 next_pending_write(_ctxt) || (_cond)); \
382} while (1)
383
384static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
385{
386 unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
387
388 move_ctxt_wait_event(ctxt,
389 !atomic_read(&ctxt->write_sectors) ||
390 atomic_read(&ctxt->write_sectors) != sectors_pending);
391}
392
393static int bch2_move_extent(struct bch_fs *c,
394 struct moving_context *ctxt,
395 struct write_point_specifier wp,
396 struct bch_io_opts io_opts,
397 struct bkey_s_c_extent e,
398 enum data_cmd data_cmd,
399 struct data_opts data_opts)
400{
401 struct moving_io *io;
1742237b
KO
402 const union bch_extent_entry *entry;
403 struct extent_ptr_decoded p;
1c6fdbd8
KO
404 unsigned sectors = e.k->size, pages;
405 int ret = -ENOMEM;
406
407 move_ctxt_wait_event(ctxt,
408 atomic_read(&ctxt->write_sectors) <
409 SECTORS_IN_FLIGHT_PER_DEVICE);
410
411 move_ctxt_wait_event(ctxt,
412 atomic_read(&ctxt->read_sectors) <
413 SECTORS_IN_FLIGHT_PER_DEVICE);
414
415 /* write path might have to decompress data: */
1742237b
KO
416 extent_for_each_ptr_decode(e, p, entry)
417 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
1c6fdbd8
KO
418
419 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
420 io = kzalloc(sizeof(struct moving_io) +
421 sizeof(struct bio_vec) * pages, GFP_KERNEL);
422 if (!io)
423 goto err;
424
425 io->write.ctxt = ctxt;
426 io->read_sectors = e.k->size;
427 io->write_sectors = e.k->size;
428
429 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
430 bio_set_prio(&io->write.op.wbio.bio,
431 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
432
433 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
434 GFP_KERNEL))
435 goto err_free;
436
437 io->rbio.opts = io_opts;
438 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
439 io->rbio.bio.bi_vcnt = pages;
440 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
441 io->rbio.bio.bi_iter.bi_size = sectors << 9;
442
443 io->rbio.bio.bi_opf = REQ_OP_READ;
444 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
445 io->rbio.bio.bi_end_io = move_read_endio;
446
447 ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
448 data_cmd, data_opts, e.s_c);
449 if (ret)
450 goto err_free_pages;
451
452 atomic64_inc(&ctxt->stats->keys_moved);
453 atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
454
455 trace_move_extent(e.k);
456
457 atomic_add(io->read_sectors, &ctxt->read_sectors);
458 list_add_tail(&io->list, &ctxt->reads);
459
460 /*
461 * dropped by move_read_endio() - guards against use after free of
462 * ctxt when doing wakeup
463 */
464 closure_get(&ctxt->cl);
465 bch2_read_extent(c, &io->rbio, e.s_c,
466 BCH_READ_NODECODE|
467 BCH_READ_LAST_FRAGMENT);
468 return 0;
469err_free_pages:
470 bio_free_pages(&io->write.op.wbio.bio);
471err_free:
472 kfree(io);
473err:
474 trace_move_alloc_fail(e.k);
475 return ret;
476}
477
478int bch2_move_data(struct bch_fs *c,
479 struct bch_ratelimit *rate,
480 struct write_point_specifier wp,
481 struct bpos start,
482 struct bpos end,
483 move_pred_fn pred, void *arg,
484 struct bch_move_stats *stats)
485{
486 bool kthread = (current->flags & PF_KTHREAD) != 0;
487 struct moving_context ctxt = { .stats = stats };
488 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
489 BKEY_PADDED(k) tmp;
490 struct bkey_s_c k;
1c6fdbd8
KO
491 struct data_opts data_opts;
492 enum data_cmd data_cmd;
c2fcff59 493 u64 delay, cur_inum = U64_MAX;
1c6fdbd8
KO
494 int ret = 0, ret2;
495
496 closure_init_stack(&ctxt.cl);
497 INIT_LIST_HEAD(&ctxt.reads);
498 init_waitqueue_head(&ctxt.wait);
499
500 stats->data_type = BCH_DATA_USER;
501 bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
502 BTREE_ITER_PREFETCH);
503
504 if (rate)
505 bch2_ratelimit_reset(rate);
506
c2fcff59
KO
507 while (1) {
508 do {
509 delay = rate ? bch2_ratelimit_delay(rate) : 0;
510
511 if (delay) {
512 bch2_btree_iter_unlock(&stats->iter);
513 set_current_state(TASK_INTERRUPTIBLE);
514 }
515
516 if (kthread && (ret = kthread_should_stop())) {
517 __set_current_state(TASK_RUNNING);
518 goto out;
519 }
520
521 if (delay)
522 schedule_timeout(delay);
523
524 if (unlikely(freezing(current))) {
525 bch2_btree_iter_unlock(&stats->iter);
526 move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
c2fcff59
KO
527 try_to_freeze();
528 }
529 } while (delay);
1c6fdbd8
KO
530peek:
531 k = bch2_btree_iter_peek(&stats->iter);
532 if (!k.k)
533 break;
534 ret = btree_iter_err(k);
535 if (ret)
536 break;
537 if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
538 break;
539
540 if (!bkey_extent_is_data(k.k))
541 goto next_nondata;
542
1c6fdbd8
KO
543 if (cur_inum != k.k->p.inode) {
544 struct bch_inode_unpacked inode;
545
546 /* don't hold btree locks while looking up inode: */
547 bch2_btree_iter_unlock(&stats->iter);
548
549 io_opts = bch2_opts_to_inode_opts(c->opts);
550 if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
551 bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
552 cur_inum = k.k->p.inode;
553 goto peek;
554 }
555
26609b61 556 switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
1c6fdbd8
KO
557 case DATA_SKIP:
558 goto next;
559 case DATA_SCRUB:
560 BUG();
561 case DATA_ADD_REPLICAS:
562 case DATA_REWRITE:
563 case DATA_PROMOTE:
564 break;
565 default:
566 BUG();
567 }
568
569 /* unlock before doing IO: */
570 bkey_reassemble(&tmp.k, k);
571 k = bkey_i_to_s_c(&tmp.k);
572 bch2_btree_iter_unlock(&stats->iter);
573
574 ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
575 bkey_s_c_to_extent(k),
576 data_cmd, data_opts);
577 if (ret2) {
578 if (ret2 == -ENOMEM) {
579 /* memory allocation failure, wait for some IO to finish */
580 bch2_move_ctxt_wait_for_io(&ctxt);
581 continue;
582 }
583
584 /* XXX signal failure */
585 goto next;
586 }
587
588 if (rate)
589 bch2_ratelimit_increment(rate, k.k->size);
590next:
26609b61 591 atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
1c6fdbd8
KO
592 &stats->sectors_seen);
593next_nondata:
594 bch2_btree_iter_next(&stats->iter);
595 bch2_btree_iter_cond_resched(&stats->iter);
596 }
c2fcff59 597out:
1c6fdbd8
KO
598 bch2_btree_iter_unlock(&stats->iter);
599
600 move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
601 closure_sync(&ctxt.cl);
602
603 EBUG_ON(atomic_read(&ctxt.write_sectors));
604
605 trace_move_data(c,
606 atomic64_read(&stats->sectors_moved),
607 atomic64_read(&stats->keys_moved));
608
609 return ret;
610}
611
612static int bch2_gc_data_replicas(struct bch_fs *c)
613{
614 struct btree_iter iter;
615 struct bkey_s_c k;
616 int ret;
617
618 mutex_lock(&c->replicas_gc_lock);
619 bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
620
621 for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
622 BTREE_ITER_PREFETCH, k) {
26609b61 623 ret = bch2_mark_bkey_replicas(c, k);
1c6fdbd8
KO
624 if (ret)
625 break;
626 }
627 ret = bch2_btree_iter_unlock(&iter) ?: ret;
628
629 bch2_replicas_gc_end(c, ret);
630 mutex_unlock(&c->replicas_gc_lock);
631
632 return ret;
633}
634
635static int bch2_gc_btree_replicas(struct bch_fs *c)
636{
637 struct btree_iter iter;
638 struct btree *b;
639 unsigned id;
640 int ret = 0;
641
642 mutex_lock(&c->replicas_gc_lock);
643 bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
644
645 for (id = 0; id < BTREE_ID_NR; id++) {
646 for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
26609b61 647 ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
1c6fdbd8
KO
648
649 bch2_btree_iter_cond_resched(&iter);
650 }
651
652 ret = bch2_btree_iter_unlock(&iter) ?: ret;
653 }
654
655 bch2_replicas_gc_end(c, ret);
656 mutex_unlock(&c->replicas_gc_lock);
657
658 return ret;
659}
660
661static int bch2_move_btree(struct bch_fs *c,
662 move_pred_fn pred,
663 void *arg,
664 struct bch_move_stats *stats)
665{
666 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
667 struct btree *b;
668 unsigned id;
669 struct data_opts data_opts;
670 enum data_cmd cmd;
671 int ret = 0;
672
673 stats->data_type = BCH_DATA_BTREE;
674
675 for (id = 0; id < BTREE_ID_NR; id++) {
676 for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
26609b61
KO
677 switch ((cmd = pred(c, arg,
678 bkey_i_to_s_c(&b->key),
679 &io_opts, &data_opts))) {
1c6fdbd8
KO
680 case DATA_SKIP:
681 goto next;
682 case DATA_SCRUB:
683 BUG();
684 case DATA_ADD_REPLICAS:
685 case DATA_REWRITE:
686 break;
687 default:
688 BUG();
689 }
690
691 ret = bch2_btree_node_rewrite(c, &stats->iter,
692 b->data->keys.seq, 0) ?: ret;
693next:
694 bch2_btree_iter_cond_resched(&stats->iter);
695 }
696
697 ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
698 }
699
700 return ret;
701}
702
703#if 0
704static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
26609b61 705 struct bkey_s_c k,
1c6fdbd8
KO
706 struct bch_io_opts *io_opts,
707 struct data_opts *data_opts)
708{
709 return DATA_SCRUB;
710}
711#endif
712
713static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
26609b61 714 struct bkey_s_c k,
1c6fdbd8
KO
715 struct bch_io_opts *io_opts,
716 struct data_opts *data_opts)
717{
26609b61
KO
718 unsigned nr_good = bch2_bkey_durability(c, k);
719 unsigned replicas = 0;
720
721 switch (k.k->type) {
722 case KEY_TYPE_btree_ptr:
723 replicas = c->opts.metadata_replicas;
724 break;
725 case KEY_TYPE_extent:
726 replicas = io_opts->data_replicas;
727 break;
728 }
1c6fdbd8
KO
729
730 if (!nr_good || nr_good >= replicas)
731 return DATA_SKIP;
732
733 data_opts->target = 0;
26609b61 734 data_opts->btree_insert_flags = 0;
1c6fdbd8
KO
735 return DATA_ADD_REPLICAS;
736}
737
738static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
26609b61 739 struct bkey_s_c k,
1c6fdbd8
KO
740 struct bch_io_opts *io_opts,
741 struct data_opts *data_opts)
742{
743 struct bch_ioctl_data *op = arg;
744
26609b61 745 if (!bch2_bkey_has_device(k, op->migrate.dev))
1c6fdbd8
KO
746 return DATA_SKIP;
747
748 data_opts->target = 0;
749 data_opts->btree_insert_flags = 0;
750 data_opts->rewrite_dev = op->migrate.dev;
751 return DATA_REWRITE;
752}
753
754int bch2_data_job(struct bch_fs *c,
755 struct bch_move_stats *stats,
756 struct bch_ioctl_data op)
757{
758 int ret = 0;
759
760 switch (op.op) {
761 case BCH_DATA_OP_REREPLICATE:
762 stats->data_type = BCH_DATA_JOURNAL;
763 ret = bch2_journal_flush_device_pins(&c->journal, -1);
764
765 ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
7ef2a73a
KO
766
767 while (1) {
768 closure_wait_event(&c->btree_interior_update_wait,
769 !bch2_btree_interior_updates_nr_pending(c) ||
770 c->btree_roots_dirty);
771 if (!bch2_btree_interior_updates_nr_pending(c))
772 break;
773 bch2_journal_meta(&c->journal);
774 }
775
1c6fdbd8
KO
776 ret = bch2_gc_btree_replicas(c) ?: ret;
777
778 ret = bch2_move_data(c, NULL,
779 writepoint_hashed((unsigned long) current),
780 op.start,
781 op.end,
782 rereplicate_pred, c, stats) ?: ret;
783 ret = bch2_gc_data_replicas(c) ?: ret;
784 break;
785 case BCH_DATA_OP_MIGRATE:
786 if (op.migrate.dev >= c->sb.nr_devices)
787 return -EINVAL;
788
789 stats->data_type = BCH_DATA_JOURNAL;
790 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
791
792 ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
793 ret = bch2_gc_btree_replicas(c) ?: ret;
794
795 ret = bch2_move_data(c, NULL,
796 writepoint_hashed((unsigned long) current),
797 op.start,
798 op.end,
799 migrate_pred, &op, stats) ?: ret;
800 ret = bch2_gc_data_replicas(c) ?: ret;
801 break;
802 default:
803 ret = -EINVAL;
804 }
805
806 return ret;
807}