Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | |
4 | * Copyright (C) 2014 Datera Inc. | |
5 | */ | |
6 | ||
7 | #include "bcachefs.h" | |
7b3f84ea | 8 | #include "alloc_background.h" |
ef337c54 | 9 | #include "alloc_foreground.h" |
1c6fdbd8 | 10 | #include "bkey_methods.h" |
07a1006a | 11 | #include "bkey_buf.h" |
1c6fdbd8 KO |
12 | #include "btree_locking.h" |
13 | #include "btree_update_interior.h" | |
14 | #include "btree_io.h" | |
15 | #include "btree_gc.h" | |
16 | #include "buckets.h" | |
17 | #include "clock.h" | |
18 | #include "debug.h" | |
cd575ddf | 19 | #include "ec.h" |
1c6fdbd8 KO |
20 | #include "error.h" |
21 | #include "extents.h" | |
22 | #include "journal.h" | |
23 | #include "keylist.h" | |
24 | #include "move.h" | |
d0734356 | 25 | #include "recovery.h" |
1c6fdbd8 KO |
26 | #include "replicas.h" |
27 | #include "super-io.h" | |
28 | #include "trace.h" | |
29 | ||
30 | #include <linux/slab.h> | |
31 | #include <linux/bitops.h> | |
32 | #include <linux/freezer.h> | |
33 | #include <linux/kthread.h> | |
34 | #include <linux/preempt.h> | |
35 | #include <linux/rcupdate.h> | |
36 | #include <linux/sched/task.h> | |
37 | ||
2252aa27 KO |
38 | static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) |
39 | { | |
40 | preempt_disable(); | |
41 | write_seqcount_begin(&c->gc_pos_lock); | |
42 | c->gc_pos = new_pos; | |
43 | write_seqcount_end(&c->gc_pos_lock); | |
44 | preempt_enable(); | |
45 | } | |
46 | ||
47 | static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) | |
48 | { | |
49 | BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); | |
50 | __gc_pos_set(c, new_pos); | |
51 | } | |
52 | ||
d06c1a0c KO |
53 | static int bch2_gc_check_topology(struct bch_fs *c, |
54 | struct bkey_s_c k, | |
55 | struct bpos *expected_start, | |
56 | struct bpos expected_end, | |
57 | bool is_last) | |
1c6fdbd8 | 58 | { |
d06c1a0c | 59 | int ret = 0; |
1c6fdbd8 | 60 | |
d06c1a0c KO |
61 | if (k.k->type == KEY_TYPE_btree_ptr_v2) { |
62 | struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); | |
1c6fdbd8 | 63 | |
d06c1a0c KO |
64 | if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, |
65 | "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", | |
66 | bp.v->min_key.inode, | |
67 | bp.v->min_key.offset, | |
68 | expected_start->inode, | |
69 | expected_start->offset)) { | |
70 | BUG(); | |
71 | } | |
72 | } | |
1c6fdbd8 | 73 | |
d06c1a0c KO |
74 | *expected_start = bkey_cmp(k.k->p, POS_MAX) |
75 | ? bkey_successor(k.k->p) | |
76 | : k.k->p; | |
77 | ||
78 | if (fsck_err_on(is_last && | |
79 | bkey_cmp(k.k->p, expected_end), c, | |
80 | "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", | |
81 | k.k->p.inode, | |
82 | k.k->p.offset, | |
83 | expected_end.inode, | |
84 | expected_end.offset)) { | |
85 | BUG(); | |
1c6fdbd8 | 86 | } |
d06c1a0c KO |
87 | fsck_err: |
88 | return ret; | |
1c6fdbd8 KO |
89 | } |
90 | ||
2252aa27 KO |
91 | /* marking of btree keys/nodes: */ |
92 | ||
26609b61 | 93 | static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, |
d034c09b | 94 | u8 *max_stale, bool initial) |
2252aa27 | 95 | { |
26609b61 KO |
96 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
97 | const struct bch_extent_ptr *ptr; | |
47799326 | 98 | unsigned flags = |
2d594dfb KO |
99 | BTREE_TRIGGER_GC| |
100 | (initial ? BTREE_TRIGGER_NOATOMIC : 0); | |
2252aa27 KO |
101 | int ret = 0; |
102 | ||
91f8b567 | 103 | if (initial) { |
29364f34 | 104 | BUG_ON(bch2_journal_seq_verify && |
91f8b567 KO |
105 | k.k->version.lo > journal_cur_seq(&c->journal)); |
106 | ||
a9bc0a51 KO |
107 | /* XXX change to fsck check */ |
108 | if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, | |
109 | "key version number higher than recorded: %llu > %llu", | |
110 | k.k->version.lo, | |
111 | atomic64_read(&c->key_version))) | |
91f8b567 KO |
112 | atomic64_set(&c->key_version, k.k->version.lo); |
113 | ||
114 | if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || | |
988e98cf | 115 | fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, |
91f8b567 | 116 | "superblock not marked as containing replicas (type %u)", |
26609b61 KO |
117 | k.k->type)) { |
118 | ret = bch2_mark_bkey_replicas(c, k); | |
91f8b567 | 119 | if (ret) |
2252aa27 KO |
120 | return ret; |
121 | } | |
47799326 | 122 | |
26609b61 KO |
123 | bkey_for_each_ptr(ptrs, ptr) { |
124 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
39fbc5a4 | 125 | struct bucket *g = PTR_BUCKET(ca, ptr, true); |
28062d32 | 126 | struct bucket *g2 = PTR_BUCKET(ca, ptr, false); |
26609b61 | 127 | |
90541a74 | 128 | if (mustfix_fsck_err_on(!g->gen_valid, c, |
06ab329c KO |
129 | "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", |
130 | ptr->dev, PTR_BUCKET_NR(ca, ptr), | |
131 | bch2_data_types[ptr_data_type(k.k, ptr)], | |
132 | ptr->gen)) { | |
28062d32 | 133 | g2->_mark.gen = g->_mark.gen = ptr->gen; |
28062d32 | 134 | g2->gen_valid = g->gen_valid = true; |
4291a331 | 135 | set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); |
26609b61 KO |
136 | } |
137 | ||
138 | if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, | |
06ab329c KO |
139 | "bucket %u:%zu data type %s ptr gen in the future: %u > %u", |
140 | ptr->dev, PTR_BUCKET_NR(ca, ptr), | |
141 | bch2_data_types[ptr_data_type(k.k, ptr)], | |
142 | ptr->gen, g->mark.gen)) { | |
28062d32 | 143 | g2->_mark.gen = g->_mark.gen = ptr->gen; |
28062d32 | 144 | g2->gen_valid = g->gen_valid = true; |
06ab329c KO |
145 | g2->_mark.data_type = 0; |
146 | g2->_mark.dirty_sectors = 0; | |
147 | g2->_mark.cached_sectors = 0; | |
26609b61 | 148 | set_bit(BCH_FS_FIXED_GENS, &c->flags); |
4291a331 | 149 | set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); |
26609b61 KO |
150 | } |
151 | } | |
2252aa27 KO |
152 | } |
153 | ||
26609b61 KO |
154 | bkey_for_each_ptr(ptrs, ptr) { |
155 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
39fbc5a4 | 156 | struct bucket *g = PTR_BUCKET(ca, ptr, true); |
26609b61 | 157 | |
76f4c7b0 KO |
158 | if (gen_after(g->oldest_gen, ptr->gen)) |
159 | g->oldest_gen = ptr->gen; | |
26609b61 KO |
160 | |
161 | *max_stale = max(*max_stale, ptr_stale(ca, ptr)); | |
162 | } | |
91f8b567 | 163 | |
2cbe5cfe | 164 | bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); |
91f8b567 | 165 | fsck_err: |
2252aa27 KO |
166 | return ret; |
167 | } | |
168 | ||
e3e464ac | 169 | static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, |
e62d65f2 | 170 | bool initial) |
1c6fdbd8 | 171 | { |
d06c1a0c | 172 | struct bpos next_node_start = b->data->min_key; |
1c6fdbd8 KO |
173 | struct btree_node_iter iter; |
174 | struct bkey unpacked; | |
175 | struct bkey_s_c k; | |
d034c09b KO |
176 | int ret = 0; |
177 | ||
178 | *max_stale = 0; | |
1c6fdbd8 | 179 | |
26609b61 | 180 | if (!btree_node_type_needs_gc(btree_node_type(b))) |
2252aa27 | 181 | return 0; |
1c6fdbd8 | 182 | |
d06c1a0c KO |
183 | bch2_btree_node_iter_init_from_start(&iter, b); |
184 | ||
185 | while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { | |
2252aa27 | 186 | bch2_bkey_debugcheck(c, b, k); |
1c6fdbd8 | 187 | |
26609b61 | 188 | ret = bch2_gc_mark_key(c, k, max_stale, initial); |
d034c09b KO |
189 | if (ret) |
190 | break; | |
d06c1a0c KO |
191 | |
192 | bch2_btree_node_iter_advance(&iter, b); | |
193 | ||
194 | if (b->c.level) { | |
195 | ret = bch2_gc_check_topology(c, k, | |
196 | &next_node_start, | |
197 | b->data->max_key, | |
198 | bch2_btree_node_iter_end(&iter)); | |
199 | if (ret) | |
200 | break; | |
201 | } | |
2252aa27 KO |
202 | } |
203 | ||
d034c09b | 204 | return ret; |
1c6fdbd8 KO |
205 | } |
206 | ||
2252aa27 | 207 | static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, |
a1d58243 | 208 | bool initial, bool metadata_only) |
1c6fdbd8 | 209 | { |
424eb881 KO |
210 | struct btree_trans trans; |
211 | struct btree_iter *iter; | |
1c6fdbd8 | 212 | struct btree *b; |
4881fdb7 | 213 | unsigned depth = metadata_only ? 1 |
29364f34 | 214 | : bch2_expensive_debug_checks ? 0 |
4881fdb7 KO |
215 | : !btree_node_type_needs_gc(btree_id) ? 1 |
216 | : 0; | |
f7c0fcdd | 217 | u8 max_stale = 0; |
1c6fdbd8 KO |
218 | int ret = 0; |
219 | ||
20bceecb | 220 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 221 | |
1c6fdbd8 KO |
222 | gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); |
223 | ||
424eb881 | 224 | __for_each_btree_node(&trans, iter, btree_id, POS_MIN, |
1c6fdbd8 | 225 | 0, depth, BTREE_ITER_PREFETCH, b) { |
1c6fdbd8 KO |
226 | bch2_verify_btree_nr_keys(b); |
227 | ||
8777210b KO |
228 | gc_pos_set(c, gc_pos_btree_node(b)); |
229 | ||
e62d65f2 | 230 | ret = btree_gc_mark_node(c, b, &max_stale, initial); |
d034c09b KO |
231 | if (ret) |
232 | break; | |
1c6fdbd8 | 233 | |
2252aa27 KO |
234 | if (!initial) { |
235 | if (max_stale > 64) | |
424eb881 | 236 | bch2_btree_node_rewrite(c, iter, |
2252aa27 | 237 | b->data->keys.seq, |
2252aa27 KO |
238 | BTREE_INSERT_NOWAIT| |
239 | BTREE_INSERT_GC_LOCK_HELD); | |
29364f34 KO |
240 | else if (!bch2_btree_gc_rewrite_disabled && |
241 | (bch2_btree_gc_always_rewrite || max_stale > 16)) | |
424eb881 | 242 | bch2_btree_node_rewrite(c, iter, |
2252aa27 KO |
243 | b->data->keys.seq, |
244 | BTREE_INSERT_NOWAIT| | |
245 | BTREE_INSERT_GC_LOCK_HELD); | |
246 | } | |
1c6fdbd8 | 247 | |
424eb881 | 248 | bch2_trans_cond_resched(&trans); |
1c6fdbd8 | 249 | } |
424eb881 | 250 | ret = bch2_trans_exit(&trans) ?: ret; |
1c6fdbd8 KO |
251 | if (ret) |
252 | return ret; | |
253 | ||
254 | mutex_lock(&c->btree_root_lock); | |
1c6fdbd8 KO |
255 | b = c->btree_roots[btree_id].b; |
256 | if (!btree_node_fake(b)) | |
8b2b9d11 KO |
257 | ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), |
258 | &max_stale, initial); | |
c43a6ef9 | 259 | gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); |
1c6fdbd8 | 260 | mutex_unlock(&c->btree_root_lock); |
8b2b9d11 KO |
261 | |
262 | return ret; | |
1c6fdbd8 KO |
263 | } |
264 | ||
e62d65f2 | 265 | static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, |
d06c1a0c KO |
266 | struct journal_keys *journal_keys, |
267 | unsigned target_depth) | |
e62d65f2 KO |
268 | { |
269 | struct btree_and_journal_iter iter; | |
270 | struct bkey_s_c k; | |
d06c1a0c | 271 | struct bpos next_node_start = b->data->min_key; |
07a1006a | 272 | struct bkey_buf tmp; |
e62d65f2 KO |
273 | u8 max_stale = 0; |
274 | int ret = 0; | |
275 | ||
276 | bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); | |
07a1006a | 277 | bch2_bkey_buf_init(&tmp); |
e62d65f2 KO |
278 | |
279 | while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { | |
280 | bch2_bkey_debugcheck(c, b, k); | |
281 | ||
d06c1a0c KO |
282 | BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); |
283 | BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); | |
284 | ||
e62d65f2 KO |
285 | ret = bch2_gc_mark_key(c, k, &max_stale, true); |
286 | if (ret) | |
287 | break; | |
288 | ||
d06c1a0c | 289 | if (b->c.level) { |
e62d65f2 | 290 | struct btree *child; |
e62d65f2 | 291 | |
07a1006a KO |
292 | bch2_bkey_buf_reassemble(&tmp, c, k); |
293 | k = bkey_i_to_s_c(tmp.k); | |
d06c1a0c KO |
294 | |
295 | bch2_btree_and_journal_iter_advance(&iter); | |
e62d65f2 | 296 | |
d06c1a0c KO |
297 | ret = bch2_gc_check_topology(c, k, |
298 | &next_node_start, | |
299 | b->data->max_key, | |
300 | !bch2_btree_and_journal_iter_peek(&iter).k); | |
e62d65f2 KO |
301 | if (ret) |
302 | break; | |
303 | ||
d06c1a0c | 304 | if (b->c.level > target_depth) { |
07a1006a | 305 | child = bch2_btree_node_get_noiter(c, tmp.k, |
d06c1a0c KO |
306 | b->c.btree_id, b->c.level - 1); |
307 | ret = PTR_ERR_OR_ZERO(child); | |
308 | if (ret) | |
309 | break; | |
e62d65f2 | 310 | |
d06c1a0c KO |
311 | ret = bch2_gc_btree_init_recurse(c, child, |
312 | journal_keys, target_depth); | |
313 | six_unlock_read(&child->c.lock); | |
314 | ||
315 | if (ret) | |
316 | break; | |
317 | } | |
318 | } else { | |
319 | bch2_btree_and_journal_iter_advance(&iter); | |
320 | } | |
e62d65f2 KO |
321 | } |
322 | ||
07a1006a | 323 | bch2_bkey_buf_exit(&tmp, c); |
e62d65f2 KO |
324 | return ret; |
325 | } | |
326 | ||
327 | static int bch2_gc_btree_init(struct bch_fs *c, | |
328 | struct journal_keys *journal_keys, | |
329 | enum btree_id btree_id, | |
330 | bool metadata_only) | |
331 | { | |
332 | struct btree *b; | |
333 | unsigned target_depth = metadata_only ? 1 | |
29364f34 | 334 | : bch2_expensive_debug_checks ? 0 |
e62d65f2 KO |
335 | : !btree_node_type_needs_gc(btree_id) ? 1 |
336 | : 0; | |
337 | u8 max_stale = 0; | |
338 | int ret = 0; | |
339 | ||
340 | b = c->btree_roots[btree_id].b; | |
341 | ||
342 | if (btree_node_fake(b)) | |
343 | return 0; | |
344 | ||
345 | six_lock_read(&b->c.lock, NULL, NULL); | |
d06c1a0c KO |
346 | if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, |
347 | "btree root with incorrect min_key: %llu:%llu", | |
348 | b->data->min_key.inode, | |
349 | b->data->min_key.offset)) { | |
350 | BUG(); | |
351 | } | |
352 | ||
353 | if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, | |
354 | "btree root with incorrect min_key: %llu:%llu", | |
355 | b->data->max_key.inode, | |
356 | b->data->max_key.offset)) { | |
357 | BUG(); | |
358 | } | |
359 | ||
e62d65f2 KO |
360 | if (b->c.level >= target_depth) |
361 | ret = bch2_gc_btree_init_recurse(c, b, | |
362 | journal_keys, target_depth); | |
363 | ||
364 | if (!ret) | |
365 | ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), | |
366 | &max_stale, true); | |
d06c1a0c | 367 | fsck_err: |
e62d65f2 KO |
368 | six_unlock_read(&b->c.lock); |
369 | ||
370 | return ret; | |
371 | } | |
372 | ||
cd575ddf KO |
373 | static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) |
374 | { | |
375 | return (int) btree_id_to_gc_phase(l) - | |
376 | (int) btree_id_to_gc_phase(r); | |
377 | } | |
378 | ||
d0734356 | 379 | static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, |
a1d58243 | 380 | bool initial, bool metadata_only) |
2252aa27 | 381 | { |
cd575ddf | 382 | enum btree_id ids[BTREE_ID_NR]; |
2252aa27 KO |
383 | unsigned i; |
384 | ||
cd575ddf KO |
385 | for (i = 0; i < BTREE_ID_NR; i++) |
386 | ids[i] = i; | |
387 | bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); | |
388 | ||
2252aa27 | 389 | for (i = 0; i < BTREE_ID_NR; i++) { |
cd575ddf | 390 | enum btree_id id = ids[i]; |
e62d65f2 KO |
391 | int ret = initial |
392 | ? bch2_gc_btree_init(c, journal_keys, | |
393 | id, metadata_only) | |
394 | : bch2_gc_btree(c, id, initial, metadata_only); | |
2252aa27 KO |
395 | if (ret) |
396 | return ret; | |
2252aa27 KO |
397 | } |
398 | ||
399 | return 0; | |
400 | } | |
401 | ||
1c6fdbd8 KO |
402 | static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, |
403 | u64 start, u64 end, | |
404 | enum bch_data_type type, | |
405 | unsigned flags) | |
406 | { | |
407 | u64 b = sector_to_bucket(ca, start); | |
408 | ||
409 | do { | |
410 | unsigned sectors = | |
411 | min_t(u64, bucket_to_sector(ca, b + 1), end) - start; | |
412 | ||
413 | bch2_mark_metadata_bucket(c, ca, b, type, sectors, | |
414 | gc_phase(GC_PHASE_SB), flags); | |
415 | b++; | |
416 | start += sectors; | |
417 | } while (start < end); | |
418 | } | |
419 | ||
420 | void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, | |
421 | unsigned flags) | |
422 | { | |
423 | struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; | |
424 | unsigned i; | |
425 | u64 b; | |
426 | ||
97446a24 KO |
427 | /* |
428 | * This conditional is kind of gross, but we may be called from the | |
429 | * device add path, before the new device has actually been added to the | |
430 | * running filesystem: | |
431 | */ | |
1c6fdbd8 KO |
432 | if (c) { |
433 | lockdep_assert_held(&c->sb_lock); | |
9166b41d | 434 | percpu_down_read(&c->mark_lock); |
1c6fdbd8 KO |
435 | } |
436 | ||
437 | for (i = 0; i < layout->nr_superblocks; i++) { | |
438 | u64 offset = le64_to_cpu(layout->sb_offset[i]); | |
439 | ||
440 | if (offset == BCH_SB_SECTOR) | |
441 | mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, | |
89fd25be | 442 | BCH_DATA_sb, flags); |
1c6fdbd8 KO |
443 | |
444 | mark_metadata_sectors(c, ca, offset, | |
445 | offset + (1 << layout->sb_max_size_bits), | |
89fd25be | 446 | BCH_DATA_sb, flags); |
1c6fdbd8 KO |
447 | } |
448 | ||
1c6fdbd8 KO |
449 | for (i = 0; i < ca->journal.nr; i++) { |
450 | b = ca->journal.buckets[i]; | |
89fd25be | 451 | bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, |
1c6fdbd8 KO |
452 | ca->mi.bucket_size, |
453 | gc_phase(GC_PHASE_SB), flags); | |
454 | } | |
455 | ||
3a0e06db | 456 | if (c) |
9166b41d | 457 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
458 | } |
459 | ||
460 | static void bch2_mark_superblocks(struct bch_fs *c) | |
461 | { | |
462 | struct bch_dev *ca; | |
463 | unsigned i; | |
464 | ||
465 | mutex_lock(&c->sb_lock); | |
466 | gc_pos_set(c, gc_phase(GC_PHASE_SB)); | |
467 | ||
468 | for_each_online_member(ca, c, i) | |
2d594dfb | 469 | bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
470 | mutex_unlock(&c->sb_lock); |
471 | } | |
472 | ||
00b8ccf7 | 473 | #if 0 |
1c6fdbd8 KO |
474 | /* Also see bch2_pending_btree_node_free_insert_done() */ |
475 | static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) | |
476 | { | |
1c6fdbd8 KO |
477 | struct btree_update *as; |
478 | struct pending_btree_node_free *d; | |
479 | ||
480 | mutex_lock(&c->btree_interior_update_lock); | |
481 | gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); | |
482 | ||
483 | for_each_pending_btree_node_free(c, as, d) | |
484 | if (d->index_update_done) | |
2cbe5cfe KO |
485 | bch2_mark_key(c, bkey_i_to_s_c(&d->key), |
486 | 0, 0, NULL, 0, | |
2d594dfb | 487 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
488 | |
489 | mutex_unlock(&c->btree_interior_update_lock); | |
490 | } | |
00b8ccf7 | 491 | #endif |
1c6fdbd8 KO |
492 | |
493 | static void bch2_mark_allocator_buckets(struct bch_fs *c) | |
494 | { | |
495 | struct bch_dev *ca; | |
496 | struct open_bucket *ob; | |
497 | size_t i, j, iter; | |
498 | unsigned ci; | |
499 | ||
9166b41d | 500 | percpu_down_read(&c->mark_lock); |
1c6fdbd8 KO |
501 | |
502 | spin_lock(&c->freelist_lock); | |
503 | gc_pos_set(c, gc_pos_alloc(c, NULL)); | |
504 | ||
505 | for_each_member_device(ca, c, ci) { | |
506 | fifo_for_each_entry(i, &ca->free_inc, iter) | |
507 | bch2_mark_alloc_bucket(c, ca, i, true, | |
508 | gc_pos_alloc(c, NULL), | |
2d594dfb | 509 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
510 | |
511 | ||
512 | ||
513 | for (j = 0; j < RESERVE_NR; j++) | |
514 | fifo_for_each_entry(i, &ca->free[j], iter) | |
515 | bch2_mark_alloc_bucket(c, ca, i, true, | |
516 | gc_pos_alloc(c, NULL), | |
2d594dfb | 517 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
518 | } |
519 | ||
520 | spin_unlock(&c->freelist_lock); | |
521 | ||
522 | for (ob = c->open_buckets; | |
523 | ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); | |
524 | ob++) { | |
525 | spin_lock(&ob->lock); | |
526 | if (ob->valid) { | |
527 | gc_pos_set(c, gc_pos_alloc(c, ob)); | |
528 | ca = bch_dev_bkey_exists(c, ob->ptr.dev); | |
529 | bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, | |
530 | gc_pos_alloc(c, ob), | |
2d594dfb | 531 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
532 | } |
533 | spin_unlock(&ob->lock); | |
534 | } | |
535 | ||
9166b41d | 536 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
537 | } |
538 | ||
9ca53b55 KO |
539 | static void bch2_gc_free(struct bch_fs *c) |
540 | { | |
541 | struct bch_dev *ca; | |
542 | unsigned i; | |
543 | ||
dfe9bfb3 KO |
544 | genradix_free(&c->stripes[1]); |
545 | ||
9ca53b55 KO |
546 | for_each_member_device(ca, c, i) { |
547 | kvpfree(rcu_dereference_protected(ca->buckets[1], 1), | |
548 | sizeof(struct bucket_array) + | |
549 | ca->mi.nbuckets * sizeof(struct bucket)); | |
550 | ca->buckets[1] = NULL; | |
551 | ||
552 | free_percpu(ca->usage[1]); | |
553 | ca->usage[1] = NULL; | |
554 | } | |
555 | ||
5e82a9a1 KO |
556 | free_percpu(c->usage_gc); |
557 | c->usage_gc = NULL; | |
06b7345c KO |
558 | } |
559 | ||
a1d58243 KO |
560 | static int bch2_gc_done(struct bch_fs *c, |
561 | bool initial, bool metadata_only) | |
9ca53b55 KO |
562 | { |
563 | struct bch_dev *ca; | |
a1d58243 KO |
564 | bool verify = !metadata_only && |
565 | (!initial || | |
566 | (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); | |
9ca53b55 | 567 | unsigned i; |
cccf4e6d | 568 | int ret = 0; |
9ca53b55 KO |
569 | |
570 | #define copy_field(_f, _msg, ...) \ | |
23f80d2b | 571 | if (dst->_f != src->_f) { \ |
76f4c7b0 | 572 | if (verify) \ |
cccf4e6d | 573 | fsck_err(c, _msg ": got %llu, should be %llu" \ |
76f4c7b0 | 574 | , ##__VA_ARGS__, dst->_f, src->_f); \ |
23f80d2b | 575 | dst->_f = src->_f; \ |
4291a331 | 576 | set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ |
9ca53b55 | 577 | } |
dfe9bfb3 KO |
578 | #define copy_stripe_field(_f, _msg, ...) \ |
579 | if (dst->_f != src->_f) { \ | |
76f4c7b0 | 580 | if (verify) \ |
cccf4e6d KO |
581 | fsck_err(c, "stripe %zu has wrong "_msg \ |
582 | ": got %u, should be %u", \ | |
76f4c7b0 KO |
583 | dst_iter.pos, ##__VA_ARGS__, \ |
584 | dst->_f, src->_f); \ | |
dfe9bfb3 | 585 | dst->_f = src->_f; \ |
61c8d7c8 | 586 | dst->dirty = true; \ |
4291a331 | 587 | set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ |
dfe9bfb3 | 588 | } |
9ca53b55 KO |
589 | #define copy_bucket_field(_f) \ |
590 | if (dst->b[b].mark._f != src->b[b].mark._f) { \ | |
76f4c7b0 | 591 | if (verify) \ |
aafcf9bc | 592 | fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ |
cccf4e6d | 593 | ": got %u, should be %u", i, b, \ |
aafcf9bc KO |
594 | dst->b[b].mark.gen, \ |
595 | bch2_data_types[dst->b[b].mark.data_type],\ | |
76f4c7b0 | 596 | dst->b[b].mark._f, src->b[b].mark._f); \ |
9ca53b55 | 597 | dst->b[b]._mark._f = src->b[b].mark._f; \ |
4291a331 | 598 | set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ |
9ca53b55 KO |
599 | } |
600 | #define copy_dev_field(_f, _msg, ...) \ | |
601 | copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) | |
602 | #define copy_fs_field(_f, _msg, ...) \ | |
603 | copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) | |
604 | ||
a1d58243 | 605 | if (!metadata_only) { |
dfe9bfb3 KO |
606 | struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); |
607 | struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); | |
608 | struct stripe *dst, *src; | |
dfe9bfb3 KO |
609 | |
610 | c->ec_stripes_heap.used = 0; | |
611 | ||
612 | while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && | |
613 | (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { | |
61c8d7c8 KO |
614 | BUG_ON(src_iter.pos != dst_iter.pos); |
615 | ||
dfe9bfb3 KO |
616 | copy_stripe_field(alive, "alive"); |
617 | copy_stripe_field(sectors, "sectors"); | |
618 | copy_stripe_field(algorithm, "algorithm"); | |
619 | copy_stripe_field(nr_blocks, "nr_blocks"); | |
620 | copy_stripe_field(nr_redundant, "nr_redundant"); | |
61c8d7c8 | 621 | copy_stripe_field(blocks_nonempty, |
dfe9bfb3 KO |
622 | "blocks_nonempty"); |
623 | ||
624 | for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) | |
61c8d7c8 | 625 | copy_stripe_field(block_sectors[i], |
dfe9bfb3 KO |
626 | "block_sectors[%u]", i); |
627 | ||
ba6dd1dd KO |
628 | if (dst->alive) { |
629 | spin_lock(&c->ec_stripes_heap_lock); | |
dfe9bfb3 | 630 | bch2_stripes_heap_insert(c, dst, dst_iter.pos); |
ba6dd1dd KO |
631 | spin_unlock(&c->ec_stripes_heap_lock); |
632 | } | |
dfe9bfb3 KO |
633 | |
634 | genradix_iter_advance(&dst_iter, &c->stripes[0]); | |
635 | genradix_iter_advance(&src_iter, &c->stripes[1]); | |
636 | } | |
637 | } | |
638 | ||
9ca53b55 KO |
639 | for_each_member_device(ca, c, i) { |
640 | struct bucket_array *dst = __bucket_array(ca, 0); | |
641 | struct bucket_array *src = __bucket_array(ca, 1); | |
642 | size_t b; | |
643 | ||
9ca53b55 KO |
644 | for (b = 0; b < src->nbuckets; b++) { |
645 | copy_bucket_field(gen); | |
646 | copy_bucket_field(data_type); | |
647 | copy_bucket_field(owned_by_allocator); | |
648 | copy_bucket_field(stripe); | |
649 | copy_bucket_field(dirty_sectors); | |
650 | copy_bucket_field(cached_sectors); | |
76f4c7b0 | 651 | |
6671a708 | 652 | dst->b[b].oldest_gen = src->b[b].oldest_gen; |
9ca53b55 KO |
653 | } |
654 | }; | |
655 | ||
5e82a9a1 KO |
656 | for (i = 0; i < ARRAY_SIZE(c->usage); i++) |
657 | bch2_fs_usage_acc_to_base(c, i); | |
658 | ||
a1d58243 | 659 | bch2_dev_usage_from_buckets(c); |
9ca53b55 KO |
660 | |
661 | { | |
ecf37a4a | 662 | unsigned nr = fs_usage_u64s(c); |
5e82a9a1 | 663 | struct bch_fs_usage *dst = c->usage_base; |
23f80d2b | 664 | struct bch_fs_usage *src = (void *) |
5e82a9a1 | 665 | bch2_acc_percpu_u64s((void *) c->usage_gc, nr); |
9ca53b55 | 666 | |
768ac639 | 667 | copy_fs_field(hidden, "hidden"); |
a1d58243 | 668 | copy_fs_field(btree, "btree"); |
06b7345c | 669 | |
a1d58243 KO |
670 | if (!metadata_only) { |
671 | copy_fs_field(data, "data"); | |
672 | copy_fs_field(cached, "cached"); | |
673 | copy_fs_field(reserved, "reserved"); | |
674 | copy_fs_field(nr_inodes,"nr_inodes"); | |
675 | ||
676 | for (i = 0; i < BCH_REPLICAS_MAX; i++) | |
677 | copy_fs_field(persistent_reserved[i], | |
678 | "persistent_reserved[%i]", i); | |
679 | } | |
9ca53b55 | 680 | |
7ef2a73a | 681 | for (i = 0; i < c->replicas.nr; i++) { |
8777210b KO |
682 | struct bch_replicas_entry *e = |
683 | cpu_replicas_entry(&c->replicas, i); | |
684 | char buf[80]; | |
685 | ||
a1d58243 | 686 | if (metadata_only && |
89fd25be KO |
687 | (e->data_type == BCH_DATA_user || |
688 | e->data_type == BCH_DATA_cached)) | |
a1d58243 KO |
689 | continue; |
690 | ||
8777210b KO |
691 | bch2_replicas_entry_to_text(&PBUF(buf), e); |
692 | ||
768ac639 | 693 | copy_fs_field(replicas[i], "%s", buf); |
7ef2a73a | 694 | } |
9ca53b55 | 695 | } |
76f4c7b0 | 696 | |
9ca53b55 KO |
697 | #undef copy_fs_field |
698 | #undef copy_dev_field | |
699 | #undef copy_bucket_field | |
dfe9bfb3 KO |
700 | #undef copy_stripe_field |
701 | #undef copy_field | |
cccf4e6d KO |
702 | fsck_err: |
703 | return ret; | |
9ca53b55 KO |
704 | } |
705 | ||
a1d58243 KO |
706 | static int bch2_gc_start(struct bch_fs *c, |
707 | bool metadata_only) | |
9ca53b55 KO |
708 | { |
709 | struct bch_dev *ca; | |
710 | unsigned i; | |
0741d378 | 711 | int ret; |
dfe9bfb3 | 712 | |
5e82a9a1 | 713 | BUG_ON(c->usage_gc); |
9ca53b55 | 714 | |
5e82a9a1 | 715 | c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), |
ecf37a4a | 716 | sizeof(u64), GFP_KERNEL); |
1e1a31c4 KO |
717 | if (!c->usage_gc) { |
718 | bch_err(c, "error allocating c->usage_gc"); | |
9ca53b55 | 719 | return -ENOMEM; |
1e1a31c4 | 720 | } |
9ca53b55 | 721 | |
1c6fdbd8 | 722 | for_each_member_device(ca, c, i) { |
9ca53b55 KO |
723 | BUG_ON(ca->buckets[1]); |
724 | BUG_ON(ca->usage[1]); | |
725 | ||
726 | ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + | |
727 | ca->mi.nbuckets * sizeof(struct bucket), | |
728 | GFP_KERNEL|__GFP_ZERO); | |
729 | if (!ca->buckets[1]) { | |
730 | percpu_ref_put(&ca->ref); | |
1e1a31c4 | 731 | bch_err(c, "error allocating ca->buckets[gc]"); |
9ca53b55 KO |
732 | return -ENOMEM; |
733 | } | |
734 | ||
735 | ca->usage[1] = alloc_percpu(struct bch_dev_usage); | |
736 | if (!ca->usage[1]) { | |
1e1a31c4 | 737 | bch_err(c, "error allocating ca->usage[gc]"); |
9ca53b55 KO |
738 | percpu_ref_put(&ca->ref); |
739 | return -ENOMEM; | |
1c6fdbd8 | 740 | } |
1c6fdbd8 | 741 | } |
9ca53b55 | 742 | |
0741d378 | 743 | ret = bch2_ec_mem_alloc(c, true); |
1e1a31c4 KO |
744 | if (ret) { |
745 | bch_err(c, "error allocating ec gc mem"); | |
0741d378 | 746 | return ret; |
1e1a31c4 | 747 | } |
0741d378 KO |
748 | |
749 | percpu_down_write(&c->mark_lock); | |
750 | ||
751 | /* | |
752 | * indicate to stripe code that we need to allocate for the gc stripes | |
753 | * radix tree, too | |
754 | */ | |
755 | gc_pos_set(c, gc_phase(GC_PHASE_START)); | |
756 | ||
9ca53b55 KO |
757 | for_each_member_device(ca, c, i) { |
758 | struct bucket_array *dst = __bucket_array(ca, 1); | |
759 | struct bucket_array *src = __bucket_array(ca, 0); | |
760 | size_t b; | |
761 | ||
762 | dst->first_bucket = src->first_bucket; | |
763 | dst->nbuckets = src->nbuckets; | |
764 | ||
28062d32 | 765 | for (b = 0; b < src->nbuckets; b++) { |
a1d58243 KO |
766 | struct bucket *d = &dst->b[b]; |
767 | struct bucket *s = &src->b[b]; | |
768 | ||
769 | d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; | |
770 | d->gen_valid = s->gen_valid; | |
771 | ||
772 | if (metadata_only && | |
89fd25be KO |
773 | (s->mark.data_type == BCH_DATA_user || |
774 | s->mark.data_type == BCH_DATA_cached)) { | |
a1d58243 KO |
775 | d->_mark = s->mark; |
776 | d->_mark.owned_by_allocator = 0; | |
777 | } | |
28062d32 | 778 | } |
9ca53b55 KO |
779 | }; |
780 | ||
0741d378 KO |
781 | percpu_up_write(&c->mark_lock); |
782 | ||
783 | return 0; | |
1c6fdbd8 KO |
784 | } |
785 | ||
786 | /** | |
9ca53b55 KO |
787 | * bch2_gc - walk _all_ references to buckets, and recompute them: |
788 | * | |
789 | * Order matters here: | |
790 | * - Concurrent GC relies on the fact that we have a total ordering for | |
791 | * everything that GC walks - see gc_will_visit_node(), | |
792 | * gc_will_visit_root() | |
793 | * | |
794 | * - also, references move around in the course of index updates and | |
795 | * various other crap: everything needs to agree on the ordering | |
796 | * references are allowed to move around in - e.g., we're allowed to | |
797 | * start with a reference owned by an open_bucket (the allocator) and | |
798 | * move it to the btree, but not the reverse. | |
799 | * | |
800 | * This is necessary to ensure that gc doesn't miss references that | |
801 | * move around - if references move backwards in the ordering GC | |
802 | * uses, GC could skip past them | |
1c6fdbd8 | 803 | */ |
d0734356 | 804 | int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, |
a1d58243 | 805 | bool initial, bool metadata_only) |
1c6fdbd8 KO |
806 | { |
807 | struct bch_dev *ca; | |
808 | u64 start_time = local_clock(); | |
9ca53b55 | 809 | unsigned i, iter = 0; |
2252aa27 | 810 | int ret; |
1c6fdbd8 | 811 | |
1ada1606 | 812 | lockdep_assert_held(&c->state_lock); |
1c6fdbd8 KO |
813 | trace_gc_start(c); |
814 | ||
1c6fdbd8 | 815 | down_write(&c->gc_lock); |
00b8ccf7 KO |
816 | |
817 | /* flush interior btree updates: */ | |
818 | closure_wait_event(&c->btree_interior_update_wait, | |
819 | !bch2_btree_interior_updates_nr_pending(c)); | |
9ca53b55 | 820 | again: |
a1d58243 | 821 | ret = bch2_gc_start(c, metadata_only); |
9ca53b55 | 822 | if (ret) |
1c6fdbd8 KO |
823 | goto out; |
824 | ||
1c6fdbd8 KO |
825 | bch2_mark_superblocks(c); |
826 | ||
d0734356 | 827 | ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); |
9ca53b55 | 828 | if (ret) |
2252aa27 | 829 | goto out; |
1c6fdbd8 | 830 | |
00b8ccf7 | 831 | #if 0 |
1c6fdbd8 | 832 | bch2_mark_pending_btree_node_frees(c); |
00b8ccf7 | 833 | #endif |
1c6fdbd8 KO |
834 | bch2_mark_allocator_buckets(c); |
835 | ||
1c6fdbd8 KO |
836 | c->gc_count++; |
837 | out: | |
6122ab63 KO |
838 | if (!ret && |
839 | (test_bit(BCH_FS_FIXED_GENS, &c->flags) || | |
29364f34 | 840 | (!iter && bch2_test_restart_gc))) { |
9ca53b55 KO |
841 | /* |
842 | * XXX: make sure gens we fixed got saved | |
843 | */ | |
844 | if (iter++ <= 2) { | |
845 | bch_info(c, "Fixed gens, restarting mark and sweep:"); | |
846 | clear_bit(BCH_FS_FIXED_GENS, &c->flags); | |
28062d32 | 847 | __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); |
05b3d5ac KO |
848 | |
849 | percpu_down_write(&c->mark_lock); | |
28062d32 | 850 | bch2_gc_free(c); |
05b3d5ac | 851 | percpu_up_write(&c->mark_lock); |
89b05118 KO |
852 | /* flush fsck errors, reset counters */ |
853 | bch2_flush_fsck_errs(c); | |
05b3d5ac | 854 | |
9ca53b55 KO |
855 | goto again; |
856 | } | |
857 | ||
858 | bch_info(c, "Unable to fix bucket gens, looping"); | |
859 | ret = -EINVAL; | |
860 | } | |
861 | ||
5e82a9a1 KO |
862 | if (!ret) { |
863 | bch2_journal_block(&c->journal); | |
05b3d5ac | 864 | |
5e82a9a1 | 865 | percpu_down_write(&c->mark_lock); |
a1d58243 | 866 | ret = bch2_gc_done(c, initial, metadata_only); |
9ca53b55 | 867 | |
5e82a9a1 KO |
868 | bch2_journal_unblock(&c->journal); |
869 | } else { | |
870 | percpu_down_write(&c->mark_lock); | |
871 | } | |
872 | ||
9ca53b55 | 873 | /* Indicates that gc is no longer in progress: */ |
dfe9bfb3 | 874 | __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); |
9ca53b55 KO |
875 | |
876 | bch2_gc_free(c); | |
05b3d5ac KO |
877 | percpu_up_write(&c->mark_lock); |
878 | ||
1c6fdbd8 | 879 | up_write(&c->gc_lock); |
9ca53b55 | 880 | |
1c6fdbd8 KO |
881 | trace_gc_end(c); |
882 | bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); | |
883 | ||
884 | /* | |
885 | * Wake up allocator in case it was waiting for buckets | |
886 | * because of not being able to inc gens | |
887 | */ | |
888 | for_each_member_device(ca, c, i) | |
889 | bch2_wake_allocator(ca); | |
890 | ||
891 | /* | |
892 | * At startup, allocations can happen directly instead of via the | |
893 | * allocator thread - issue wakeup in case they blocked on gc_lock: | |
894 | */ | |
895 | closure_wake_up(&c->freelist_wait); | |
9ca53b55 | 896 | return ret; |
1c6fdbd8 KO |
897 | } |
898 | ||
c47c50f8 KO |
899 | static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) |
900 | { | |
901 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); | |
902 | const struct bch_extent_ptr *ptr; | |
903 | ||
904 | percpu_down_read(&c->mark_lock); | |
905 | bkey_for_each_ptr(ptrs, ptr) { | |
906 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
907 | struct bucket *g = PTR_BUCKET(ca, ptr, false); | |
908 | ||
909 | if (gen_after(g->mark.gen, ptr->gen) > 16) { | |
910 | percpu_up_read(&c->mark_lock); | |
911 | return true; | |
912 | } | |
913 | } | |
914 | ||
915 | bkey_for_each_ptr(ptrs, ptr) { | |
916 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
917 | struct bucket *g = PTR_BUCKET(ca, ptr, false); | |
918 | ||
919 | if (gen_after(g->gc_gen, ptr->gen)) | |
920 | g->gc_gen = ptr->gen; | |
921 | } | |
922 | percpu_up_read(&c->mark_lock); | |
923 | ||
924 | return false; | |
925 | } | |
926 | ||
451570a5 KO |
927 | /* |
928 | * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree | |
929 | * node pointers currently never have cached pointers that can become stale: | |
930 | */ | |
c47c50f8 | 931 | static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) |
451570a5 KO |
932 | { |
933 | struct btree_trans trans; | |
934 | struct btree_iter *iter; | |
935 | struct bkey_s_c k; | |
07a1006a | 936 | struct bkey_buf sk; |
c47c50f8 | 937 | int ret = 0; |
451570a5 | 938 | |
07a1006a | 939 | bch2_bkey_buf_init(&sk); |
451570a5 KO |
940 | bch2_trans_init(&trans, c, 0, 0); |
941 | ||
c47c50f8 KO |
942 | iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, |
943 | BTREE_ITER_PREFETCH); | |
451570a5 | 944 | |
c47c50f8 KO |
945 | while ((k = bch2_btree_iter_peek(iter)).k && |
946 | !(ret = bkey_err(k))) { | |
947 | if (gc_btree_gens_key(c, k)) { | |
07a1006a | 948 | bch2_bkey_buf_reassemble(&sk, c, k); |
c47c50f8 | 949 | bch2_extent_normalize(c, bkey_i_to_s(sk.k)); |
451570a5 | 950 | |
c47c50f8 | 951 | bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); |
451570a5 | 952 | |
c47c50f8 | 953 | bch2_trans_update(&trans, iter, sk.k, 0); |
451570a5 | 954 | |
c47c50f8 KO |
955 | ret = bch2_trans_commit(&trans, NULL, NULL, |
956 | BTREE_INSERT_NOFAIL); | |
957 | if (ret == -EINTR) | |
958 | continue; | |
959 | if (ret) { | |
960 | break; | |
451570a5 KO |
961 | } |
962 | } | |
c47c50f8 KO |
963 | |
964 | bch2_btree_iter_next(iter); | |
451570a5 KO |
965 | } |
966 | ||
967 | bch2_trans_exit(&trans); | |
07a1006a | 968 | bch2_bkey_buf_exit(&sk, c); |
c47c50f8 | 969 | |
451570a5 KO |
970 | return ret; |
971 | } | |
972 | ||
973 | int bch2_gc_gens(struct bch_fs *c) | |
974 | { | |
975 | struct bch_dev *ca; | |
976 | struct bucket_array *buckets; | |
977 | struct bucket *g; | |
978 | unsigned i; | |
979 | int ret; | |
980 | ||
b9c3d139 KO |
981 | /* |
982 | * Ideally we would be using state_lock and not gc_lock here, but that | |
983 | * introduces a deadlock in the RO path - we currently take the state | |
984 | * lock at the start of going RO, thus the gc thread may get stuck: | |
985 | */ | |
986 | down_read(&c->gc_lock); | |
451570a5 KO |
987 | |
988 | for_each_member_device(ca, c, i) { | |
989 | down_read(&ca->bucket_lock); | |
990 | buckets = bucket_array(ca); | |
991 | ||
992 | for_each_bucket(g, buckets) | |
993 | g->gc_gen = g->mark.gen; | |
994 | up_read(&ca->bucket_lock); | |
995 | } | |
996 | ||
997 | for (i = 0; i < BTREE_ID_NR; i++) | |
998 | if (btree_node_type_needs_gc(i)) { | |
999 | ret = bch2_gc_btree_gens(c, i); | |
74ed7e56 KO |
1000 | if (ret) { |
1001 | bch_err(c, "error recalculating oldest_gen: %i", ret); | |
451570a5 | 1002 | goto err; |
74ed7e56 | 1003 | } |
451570a5 KO |
1004 | } |
1005 | ||
1006 | for_each_member_device(ca, c, i) { | |
1007 | down_read(&ca->bucket_lock); | |
1008 | buckets = bucket_array(ca); | |
1009 | ||
1010 | for_each_bucket(g, buckets) | |
1011 | g->oldest_gen = g->gc_gen; | |
1012 | up_read(&ca->bucket_lock); | |
1013 | } | |
74ed7e56 KO |
1014 | |
1015 | c->gc_count++; | |
451570a5 | 1016 | err: |
b9c3d139 | 1017 | up_read(&c->gc_lock); |
451570a5 KO |
1018 | return ret; |
1019 | } | |
1020 | ||
1c6fdbd8 KO |
1021 | /* Btree coalescing */ |
1022 | ||
1023 | static void recalc_packed_keys(struct btree *b) | |
1024 | { | |
216c9fac | 1025 | struct bset *i = btree_bset_first(b); |
1c6fdbd8 KO |
1026 | struct bkey_packed *k; |
1027 | ||
1028 | memset(&b->nr, 0, sizeof(b->nr)); | |
1029 | ||
1030 | BUG_ON(b->nsets != 1); | |
1031 | ||
216c9fac | 1032 | vstruct_for_each(i, k) |
1c6fdbd8 KO |
1033 | btree_keys_account_key_add(&b->nr, 0, k); |
1034 | } | |
1035 | ||
1036 | static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, | |
1037 | struct btree *old_nodes[GC_MERGE_NODES]) | |
1038 | { | |
1039 | struct btree *parent = btree_node_parent(iter, old_nodes[0]); | |
1040 | unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; | |
1041 | unsigned blocks = btree_blocks(c) * 2 / 3; | |
1042 | struct btree *new_nodes[GC_MERGE_NODES]; | |
1043 | struct btree_update *as; | |
1044 | struct keylist keylist; | |
1045 | struct bkey_format_state format_state; | |
1046 | struct bkey_format new_format; | |
1047 | ||
1048 | memset(new_nodes, 0, sizeof(new_nodes)); | |
1049 | bch2_keylist_init(&keylist, NULL); | |
1050 | ||
1051 | /* Count keys that are not deleted */ | |
1052 | for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) | |
1053 | u64s += old_nodes[i]->nr.live_u64s; | |
1054 | ||
1055 | nr_old_nodes = nr_new_nodes = i; | |
1056 | ||
1057 | /* Check if all keys in @old_nodes could fit in one fewer node */ | |
1058 | if (nr_old_nodes <= 1 || | |
1059 | __vstruct_blocks(struct btree_node, c->block_bits, | |
1060 | DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) | |
1061 | return; | |
1062 | ||
1063 | /* Find a format that all keys in @old_nodes can pack into */ | |
1064 | bch2_bkey_format_init(&format_state); | |
1065 | ||
1066 | for (i = 0; i < nr_old_nodes; i++) | |
1067 | __bch2_btree_calc_format(&format_state, old_nodes[i]); | |
1068 | ||
1069 | new_format = bch2_bkey_format_done(&format_state); | |
1070 | ||
1071 | /* Check if repacking would make any nodes too big to fit */ | |
1072 | for (i = 0; i < nr_old_nodes; i++) | |
1073 | if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { | |
1074 | trace_btree_gc_coalesce_fail(c, | |
1075 | BTREE_GC_COALESCE_FAIL_FORMAT_FITS); | |
1076 | return; | |
1077 | } | |
1078 | ||
1079 | if (bch2_keylist_realloc(&keylist, NULL, 0, | |
07a1006a | 1080 | BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) { |
1c6fdbd8 KO |
1081 | trace_btree_gc_coalesce_fail(c, |
1082 | BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); | |
1083 | return; | |
1084 | } | |
1085 | ||
0f9dda47 | 1086 | as = bch2_btree_update_start(iter->trans, iter->btree_id, |
1c6fdbd8 KO |
1087 | btree_update_reserve_required(c, parent) + nr_old_nodes, |
1088 | BTREE_INSERT_NOFAIL| | |
1089 | BTREE_INSERT_USE_RESERVE, | |
1090 | NULL); | |
1091 | if (IS_ERR(as)) { | |
1092 | trace_btree_gc_coalesce_fail(c, | |
1093 | BTREE_GC_COALESCE_FAIL_RESERVE_GET); | |
1094 | bch2_keylist_free(&keylist, NULL); | |
1095 | return; | |
1096 | } | |
1097 | ||
1098 | trace_btree_gc_coalesce(c, old_nodes[0]); | |
1099 | ||
1100 | for (i = 0; i < nr_old_nodes; i++) | |
1101 | bch2_btree_interior_update_will_free_node(as, old_nodes[i]); | |
1102 | ||
1103 | /* Repack everything with @new_format and sort down to one bset */ | |
1104 | for (i = 0; i < nr_old_nodes; i++) | |
1105 | new_nodes[i] = | |
1106 | __bch2_btree_node_alloc_replacement(as, old_nodes[i], | |
1107 | new_format); | |
1108 | ||
1109 | /* | |
1110 | * Conceptually we concatenate the nodes together and slice them | |
1111 | * up at different boundaries. | |
1112 | */ | |
1113 | for (i = nr_new_nodes - 1; i > 0; --i) { | |
1114 | struct btree *n1 = new_nodes[i]; | |
1115 | struct btree *n2 = new_nodes[i - 1]; | |
1116 | ||
1117 | struct bset *s1 = btree_bset_first(n1); | |
1118 | struct bset *s2 = btree_bset_first(n2); | |
1119 | struct bkey_packed *k, *last = NULL; | |
1120 | ||
1121 | /* Calculate how many keys from @n2 we could fit inside @n1 */ | |
1122 | u64s = 0; | |
1123 | ||
1124 | for (k = s2->start; | |
1125 | k < vstruct_last(s2) && | |
1126 | vstruct_blocks_plus(n1->data, c->block_bits, | |
1127 | u64s + k->u64s) <= blocks; | |
ad44bdc3 | 1128 | k = bkey_next_skip_noops(k, vstruct_last(s2))) { |
1c6fdbd8 KO |
1129 | last = k; |
1130 | u64s += k->u64s; | |
1131 | } | |
1132 | ||
1133 | if (u64s == le16_to_cpu(s2->u64s)) { | |
1134 | /* n2 fits entirely in n1 */ | |
1135 | n1->key.k.p = n1->data->max_key = n2->data->max_key; | |
1136 | ||
1137 | memcpy_u64s(vstruct_last(s1), | |
1138 | s2->start, | |
1139 | le16_to_cpu(s2->u64s)); | |
1140 | le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); | |
1141 | ||
1142 | set_btree_bset_end(n1, n1->set); | |
1143 | ||
c43a6ef9 | 1144 | six_unlock_write(&n2->c.lock); |
1c6fdbd8 | 1145 | bch2_btree_node_free_never_inserted(c, n2); |
c43a6ef9 | 1146 | six_unlock_intent(&n2->c.lock); |
1c6fdbd8 KO |
1147 | |
1148 | memmove(new_nodes + i - 1, | |
1149 | new_nodes + i, | |
1150 | sizeof(new_nodes[0]) * (nr_new_nodes - i)); | |
1151 | new_nodes[--nr_new_nodes] = NULL; | |
1152 | } else if (u64s) { | |
1153 | /* move part of n2 into n1 */ | |
1154 | n1->key.k.p = n1->data->max_key = | |
1155 | bkey_unpack_pos(n1, last); | |
1156 | ||
39fb2983 | 1157 | n2->data->min_key = bkey_successor(n1->data->max_key); |
1c6fdbd8 KO |
1158 | |
1159 | memcpy_u64s(vstruct_last(s1), | |
1160 | s2->start, u64s); | |
1161 | le16_add_cpu(&s1->u64s, u64s); | |
1162 | ||
1163 | memmove(s2->start, | |
1164 | vstruct_idx(s2, u64s), | |
1165 | (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); | |
1166 | s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); | |
1167 | ||
1168 | set_btree_bset_end(n1, n1->set); | |
1169 | set_btree_bset_end(n2, n2->set); | |
1170 | } | |
1171 | } | |
1172 | ||
1173 | for (i = 0; i < nr_new_nodes; i++) { | |
1174 | struct btree *n = new_nodes[i]; | |
1175 | ||
1176 | recalc_packed_keys(n); | |
1177 | btree_node_reset_sib_u64s(n); | |
1178 | ||
1179 | bch2_btree_build_aux_trees(n); | |
00b8ccf7 KO |
1180 | |
1181 | bch2_btree_update_add_new_node(as, n); | |
c43a6ef9 | 1182 | six_unlock_write(&n->c.lock); |
1c6fdbd8 KO |
1183 | |
1184 | bch2_btree_node_write(c, n, SIX_LOCK_intent); | |
1185 | } | |
1186 | ||
1187 | /* | |
1188 | * The keys for the old nodes get deleted. We don't want to insert keys | |
1189 | * that compare equal to the keys for the new nodes we'll also be | |
1190 | * inserting - we can't because keys on a keylist must be strictly | |
1191 | * greater than the previous keys, and we also don't need to since the | |
1192 | * key for the new node will serve the same purpose (overwriting the key | |
1193 | * for the old node). | |
1194 | */ | |
1195 | for (i = 0; i < nr_old_nodes; i++) { | |
1196 | struct bkey_i delete; | |
1197 | unsigned j; | |
1198 | ||
1199 | for (j = 0; j < nr_new_nodes; j++) | |
1200 | if (!bkey_cmp(old_nodes[i]->key.k.p, | |
1201 | new_nodes[j]->key.k.p)) | |
1202 | goto next; | |
1203 | ||
1204 | bkey_init(&delete.k); | |
1205 | delete.k.p = old_nodes[i]->key.k.p; | |
1206 | bch2_keylist_add_in_order(&keylist, &delete); | |
1207 | next: | |
1208 | i = i; | |
1209 | } | |
1210 | ||
1211 | /* | |
1212 | * Keys for the new nodes get inserted: bch2_btree_insert_keys() only | |
1213 | * does the lookup once and thus expects the keys to be in sorted order | |
1214 | * so we have to make sure the new keys are correctly ordered with | |
1215 | * respect to the deleted keys added in the previous loop | |
1216 | */ | |
1217 | for (i = 0; i < nr_new_nodes; i++) | |
1218 | bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); | |
1219 | ||
1220 | /* Insert the newly coalesced nodes */ | |
1221 | bch2_btree_insert_node(as, parent, iter, &keylist, 0); | |
1222 | ||
1223 | BUG_ON(!bch2_keylist_empty(&keylist)); | |
1224 | ||
c43a6ef9 | 1225 | BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); |
1c6fdbd8 KO |
1226 | |
1227 | bch2_btree_iter_node_replace(iter, new_nodes[0]); | |
1228 | ||
1229 | for (i = 0; i < nr_new_nodes; i++) | |
00b8ccf7 | 1230 | bch2_btree_update_get_open_buckets(as, new_nodes[i]); |
1c6fdbd8 KO |
1231 | |
1232 | /* Free the old nodes and update our sliding window */ | |
1233 | for (i = 0; i < nr_old_nodes; i++) { | |
1234 | bch2_btree_node_free_inmem(c, old_nodes[i], iter); | |
1c6fdbd8 KO |
1235 | |
1236 | /* | |
1237 | * the index update might have triggered a split, in which case | |
1238 | * the nodes we coalesced - the new nodes we just created - | |
1239 | * might not be sibling nodes anymore - don't add them to the | |
1240 | * sliding window (except the first): | |
1241 | */ | |
1242 | if (!i) { | |
1243 | old_nodes[i] = new_nodes[i]; | |
1244 | } else { | |
1245 | old_nodes[i] = NULL; | |
1c6fdbd8 KO |
1246 | } |
1247 | } | |
1248 | ||
ea3532cb KO |
1249 | for (i = 0; i < nr_new_nodes; i++) |
1250 | six_unlock_intent(&new_nodes[i]->c.lock); | |
1251 | ||
1c6fdbd8 KO |
1252 | bch2_btree_update_done(as); |
1253 | bch2_keylist_free(&keylist, NULL); | |
1254 | } | |
1255 | ||
1256 | static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) | |
1257 | { | |
424eb881 KO |
1258 | struct btree_trans trans; |
1259 | struct btree_iter *iter; | |
1c6fdbd8 KO |
1260 | struct btree *b; |
1261 | bool kthread = (current->flags & PF_KTHREAD) != 0; | |
1262 | unsigned i; | |
1263 | ||
1264 | /* Sliding window of adjacent btree nodes */ | |
1265 | struct btree *merge[GC_MERGE_NODES]; | |
1266 | u32 lock_seq[GC_MERGE_NODES]; | |
1267 | ||
20bceecb | 1268 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 1269 | |
1c6fdbd8 KO |
1270 | /* |
1271 | * XXX: We don't have a good way of positively matching on sibling nodes | |
1272 | * that have the same parent - this code works by handling the cases | |
1273 | * where they might not have the same parent, and is thus fragile. Ugh. | |
1274 | * | |
1275 | * Perhaps redo this to use multiple linked iterators? | |
1276 | */ | |
1277 | memset(merge, 0, sizeof(merge)); | |
1278 | ||
424eb881 | 1279 | __for_each_btree_node(&trans, iter, btree_id, POS_MIN, |
1c6fdbd8 KO |
1280 | BTREE_MAX_DEPTH, 0, |
1281 | BTREE_ITER_PREFETCH, b) { | |
1282 | memmove(merge + 1, merge, | |
1283 | sizeof(merge) - sizeof(merge[0])); | |
1284 | memmove(lock_seq + 1, lock_seq, | |
1285 | sizeof(lock_seq) - sizeof(lock_seq[0])); | |
1286 | ||
1287 | merge[0] = b; | |
1288 | ||
1289 | for (i = 1; i < GC_MERGE_NODES; i++) { | |
1290 | if (!merge[i] || | |
c43a6ef9 | 1291 | !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) |
1c6fdbd8 KO |
1292 | break; |
1293 | ||
c43a6ef9 KO |
1294 | if (merge[i]->c.level != merge[0]->c.level) { |
1295 | six_unlock_intent(&merge[i]->c.lock); | |
1c6fdbd8 KO |
1296 | break; |
1297 | } | |
1298 | } | |
1299 | memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); | |
1300 | ||
424eb881 | 1301 | bch2_coalesce_nodes(c, iter, merge); |
1c6fdbd8 KO |
1302 | |
1303 | for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { | |
c43a6ef9 KO |
1304 | lock_seq[i] = merge[i]->c.lock.state.seq; |
1305 | six_unlock_intent(&merge[i]->c.lock); | |
1c6fdbd8 KO |
1306 | } |
1307 | ||
c43a6ef9 | 1308 | lock_seq[0] = merge[0]->c.lock.state.seq; |
1c6fdbd8 KO |
1309 | |
1310 | if (kthread && kthread_should_stop()) { | |
424eb881 | 1311 | bch2_trans_exit(&trans); |
1c6fdbd8 KO |
1312 | return -ESHUTDOWN; |
1313 | } | |
1314 | ||
424eb881 | 1315 | bch2_trans_cond_resched(&trans); |
1c6fdbd8 KO |
1316 | |
1317 | /* | |
1318 | * If the parent node wasn't relocked, it might have been split | |
1319 | * and the nodes in our sliding window might not have the same | |
1320 | * parent anymore - blow away the sliding window: | |
1321 | */ | |
424eb881 KO |
1322 | if (btree_iter_node(iter, iter->level + 1) && |
1323 | !btree_node_intent_locked(iter, iter->level + 1)) | |
1c6fdbd8 KO |
1324 | memset(merge + 1, 0, |
1325 | (GC_MERGE_NODES - 1) * sizeof(merge[0])); | |
1326 | } | |
424eb881 | 1327 | return bch2_trans_exit(&trans); |
1c6fdbd8 KO |
1328 | } |
1329 | ||
1330 | /** | |
1331 | * bch_coalesce - coalesce adjacent nodes with low occupancy | |
1332 | */ | |
1333 | void bch2_coalesce(struct bch_fs *c) | |
1334 | { | |
1335 | enum btree_id id; | |
1336 | ||
1c6fdbd8 KO |
1337 | down_read(&c->gc_lock); |
1338 | trace_gc_coalesce_start(c); | |
1339 | ||
1340 | for (id = 0; id < BTREE_ID_NR; id++) { | |
1341 | int ret = c->btree_roots[id].b | |
1342 | ? bch2_coalesce_btree(c, id) | |
1343 | : 0; | |
1344 | ||
1345 | if (ret) { | |
1346 | if (ret != -ESHUTDOWN) | |
1347 | bch_err(c, "btree coalescing failed: %d", ret); | |
1c6fdbd8 KO |
1348 | return; |
1349 | } | |
1350 | } | |
1351 | ||
1352 | trace_gc_coalesce_end(c); | |
1353 | up_read(&c->gc_lock); | |
1354 | } | |
1355 | ||
1356 | static int bch2_gc_thread(void *arg) | |
1357 | { | |
1358 | struct bch_fs *c = arg; | |
1359 | struct io_clock *clock = &c->io_clock[WRITE]; | |
1360 | unsigned long last = atomic_long_read(&clock->now); | |
1361 | unsigned last_kick = atomic_read(&c->kick_gc); | |
9ca53b55 | 1362 | int ret; |
1c6fdbd8 KO |
1363 | |
1364 | set_freezable(); | |
1365 | ||
1366 | while (1) { | |
1367 | while (1) { | |
1368 | set_current_state(TASK_INTERRUPTIBLE); | |
1369 | ||
1370 | if (kthread_should_stop()) { | |
1371 | __set_current_state(TASK_RUNNING); | |
1372 | return 0; | |
1373 | } | |
1374 | ||
1375 | if (atomic_read(&c->kick_gc) != last_kick) | |
1376 | break; | |
1377 | ||
1378 | if (c->btree_gc_periodic) { | |
1379 | unsigned long next = last + c->capacity / 16; | |
1380 | ||
1381 | if (atomic_long_read(&clock->now) >= next) | |
1382 | break; | |
1383 | ||
1384 | bch2_io_clock_schedule_timeout(clock, next); | |
1385 | } else { | |
1386 | schedule(); | |
1387 | } | |
1388 | ||
1389 | try_to_freeze(); | |
1390 | } | |
1391 | __set_current_state(TASK_RUNNING); | |
1392 | ||
1393 | last = atomic_long_read(&clock->now); | |
1394 | last_kick = atomic_read(&c->kick_gc); | |
1395 | ||
451570a5 KO |
1396 | /* |
1397 | * Full gc is currently incompatible with btree key cache: | |
1398 | */ | |
1399 | #if 0 | |
a1d58243 | 1400 | ret = bch2_gc(c, NULL, false, false); |
451570a5 KO |
1401 | #else |
1402 | ret = bch2_gc_gens(c); | |
1403 | #endif | |
8d6b6222 | 1404 | if (ret < 0) |
9ca53b55 | 1405 | bch_err(c, "btree gc failed: %i", ret); |
1c6fdbd8 KO |
1406 | |
1407 | debug_check_no_locks_held(); | |
1408 | } | |
1409 | ||
1410 | return 0; | |
1411 | } | |
1412 | ||
1413 | void bch2_gc_thread_stop(struct bch_fs *c) | |
1414 | { | |
1415 | struct task_struct *p; | |
1416 | ||
1417 | p = c->gc_thread; | |
1418 | c->gc_thread = NULL; | |
1419 | ||
1420 | if (p) { | |
1421 | kthread_stop(p); | |
1422 | put_task_struct(p); | |
1423 | } | |
1424 | } | |
1425 | ||
1426 | int bch2_gc_thread_start(struct bch_fs *c) | |
1427 | { | |
1428 | struct task_struct *p; | |
1429 | ||
1430 | BUG_ON(c->gc_thread); | |
1431 | ||
b7a9bbfc | 1432 | p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); |
1c6fdbd8 KO |
1433 | if (IS_ERR(p)) |
1434 | return PTR_ERR(p); | |
1435 | ||
1436 | get_task_struct(p); | |
1437 | c->gc_thread = p; | |
1438 | wake_up_process(p); | |
1439 | return 0; | |
1440 | } |