Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | |
4 | * Copyright (C) 2014 Datera Inc. | |
5 | */ | |
6 | ||
7 | #include "bcachefs.h" | |
7b3f84ea | 8 | #include "alloc_background.h" |
ef337c54 | 9 | #include "alloc_foreground.h" |
1c6fdbd8 KO |
10 | #include "bkey_methods.h" |
11 | #include "btree_locking.h" | |
12 | #include "btree_update_interior.h" | |
13 | #include "btree_io.h" | |
14 | #include "btree_gc.h" | |
15 | #include "buckets.h" | |
16 | #include "clock.h" | |
17 | #include "debug.h" | |
cd575ddf | 18 | #include "ec.h" |
1c6fdbd8 KO |
19 | #include "error.h" |
20 | #include "extents.h" | |
21 | #include "journal.h" | |
22 | #include "keylist.h" | |
23 | #include "move.h" | |
d0734356 | 24 | #include "recovery.h" |
1c6fdbd8 KO |
25 | #include "replicas.h" |
26 | #include "super-io.h" | |
27 | #include "trace.h" | |
28 | ||
29 | #include <linux/slab.h> | |
30 | #include <linux/bitops.h> | |
31 | #include <linux/freezer.h> | |
32 | #include <linux/kthread.h> | |
33 | #include <linux/preempt.h> | |
34 | #include <linux/rcupdate.h> | |
35 | #include <linux/sched/task.h> | |
36 | ||
2252aa27 KO |
37 | static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) |
38 | { | |
39 | preempt_disable(); | |
40 | write_seqcount_begin(&c->gc_pos_lock); | |
41 | c->gc_pos = new_pos; | |
42 | write_seqcount_end(&c->gc_pos_lock); | |
43 | preempt_enable(); | |
44 | } | |
45 | ||
46 | static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) | |
47 | { | |
48 | BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); | |
49 | __gc_pos_set(c, new_pos); | |
50 | } | |
51 | ||
d06c1a0c KO |
52 | static int bch2_gc_check_topology(struct bch_fs *c, |
53 | struct bkey_s_c k, | |
54 | struct bpos *expected_start, | |
55 | struct bpos expected_end, | |
56 | bool is_last) | |
1c6fdbd8 | 57 | { |
d06c1a0c | 58 | int ret = 0; |
1c6fdbd8 | 59 | |
d06c1a0c KO |
60 | if (k.k->type == KEY_TYPE_btree_ptr_v2) { |
61 | struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); | |
1c6fdbd8 | 62 | |
d06c1a0c KO |
63 | if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, |
64 | "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", | |
65 | bp.v->min_key.inode, | |
66 | bp.v->min_key.offset, | |
67 | expected_start->inode, | |
68 | expected_start->offset)) { | |
69 | BUG(); | |
70 | } | |
71 | } | |
1c6fdbd8 | 72 | |
d06c1a0c KO |
73 | *expected_start = bkey_cmp(k.k->p, POS_MAX) |
74 | ? bkey_successor(k.k->p) | |
75 | : k.k->p; | |
76 | ||
77 | if (fsck_err_on(is_last && | |
78 | bkey_cmp(k.k->p, expected_end), c, | |
79 | "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", | |
80 | k.k->p.inode, | |
81 | k.k->p.offset, | |
82 | expected_end.inode, | |
83 | expected_end.offset)) { | |
84 | BUG(); | |
1c6fdbd8 | 85 | } |
d06c1a0c KO |
86 | fsck_err: |
87 | return ret; | |
1c6fdbd8 KO |
88 | } |
89 | ||
2252aa27 KO |
90 | /* marking of btree keys/nodes: */ |
91 | ||
26609b61 | 92 | static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, |
d034c09b | 93 | u8 *max_stale, bool initial) |
2252aa27 | 94 | { |
26609b61 KO |
95 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
96 | const struct bch_extent_ptr *ptr; | |
47799326 | 97 | unsigned flags = |
2d594dfb KO |
98 | BTREE_TRIGGER_GC| |
99 | (initial ? BTREE_TRIGGER_NOATOMIC : 0); | |
2252aa27 KO |
100 | int ret = 0; |
101 | ||
91f8b567 KO |
102 | if (initial) { |
103 | BUG_ON(journal_seq_verify(c) && | |
104 | k.k->version.lo > journal_cur_seq(&c->journal)); | |
105 | ||
a9bc0a51 KO |
106 | /* XXX change to fsck check */ |
107 | if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, | |
108 | "key version number higher than recorded: %llu > %llu", | |
109 | k.k->version.lo, | |
110 | atomic64_read(&c->key_version))) | |
91f8b567 KO |
111 | atomic64_set(&c->key_version, k.k->version.lo); |
112 | ||
113 | if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || | |
26609b61 | 114 | fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, |
91f8b567 | 115 | "superblock not marked as containing replicas (type %u)", |
26609b61 KO |
116 | k.k->type)) { |
117 | ret = bch2_mark_bkey_replicas(c, k); | |
91f8b567 | 118 | if (ret) |
2252aa27 KO |
119 | return ret; |
120 | } | |
47799326 | 121 | |
26609b61 KO |
122 | bkey_for_each_ptr(ptrs, ptr) { |
123 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
39fbc5a4 | 124 | struct bucket *g = PTR_BUCKET(ca, ptr, true); |
28062d32 | 125 | struct bucket *g2 = PTR_BUCKET(ca, ptr, false); |
26609b61 | 126 | |
90541a74 | 127 | if (mustfix_fsck_err_on(!g->gen_valid, c, |
06ab329c KO |
128 | "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", |
129 | ptr->dev, PTR_BUCKET_NR(ca, ptr), | |
130 | bch2_data_types[ptr_data_type(k.k, ptr)], | |
131 | ptr->gen)) { | |
28062d32 | 132 | g2->_mark.gen = g->_mark.gen = ptr->gen; |
28062d32 | 133 | g2->gen_valid = g->gen_valid = true; |
26609b61 KO |
134 | } |
135 | ||
136 | if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, | |
06ab329c KO |
137 | "bucket %u:%zu data type %s ptr gen in the future: %u > %u", |
138 | ptr->dev, PTR_BUCKET_NR(ca, ptr), | |
139 | bch2_data_types[ptr_data_type(k.k, ptr)], | |
140 | ptr->gen, g->mark.gen)) { | |
28062d32 | 141 | g2->_mark.gen = g->_mark.gen = ptr->gen; |
28062d32 | 142 | g2->gen_valid = g->gen_valid = true; |
06ab329c KO |
143 | g2->_mark.data_type = 0; |
144 | g2->_mark.dirty_sectors = 0; | |
145 | g2->_mark.cached_sectors = 0; | |
26609b61 KO |
146 | set_bit(BCH_FS_FIXED_GENS, &c->flags); |
147 | } | |
148 | } | |
2252aa27 KO |
149 | } |
150 | ||
26609b61 KO |
151 | bkey_for_each_ptr(ptrs, ptr) { |
152 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
39fbc5a4 | 153 | struct bucket *g = PTR_BUCKET(ca, ptr, true); |
26609b61 | 154 | |
76f4c7b0 KO |
155 | if (gen_after(g->oldest_gen, ptr->gen)) |
156 | g->oldest_gen = ptr->gen; | |
26609b61 KO |
157 | |
158 | *max_stale = max(*max_stale, ptr_stale(ca, ptr)); | |
159 | } | |
91f8b567 | 160 | |
2cbe5cfe | 161 | bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); |
91f8b567 | 162 | fsck_err: |
2252aa27 KO |
163 | return ret; |
164 | } | |
165 | ||
e3e464ac | 166 | static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, |
e62d65f2 | 167 | bool initial) |
1c6fdbd8 | 168 | { |
d06c1a0c | 169 | struct bpos next_node_start = b->data->min_key; |
1c6fdbd8 KO |
170 | struct btree_node_iter iter; |
171 | struct bkey unpacked; | |
172 | struct bkey_s_c k; | |
d034c09b KO |
173 | int ret = 0; |
174 | ||
175 | *max_stale = 0; | |
1c6fdbd8 | 176 | |
26609b61 | 177 | if (!btree_node_type_needs_gc(btree_node_type(b))) |
2252aa27 | 178 | return 0; |
1c6fdbd8 | 179 | |
d06c1a0c KO |
180 | bch2_btree_node_iter_init_from_start(&iter, b); |
181 | ||
182 | while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { | |
2252aa27 | 183 | bch2_bkey_debugcheck(c, b, k); |
1c6fdbd8 | 184 | |
26609b61 | 185 | ret = bch2_gc_mark_key(c, k, max_stale, initial); |
d034c09b KO |
186 | if (ret) |
187 | break; | |
d06c1a0c KO |
188 | |
189 | bch2_btree_node_iter_advance(&iter, b); | |
190 | ||
191 | if (b->c.level) { | |
192 | ret = bch2_gc_check_topology(c, k, | |
193 | &next_node_start, | |
194 | b->data->max_key, | |
195 | bch2_btree_node_iter_end(&iter)); | |
196 | if (ret) | |
197 | break; | |
198 | } | |
2252aa27 KO |
199 | } |
200 | ||
d034c09b | 201 | return ret; |
1c6fdbd8 KO |
202 | } |
203 | ||
2252aa27 | 204 | static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, |
a1d58243 | 205 | bool initial, bool metadata_only) |
1c6fdbd8 | 206 | { |
424eb881 KO |
207 | struct btree_trans trans; |
208 | struct btree_iter *iter; | |
1c6fdbd8 | 209 | struct btree *b; |
4881fdb7 KO |
210 | unsigned depth = metadata_only ? 1 |
211 | : expensive_debug_checks(c) ? 0 | |
212 | : !btree_node_type_needs_gc(btree_id) ? 1 | |
213 | : 0; | |
f7c0fcdd | 214 | u8 max_stale = 0; |
1c6fdbd8 KO |
215 | int ret = 0; |
216 | ||
20bceecb | 217 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 218 | |
1c6fdbd8 KO |
219 | gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); |
220 | ||
424eb881 | 221 | __for_each_btree_node(&trans, iter, btree_id, POS_MIN, |
1c6fdbd8 | 222 | 0, depth, BTREE_ITER_PREFETCH, b) { |
1c6fdbd8 KO |
223 | bch2_verify_btree_nr_keys(b); |
224 | ||
8777210b KO |
225 | gc_pos_set(c, gc_pos_btree_node(b)); |
226 | ||
e62d65f2 | 227 | ret = btree_gc_mark_node(c, b, &max_stale, initial); |
d034c09b KO |
228 | if (ret) |
229 | break; | |
1c6fdbd8 | 230 | |
2252aa27 KO |
231 | if (!initial) { |
232 | if (max_stale > 64) | |
424eb881 | 233 | bch2_btree_node_rewrite(c, iter, |
2252aa27 KO |
234 | b->data->keys.seq, |
235 | BTREE_INSERT_USE_RESERVE| | |
236 | BTREE_INSERT_NOWAIT| | |
237 | BTREE_INSERT_GC_LOCK_HELD); | |
238 | else if (!btree_gc_rewrite_disabled(c) && | |
239 | (btree_gc_always_rewrite(c) || max_stale > 16)) | |
424eb881 | 240 | bch2_btree_node_rewrite(c, iter, |
2252aa27 KO |
241 | b->data->keys.seq, |
242 | BTREE_INSERT_NOWAIT| | |
243 | BTREE_INSERT_GC_LOCK_HELD); | |
244 | } | |
1c6fdbd8 | 245 | |
424eb881 | 246 | bch2_trans_cond_resched(&trans); |
1c6fdbd8 | 247 | } |
424eb881 | 248 | ret = bch2_trans_exit(&trans) ?: ret; |
1c6fdbd8 KO |
249 | if (ret) |
250 | return ret; | |
251 | ||
252 | mutex_lock(&c->btree_root_lock); | |
1c6fdbd8 KO |
253 | b = c->btree_roots[btree_id].b; |
254 | if (!btree_node_fake(b)) | |
8b2b9d11 KO |
255 | ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), |
256 | &max_stale, initial); | |
c43a6ef9 | 257 | gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); |
1c6fdbd8 | 258 | mutex_unlock(&c->btree_root_lock); |
8b2b9d11 KO |
259 | |
260 | return ret; | |
1c6fdbd8 KO |
261 | } |
262 | ||
e62d65f2 | 263 | static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, |
d06c1a0c KO |
264 | struct journal_keys *journal_keys, |
265 | unsigned target_depth) | |
e62d65f2 KO |
266 | { |
267 | struct btree_and_journal_iter iter; | |
268 | struct bkey_s_c k; | |
d06c1a0c | 269 | struct bpos next_node_start = b->data->min_key; |
e62d65f2 KO |
270 | u8 max_stale = 0; |
271 | int ret = 0; | |
272 | ||
273 | bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); | |
274 | ||
275 | while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { | |
276 | bch2_bkey_debugcheck(c, b, k); | |
277 | ||
d06c1a0c KO |
278 | BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); |
279 | BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); | |
280 | ||
e62d65f2 KO |
281 | ret = bch2_gc_mark_key(c, k, &max_stale, true); |
282 | if (ret) | |
283 | break; | |
284 | ||
d06c1a0c | 285 | if (b->c.level) { |
e62d65f2 KO |
286 | struct btree *child; |
287 | BKEY_PADDED(k) tmp; | |
288 | ||
289 | bkey_reassemble(&tmp.k, k); | |
d06c1a0c KO |
290 | k = bkey_i_to_s_c(&tmp.k); |
291 | ||
292 | bch2_btree_and_journal_iter_advance(&iter); | |
e62d65f2 | 293 | |
d06c1a0c KO |
294 | ret = bch2_gc_check_topology(c, k, |
295 | &next_node_start, | |
296 | b->data->max_key, | |
297 | !bch2_btree_and_journal_iter_peek(&iter).k); | |
e62d65f2 KO |
298 | if (ret) |
299 | break; | |
300 | ||
d06c1a0c KO |
301 | if (b->c.level > target_depth) { |
302 | child = bch2_btree_node_get_noiter(c, &tmp.k, | |
303 | b->c.btree_id, b->c.level - 1); | |
304 | ret = PTR_ERR_OR_ZERO(child); | |
305 | if (ret) | |
306 | break; | |
e62d65f2 | 307 | |
d06c1a0c KO |
308 | ret = bch2_gc_btree_init_recurse(c, child, |
309 | journal_keys, target_depth); | |
310 | six_unlock_read(&child->c.lock); | |
311 | ||
312 | if (ret) | |
313 | break; | |
314 | } | |
315 | } else { | |
316 | bch2_btree_and_journal_iter_advance(&iter); | |
317 | } | |
e62d65f2 KO |
318 | } |
319 | ||
320 | return ret; | |
321 | } | |
322 | ||
323 | static int bch2_gc_btree_init(struct bch_fs *c, | |
324 | struct journal_keys *journal_keys, | |
325 | enum btree_id btree_id, | |
326 | bool metadata_only) | |
327 | { | |
328 | struct btree *b; | |
329 | unsigned target_depth = metadata_only ? 1 | |
330 | : expensive_debug_checks(c) ? 0 | |
331 | : !btree_node_type_needs_gc(btree_id) ? 1 | |
332 | : 0; | |
333 | u8 max_stale = 0; | |
334 | int ret = 0; | |
335 | ||
336 | b = c->btree_roots[btree_id].b; | |
337 | ||
338 | if (btree_node_fake(b)) | |
339 | return 0; | |
340 | ||
341 | six_lock_read(&b->c.lock, NULL, NULL); | |
d06c1a0c KO |
342 | if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, |
343 | "btree root with incorrect min_key: %llu:%llu", | |
344 | b->data->min_key.inode, | |
345 | b->data->min_key.offset)) { | |
346 | BUG(); | |
347 | } | |
348 | ||
349 | if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, | |
350 | "btree root with incorrect min_key: %llu:%llu", | |
351 | b->data->max_key.inode, | |
352 | b->data->max_key.offset)) { | |
353 | BUG(); | |
354 | } | |
355 | ||
e62d65f2 KO |
356 | if (b->c.level >= target_depth) |
357 | ret = bch2_gc_btree_init_recurse(c, b, | |
358 | journal_keys, target_depth); | |
359 | ||
360 | if (!ret) | |
361 | ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), | |
362 | &max_stale, true); | |
d06c1a0c | 363 | fsck_err: |
e62d65f2 KO |
364 | six_unlock_read(&b->c.lock); |
365 | ||
366 | return ret; | |
367 | } | |
368 | ||
cd575ddf KO |
369 | static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) |
370 | { | |
371 | return (int) btree_id_to_gc_phase(l) - | |
372 | (int) btree_id_to_gc_phase(r); | |
373 | } | |
374 | ||
d0734356 | 375 | static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, |
a1d58243 | 376 | bool initial, bool metadata_only) |
2252aa27 | 377 | { |
cd575ddf | 378 | enum btree_id ids[BTREE_ID_NR]; |
2252aa27 KO |
379 | unsigned i; |
380 | ||
cd575ddf KO |
381 | for (i = 0; i < BTREE_ID_NR; i++) |
382 | ids[i] = i; | |
383 | bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); | |
384 | ||
2252aa27 | 385 | for (i = 0; i < BTREE_ID_NR; i++) { |
cd575ddf | 386 | enum btree_id id = ids[i]; |
e62d65f2 KO |
387 | int ret = initial |
388 | ? bch2_gc_btree_init(c, journal_keys, | |
389 | id, metadata_only) | |
390 | : bch2_gc_btree(c, id, initial, metadata_only); | |
2252aa27 KO |
391 | if (ret) |
392 | return ret; | |
2252aa27 KO |
393 | } |
394 | ||
395 | return 0; | |
396 | } | |
397 | ||
1c6fdbd8 KO |
398 | static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, |
399 | u64 start, u64 end, | |
400 | enum bch_data_type type, | |
401 | unsigned flags) | |
402 | { | |
403 | u64 b = sector_to_bucket(ca, start); | |
404 | ||
405 | do { | |
406 | unsigned sectors = | |
407 | min_t(u64, bucket_to_sector(ca, b + 1), end) - start; | |
408 | ||
409 | bch2_mark_metadata_bucket(c, ca, b, type, sectors, | |
410 | gc_phase(GC_PHASE_SB), flags); | |
411 | b++; | |
412 | start += sectors; | |
413 | } while (start < end); | |
414 | } | |
415 | ||
416 | void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, | |
417 | unsigned flags) | |
418 | { | |
419 | struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; | |
420 | unsigned i; | |
421 | u64 b; | |
422 | ||
97446a24 KO |
423 | /* |
424 | * This conditional is kind of gross, but we may be called from the | |
425 | * device add path, before the new device has actually been added to the | |
426 | * running filesystem: | |
427 | */ | |
1c6fdbd8 KO |
428 | if (c) { |
429 | lockdep_assert_held(&c->sb_lock); | |
9166b41d | 430 | percpu_down_read(&c->mark_lock); |
1c6fdbd8 KO |
431 | } |
432 | ||
433 | for (i = 0; i < layout->nr_superblocks; i++) { | |
434 | u64 offset = le64_to_cpu(layout->sb_offset[i]); | |
435 | ||
436 | if (offset == BCH_SB_SECTOR) | |
437 | mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, | |
438 | BCH_DATA_SB, flags); | |
439 | ||
440 | mark_metadata_sectors(c, ca, offset, | |
441 | offset + (1 << layout->sb_max_size_bits), | |
442 | BCH_DATA_SB, flags); | |
443 | } | |
444 | ||
1c6fdbd8 KO |
445 | for (i = 0; i < ca->journal.nr; i++) { |
446 | b = ca->journal.buckets[i]; | |
447 | bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, | |
448 | ca->mi.bucket_size, | |
449 | gc_phase(GC_PHASE_SB), flags); | |
450 | } | |
451 | ||
3a0e06db | 452 | if (c) |
9166b41d | 453 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
454 | } |
455 | ||
456 | static void bch2_mark_superblocks(struct bch_fs *c) | |
457 | { | |
458 | struct bch_dev *ca; | |
459 | unsigned i; | |
460 | ||
461 | mutex_lock(&c->sb_lock); | |
462 | gc_pos_set(c, gc_phase(GC_PHASE_SB)); | |
463 | ||
464 | for_each_online_member(ca, c, i) | |
2d594dfb | 465 | bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
466 | mutex_unlock(&c->sb_lock); |
467 | } | |
468 | ||
00b8ccf7 | 469 | #if 0 |
1c6fdbd8 KO |
470 | /* Also see bch2_pending_btree_node_free_insert_done() */ |
471 | static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) | |
472 | { | |
1c6fdbd8 KO |
473 | struct btree_update *as; |
474 | struct pending_btree_node_free *d; | |
475 | ||
476 | mutex_lock(&c->btree_interior_update_lock); | |
477 | gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); | |
478 | ||
479 | for_each_pending_btree_node_free(c, as, d) | |
480 | if (d->index_update_done) | |
2cbe5cfe KO |
481 | bch2_mark_key(c, bkey_i_to_s_c(&d->key), |
482 | 0, 0, NULL, 0, | |
2d594dfb | 483 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
484 | |
485 | mutex_unlock(&c->btree_interior_update_lock); | |
486 | } | |
00b8ccf7 | 487 | #endif |
1c6fdbd8 KO |
488 | |
489 | static void bch2_mark_allocator_buckets(struct bch_fs *c) | |
490 | { | |
491 | struct bch_dev *ca; | |
492 | struct open_bucket *ob; | |
493 | size_t i, j, iter; | |
494 | unsigned ci; | |
495 | ||
9166b41d | 496 | percpu_down_read(&c->mark_lock); |
1c6fdbd8 KO |
497 | |
498 | spin_lock(&c->freelist_lock); | |
499 | gc_pos_set(c, gc_pos_alloc(c, NULL)); | |
500 | ||
501 | for_each_member_device(ca, c, ci) { | |
502 | fifo_for_each_entry(i, &ca->free_inc, iter) | |
503 | bch2_mark_alloc_bucket(c, ca, i, true, | |
504 | gc_pos_alloc(c, NULL), | |
2d594dfb | 505 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
506 | |
507 | ||
508 | ||
509 | for (j = 0; j < RESERVE_NR; j++) | |
510 | fifo_for_each_entry(i, &ca->free[j], iter) | |
511 | bch2_mark_alloc_bucket(c, ca, i, true, | |
512 | gc_pos_alloc(c, NULL), | |
2d594dfb | 513 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
514 | } |
515 | ||
516 | spin_unlock(&c->freelist_lock); | |
517 | ||
518 | for (ob = c->open_buckets; | |
519 | ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); | |
520 | ob++) { | |
521 | spin_lock(&ob->lock); | |
522 | if (ob->valid) { | |
523 | gc_pos_set(c, gc_pos_alloc(c, ob)); | |
524 | ca = bch_dev_bkey_exists(c, ob->ptr.dev); | |
525 | bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, | |
526 | gc_pos_alloc(c, ob), | |
2d594dfb | 527 | BTREE_TRIGGER_GC); |
1c6fdbd8 KO |
528 | } |
529 | spin_unlock(&ob->lock); | |
530 | } | |
531 | ||
9166b41d | 532 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
533 | } |
534 | ||
9ca53b55 KO |
535 | static void bch2_gc_free(struct bch_fs *c) |
536 | { | |
537 | struct bch_dev *ca; | |
538 | unsigned i; | |
539 | ||
dfe9bfb3 KO |
540 | genradix_free(&c->stripes[1]); |
541 | ||
9ca53b55 KO |
542 | for_each_member_device(ca, c, i) { |
543 | kvpfree(rcu_dereference_protected(ca->buckets[1], 1), | |
544 | sizeof(struct bucket_array) + | |
545 | ca->mi.nbuckets * sizeof(struct bucket)); | |
546 | ca->buckets[1] = NULL; | |
547 | ||
548 | free_percpu(ca->usage[1]); | |
549 | ca->usage[1] = NULL; | |
550 | } | |
551 | ||
5e82a9a1 KO |
552 | free_percpu(c->usage_gc); |
553 | c->usage_gc = NULL; | |
06b7345c KO |
554 | } |
555 | ||
a1d58243 KO |
556 | static int bch2_gc_done(struct bch_fs *c, |
557 | bool initial, bool metadata_only) | |
9ca53b55 KO |
558 | { |
559 | struct bch_dev *ca; | |
a1d58243 KO |
560 | bool verify = !metadata_only && |
561 | (!initial || | |
562 | (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); | |
9ca53b55 | 563 | unsigned i; |
cccf4e6d | 564 | int ret = 0; |
9ca53b55 KO |
565 | |
566 | #define copy_field(_f, _msg, ...) \ | |
23f80d2b | 567 | if (dst->_f != src->_f) { \ |
76f4c7b0 | 568 | if (verify) \ |
cccf4e6d | 569 | fsck_err(c, _msg ": got %llu, should be %llu" \ |
76f4c7b0 | 570 | , ##__VA_ARGS__, dst->_f, src->_f); \ |
23f80d2b | 571 | dst->_f = src->_f; \ |
9ca53b55 | 572 | } |
dfe9bfb3 KO |
573 | #define copy_stripe_field(_f, _msg, ...) \ |
574 | if (dst->_f != src->_f) { \ | |
76f4c7b0 | 575 | if (verify) \ |
cccf4e6d KO |
576 | fsck_err(c, "stripe %zu has wrong "_msg \ |
577 | ": got %u, should be %u", \ | |
76f4c7b0 KO |
578 | dst_iter.pos, ##__VA_ARGS__, \ |
579 | dst->_f, src->_f); \ | |
dfe9bfb3 | 580 | dst->_f = src->_f; \ |
61c8d7c8 | 581 | dst->dirty = true; \ |
dfe9bfb3 | 582 | } |
9ca53b55 KO |
583 | #define copy_bucket_field(_f) \ |
584 | if (dst->b[b].mark._f != src->b[b].mark._f) { \ | |
76f4c7b0 | 585 | if (verify) \ |
aafcf9bc | 586 | fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ |
cccf4e6d | 587 | ": got %u, should be %u", i, b, \ |
aafcf9bc KO |
588 | dst->b[b].mark.gen, \ |
589 | bch2_data_types[dst->b[b].mark.data_type],\ | |
76f4c7b0 | 590 | dst->b[b].mark._f, src->b[b].mark._f); \ |
9ca53b55 KO |
591 | dst->b[b]._mark._f = src->b[b].mark._f; \ |
592 | } | |
593 | #define copy_dev_field(_f, _msg, ...) \ | |
594 | copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) | |
595 | #define copy_fs_field(_f, _msg, ...) \ | |
596 | copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) | |
597 | ||
a1d58243 | 598 | if (!metadata_only) { |
dfe9bfb3 KO |
599 | struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); |
600 | struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); | |
601 | struct stripe *dst, *src; | |
602 | unsigned i; | |
603 | ||
604 | c->ec_stripes_heap.used = 0; | |
605 | ||
606 | while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && | |
607 | (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { | |
61c8d7c8 KO |
608 | BUG_ON(src_iter.pos != dst_iter.pos); |
609 | ||
dfe9bfb3 KO |
610 | copy_stripe_field(alive, "alive"); |
611 | copy_stripe_field(sectors, "sectors"); | |
612 | copy_stripe_field(algorithm, "algorithm"); | |
613 | copy_stripe_field(nr_blocks, "nr_blocks"); | |
614 | copy_stripe_field(nr_redundant, "nr_redundant"); | |
61c8d7c8 | 615 | copy_stripe_field(blocks_nonempty, |
dfe9bfb3 KO |
616 | "blocks_nonempty"); |
617 | ||
618 | for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) | |
61c8d7c8 | 619 | copy_stripe_field(block_sectors[i], |
dfe9bfb3 KO |
620 | "block_sectors[%u]", i); |
621 | ||
622 | if (dst->alive) | |
623 | bch2_stripes_heap_insert(c, dst, dst_iter.pos); | |
624 | ||
625 | genradix_iter_advance(&dst_iter, &c->stripes[0]); | |
626 | genradix_iter_advance(&src_iter, &c->stripes[1]); | |
627 | } | |
628 | } | |
629 | ||
9ca53b55 KO |
630 | for_each_member_device(ca, c, i) { |
631 | struct bucket_array *dst = __bucket_array(ca, 0); | |
632 | struct bucket_array *src = __bucket_array(ca, 1); | |
633 | size_t b; | |
634 | ||
9ca53b55 KO |
635 | for (b = 0; b < src->nbuckets; b++) { |
636 | copy_bucket_field(gen); | |
637 | copy_bucket_field(data_type); | |
638 | copy_bucket_field(owned_by_allocator); | |
639 | copy_bucket_field(stripe); | |
640 | copy_bucket_field(dirty_sectors); | |
641 | copy_bucket_field(cached_sectors); | |
76f4c7b0 | 642 | |
6671a708 | 643 | dst->b[b].oldest_gen = src->b[b].oldest_gen; |
9ca53b55 KO |
644 | } |
645 | }; | |
646 | ||
5e82a9a1 KO |
647 | for (i = 0; i < ARRAY_SIZE(c->usage); i++) |
648 | bch2_fs_usage_acc_to_base(c, i); | |
649 | ||
a1d58243 | 650 | bch2_dev_usage_from_buckets(c); |
9ca53b55 KO |
651 | |
652 | { | |
ecf37a4a | 653 | unsigned nr = fs_usage_u64s(c); |
5e82a9a1 | 654 | struct bch_fs_usage *dst = c->usage_base; |
23f80d2b | 655 | struct bch_fs_usage *src = (void *) |
5e82a9a1 | 656 | bch2_acc_percpu_u64s((void *) c->usage_gc, nr); |
9ca53b55 | 657 | |
768ac639 | 658 | copy_fs_field(hidden, "hidden"); |
a1d58243 | 659 | copy_fs_field(btree, "btree"); |
06b7345c | 660 | |
a1d58243 KO |
661 | if (!metadata_only) { |
662 | copy_fs_field(data, "data"); | |
663 | copy_fs_field(cached, "cached"); | |
664 | copy_fs_field(reserved, "reserved"); | |
665 | copy_fs_field(nr_inodes,"nr_inodes"); | |
666 | ||
667 | for (i = 0; i < BCH_REPLICAS_MAX; i++) | |
668 | copy_fs_field(persistent_reserved[i], | |
669 | "persistent_reserved[%i]", i); | |
670 | } | |
9ca53b55 | 671 | |
7ef2a73a | 672 | for (i = 0; i < c->replicas.nr; i++) { |
8777210b KO |
673 | struct bch_replicas_entry *e = |
674 | cpu_replicas_entry(&c->replicas, i); | |
675 | char buf[80]; | |
676 | ||
a1d58243 KO |
677 | if (metadata_only && |
678 | (e->data_type == BCH_DATA_USER || | |
679 | e->data_type == BCH_DATA_CACHED)) | |
680 | continue; | |
681 | ||
8777210b KO |
682 | bch2_replicas_entry_to_text(&PBUF(buf), e); |
683 | ||
768ac639 | 684 | copy_fs_field(replicas[i], "%s", buf); |
7ef2a73a | 685 | } |
9ca53b55 | 686 | } |
76f4c7b0 | 687 | |
9ca53b55 KO |
688 | #undef copy_fs_field |
689 | #undef copy_dev_field | |
690 | #undef copy_bucket_field | |
dfe9bfb3 KO |
691 | #undef copy_stripe_field |
692 | #undef copy_field | |
cccf4e6d KO |
693 | fsck_err: |
694 | return ret; | |
9ca53b55 KO |
695 | } |
696 | ||
a1d58243 KO |
697 | static int bch2_gc_start(struct bch_fs *c, |
698 | bool metadata_only) | |
9ca53b55 KO |
699 | { |
700 | struct bch_dev *ca; | |
701 | unsigned i; | |
0741d378 | 702 | int ret; |
dfe9bfb3 | 703 | |
5e82a9a1 | 704 | BUG_ON(c->usage_gc); |
9ca53b55 | 705 | |
5e82a9a1 | 706 | c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), |
ecf37a4a | 707 | sizeof(u64), GFP_KERNEL); |
1e1a31c4 KO |
708 | if (!c->usage_gc) { |
709 | bch_err(c, "error allocating c->usage_gc"); | |
9ca53b55 | 710 | return -ENOMEM; |
1e1a31c4 | 711 | } |
9ca53b55 | 712 | |
1c6fdbd8 | 713 | for_each_member_device(ca, c, i) { |
9ca53b55 KO |
714 | BUG_ON(ca->buckets[1]); |
715 | BUG_ON(ca->usage[1]); | |
716 | ||
717 | ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + | |
718 | ca->mi.nbuckets * sizeof(struct bucket), | |
719 | GFP_KERNEL|__GFP_ZERO); | |
720 | if (!ca->buckets[1]) { | |
721 | percpu_ref_put(&ca->ref); | |
1e1a31c4 | 722 | bch_err(c, "error allocating ca->buckets[gc]"); |
9ca53b55 KO |
723 | return -ENOMEM; |
724 | } | |
725 | ||
726 | ca->usage[1] = alloc_percpu(struct bch_dev_usage); | |
727 | if (!ca->usage[1]) { | |
1e1a31c4 | 728 | bch_err(c, "error allocating ca->usage[gc]"); |
9ca53b55 KO |
729 | percpu_ref_put(&ca->ref); |
730 | return -ENOMEM; | |
1c6fdbd8 | 731 | } |
1c6fdbd8 | 732 | } |
9ca53b55 | 733 | |
0741d378 | 734 | ret = bch2_ec_mem_alloc(c, true); |
1e1a31c4 KO |
735 | if (ret) { |
736 | bch_err(c, "error allocating ec gc mem"); | |
0741d378 | 737 | return ret; |
1e1a31c4 | 738 | } |
0741d378 KO |
739 | |
740 | percpu_down_write(&c->mark_lock); | |
741 | ||
742 | /* | |
743 | * indicate to stripe code that we need to allocate for the gc stripes | |
744 | * radix tree, too | |
745 | */ | |
746 | gc_pos_set(c, gc_phase(GC_PHASE_START)); | |
747 | ||
9ca53b55 KO |
748 | for_each_member_device(ca, c, i) { |
749 | struct bucket_array *dst = __bucket_array(ca, 1); | |
750 | struct bucket_array *src = __bucket_array(ca, 0); | |
751 | size_t b; | |
752 | ||
753 | dst->first_bucket = src->first_bucket; | |
754 | dst->nbuckets = src->nbuckets; | |
755 | ||
28062d32 | 756 | for (b = 0; b < src->nbuckets; b++) { |
a1d58243 KO |
757 | struct bucket *d = &dst->b[b]; |
758 | struct bucket *s = &src->b[b]; | |
759 | ||
760 | d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; | |
761 | d->gen_valid = s->gen_valid; | |
762 | ||
763 | if (metadata_only && | |
764 | (s->mark.data_type == BCH_DATA_USER || | |
765 | s->mark.data_type == BCH_DATA_CACHED)) { | |
766 | d->_mark = s->mark; | |
767 | d->_mark.owned_by_allocator = 0; | |
768 | } | |
28062d32 | 769 | } |
9ca53b55 KO |
770 | }; |
771 | ||
0741d378 KO |
772 | percpu_up_write(&c->mark_lock); |
773 | ||
774 | return 0; | |
1c6fdbd8 KO |
775 | } |
776 | ||
777 | /** | |
9ca53b55 KO |
778 | * bch2_gc - walk _all_ references to buckets, and recompute them: |
779 | * | |
780 | * Order matters here: | |
781 | * - Concurrent GC relies on the fact that we have a total ordering for | |
782 | * everything that GC walks - see gc_will_visit_node(), | |
783 | * gc_will_visit_root() | |
784 | * | |
785 | * - also, references move around in the course of index updates and | |
786 | * various other crap: everything needs to agree on the ordering | |
787 | * references are allowed to move around in - e.g., we're allowed to | |
788 | * start with a reference owned by an open_bucket (the allocator) and | |
789 | * move it to the btree, but not the reverse. | |
790 | * | |
791 | * This is necessary to ensure that gc doesn't miss references that | |
792 | * move around - if references move backwards in the ordering GC | |
793 | * uses, GC could skip past them | |
1c6fdbd8 | 794 | */ |
d0734356 | 795 | int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, |
a1d58243 | 796 | bool initial, bool metadata_only) |
1c6fdbd8 KO |
797 | { |
798 | struct bch_dev *ca; | |
799 | u64 start_time = local_clock(); | |
9ca53b55 | 800 | unsigned i, iter = 0; |
2252aa27 | 801 | int ret; |
1c6fdbd8 | 802 | |
1ada1606 | 803 | lockdep_assert_held(&c->state_lock); |
1c6fdbd8 KO |
804 | trace_gc_start(c); |
805 | ||
1c6fdbd8 | 806 | down_write(&c->gc_lock); |
00b8ccf7 KO |
807 | |
808 | /* flush interior btree updates: */ | |
809 | closure_wait_event(&c->btree_interior_update_wait, | |
810 | !bch2_btree_interior_updates_nr_pending(c)); | |
9ca53b55 | 811 | again: |
a1d58243 | 812 | ret = bch2_gc_start(c, metadata_only); |
9ca53b55 | 813 | if (ret) |
1c6fdbd8 KO |
814 | goto out; |
815 | ||
1c6fdbd8 KO |
816 | bch2_mark_superblocks(c); |
817 | ||
d0734356 | 818 | ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); |
9ca53b55 | 819 | if (ret) |
2252aa27 | 820 | goto out; |
1c6fdbd8 | 821 | |
00b8ccf7 | 822 | #if 0 |
1c6fdbd8 | 823 | bch2_mark_pending_btree_node_frees(c); |
00b8ccf7 | 824 | #endif |
1c6fdbd8 KO |
825 | bch2_mark_allocator_buckets(c); |
826 | ||
1c6fdbd8 KO |
827 | c->gc_count++; |
828 | out: | |
6122ab63 KO |
829 | if (!ret && |
830 | (test_bit(BCH_FS_FIXED_GENS, &c->flags) || | |
831 | (!iter && test_restart_gc(c)))) { | |
9ca53b55 KO |
832 | /* |
833 | * XXX: make sure gens we fixed got saved | |
834 | */ | |
835 | if (iter++ <= 2) { | |
836 | bch_info(c, "Fixed gens, restarting mark and sweep:"); | |
837 | clear_bit(BCH_FS_FIXED_GENS, &c->flags); | |
28062d32 | 838 | __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); |
05b3d5ac KO |
839 | |
840 | percpu_down_write(&c->mark_lock); | |
28062d32 | 841 | bch2_gc_free(c); |
05b3d5ac | 842 | percpu_up_write(&c->mark_lock); |
89b05118 KO |
843 | /* flush fsck errors, reset counters */ |
844 | bch2_flush_fsck_errs(c); | |
05b3d5ac | 845 | |
9ca53b55 KO |
846 | goto again; |
847 | } | |
848 | ||
849 | bch_info(c, "Unable to fix bucket gens, looping"); | |
850 | ret = -EINVAL; | |
851 | } | |
852 | ||
5e82a9a1 KO |
853 | if (!ret) { |
854 | bch2_journal_block(&c->journal); | |
05b3d5ac | 855 | |
5e82a9a1 | 856 | percpu_down_write(&c->mark_lock); |
a1d58243 | 857 | ret = bch2_gc_done(c, initial, metadata_only); |
9ca53b55 | 858 | |
5e82a9a1 KO |
859 | bch2_journal_unblock(&c->journal); |
860 | } else { | |
861 | percpu_down_write(&c->mark_lock); | |
862 | } | |
863 | ||
9ca53b55 | 864 | /* Indicates that gc is no longer in progress: */ |
dfe9bfb3 | 865 | __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); |
9ca53b55 KO |
866 | |
867 | bch2_gc_free(c); | |
05b3d5ac KO |
868 | percpu_up_write(&c->mark_lock); |
869 | ||
1c6fdbd8 | 870 | up_write(&c->gc_lock); |
9ca53b55 | 871 | |
1c6fdbd8 KO |
872 | trace_gc_end(c); |
873 | bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); | |
874 | ||
875 | /* | |
876 | * Wake up allocator in case it was waiting for buckets | |
877 | * because of not being able to inc gens | |
878 | */ | |
879 | for_each_member_device(ca, c, i) | |
880 | bch2_wake_allocator(ca); | |
881 | ||
882 | /* | |
883 | * At startup, allocations can happen directly instead of via the | |
884 | * allocator thread - issue wakeup in case they blocked on gc_lock: | |
885 | */ | |
886 | closure_wake_up(&c->freelist_wait); | |
9ca53b55 | 887 | return ret; |
1c6fdbd8 KO |
888 | } |
889 | ||
451570a5 KO |
890 | /* |
891 | * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree | |
892 | * node pointers currently never have cached pointers that can become stale: | |
893 | */ | |
894 | static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) | |
895 | { | |
896 | struct btree_trans trans; | |
897 | struct btree_iter *iter; | |
898 | struct bkey_s_c k; | |
899 | int ret; | |
900 | ||
901 | bch2_trans_init(&trans, c, 0, 0); | |
902 | ||
903 | for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { | |
904 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); | |
905 | const struct bch_extent_ptr *ptr; | |
906 | ||
907 | percpu_down_read(&c->mark_lock); | |
908 | bkey_for_each_ptr(ptrs, ptr) { | |
909 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
910 | struct bucket *g = PTR_BUCKET(ca, ptr, false); | |
911 | ||
912 | if (gen_after(g->gc_gen, ptr->gen)) | |
913 | g->gc_gen = ptr->gen; | |
914 | ||
915 | if (gen_after(g->mark.gen, ptr->gen) > 32) { | |
916 | /* rewrite btree node */ | |
917 | ||
918 | } | |
919 | } | |
920 | percpu_up_read(&c->mark_lock); | |
921 | } | |
922 | ||
923 | bch2_trans_exit(&trans); | |
924 | return ret; | |
925 | } | |
926 | ||
927 | int bch2_gc_gens(struct bch_fs *c) | |
928 | { | |
929 | struct bch_dev *ca; | |
930 | struct bucket_array *buckets; | |
931 | struct bucket *g; | |
932 | unsigned i; | |
933 | int ret; | |
934 | ||
b9c3d139 KO |
935 | /* |
936 | * Ideally we would be using state_lock and not gc_lock here, but that | |
937 | * introduces a deadlock in the RO path - we currently take the state | |
938 | * lock at the start of going RO, thus the gc thread may get stuck: | |
939 | */ | |
940 | down_read(&c->gc_lock); | |
451570a5 KO |
941 | |
942 | for_each_member_device(ca, c, i) { | |
943 | down_read(&ca->bucket_lock); | |
944 | buckets = bucket_array(ca); | |
945 | ||
946 | for_each_bucket(g, buckets) | |
947 | g->gc_gen = g->mark.gen; | |
948 | up_read(&ca->bucket_lock); | |
949 | } | |
950 | ||
951 | for (i = 0; i < BTREE_ID_NR; i++) | |
952 | if (btree_node_type_needs_gc(i)) { | |
953 | ret = bch2_gc_btree_gens(c, i); | |
954 | if (ret) | |
955 | goto err; | |
956 | } | |
957 | ||
958 | for_each_member_device(ca, c, i) { | |
959 | down_read(&ca->bucket_lock); | |
960 | buckets = bucket_array(ca); | |
961 | ||
962 | for_each_bucket(g, buckets) | |
963 | g->oldest_gen = g->gc_gen; | |
964 | up_read(&ca->bucket_lock); | |
965 | } | |
966 | err: | |
b9c3d139 | 967 | up_read(&c->gc_lock); |
451570a5 KO |
968 | return ret; |
969 | } | |
970 | ||
1c6fdbd8 KO |
971 | /* Btree coalescing */ |
972 | ||
973 | static void recalc_packed_keys(struct btree *b) | |
974 | { | |
216c9fac | 975 | struct bset *i = btree_bset_first(b); |
1c6fdbd8 KO |
976 | struct bkey_packed *k; |
977 | ||
978 | memset(&b->nr, 0, sizeof(b->nr)); | |
979 | ||
980 | BUG_ON(b->nsets != 1); | |
981 | ||
216c9fac | 982 | vstruct_for_each(i, k) |
1c6fdbd8 KO |
983 | btree_keys_account_key_add(&b->nr, 0, k); |
984 | } | |
985 | ||
986 | static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, | |
987 | struct btree *old_nodes[GC_MERGE_NODES]) | |
988 | { | |
989 | struct btree *parent = btree_node_parent(iter, old_nodes[0]); | |
990 | unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; | |
991 | unsigned blocks = btree_blocks(c) * 2 / 3; | |
992 | struct btree *new_nodes[GC_MERGE_NODES]; | |
993 | struct btree_update *as; | |
994 | struct keylist keylist; | |
995 | struct bkey_format_state format_state; | |
996 | struct bkey_format new_format; | |
997 | ||
998 | memset(new_nodes, 0, sizeof(new_nodes)); | |
999 | bch2_keylist_init(&keylist, NULL); | |
1000 | ||
1001 | /* Count keys that are not deleted */ | |
1002 | for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) | |
1003 | u64s += old_nodes[i]->nr.live_u64s; | |
1004 | ||
1005 | nr_old_nodes = nr_new_nodes = i; | |
1006 | ||
1007 | /* Check if all keys in @old_nodes could fit in one fewer node */ | |
1008 | if (nr_old_nodes <= 1 || | |
1009 | __vstruct_blocks(struct btree_node, c->block_bits, | |
1010 | DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) | |
1011 | return; | |
1012 | ||
1013 | /* Find a format that all keys in @old_nodes can pack into */ | |
1014 | bch2_bkey_format_init(&format_state); | |
1015 | ||
1016 | for (i = 0; i < nr_old_nodes; i++) | |
1017 | __bch2_btree_calc_format(&format_state, old_nodes[i]); | |
1018 | ||
1019 | new_format = bch2_bkey_format_done(&format_state); | |
1020 | ||
1021 | /* Check if repacking would make any nodes too big to fit */ | |
1022 | for (i = 0; i < nr_old_nodes; i++) | |
1023 | if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { | |
1024 | trace_btree_gc_coalesce_fail(c, | |
1025 | BTREE_GC_COALESCE_FAIL_FORMAT_FITS); | |
1026 | return; | |
1027 | } | |
1028 | ||
1029 | if (bch2_keylist_realloc(&keylist, NULL, 0, | |
1030 | (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { | |
1031 | trace_btree_gc_coalesce_fail(c, | |
1032 | BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); | |
1033 | return; | |
1034 | } | |
1035 | ||
0f9dda47 | 1036 | as = bch2_btree_update_start(iter->trans, iter->btree_id, |
1c6fdbd8 KO |
1037 | btree_update_reserve_required(c, parent) + nr_old_nodes, |
1038 | BTREE_INSERT_NOFAIL| | |
1039 | BTREE_INSERT_USE_RESERVE, | |
1040 | NULL); | |
1041 | if (IS_ERR(as)) { | |
1042 | trace_btree_gc_coalesce_fail(c, | |
1043 | BTREE_GC_COALESCE_FAIL_RESERVE_GET); | |
1044 | bch2_keylist_free(&keylist, NULL); | |
1045 | return; | |
1046 | } | |
1047 | ||
1048 | trace_btree_gc_coalesce(c, old_nodes[0]); | |
1049 | ||
1050 | for (i = 0; i < nr_old_nodes; i++) | |
1051 | bch2_btree_interior_update_will_free_node(as, old_nodes[i]); | |
1052 | ||
1053 | /* Repack everything with @new_format and sort down to one bset */ | |
1054 | for (i = 0; i < nr_old_nodes; i++) | |
1055 | new_nodes[i] = | |
1056 | __bch2_btree_node_alloc_replacement(as, old_nodes[i], | |
1057 | new_format); | |
1058 | ||
1059 | /* | |
1060 | * Conceptually we concatenate the nodes together and slice them | |
1061 | * up at different boundaries. | |
1062 | */ | |
1063 | for (i = nr_new_nodes - 1; i > 0; --i) { | |
1064 | struct btree *n1 = new_nodes[i]; | |
1065 | struct btree *n2 = new_nodes[i - 1]; | |
1066 | ||
1067 | struct bset *s1 = btree_bset_first(n1); | |
1068 | struct bset *s2 = btree_bset_first(n2); | |
1069 | struct bkey_packed *k, *last = NULL; | |
1070 | ||
1071 | /* Calculate how many keys from @n2 we could fit inside @n1 */ | |
1072 | u64s = 0; | |
1073 | ||
1074 | for (k = s2->start; | |
1075 | k < vstruct_last(s2) && | |
1076 | vstruct_blocks_plus(n1->data, c->block_bits, | |
1077 | u64s + k->u64s) <= blocks; | |
ad44bdc3 | 1078 | k = bkey_next_skip_noops(k, vstruct_last(s2))) { |
1c6fdbd8 KO |
1079 | last = k; |
1080 | u64s += k->u64s; | |
1081 | } | |
1082 | ||
1083 | if (u64s == le16_to_cpu(s2->u64s)) { | |
1084 | /* n2 fits entirely in n1 */ | |
1085 | n1->key.k.p = n1->data->max_key = n2->data->max_key; | |
1086 | ||
1087 | memcpy_u64s(vstruct_last(s1), | |
1088 | s2->start, | |
1089 | le16_to_cpu(s2->u64s)); | |
1090 | le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); | |
1091 | ||
1092 | set_btree_bset_end(n1, n1->set); | |
1093 | ||
c43a6ef9 | 1094 | six_unlock_write(&n2->c.lock); |
1c6fdbd8 | 1095 | bch2_btree_node_free_never_inserted(c, n2); |
c43a6ef9 | 1096 | six_unlock_intent(&n2->c.lock); |
1c6fdbd8 KO |
1097 | |
1098 | memmove(new_nodes + i - 1, | |
1099 | new_nodes + i, | |
1100 | sizeof(new_nodes[0]) * (nr_new_nodes - i)); | |
1101 | new_nodes[--nr_new_nodes] = NULL; | |
1102 | } else if (u64s) { | |
1103 | /* move part of n2 into n1 */ | |
1104 | n1->key.k.p = n1->data->max_key = | |
1105 | bkey_unpack_pos(n1, last); | |
1106 | ||
39fb2983 | 1107 | n2->data->min_key = bkey_successor(n1->data->max_key); |
1c6fdbd8 KO |
1108 | |
1109 | memcpy_u64s(vstruct_last(s1), | |
1110 | s2->start, u64s); | |
1111 | le16_add_cpu(&s1->u64s, u64s); | |
1112 | ||
1113 | memmove(s2->start, | |
1114 | vstruct_idx(s2, u64s), | |
1115 | (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); | |
1116 | s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); | |
1117 | ||
1118 | set_btree_bset_end(n1, n1->set); | |
1119 | set_btree_bset_end(n2, n2->set); | |
1120 | } | |
1121 | } | |
1122 | ||
1123 | for (i = 0; i < nr_new_nodes; i++) { | |
1124 | struct btree *n = new_nodes[i]; | |
1125 | ||
1126 | recalc_packed_keys(n); | |
1127 | btree_node_reset_sib_u64s(n); | |
1128 | ||
1129 | bch2_btree_build_aux_trees(n); | |
00b8ccf7 KO |
1130 | |
1131 | bch2_btree_update_add_new_node(as, n); | |
c43a6ef9 | 1132 | six_unlock_write(&n->c.lock); |
1c6fdbd8 KO |
1133 | |
1134 | bch2_btree_node_write(c, n, SIX_LOCK_intent); | |
1135 | } | |
1136 | ||
1137 | /* | |
1138 | * The keys for the old nodes get deleted. We don't want to insert keys | |
1139 | * that compare equal to the keys for the new nodes we'll also be | |
1140 | * inserting - we can't because keys on a keylist must be strictly | |
1141 | * greater than the previous keys, and we also don't need to since the | |
1142 | * key for the new node will serve the same purpose (overwriting the key | |
1143 | * for the old node). | |
1144 | */ | |
1145 | for (i = 0; i < nr_old_nodes; i++) { | |
1146 | struct bkey_i delete; | |
1147 | unsigned j; | |
1148 | ||
1149 | for (j = 0; j < nr_new_nodes; j++) | |
1150 | if (!bkey_cmp(old_nodes[i]->key.k.p, | |
1151 | new_nodes[j]->key.k.p)) | |
1152 | goto next; | |
1153 | ||
1154 | bkey_init(&delete.k); | |
1155 | delete.k.p = old_nodes[i]->key.k.p; | |
1156 | bch2_keylist_add_in_order(&keylist, &delete); | |
1157 | next: | |
1158 | i = i; | |
1159 | } | |
1160 | ||
1161 | /* | |
1162 | * Keys for the new nodes get inserted: bch2_btree_insert_keys() only | |
1163 | * does the lookup once and thus expects the keys to be in sorted order | |
1164 | * so we have to make sure the new keys are correctly ordered with | |
1165 | * respect to the deleted keys added in the previous loop | |
1166 | */ | |
1167 | for (i = 0; i < nr_new_nodes; i++) | |
1168 | bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); | |
1169 | ||
1170 | /* Insert the newly coalesced nodes */ | |
1171 | bch2_btree_insert_node(as, parent, iter, &keylist, 0); | |
1172 | ||
1173 | BUG_ON(!bch2_keylist_empty(&keylist)); | |
1174 | ||
c43a6ef9 | 1175 | BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); |
1c6fdbd8 KO |
1176 | |
1177 | bch2_btree_iter_node_replace(iter, new_nodes[0]); | |
1178 | ||
1179 | for (i = 0; i < nr_new_nodes; i++) | |
00b8ccf7 | 1180 | bch2_btree_update_get_open_buckets(as, new_nodes[i]); |
1c6fdbd8 KO |
1181 | |
1182 | /* Free the old nodes and update our sliding window */ | |
1183 | for (i = 0; i < nr_old_nodes; i++) { | |
1184 | bch2_btree_node_free_inmem(c, old_nodes[i], iter); | |
1c6fdbd8 KO |
1185 | |
1186 | /* | |
1187 | * the index update might have triggered a split, in which case | |
1188 | * the nodes we coalesced - the new nodes we just created - | |
1189 | * might not be sibling nodes anymore - don't add them to the | |
1190 | * sliding window (except the first): | |
1191 | */ | |
1192 | if (!i) { | |
1193 | old_nodes[i] = new_nodes[i]; | |
1194 | } else { | |
1195 | old_nodes[i] = NULL; | |
1c6fdbd8 KO |
1196 | } |
1197 | } | |
1198 | ||
ea3532cb KO |
1199 | for (i = 0; i < nr_new_nodes; i++) |
1200 | six_unlock_intent(&new_nodes[i]->c.lock); | |
1201 | ||
1c6fdbd8 KO |
1202 | bch2_btree_update_done(as); |
1203 | bch2_keylist_free(&keylist, NULL); | |
1204 | } | |
1205 | ||
1206 | static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) | |
1207 | { | |
424eb881 KO |
1208 | struct btree_trans trans; |
1209 | struct btree_iter *iter; | |
1c6fdbd8 KO |
1210 | struct btree *b; |
1211 | bool kthread = (current->flags & PF_KTHREAD) != 0; | |
1212 | unsigned i; | |
1213 | ||
1214 | /* Sliding window of adjacent btree nodes */ | |
1215 | struct btree *merge[GC_MERGE_NODES]; | |
1216 | u32 lock_seq[GC_MERGE_NODES]; | |
1217 | ||
20bceecb | 1218 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 1219 | |
1c6fdbd8 KO |
1220 | /* |
1221 | * XXX: We don't have a good way of positively matching on sibling nodes | |
1222 | * that have the same parent - this code works by handling the cases | |
1223 | * where they might not have the same parent, and is thus fragile. Ugh. | |
1224 | * | |
1225 | * Perhaps redo this to use multiple linked iterators? | |
1226 | */ | |
1227 | memset(merge, 0, sizeof(merge)); | |
1228 | ||
424eb881 | 1229 | __for_each_btree_node(&trans, iter, btree_id, POS_MIN, |
1c6fdbd8 KO |
1230 | BTREE_MAX_DEPTH, 0, |
1231 | BTREE_ITER_PREFETCH, b) { | |
1232 | memmove(merge + 1, merge, | |
1233 | sizeof(merge) - sizeof(merge[0])); | |
1234 | memmove(lock_seq + 1, lock_seq, | |
1235 | sizeof(lock_seq) - sizeof(lock_seq[0])); | |
1236 | ||
1237 | merge[0] = b; | |
1238 | ||
1239 | for (i = 1; i < GC_MERGE_NODES; i++) { | |
1240 | if (!merge[i] || | |
c43a6ef9 | 1241 | !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) |
1c6fdbd8 KO |
1242 | break; |
1243 | ||
c43a6ef9 KO |
1244 | if (merge[i]->c.level != merge[0]->c.level) { |
1245 | six_unlock_intent(&merge[i]->c.lock); | |
1c6fdbd8 KO |
1246 | break; |
1247 | } | |
1248 | } | |
1249 | memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); | |
1250 | ||
424eb881 | 1251 | bch2_coalesce_nodes(c, iter, merge); |
1c6fdbd8 KO |
1252 | |
1253 | for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { | |
c43a6ef9 KO |
1254 | lock_seq[i] = merge[i]->c.lock.state.seq; |
1255 | six_unlock_intent(&merge[i]->c.lock); | |
1c6fdbd8 KO |
1256 | } |
1257 | ||
c43a6ef9 | 1258 | lock_seq[0] = merge[0]->c.lock.state.seq; |
1c6fdbd8 KO |
1259 | |
1260 | if (kthread && kthread_should_stop()) { | |
424eb881 | 1261 | bch2_trans_exit(&trans); |
1c6fdbd8 KO |
1262 | return -ESHUTDOWN; |
1263 | } | |
1264 | ||
424eb881 | 1265 | bch2_trans_cond_resched(&trans); |
1c6fdbd8 KO |
1266 | |
1267 | /* | |
1268 | * If the parent node wasn't relocked, it might have been split | |
1269 | * and the nodes in our sliding window might not have the same | |
1270 | * parent anymore - blow away the sliding window: | |
1271 | */ | |
424eb881 KO |
1272 | if (btree_iter_node(iter, iter->level + 1) && |
1273 | !btree_node_intent_locked(iter, iter->level + 1)) | |
1c6fdbd8 KO |
1274 | memset(merge + 1, 0, |
1275 | (GC_MERGE_NODES - 1) * sizeof(merge[0])); | |
1276 | } | |
424eb881 | 1277 | return bch2_trans_exit(&trans); |
1c6fdbd8 KO |
1278 | } |
1279 | ||
1280 | /** | |
1281 | * bch_coalesce - coalesce adjacent nodes with low occupancy | |
1282 | */ | |
1283 | void bch2_coalesce(struct bch_fs *c) | |
1284 | { | |
1285 | enum btree_id id; | |
1286 | ||
1c6fdbd8 KO |
1287 | down_read(&c->gc_lock); |
1288 | trace_gc_coalesce_start(c); | |
1289 | ||
1290 | for (id = 0; id < BTREE_ID_NR; id++) { | |
1291 | int ret = c->btree_roots[id].b | |
1292 | ? bch2_coalesce_btree(c, id) | |
1293 | : 0; | |
1294 | ||
1295 | if (ret) { | |
1296 | if (ret != -ESHUTDOWN) | |
1297 | bch_err(c, "btree coalescing failed: %d", ret); | |
1c6fdbd8 KO |
1298 | return; |
1299 | } | |
1300 | } | |
1301 | ||
1302 | trace_gc_coalesce_end(c); | |
1303 | up_read(&c->gc_lock); | |
1304 | } | |
1305 | ||
1306 | static int bch2_gc_thread(void *arg) | |
1307 | { | |
1308 | struct bch_fs *c = arg; | |
1309 | struct io_clock *clock = &c->io_clock[WRITE]; | |
1310 | unsigned long last = atomic_long_read(&clock->now); | |
1311 | unsigned last_kick = atomic_read(&c->kick_gc); | |
9ca53b55 | 1312 | int ret; |
1c6fdbd8 KO |
1313 | |
1314 | set_freezable(); | |
1315 | ||
1316 | while (1) { | |
1317 | while (1) { | |
1318 | set_current_state(TASK_INTERRUPTIBLE); | |
1319 | ||
1320 | if (kthread_should_stop()) { | |
1321 | __set_current_state(TASK_RUNNING); | |
1322 | return 0; | |
1323 | } | |
1324 | ||
1325 | if (atomic_read(&c->kick_gc) != last_kick) | |
1326 | break; | |
1327 | ||
1328 | if (c->btree_gc_periodic) { | |
1329 | unsigned long next = last + c->capacity / 16; | |
1330 | ||
1331 | if (atomic_long_read(&clock->now) >= next) | |
1332 | break; | |
1333 | ||
1334 | bch2_io_clock_schedule_timeout(clock, next); | |
1335 | } else { | |
1336 | schedule(); | |
1337 | } | |
1338 | ||
1339 | try_to_freeze(); | |
1340 | } | |
1341 | __set_current_state(TASK_RUNNING); | |
1342 | ||
1343 | last = atomic_long_read(&clock->now); | |
1344 | last_kick = atomic_read(&c->kick_gc); | |
1345 | ||
451570a5 KO |
1346 | /* |
1347 | * Full gc is currently incompatible with btree key cache: | |
1348 | */ | |
1349 | #if 0 | |
a1d58243 | 1350 | ret = bch2_gc(c, NULL, false, false); |
451570a5 KO |
1351 | #else |
1352 | ret = bch2_gc_gens(c); | |
1353 | #endif | |
9ca53b55 KO |
1354 | if (ret) |
1355 | bch_err(c, "btree gc failed: %i", ret); | |
1c6fdbd8 KO |
1356 | |
1357 | debug_check_no_locks_held(); | |
1358 | } | |
1359 | ||
1360 | return 0; | |
1361 | } | |
1362 | ||
1363 | void bch2_gc_thread_stop(struct bch_fs *c) | |
1364 | { | |
1365 | struct task_struct *p; | |
1366 | ||
1367 | p = c->gc_thread; | |
1368 | c->gc_thread = NULL; | |
1369 | ||
1370 | if (p) { | |
1371 | kthread_stop(p); | |
1372 | put_task_struct(p); | |
1373 | } | |
1374 | } | |
1375 | ||
1376 | int bch2_gc_thread_start(struct bch_fs *c) | |
1377 | { | |
1378 | struct task_struct *p; | |
1379 | ||
1380 | BUG_ON(c->gc_thread); | |
1381 | ||
1382 | p = kthread_create(bch2_gc_thread, c, "bch_gc"); | |
1383 | if (IS_ERR(p)) | |
1384 | return PTR_ERR(p); | |
1385 | ||
1386 | get_task_struct(p); | |
1387 | c->gc_thread = p; | |
1388 | wake_up_process(p); | |
1389 | return 0; | |
1390 | } |