Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | |
4 | * Copyright (C) 2014 Datera Inc. | |
5 | */ | |
6 | ||
7 | #include "bcachefs.h" | |
7b3f84ea | 8 | #include "alloc_background.h" |
ef337c54 | 9 | #include "alloc_foreground.h" |
1c6fdbd8 KO |
10 | #include "bkey_methods.h" |
11 | #include "btree_locking.h" | |
12 | #include "btree_update_interior.h" | |
13 | #include "btree_io.h" | |
14 | #include "btree_gc.h" | |
15 | #include "buckets.h" | |
16 | #include "clock.h" | |
17 | #include "debug.h" | |
cd575ddf | 18 | #include "ec.h" |
1c6fdbd8 KO |
19 | #include "error.h" |
20 | #include "extents.h" | |
21 | #include "journal.h" | |
2252aa27 | 22 | #include "journal_io.h" |
1c6fdbd8 KO |
23 | #include "keylist.h" |
24 | #include "move.h" | |
25 | #include "replicas.h" | |
26 | #include "super-io.h" | |
27 | #include "trace.h" | |
28 | ||
29 | #include <linux/slab.h> | |
30 | #include <linux/bitops.h> | |
31 | #include <linux/freezer.h> | |
32 | #include <linux/kthread.h> | |
33 | #include <linux/preempt.h> | |
34 | #include <linux/rcupdate.h> | |
35 | #include <linux/sched/task.h> | |
36 | ||
2252aa27 KO |
37 | static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) |
38 | { | |
39 | preempt_disable(); | |
40 | write_seqcount_begin(&c->gc_pos_lock); | |
41 | c->gc_pos = new_pos; | |
42 | write_seqcount_end(&c->gc_pos_lock); | |
43 | preempt_enable(); | |
44 | } | |
45 | ||
46 | static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) | |
47 | { | |
48 | BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); | |
49 | __gc_pos_set(c, new_pos); | |
50 | } | |
51 | ||
52 | /* range_checks - for validating min/max pos of each btree node: */ | |
53 | ||
1c6fdbd8 KO |
54 | struct range_checks { |
55 | struct range_level { | |
56 | struct bpos min; | |
57 | struct bpos max; | |
58 | } l[BTREE_MAX_DEPTH]; | |
59 | unsigned depth; | |
60 | }; | |
61 | ||
62 | static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) | |
63 | { | |
64 | unsigned i; | |
65 | ||
66 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
67 | r->l[i].min = r->l[i].max = POS_MIN; | |
68 | r->depth = depth; | |
69 | } | |
70 | ||
71 | static void btree_node_range_checks(struct bch_fs *c, struct btree *b, | |
72 | struct range_checks *r) | |
73 | { | |
74 | struct range_level *l = &r->l[b->level]; | |
75 | ||
76 | struct bpos expected_min = bkey_cmp(l->min, l->max) | |
77 | ? btree_type_successor(b->btree_id, l->max) | |
78 | : l->max; | |
79 | ||
80 | bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, | |
81 | "btree node has incorrect min key: %llu:%llu != %llu:%llu", | |
82 | b->data->min_key.inode, | |
83 | b->data->min_key.offset, | |
84 | expected_min.inode, | |
85 | expected_min.offset); | |
86 | ||
87 | l->max = b->data->max_key; | |
88 | ||
89 | if (b->level > r->depth) { | |
90 | l = &r->l[b->level - 1]; | |
91 | ||
92 | bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, | |
93 | "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", | |
94 | b->data->min_key.inode, | |
95 | b->data->min_key.offset, | |
96 | l->min.inode, | |
97 | l->min.offset); | |
98 | ||
99 | bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, | |
100 | "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", | |
101 | b->data->max_key.inode, | |
102 | b->data->max_key.offset, | |
103 | l->max.inode, | |
104 | l->max.offset); | |
105 | ||
106 | if (bkey_cmp(b->data->max_key, POS_MAX)) | |
107 | l->min = l->max = | |
108 | btree_type_successor(b->btree_id, | |
109 | b->data->max_key); | |
110 | } | |
111 | } | |
112 | ||
2252aa27 KO |
113 | /* marking of btree keys/nodes: */ |
114 | ||
26609b61 | 115 | static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, |
d034c09b | 116 | u8 *max_stale, bool initial) |
2252aa27 | 117 | { |
26609b61 KO |
118 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
119 | const struct bch_extent_ptr *ptr; | |
2252aa27 | 120 | struct gc_pos pos = { 0 }; |
47799326 | 121 | unsigned flags = |
9ca53b55 | 122 | BCH_BUCKET_MARK_GC| |
47799326 | 123 | (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); |
2252aa27 KO |
124 | int ret = 0; |
125 | ||
91f8b567 KO |
126 | if (initial) { |
127 | BUG_ON(journal_seq_verify(c) && | |
128 | k.k->version.lo > journal_cur_seq(&c->journal)); | |
129 | ||
130 | if (k.k->version.lo > atomic64_read(&c->key_version)) | |
131 | atomic64_set(&c->key_version, k.k->version.lo); | |
132 | ||
133 | if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || | |
26609b61 | 134 | fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, |
91f8b567 | 135 | "superblock not marked as containing replicas (type %u)", |
26609b61 KO |
136 | k.k->type)) { |
137 | ret = bch2_mark_bkey_replicas(c, k); | |
91f8b567 | 138 | if (ret) |
2252aa27 KO |
139 | return ret; |
140 | } | |
47799326 | 141 | |
26609b61 KO |
142 | bkey_for_each_ptr(ptrs, ptr) { |
143 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
39fbc5a4 | 144 | struct bucket *g = PTR_BUCKET(ca, ptr, true); |
28062d32 | 145 | struct bucket *g2 = PTR_BUCKET(ca, ptr, false); |
26609b61 | 146 | |
90541a74 | 147 | if (mustfix_fsck_err_on(!g->gen_valid, c, |
26609b61 KO |
148 | "found ptr with missing gen in alloc btree,\n" |
149 | "type %u gen %u", | |
150 | k.k->type, ptr->gen)) { | |
28062d32 KO |
151 | g2->_mark.gen = g->_mark.gen = ptr->gen; |
152 | g2->_mark.dirty = g->_mark.dirty = true; | |
153 | g2->gen_valid = g->gen_valid = true; | |
26609b61 KO |
154 | } |
155 | ||
156 | if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, | |
157 | "%u ptr gen in the future: %u > %u", | |
158 | k.k->type, ptr->gen, g->mark.gen)) { | |
28062d32 KO |
159 | g2->_mark.gen = g->_mark.gen = ptr->gen; |
160 | g2->_mark.dirty = g->_mark.dirty = true; | |
161 | g2->gen_valid = g->gen_valid = true; | |
26609b61 KO |
162 | set_bit(BCH_FS_FIXED_GENS, &c->flags); |
163 | } | |
164 | } | |
2252aa27 KO |
165 | } |
166 | ||
26609b61 KO |
167 | bkey_for_each_ptr(ptrs, ptr) { |
168 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
39fbc5a4 | 169 | struct bucket *g = PTR_BUCKET(ca, ptr, true); |
26609b61 | 170 | |
76f4c7b0 KO |
171 | if (gen_after(g->oldest_gen, ptr->gen)) |
172 | g->oldest_gen = ptr->gen; | |
26609b61 KO |
173 | |
174 | *max_stale = max(*max_stale, ptr_stale(ca, ptr)); | |
175 | } | |
91f8b567 | 176 | |
26609b61 | 177 | bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags); |
91f8b567 | 178 | fsck_err: |
2252aa27 KO |
179 | return ret; |
180 | } | |
181 | ||
182 | static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, | |
d034c09b | 183 | u8 *max_stale, bool initial) |
1c6fdbd8 | 184 | { |
1c6fdbd8 KO |
185 | struct btree_node_iter iter; |
186 | struct bkey unpacked; | |
187 | struct bkey_s_c k; | |
d034c09b KO |
188 | int ret = 0; |
189 | ||
190 | *max_stale = 0; | |
1c6fdbd8 | 191 | |
26609b61 | 192 | if (!btree_node_type_needs_gc(btree_node_type(b))) |
2252aa27 | 193 | return 0; |
1c6fdbd8 | 194 | |
2252aa27 KO |
195 | for_each_btree_node_key_unpack(b, k, &iter, |
196 | &unpacked) { | |
197 | bch2_bkey_debugcheck(c, b, k); | |
1c6fdbd8 | 198 | |
26609b61 | 199 | ret = bch2_gc_mark_key(c, k, max_stale, initial); |
d034c09b KO |
200 | if (ret) |
201 | break; | |
2252aa27 KO |
202 | } |
203 | ||
d034c09b | 204 | return ret; |
1c6fdbd8 KO |
205 | } |
206 | ||
2252aa27 KO |
207 | static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, |
208 | bool initial) | |
1c6fdbd8 KO |
209 | { |
210 | struct btree_iter iter; | |
211 | struct btree *b; | |
212 | struct range_checks r; | |
26609b61 | 213 | unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1; |
d034c09b | 214 | u8 max_stale; |
1c6fdbd8 KO |
215 | int ret = 0; |
216 | ||
217 | gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); | |
218 | ||
1c6fdbd8 KO |
219 | /* |
220 | * if expensive_debug_checks is on, run range_checks on all leaf nodes: | |
2252aa27 KO |
221 | * |
222 | * and on startup, we have to read every btree node (XXX: only if it was | |
223 | * an unclean shutdown) | |
1c6fdbd8 | 224 | */ |
2252aa27 | 225 | if (initial || expensive_debug_checks(c)) |
1c6fdbd8 KO |
226 | depth = 0; |
227 | ||
228 | btree_node_range_checks_init(&r, depth); | |
229 | ||
230 | __for_each_btree_node(&iter, c, btree_id, POS_MIN, | |
231 | 0, depth, BTREE_ITER_PREFETCH, b) { | |
232 | btree_node_range_checks(c, b, &r); | |
233 | ||
234 | bch2_verify_btree_nr_keys(b); | |
235 | ||
8777210b KO |
236 | gc_pos_set(c, gc_pos_btree_node(b)); |
237 | ||
d034c09b KO |
238 | ret = btree_gc_mark_node(c, b, &max_stale, initial); |
239 | if (ret) | |
240 | break; | |
1c6fdbd8 | 241 | |
2252aa27 KO |
242 | if (!initial) { |
243 | if (max_stale > 64) | |
244 | bch2_btree_node_rewrite(c, &iter, | |
245 | b->data->keys.seq, | |
246 | BTREE_INSERT_USE_RESERVE| | |
247 | BTREE_INSERT_NOWAIT| | |
248 | BTREE_INSERT_GC_LOCK_HELD); | |
249 | else if (!btree_gc_rewrite_disabled(c) && | |
250 | (btree_gc_always_rewrite(c) || max_stale > 16)) | |
251 | bch2_btree_node_rewrite(c, &iter, | |
252 | b->data->keys.seq, | |
253 | BTREE_INSERT_NOWAIT| | |
254 | BTREE_INSERT_GC_LOCK_HELD); | |
255 | } | |
1c6fdbd8 KO |
256 | |
257 | bch2_btree_iter_cond_resched(&iter); | |
258 | } | |
d034c09b | 259 | ret = bch2_btree_iter_unlock(&iter) ?: ret; |
1c6fdbd8 KO |
260 | if (ret) |
261 | return ret; | |
262 | ||
263 | mutex_lock(&c->btree_root_lock); | |
264 | ||
265 | b = c->btree_roots[btree_id].b; | |
266 | if (!btree_node_fake(b)) | |
26609b61 | 267 | bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), |
d034c09b | 268 | &max_stale, initial); |
1c6fdbd8 KO |
269 | gc_pos_set(c, gc_pos_btree_root(b->btree_id)); |
270 | ||
271 | mutex_unlock(&c->btree_root_lock); | |
272 | return 0; | |
273 | } | |
274 | ||
cd575ddf KO |
275 | static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) |
276 | { | |
277 | return (int) btree_id_to_gc_phase(l) - | |
278 | (int) btree_id_to_gc_phase(r); | |
279 | } | |
280 | ||
2252aa27 KO |
281 | static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, |
282 | bool initial) | |
283 | { | |
cd575ddf | 284 | enum btree_id ids[BTREE_ID_NR]; |
d034c09b | 285 | u8 max_stale; |
2252aa27 KO |
286 | unsigned i; |
287 | ||
cd575ddf KO |
288 | for (i = 0; i < BTREE_ID_NR; i++) |
289 | ids[i] = i; | |
290 | bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); | |
291 | ||
2252aa27 | 292 | for (i = 0; i < BTREE_ID_NR; i++) { |
cd575ddf | 293 | enum btree_id id = ids[i]; |
26609b61 | 294 | enum btree_node_type type = __btree_node_type(0, id); |
2252aa27 | 295 | |
cd575ddf | 296 | int ret = bch2_gc_btree(c, id, initial); |
2252aa27 KO |
297 | if (ret) |
298 | return ret; | |
299 | ||
26609b61 | 300 | if (journal && btree_node_type_needs_gc(type)) { |
2252aa27 KO |
301 | struct bkey_i *k, *n; |
302 | struct jset_entry *j; | |
303 | struct journal_replay *r; | |
304 | int ret; | |
305 | ||
306 | list_for_each_entry(r, journal, list) | |
307 | for_each_jset_key(k, n, j, &r->j) { | |
26609b61 KO |
308 | if (type == __btree_node_type(j->level, j->btree_id)) { |
309 | ret = bch2_gc_mark_key(c, | |
d034c09b KO |
310 | bkey_i_to_s_c(k), |
311 | &max_stale, initial); | |
312 | if (ret) | |
2252aa27 KO |
313 | return ret; |
314 | } | |
315 | } | |
316 | } | |
317 | } | |
318 | ||
319 | return 0; | |
320 | } | |
321 | ||
1c6fdbd8 KO |
322 | static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, |
323 | u64 start, u64 end, | |
324 | enum bch_data_type type, | |
325 | unsigned flags) | |
326 | { | |
327 | u64 b = sector_to_bucket(ca, start); | |
328 | ||
329 | do { | |
330 | unsigned sectors = | |
331 | min_t(u64, bucket_to_sector(ca, b + 1), end) - start; | |
332 | ||
333 | bch2_mark_metadata_bucket(c, ca, b, type, sectors, | |
334 | gc_phase(GC_PHASE_SB), flags); | |
335 | b++; | |
336 | start += sectors; | |
337 | } while (start < end); | |
338 | } | |
339 | ||
340 | void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, | |
341 | unsigned flags) | |
342 | { | |
343 | struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; | |
344 | unsigned i; | |
345 | u64 b; | |
346 | ||
97446a24 KO |
347 | /* |
348 | * This conditional is kind of gross, but we may be called from the | |
349 | * device add path, before the new device has actually been added to the | |
350 | * running filesystem: | |
351 | */ | |
1c6fdbd8 KO |
352 | if (c) { |
353 | lockdep_assert_held(&c->sb_lock); | |
9166b41d | 354 | percpu_down_read(&c->mark_lock); |
97446a24 KO |
355 | } else { |
356 | preempt_disable(); | |
1c6fdbd8 KO |
357 | } |
358 | ||
359 | for (i = 0; i < layout->nr_superblocks; i++) { | |
360 | u64 offset = le64_to_cpu(layout->sb_offset[i]); | |
361 | ||
362 | if (offset == BCH_SB_SECTOR) | |
363 | mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, | |
364 | BCH_DATA_SB, flags); | |
365 | ||
366 | mark_metadata_sectors(c, ca, offset, | |
367 | offset + (1 << layout->sb_max_size_bits), | |
368 | BCH_DATA_SB, flags); | |
369 | } | |
370 | ||
1c6fdbd8 KO |
371 | for (i = 0; i < ca->journal.nr; i++) { |
372 | b = ca->journal.buckets[i]; | |
373 | bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, | |
374 | ca->mi.bucket_size, | |
375 | gc_phase(GC_PHASE_SB), flags); | |
376 | } | |
377 | ||
378 | if (c) { | |
9166b41d | 379 | percpu_up_read(&c->mark_lock); |
97446a24 KO |
380 | } else { |
381 | preempt_enable(); | |
1c6fdbd8 KO |
382 | } |
383 | } | |
384 | ||
385 | static void bch2_mark_superblocks(struct bch_fs *c) | |
386 | { | |
387 | struct bch_dev *ca; | |
388 | unsigned i; | |
389 | ||
390 | mutex_lock(&c->sb_lock); | |
391 | gc_pos_set(c, gc_phase(GC_PHASE_SB)); | |
392 | ||
393 | for_each_online_member(ca, c, i) | |
9ca53b55 | 394 | bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); |
1c6fdbd8 KO |
395 | mutex_unlock(&c->sb_lock); |
396 | } | |
397 | ||
398 | /* Also see bch2_pending_btree_node_free_insert_done() */ | |
399 | static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) | |
400 | { | |
401 | struct gc_pos pos = { 0 }; | |
1c6fdbd8 KO |
402 | struct btree_update *as; |
403 | struct pending_btree_node_free *d; | |
404 | ||
405 | mutex_lock(&c->btree_interior_update_lock); | |
406 | gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); | |
407 | ||
408 | for_each_pending_btree_node_free(c, as, d) | |
409 | if (d->index_update_done) | |
26609b61 | 410 | bch2_mark_key(c, bkey_i_to_s_c(&d->key), |
47799326 | 411 | true, 0, |
9ca53b55 KO |
412 | pos, NULL, 0, |
413 | BCH_BUCKET_MARK_GC); | |
1c6fdbd8 KO |
414 | |
415 | mutex_unlock(&c->btree_interior_update_lock); | |
416 | } | |
417 | ||
418 | static void bch2_mark_allocator_buckets(struct bch_fs *c) | |
419 | { | |
420 | struct bch_dev *ca; | |
421 | struct open_bucket *ob; | |
422 | size_t i, j, iter; | |
423 | unsigned ci; | |
424 | ||
9166b41d | 425 | percpu_down_read(&c->mark_lock); |
1c6fdbd8 KO |
426 | |
427 | spin_lock(&c->freelist_lock); | |
428 | gc_pos_set(c, gc_pos_alloc(c, NULL)); | |
429 | ||
430 | for_each_member_device(ca, c, ci) { | |
431 | fifo_for_each_entry(i, &ca->free_inc, iter) | |
432 | bch2_mark_alloc_bucket(c, ca, i, true, | |
433 | gc_pos_alloc(c, NULL), | |
9ca53b55 | 434 | BCH_BUCKET_MARK_GC); |
1c6fdbd8 KO |
435 | |
436 | ||
437 | ||
438 | for (j = 0; j < RESERVE_NR; j++) | |
439 | fifo_for_each_entry(i, &ca->free[j], iter) | |
440 | bch2_mark_alloc_bucket(c, ca, i, true, | |
441 | gc_pos_alloc(c, NULL), | |
9ca53b55 | 442 | BCH_BUCKET_MARK_GC); |
1c6fdbd8 KO |
443 | } |
444 | ||
445 | spin_unlock(&c->freelist_lock); | |
446 | ||
447 | for (ob = c->open_buckets; | |
448 | ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); | |
449 | ob++) { | |
450 | spin_lock(&ob->lock); | |
451 | if (ob->valid) { | |
452 | gc_pos_set(c, gc_pos_alloc(c, ob)); | |
453 | ca = bch_dev_bkey_exists(c, ob->ptr.dev); | |
454 | bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, | |
455 | gc_pos_alloc(c, ob), | |
9ca53b55 | 456 | BCH_BUCKET_MARK_GC); |
1c6fdbd8 KO |
457 | } |
458 | spin_unlock(&ob->lock); | |
459 | } | |
460 | ||
9166b41d | 461 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
462 | } |
463 | ||
9ca53b55 KO |
464 | static void bch2_gc_free(struct bch_fs *c) |
465 | { | |
466 | struct bch_dev *ca; | |
467 | unsigned i; | |
468 | ||
dfe9bfb3 KO |
469 | genradix_free(&c->stripes[1]); |
470 | ||
9ca53b55 KO |
471 | for_each_member_device(ca, c, i) { |
472 | kvpfree(rcu_dereference_protected(ca->buckets[1], 1), | |
473 | sizeof(struct bucket_array) + | |
474 | ca->mi.nbuckets * sizeof(struct bucket)); | |
475 | ca->buckets[1] = NULL; | |
476 | ||
477 | free_percpu(ca->usage[1]); | |
478 | ca->usage[1] = NULL; | |
479 | } | |
480 | ||
7ef2a73a KO |
481 | percpu_down_write(&c->mark_lock); |
482 | ||
9ca53b55 KO |
483 | free_percpu(c->usage[1]); |
484 | c->usage[1] = NULL; | |
06b7345c | 485 | |
7ef2a73a | 486 | percpu_up_write(&c->mark_lock); |
06b7345c KO |
487 | } |
488 | ||
9ca53b55 KO |
489 | static void bch2_gc_done(struct bch_fs *c, bool initial) |
490 | { | |
491 | struct bch_dev *ca; | |
76f4c7b0 KO |
492 | bool verify = !initial || |
493 | (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)); | |
9ca53b55 | 494 | unsigned i; |
9ca53b55 KO |
495 | |
496 | #define copy_field(_f, _msg, ...) \ | |
23f80d2b | 497 | if (dst->_f != src->_f) { \ |
76f4c7b0 KO |
498 | if (verify) \ |
499 | bch_err(c, _msg ": got %llu, should be %llu, fixing"\ | |
500 | , ##__VA_ARGS__, dst->_f, src->_f); \ | |
23f80d2b | 501 | dst->_f = src->_f; \ |
9ca53b55 | 502 | } |
dfe9bfb3 KO |
503 | #define copy_stripe_field(_f, _msg, ...) \ |
504 | if (dst->_f != src->_f) { \ | |
76f4c7b0 KO |
505 | if (verify) \ |
506 | bch_err_ratelimited(c, "stripe %zu has wrong "_msg\ | |
507 | ": got %u, should be %u, fixing", \ | |
508 | dst_iter.pos, ##__VA_ARGS__, \ | |
509 | dst->_f, src->_f); \ | |
dfe9bfb3 | 510 | dst->_f = src->_f; \ |
61c8d7c8 | 511 | dst->dirty = true; \ |
dfe9bfb3 | 512 | } |
9ca53b55 KO |
513 | #define copy_bucket_field(_f) \ |
514 | if (dst->b[b].mark._f != src->b[b].mark._f) { \ | |
76f4c7b0 KO |
515 | if (verify) \ |
516 | bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\ | |
517 | ": got %u, should be %u, fixing", i, b, \ | |
518 | dst->b[b].mark._f, src->b[b].mark._f); \ | |
9ca53b55 | 519 | dst->b[b]._mark._f = src->b[b].mark._f; \ |
76f4c7b0 | 520 | dst->b[b]._mark.dirty = true; \ |
9ca53b55 KO |
521 | } |
522 | #define copy_dev_field(_f, _msg, ...) \ | |
523 | copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) | |
524 | #define copy_fs_field(_f, _msg, ...) \ | |
525 | copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) | |
526 | ||
9166b41d | 527 | percpu_down_write(&c->mark_lock); |
9ca53b55 | 528 | |
dfe9bfb3 KO |
529 | { |
530 | struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); | |
531 | struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); | |
532 | struct stripe *dst, *src; | |
533 | unsigned i; | |
534 | ||
535 | c->ec_stripes_heap.used = 0; | |
536 | ||
537 | while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && | |
538 | (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { | |
61c8d7c8 KO |
539 | BUG_ON(src_iter.pos != dst_iter.pos); |
540 | ||
dfe9bfb3 KO |
541 | copy_stripe_field(alive, "alive"); |
542 | copy_stripe_field(sectors, "sectors"); | |
543 | copy_stripe_field(algorithm, "algorithm"); | |
544 | copy_stripe_field(nr_blocks, "nr_blocks"); | |
545 | copy_stripe_field(nr_redundant, "nr_redundant"); | |
61c8d7c8 | 546 | copy_stripe_field(blocks_nonempty, |
dfe9bfb3 KO |
547 | "blocks_nonempty"); |
548 | ||
549 | for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) | |
61c8d7c8 | 550 | copy_stripe_field(block_sectors[i], |
dfe9bfb3 KO |
551 | "block_sectors[%u]", i); |
552 | ||
553 | if (dst->alive) | |
554 | bch2_stripes_heap_insert(c, dst, dst_iter.pos); | |
555 | ||
556 | genradix_iter_advance(&dst_iter, &c->stripes[0]); | |
557 | genradix_iter_advance(&src_iter, &c->stripes[1]); | |
558 | } | |
559 | } | |
560 | ||
9ca53b55 KO |
561 | for_each_member_device(ca, c, i) { |
562 | struct bucket_array *dst = __bucket_array(ca, 0); | |
563 | struct bucket_array *src = __bucket_array(ca, 1); | |
564 | size_t b; | |
565 | ||
566 | if (initial) { | |
567 | memcpy(dst, src, | |
568 | sizeof(struct bucket_array) + | |
569 | sizeof(struct bucket) * dst->nbuckets); | |
570 | } | |
571 | ||
572 | for (b = 0; b < src->nbuckets; b++) { | |
573 | copy_bucket_field(gen); | |
574 | copy_bucket_field(data_type); | |
575 | copy_bucket_field(owned_by_allocator); | |
576 | copy_bucket_field(stripe); | |
577 | copy_bucket_field(dirty_sectors); | |
578 | copy_bucket_field(cached_sectors); | |
76f4c7b0 KO |
579 | |
580 | if (dst->b[b].oldest_gen != src->b[b].oldest_gen) { | |
581 | dst->b[b].oldest_gen = src->b[b].oldest_gen; | |
582 | dst->b[b]._mark.dirty = true; | |
583 | } | |
9ca53b55 KO |
584 | } |
585 | }; | |
586 | ||
587 | for_each_member_device(ca, c, i) { | |
23f80d2b KO |
588 | unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); |
589 | struct bch_dev_usage *dst = (void *) | |
7ef2a73a | 590 | bch2_acc_percpu_u64s((void *) ca->usage[0], nr); |
23f80d2b | 591 | struct bch_dev_usage *src = (void *) |
7ef2a73a | 592 | bch2_acc_percpu_u64s((void *) ca->usage[1], nr); |
9ca53b55 KO |
593 | unsigned b; |
594 | ||
595 | for (b = 0; b < BCH_DATA_NR; b++) | |
76f4c7b0 KO |
596 | copy_dev_field(buckets[b], "buckets[%s]", |
597 | bch2_data_types[b]); | |
598 | copy_dev_field(buckets_alloc, "buckets_alloc"); | |
599 | copy_dev_field(buckets_ec, "buckets_ec"); | |
600 | copy_dev_field(buckets_unavailable, "buckets_unavailable"); | |
9ca53b55 KO |
601 | |
602 | for (b = 0; b < BCH_DATA_NR; b++) | |
76f4c7b0 KO |
603 | copy_dev_field(sectors[b], "sectors[%s]", |
604 | bch2_data_types[b]); | |
605 | copy_dev_field(sectors_fragmented, "sectors_fragmented"); | |
9ca53b55 KO |
606 | } |
607 | ||
608 | { | |
ecf37a4a | 609 | unsigned nr = fs_usage_u64s(c); |
23f80d2b | 610 | struct bch_fs_usage *dst = (void *) |
7ef2a73a | 611 | bch2_acc_percpu_u64s((void *) c->usage[0], nr); |
23f80d2b | 612 | struct bch_fs_usage *src = (void *) |
7ef2a73a | 613 | bch2_acc_percpu_u64s((void *) c->usage[1], nr); |
9ca53b55 | 614 | |
768ac639 KO |
615 | copy_fs_field(hidden, "hidden"); |
616 | copy_fs_field(data, "data"); | |
617 | copy_fs_field(cached, "cached"); | |
618 | copy_fs_field(reserved, "reserved"); | |
619 | copy_fs_field(nr_inodes, "nr_inodes"); | |
06b7345c | 620 | |
7ef2a73a KO |
621 | for (i = 0; i < BCH_REPLICAS_MAX; i++) |
622 | copy_fs_field(persistent_reserved[i], | |
623 | "persistent_reserved[%i]", i); | |
9ca53b55 | 624 | |
7ef2a73a | 625 | for (i = 0; i < c->replicas.nr; i++) { |
8777210b KO |
626 | struct bch_replicas_entry *e = |
627 | cpu_replicas_entry(&c->replicas, i); | |
628 | char buf[80]; | |
629 | ||
630 | bch2_replicas_entry_to_text(&PBUF(buf), e); | |
631 | ||
768ac639 | 632 | copy_fs_field(replicas[i], "%s", buf); |
7ef2a73a | 633 | } |
9ca53b55 | 634 | } |
76f4c7b0 | 635 | |
9166b41d | 636 | percpu_up_write(&c->mark_lock); |
1c6fdbd8 | 637 | |
9ca53b55 KO |
638 | #undef copy_fs_field |
639 | #undef copy_dev_field | |
640 | #undef copy_bucket_field | |
dfe9bfb3 KO |
641 | #undef copy_stripe_field |
642 | #undef copy_field | |
9ca53b55 KO |
643 | } |
644 | ||
645 | static int bch2_gc_start(struct bch_fs *c) | |
646 | { | |
647 | struct bch_dev *ca; | |
648 | unsigned i; | |
649 | ||
39fbc5a4 KO |
650 | percpu_down_write(&c->mark_lock); |
651 | ||
dfe9bfb3 KO |
652 | /* |
653 | * indicate to stripe code that we need to allocate for the gc stripes | |
654 | * radix tree, too | |
655 | */ | |
656 | gc_pos_set(c, gc_phase(GC_PHASE_START)); | |
657 | ||
9ca53b55 KO |
658 | BUG_ON(c->usage[1]); |
659 | ||
ecf37a4a KO |
660 | c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), |
661 | sizeof(u64), GFP_KERNEL); | |
7ef2a73a KO |
662 | percpu_up_write(&c->mark_lock); |
663 | ||
9ca53b55 KO |
664 | if (!c->usage[1]) |
665 | return -ENOMEM; | |
666 | ||
1c6fdbd8 | 667 | for_each_member_device(ca, c, i) { |
9ca53b55 KO |
668 | BUG_ON(ca->buckets[1]); |
669 | BUG_ON(ca->usage[1]); | |
670 | ||
671 | ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + | |
672 | ca->mi.nbuckets * sizeof(struct bucket), | |
673 | GFP_KERNEL|__GFP_ZERO); | |
674 | if (!ca->buckets[1]) { | |
675 | percpu_ref_put(&ca->ref); | |
676 | return -ENOMEM; | |
677 | } | |
678 | ||
679 | ca->usage[1] = alloc_percpu(struct bch_dev_usage); | |
680 | if (!ca->usage[1]) { | |
681 | percpu_ref_put(&ca->ref); | |
682 | return -ENOMEM; | |
1c6fdbd8 | 683 | } |
1c6fdbd8 | 684 | } |
9ca53b55 | 685 | |
9166b41d | 686 | percpu_down_write(&c->mark_lock); |
9ca53b55 KO |
687 | |
688 | for_each_member_device(ca, c, i) { | |
689 | struct bucket_array *dst = __bucket_array(ca, 1); | |
690 | struct bucket_array *src = __bucket_array(ca, 0); | |
691 | size_t b; | |
692 | ||
693 | dst->first_bucket = src->first_bucket; | |
694 | dst->nbuckets = src->nbuckets; | |
695 | ||
28062d32 | 696 | for (b = 0; b < src->nbuckets; b++) { |
76f4c7b0 KO |
697 | dst->b[b]._mark.gen = |
698 | dst->b[b].oldest_gen = | |
699 | src->b[b].mark.gen; | |
28062d32 KO |
700 | dst->b[b].gen_valid = src->b[b].gen_valid; |
701 | } | |
9ca53b55 KO |
702 | }; |
703 | ||
9166b41d | 704 | percpu_up_write(&c->mark_lock); |
9ca53b55 | 705 | |
dfe9bfb3 | 706 | return bch2_ec_mem_alloc(c, true); |
1c6fdbd8 KO |
707 | } |
708 | ||
709 | /** | |
9ca53b55 KO |
710 | * bch2_gc - walk _all_ references to buckets, and recompute them: |
711 | * | |
712 | * Order matters here: | |
713 | * - Concurrent GC relies on the fact that we have a total ordering for | |
714 | * everything that GC walks - see gc_will_visit_node(), | |
715 | * gc_will_visit_root() | |
716 | * | |
717 | * - also, references move around in the course of index updates and | |
718 | * various other crap: everything needs to agree on the ordering | |
719 | * references are allowed to move around in - e.g., we're allowed to | |
720 | * start with a reference owned by an open_bucket (the allocator) and | |
721 | * move it to the btree, but not the reverse. | |
722 | * | |
723 | * This is necessary to ensure that gc doesn't miss references that | |
724 | * move around - if references move backwards in the ordering GC | |
725 | * uses, GC could skip past them | |
1c6fdbd8 | 726 | */ |
9ca53b55 | 727 | int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial) |
1c6fdbd8 KO |
728 | { |
729 | struct bch_dev *ca; | |
730 | u64 start_time = local_clock(); | |
9ca53b55 | 731 | unsigned i, iter = 0; |
2252aa27 | 732 | int ret; |
1c6fdbd8 | 733 | |
1c6fdbd8 KO |
734 | trace_gc_start(c); |
735 | ||
1c6fdbd8 | 736 | down_write(&c->gc_lock); |
9ca53b55 KO |
737 | again: |
738 | ret = bch2_gc_start(c); | |
739 | if (ret) | |
1c6fdbd8 KO |
740 | goto out; |
741 | ||
1c6fdbd8 KO |
742 | bch2_mark_superblocks(c); |
743 | ||
9ca53b55 KO |
744 | ret = bch2_gc_btrees(c, journal, initial); |
745 | if (ret) | |
2252aa27 | 746 | goto out; |
1c6fdbd8 KO |
747 | |
748 | bch2_mark_pending_btree_node_frees(c); | |
749 | bch2_mark_allocator_buckets(c); | |
750 | ||
1c6fdbd8 KO |
751 | c->gc_count++; |
752 | out: | |
6122ab63 KO |
753 | if (!ret && |
754 | (test_bit(BCH_FS_FIXED_GENS, &c->flags) || | |
755 | (!iter && test_restart_gc(c)))) { | |
9ca53b55 KO |
756 | /* |
757 | * XXX: make sure gens we fixed got saved | |
758 | */ | |
759 | if (iter++ <= 2) { | |
760 | bch_info(c, "Fixed gens, restarting mark and sweep:"); | |
761 | clear_bit(BCH_FS_FIXED_GENS, &c->flags); | |
28062d32 KO |
762 | __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); |
763 | bch2_gc_free(c); | |
9ca53b55 KO |
764 | goto again; |
765 | } | |
766 | ||
767 | bch_info(c, "Unable to fix bucket gens, looping"); | |
768 | ret = -EINVAL; | |
769 | } | |
770 | ||
771 | if (!ret) | |
772 | bch2_gc_done(c, initial); | |
773 | ||
774 | /* Indicates that gc is no longer in progress: */ | |
dfe9bfb3 | 775 | __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); |
9ca53b55 KO |
776 | |
777 | bch2_gc_free(c); | |
1c6fdbd8 | 778 | up_write(&c->gc_lock); |
9ca53b55 | 779 | |
1c6fdbd8 KO |
780 | trace_gc_end(c); |
781 | bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); | |
782 | ||
783 | /* | |
784 | * Wake up allocator in case it was waiting for buckets | |
785 | * because of not being able to inc gens | |
786 | */ | |
787 | for_each_member_device(ca, c, i) | |
788 | bch2_wake_allocator(ca); | |
789 | ||
790 | /* | |
791 | * At startup, allocations can happen directly instead of via the | |
792 | * allocator thread - issue wakeup in case they blocked on gc_lock: | |
793 | */ | |
794 | closure_wake_up(&c->freelist_wait); | |
9ca53b55 | 795 | return ret; |
1c6fdbd8 KO |
796 | } |
797 | ||
798 | /* Btree coalescing */ | |
799 | ||
800 | static void recalc_packed_keys(struct btree *b) | |
801 | { | |
216c9fac | 802 | struct bset *i = btree_bset_first(b); |
1c6fdbd8 KO |
803 | struct bkey_packed *k; |
804 | ||
805 | memset(&b->nr, 0, sizeof(b->nr)); | |
806 | ||
807 | BUG_ON(b->nsets != 1); | |
808 | ||
216c9fac | 809 | vstruct_for_each(i, k) |
1c6fdbd8 KO |
810 | btree_keys_account_key_add(&b->nr, 0, k); |
811 | } | |
812 | ||
813 | static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, | |
814 | struct btree *old_nodes[GC_MERGE_NODES]) | |
815 | { | |
816 | struct btree *parent = btree_node_parent(iter, old_nodes[0]); | |
817 | unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; | |
818 | unsigned blocks = btree_blocks(c) * 2 / 3; | |
819 | struct btree *new_nodes[GC_MERGE_NODES]; | |
820 | struct btree_update *as; | |
821 | struct keylist keylist; | |
822 | struct bkey_format_state format_state; | |
823 | struct bkey_format new_format; | |
824 | ||
825 | memset(new_nodes, 0, sizeof(new_nodes)); | |
826 | bch2_keylist_init(&keylist, NULL); | |
827 | ||
828 | /* Count keys that are not deleted */ | |
829 | for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) | |
830 | u64s += old_nodes[i]->nr.live_u64s; | |
831 | ||
832 | nr_old_nodes = nr_new_nodes = i; | |
833 | ||
834 | /* Check if all keys in @old_nodes could fit in one fewer node */ | |
835 | if (nr_old_nodes <= 1 || | |
836 | __vstruct_blocks(struct btree_node, c->block_bits, | |
837 | DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) | |
838 | return; | |
839 | ||
840 | /* Find a format that all keys in @old_nodes can pack into */ | |
841 | bch2_bkey_format_init(&format_state); | |
842 | ||
843 | for (i = 0; i < nr_old_nodes; i++) | |
844 | __bch2_btree_calc_format(&format_state, old_nodes[i]); | |
845 | ||
846 | new_format = bch2_bkey_format_done(&format_state); | |
847 | ||
848 | /* Check if repacking would make any nodes too big to fit */ | |
849 | for (i = 0; i < nr_old_nodes; i++) | |
850 | if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { | |
851 | trace_btree_gc_coalesce_fail(c, | |
852 | BTREE_GC_COALESCE_FAIL_FORMAT_FITS); | |
853 | return; | |
854 | } | |
855 | ||
856 | if (bch2_keylist_realloc(&keylist, NULL, 0, | |
857 | (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { | |
858 | trace_btree_gc_coalesce_fail(c, | |
859 | BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); | |
860 | return; | |
861 | } | |
862 | ||
863 | as = bch2_btree_update_start(c, iter->btree_id, | |
864 | btree_update_reserve_required(c, parent) + nr_old_nodes, | |
865 | BTREE_INSERT_NOFAIL| | |
866 | BTREE_INSERT_USE_RESERVE, | |
867 | NULL); | |
868 | if (IS_ERR(as)) { | |
869 | trace_btree_gc_coalesce_fail(c, | |
870 | BTREE_GC_COALESCE_FAIL_RESERVE_GET); | |
871 | bch2_keylist_free(&keylist, NULL); | |
872 | return; | |
873 | } | |
874 | ||
875 | trace_btree_gc_coalesce(c, old_nodes[0]); | |
876 | ||
877 | for (i = 0; i < nr_old_nodes; i++) | |
878 | bch2_btree_interior_update_will_free_node(as, old_nodes[i]); | |
879 | ||
880 | /* Repack everything with @new_format and sort down to one bset */ | |
881 | for (i = 0; i < nr_old_nodes; i++) | |
882 | new_nodes[i] = | |
883 | __bch2_btree_node_alloc_replacement(as, old_nodes[i], | |
884 | new_format); | |
885 | ||
886 | /* | |
887 | * Conceptually we concatenate the nodes together and slice them | |
888 | * up at different boundaries. | |
889 | */ | |
890 | for (i = nr_new_nodes - 1; i > 0; --i) { | |
891 | struct btree *n1 = new_nodes[i]; | |
892 | struct btree *n2 = new_nodes[i - 1]; | |
893 | ||
894 | struct bset *s1 = btree_bset_first(n1); | |
895 | struct bset *s2 = btree_bset_first(n2); | |
896 | struct bkey_packed *k, *last = NULL; | |
897 | ||
898 | /* Calculate how many keys from @n2 we could fit inside @n1 */ | |
899 | u64s = 0; | |
900 | ||
901 | for (k = s2->start; | |
902 | k < vstruct_last(s2) && | |
903 | vstruct_blocks_plus(n1->data, c->block_bits, | |
904 | u64s + k->u64s) <= blocks; | |
905 | k = bkey_next(k)) { | |
906 | last = k; | |
907 | u64s += k->u64s; | |
908 | } | |
909 | ||
910 | if (u64s == le16_to_cpu(s2->u64s)) { | |
911 | /* n2 fits entirely in n1 */ | |
912 | n1->key.k.p = n1->data->max_key = n2->data->max_key; | |
913 | ||
914 | memcpy_u64s(vstruct_last(s1), | |
915 | s2->start, | |
916 | le16_to_cpu(s2->u64s)); | |
917 | le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); | |
918 | ||
919 | set_btree_bset_end(n1, n1->set); | |
920 | ||
921 | six_unlock_write(&n2->lock); | |
922 | bch2_btree_node_free_never_inserted(c, n2); | |
923 | six_unlock_intent(&n2->lock); | |
924 | ||
925 | memmove(new_nodes + i - 1, | |
926 | new_nodes + i, | |
927 | sizeof(new_nodes[0]) * (nr_new_nodes - i)); | |
928 | new_nodes[--nr_new_nodes] = NULL; | |
929 | } else if (u64s) { | |
930 | /* move part of n2 into n1 */ | |
931 | n1->key.k.p = n1->data->max_key = | |
932 | bkey_unpack_pos(n1, last); | |
933 | ||
934 | n2->data->min_key = | |
935 | btree_type_successor(iter->btree_id, | |
936 | n1->data->max_key); | |
937 | ||
938 | memcpy_u64s(vstruct_last(s1), | |
939 | s2->start, u64s); | |
940 | le16_add_cpu(&s1->u64s, u64s); | |
941 | ||
942 | memmove(s2->start, | |
943 | vstruct_idx(s2, u64s), | |
944 | (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); | |
945 | s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); | |
946 | ||
947 | set_btree_bset_end(n1, n1->set); | |
948 | set_btree_bset_end(n2, n2->set); | |
949 | } | |
950 | } | |
951 | ||
952 | for (i = 0; i < nr_new_nodes; i++) { | |
953 | struct btree *n = new_nodes[i]; | |
954 | ||
955 | recalc_packed_keys(n); | |
956 | btree_node_reset_sib_u64s(n); | |
957 | ||
958 | bch2_btree_build_aux_trees(n); | |
959 | six_unlock_write(&n->lock); | |
960 | ||
961 | bch2_btree_node_write(c, n, SIX_LOCK_intent); | |
962 | } | |
963 | ||
964 | /* | |
965 | * The keys for the old nodes get deleted. We don't want to insert keys | |
966 | * that compare equal to the keys for the new nodes we'll also be | |
967 | * inserting - we can't because keys on a keylist must be strictly | |
968 | * greater than the previous keys, and we also don't need to since the | |
969 | * key for the new node will serve the same purpose (overwriting the key | |
970 | * for the old node). | |
971 | */ | |
972 | for (i = 0; i < nr_old_nodes; i++) { | |
973 | struct bkey_i delete; | |
974 | unsigned j; | |
975 | ||
976 | for (j = 0; j < nr_new_nodes; j++) | |
977 | if (!bkey_cmp(old_nodes[i]->key.k.p, | |
978 | new_nodes[j]->key.k.p)) | |
979 | goto next; | |
980 | ||
981 | bkey_init(&delete.k); | |
982 | delete.k.p = old_nodes[i]->key.k.p; | |
983 | bch2_keylist_add_in_order(&keylist, &delete); | |
984 | next: | |
985 | i = i; | |
986 | } | |
987 | ||
988 | /* | |
989 | * Keys for the new nodes get inserted: bch2_btree_insert_keys() only | |
990 | * does the lookup once and thus expects the keys to be in sorted order | |
991 | * so we have to make sure the new keys are correctly ordered with | |
992 | * respect to the deleted keys added in the previous loop | |
993 | */ | |
994 | for (i = 0; i < nr_new_nodes; i++) | |
995 | bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); | |
996 | ||
997 | /* Insert the newly coalesced nodes */ | |
998 | bch2_btree_insert_node(as, parent, iter, &keylist, 0); | |
999 | ||
1000 | BUG_ON(!bch2_keylist_empty(&keylist)); | |
1001 | ||
1002 | BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]); | |
1003 | ||
1004 | bch2_btree_iter_node_replace(iter, new_nodes[0]); | |
1005 | ||
1006 | for (i = 0; i < nr_new_nodes; i++) | |
ef337c54 | 1007 | bch2_open_buckets_put(c, &new_nodes[i]->ob); |
1c6fdbd8 KO |
1008 | |
1009 | /* Free the old nodes and update our sliding window */ | |
1010 | for (i = 0; i < nr_old_nodes; i++) { | |
1011 | bch2_btree_node_free_inmem(c, old_nodes[i], iter); | |
1c6fdbd8 KO |
1012 | |
1013 | /* | |
1014 | * the index update might have triggered a split, in which case | |
1015 | * the nodes we coalesced - the new nodes we just created - | |
1016 | * might not be sibling nodes anymore - don't add them to the | |
1017 | * sliding window (except the first): | |
1018 | */ | |
1019 | if (!i) { | |
1020 | old_nodes[i] = new_nodes[i]; | |
1021 | } else { | |
1022 | old_nodes[i] = NULL; | |
1023 | if (new_nodes[i]) | |
1024 | six_unlock_intent(&new_nodes[i]->lock); | |
1025 | } | |
1026 | } | |
1027 | ||
1028 | bch2_btree_update_done(as); | |
1029 | bch2_keylist_free(&keylist, NULL); | |
1030 | } | |
1031 | ||
1032 | static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) | |
1033 | { | |
1034 | struct btree_iter iter; | |
1035 | struct btree *b; | |
1036 | bool kthread = (current->flags & PF_KTHREAD) != 0; | |
1037 | unsigned i; | |
1038 | ||
1039 | /* Sliding window of adjacent btree nodes */ | |
1040 | struct btree *merge[GC_MERGE_NODES]; | |
1041 | u32 lock_seq[GC_MERGE_NODES]; | |
1042 | ||
1043 | /* | |
1044 | * XXX: We don't have a good way of positively matching on sibling nodes | |
1045 | * that have the same parent - this code works by handling the cases | |
1046 | * where they might not have the same parent, and is thus fragile. Ugh. | |
1047 | * | |
1048 | * Perhaps redo this to use multiple linked iterators? | |
1049 | */ | |
1050 | memset(merge, 0, sizeof(merge)); | |
1051 | ||
1052 | __for_each_btree_node(&iter, c, btree_id, POS_MIN, | |
1053 | BTREE_MAX_DEPTH, 0, | |
1054 | BTREE_ITER_PREFETCH, b) { | |
1055 | memmove(merge + 1, merge, | |
1056 | sizeof(merge) - sizeof(merge[0])); | |
1057 | memmove(lock_seq + 1, lock_seq, | |
1058 | sizeof(lock_seq) - sizeof(lock_seq[0])); | |
1059 | ||
1060 | merge[0] = b; | |
1061 | ||
1062 | for (i = 1; i < GC_MERGE_NODES; i++) { | |
1063 | if (!merge[i] || | |
1064 | !six_relock_intent(&merge[i]->lock, lock_seq[i])) | |
1065 | break; | |
1066 | ||
1067 | if (merge[i]->level != merge[0]->level) { | |
1068 | six_unlock_intent(&merge[i]->lock); | |
1069 | break; | |
1070 | } | |
1071 | } | |
1072 | memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); | |
1073 | ||
1074 | bch2_coalesce_nodes(c, &iter, merge); | |
1075 | ||
1076 | for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { | |
1077 | lock_seq[i] = merge[i]->lock.state.seq; | |
1078 | six_unlock_intent(&merge[i]->lock); | |
1079 | } | |
1080 | ||
1081 | lock_seq[0] = merge[0]->lock.state.seq; | |
1082 | ||
1083 | if (kthread && kthread_should_stop()) { | |
1084 | bch2_btree_iter_unlock(&iter); | |
1085 | return -ESHUTDOWN; | |
1086 | } | |
1087 | ||
1088 | bch2_btree_iter_cond_resched(&iter); | |
1089 | ||
1090 | /* | |
1091 | * If the parent node wasn't relocked, it might have been split | |
1092 | * and the nodes in our sliding window might not have the same | |
1093 | * parent anymore - blow away the sliding window: | |
1094 | */ | |
1095 | if (btree_iter_node(&iter, iter.level + 1) && | |
1096 | !btree_node_intent_locked(&iter, iter.level + 1)) | |
1097 | memset(merge + 1, 0, | |
1098 | (GC_MERGE_NODES - 1) * sizeof(merge[0])); | |
1099 | } | |
1100 | return bch2_btree_iter_unlock(&iter); | |
1101 | } | |
1102 | ||
1103 | /** | |
1104 | * bch_coalesce - coalesce adjacent nodes with low occupancy | |
1105 | */ | |
1106 | void bch2_coalesce(struct bch_fs *c) | |
1107 | { | |
1108 | enum btree_id id; | |
1109 | ||
1c6fdbd8 KO |
1110 | down_read(&c->gc_lock); |
1111 | trace_gc_coalesce_start(c); | |
1112 | ||
1113 | for (id = 0; id < BTREE_ID_NR; id++) { | |
1114 | int ret = c->btree_roots[id].b | |
1115 | ? bch2_coalesce_btree(c, id) | |
1116 | : 0; | |
1117 | ||
1118 | if (ret) { | |
1119 | if (ret != -ESHUTDOWN) | |
1120 | bch_err(c, "btree coalescing failed: %d", ret); | |
1c6fdbd8 KO |
1121 | return; |
1122 | } | |
1123 | } | |
1124 | ||
1125 | trace_gc_coalesce_end(c); | |
1126 | up_read(&c->gc_lock); | |
1127 | } | |
1128 | ||
1129 | static int bch2_gc_thread(void *arg) | |
1130 | { | |
1131 | struct bch_fs *c = arg; | |
1132 | struct io_clock *clock = &c->io_clock[WRITE]; | |
1133 | unsigned long last = atomic_long_read(&clock->now); | |
1134 | unsigned last_kick = atomic_read(&c->kick_gc); | |
9ca53b55 | 1135 | int ret; |
1c6fdbd8 KO |
1136 | |
1137 | set_freezable(); | |
1138 | ||
1139 | while (1) { | |
1140 | while (1) { | |
1141 | set_current_state(TASK_INTERRUPTIBLE); | |
1142 | ||
1143 | if (kthread_should_stop()) { | |
1144 | __set_current_state(TASK_RUNNING); | |
1145 | return 0; | |
1146 | } | |
1147 | ||
1148 | if (atomic_read(&c->kick_gc) != last_kick) | |
1149 | break; | |
1150 | ||
1151 | if (c->btree_gc_periodic) { | |
1152 | unsigned long next = last + c->capacity / 16; | |
1153 | ||
1154 | if (atomic_long_read(&clock->now) >= next) | |
1155 | break; | |
1156 | ||
1157 | bch2_io_clock_schedule_timeout(clock, next); | |
1158 | } else { | |
1159 | schedule(); | |
1160 | } | |
1161 | ||
1162 | try_to_freeze(); | |
1163 | } | |
1164 | __set_current_state(TASK_RUNNING); | |
1165 | ||
1166 | last = atomic_long_read(&clock->now); | |
1167 | last_kick = atomic_read(&c->kick_gc); | |
1168 | ||
9ca53b55 KO |
1169 | ret = bch2_gc(c, NULL, false); |
1170 | if (ret) | |
1171 | bch_err(c, "btree gc failed: %i", ret); | |
1c6fdbd8 KO |
1172 | |
1173 | debug_check_no_locks_held(); | |
1174 | } | |
1175 | ||
1176 | return 0; | |
1177 | } | |
1178 | ||
1179 | void bch2_gc_thread_stop(struct bch_fs *c) | |
1180 | { | |
1181 | struct task_struct *p; | |
1182 | ||
1183 | p = c->gc_thread; | |
1184 | c->gc_thread = NULL; | |
1185 | ||
1186 | if (p) { | |
1187 | kthread_stop(p); | |
1188 | put_task_struct(p); | |
1189 | } | |
1190 | } | |
1191 | ||
1192 | int bch2_gc_thread_start(struct bch_fs *c) | |
1193 | { | |
1194 | struct task_struct *p; | |
1195 | ||
1196 | BUG_ON(c->gc_thread); | |
1197 | ||
1198 | p = kthread_create(bch2_gc_thread, c, "bch_gc"); | |
1199 | if (IS_ERR(p)) | |
1200 | return PTR_ERR(p); | |
1201 | ||
1202 | get_task_struct(p); | |
1203 | c->gc_thread = p; | |
1204 | wake_up_process(p); | |
1205 | return 0; | |
1206 | } |