Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Code for manipulating bucket marks for garbage collection. | |
4 | * | |
5 | * Copyright 2014 Datera, Inc. | |
6 | * | |
7 | * Bucket states: | |
8 | * - free bucket: mark == 0 | |
9 | * The bucket contains no data and will not be read | |
10 | * | |
11 | * - allocator bucket: owned_by_allocator == 1 | |
12 | * The bucket is on a free list, or it is an open bucket | |
13 | * | |
14 | * - cached bucket: owned_by_allocator == 0 && | |
15 | * dirty_sectors == 0 && | |
16 | * cached_sectors > 0 | |
17 | * The bucket contains data but may be safely discarded as there are | |
18 | * enough replicas of the data on other cache devices, or it has been | |
19 | * written back to the backing device | |
20 | * | |
21 | * - dirty bucket: owned_by_allocator == 0 && | |
22 | * dirty_sectors > 0 | |
23 | * The bucket contains data that we must not discard (either only copy, | |
24 | * or one of the 'main copies' for data requiring multiple replicas) | |
25 | * | |
26 | * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 | |
27 | * This is a btree node, journal or gen/prio bucket | |
28 | * | |
29 | * Lifecycle: | |
30 | * | |
31 | * bucket invalidated => bucket on freelist => open bucket => | |
32 | * [dirty bucket =>] cached bucket => bucket invalidated => ... | |
33 | * | |
34 | * Note that cache promotion can skip the dirty bucket step, as data | |
35 | * is copied from a deeper tier to a shallower tier, onto a cached | |
36 | * bucket. | |
37 | * Note also that a cached bucket can spontaneously become dirty -- | |
38 | * see below. | |
39 | * | |
40 | * Only a traversal of the key space can determine whether a bucket is | |
41 | * truly dirty or cached. | |
42 | * | |
43 | * Transitions: | |
44 | * | |
45 | * - free => allocator: bucket was invalidated | |
46 | * - cached => allocator: bucket was invalidated | |
47 | * | |
48 | * - allocator => dirty: open bucket was filled up | |
49 | * - allocator => cached: open bucket was filled up | |
50 | * - allocator => metadata: metadata was allocated | |
51 | * | |
52 | * - dirty => cached: dirty sectors were copied to a deeper tier | |
53 | * - dirty => free: dirty sectors were overwritten or moved (copy gc) | |
54 | * - cached => free: cached sectors were overwritten | |
55 | * | |
56 | * - metadata => free: metadata was freed | |
57 | * | |
58 | * Oddities: | |
59 | * - cached => dirty: a device was removed so formerly replicated data | |
60 | * is no longer sufficiently replicated | |
61 | * - free => cached: cannot happen | |
62 | * - free => dirty: cannot happen | |
63 | * - free => metadata: cannot happen | |
64 | */ | |
65 | ||
66 | #include "bcachefs.h" | |
7b3f84ea | 67 | #include "alloc_background.h" |
b35b1925 | 68 | #include "bset.h" |
1c6fdbd8 | 69 | #include "btree_gc.h" |
b35b1925 | 70 | #include "btree_update.h" |
1c6fdbd8 | 71 | #include "buckets.h" |
cd575ddf | 72 | #include "ec.h" |
1c6fdbd8 KO |
73 | #include "error.h" |
74 | #include "movinggc.h" | |
7ef2a73a | 75 | #include "replicas.h" |
1c6fdbd8 KO |
76 | #include "trace.h" |
77 | ||
78 | #include <linux/preempt.h> | |
79 | ||
1c6fdbd8 KO |
80 | /* |
81 | * Clear journal_seq_valid for buckets for which it's not needed, to prevent | |
82 | * wraparound: | |
83 | */ | |
84 | void bch2_bucket_seq_cleanup(struct bch_fs *c) | |
85 | { | |
c6923995 | 86 | u64 journal_seq = atomic64_read(&c->journal.seq); |
1c6fdbd8 KO |
87 | u16 last_seq_ondisk = c->journal.last_seq_ondisk; |
88 | struct bch_dev *ca; | |
89 | struct bucket_array *buckets; | |
90 | struct bucket *g; | |
91 | struct bucket_mark m; | |
92 | unsigned i; | |
93 | ||
c6923995 KO |
94 | if (journal_seq - c->last_bucket_seq_cleanup < |
95 | (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) | |
96 | return; | |
97 | ||
98 | c->last_bucket_seq_cleanup = journal_seq; | |
99 | ||
1c6fdbd8 KO |
100 | for_each_member_device(ca, c, i) { |
101 | down_read(&ca->bucket_lock); | |
102 | buckets = bucket_array(ca); | |
103 | ||
104 | for_each_bucket(g, buckets) { | |
105 | bucket_cmpxchg(g, m, ({ | |
106 | if (!m.journal_seq_valid || | |
107 | bucket_needs_journal_commit(m, last_seq_ondisk)) | |
108 | break; | |
109 | ||
110 | m.journal_seq_valid = 0; | |
111 | })); | |
112 | } | |
113 | up_read(&ca->bucket_lock); | |
114 | } | |
115 | } | |
116 | ||
3e0745e2 KO |
117 | void bch2_fs_usage_initialize(struct bch_fs *c) |
118 | { | |
119 | struct bch_fs_usage *usage; | |
120 | unsigned i, nr; | |
121 | ||
122 | percpu_down_write(&c->mark_lock); | |
123 | nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr; | |
124 | usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr); | |
125 | ||
3577df5f KO |
126 | for (i = 0; i < BCH_REPLICAS_MAX; i++) |
127 | usage->s.reserved += usage->persistent_reserved[i]; | |
128 | ||
3e0745e2 KO |
129 | for (i = 0; i < c->replicas.nr; i++) { |
130 | struct bch_replicas_entry *e = | |
131 | cpu_replicas_entry(&c->replicas, i); | |
132 | ||
133 | switch (e->data_type) { | |
134 | case BCH_DATA_BTREE: | |
135 | case BCH_DATA_USER: | |
136 | usage->s.data += usage->data[i]; | |
137 | break; | |
138 | case BCH_DATA_CACHED: | |
139 | usage->s.cached += usage->data[i]; | |
140 | break; | |
141 | } | |
142 | } | |
143 | ||
144 | percpu_up_write(&c->mark_lock); | |
145 | } | |
146 | ||
1c6fdbd8 KO |
147 | #define bch2_usage_read_raw(_stats) \ |
148 | ({ \ | |
149 | typeof(*this_cpu_ptr(_stats)) _acc; \ | |
1c6fdbd8 KO |
150 | \ |
151 | memset(&_acc, 0, sizeof(_acc)); \ | |
23f80d2b KO |
152 | acc_u64s_percpu((u64 *) &_acc, \ |
153 | (u64 __percpu *) _stats, \ | |
154 | sizeof(_acc) / sizeof(u64)); \ | |
1c6fdbd8 KO |
155 | \ |
156 | _acc; \ | |
157 | }) | |
158 | ||
1c6fdbd8 KO |
159 | struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) |
160 | { | |
9ca53b55 | 161 | return bch2_usage_read_raw(ca->usage[0]); |
1c6fdbd8 KO |
162 | } |
163 | ||
7ef2a73a | 164 | struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) |
1c6fdbd8 | 165 | { |
7ef2a73a KO |
166 | struct bch_fs_usage *ret; |
167 | unsigned nr = READ_ONCE(c->replicas.nr); | |
168 | retry: | |
169 | ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS); | |
170 | if (unlikely(!ret)) | |
171 | return NULL; | |
172 | ||
173 | percpu_down_read(&c->mark_lock); | |
174 | ||
175 | if (unlikely(nr < c->replicas.nr)) { | |
176 | nr = c->replicas.nr; | |
177 | percpu_up_read(&c->mark_lock); | |
178 | kfree(ret); | |
179 | goto retry; | |
180 | } | |
181 | ||
182 | acc_u64s_percpu((u64 *) ret, | |
183 | (u64 __percpu *) c->usage[0], | |
184 | sizeof(*ret) / sizeof(u64) + nr); | |
185 | ||
186 | return ret; | |
1c6fdbd8 KO |
187 | } |
188 | ||
1c6fdbd8 KO |
189 | #define RESERVE_FACTOR 6 |
190 | ||
191 | static u64 reserve_factor(u64 r) | |
192 | { | |
193 | return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); | |
194 | } | |
195 | ||
196 | static u64 avail_factor(u64 r) | |
197 | { | |
5b650fd1 | 198 | return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); |
1c6fdbd8 KO |
199 | } |
200 | ||
5663a415 | 201 | u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage) |
1c6fdbd8 | 202 | { |
7ef2a73a KO |
203 | return min(fs_usage.s.hidden + |
204 | fs_usage.s.data + | |
205 | reserve_factor(fs_usage.s.reserved + | |
206 | fs_usage.s.online_reserved), | |
207 | c->capacity); | |
5663a415 KO |
208 | } |
209 | ||
210 | struct bch_fs_usage_short | |
211 | bch2_fs_usage_read_short(struct bch_fs *c) | |
212 | { | |
06b7345c KO |
213 | struct bch_fs_usage_summarized usage = |
214 | bch2_usage_read_raw(&c->usage[0]->s); | |
5663a415 KO |
215 | struct bch_fs_usage_short ret; |
216 | ||
06b7345c KO |
217 | ret.capacity = READ_ONCE(c->capacity) - usage.hidden; |
218 | ret.used = min(ret.capacity, usage.data + | |
219 | reserve_factor(usage.reserved + | |
220 | usage.online_reserved)); | |
5663a415 KO |
221 | ret.nr_inodes = usage.nr_inodes; |
222 | ||
223 | return ret; | |
1c6fdbd8 KO |
224 | } |
225 | ||
1c6fdbd8 KO |
226 | static inline int is_unavailable_bucket(struct bucket_mark m) |
227 | { | |
228 | return !is_available_bucket(m); | |
229 | } | |
230 | ||
231 | static inline int is_fragmented_bucket(struct bucket_mark m, | |
232 | struct bch_dev *ca) | |
233 | { | |
234 | if (!m.owned_by_allocator && | |
235 | m.data_type == BCH_DATA_USER && | |
236 | bucket_sectors_used(m)) | |
237 | return max_t(int, 0, (int) ca->mi.bucket_size - | |
238 | bucket_sectors_used(m)); | |
239 | return 0; | |
240 | } | |
241 | ||
242 | static inline enum bch_data_type bucket_type(struct bucket_mark m) | |
243 | { | |
244 | return m.cached_sectors && !m.dirty_sectors | |
dfe9bfb3 | 245 | ? BCH_DATA_CACHED |
1c6fdbd8 KO |
246 | : m.data_type; |
247 | } | |
248 | ||
9ca53b55 | 249 | static bool bucket_became_unavailable(struct bucket_mark old, |
1c6fdbd8 KO |
250 | struct bucket_mark new) |
251 | { | |
252 | return is_available_bucket(old) && | |
9ca53b55 | 253 | !is_available_bucket(new); |
1c6fdbd8 KO |
254 | } |
255 | ||
7ef2a73a KO |
256 | int bch2_fs_usage_apply(struct bch_fs *c, |
257 | struct bch_fs_usage *fs_usage, | |
2ecc6171 | 258 | struct disk_reservation *disk_res) |
1c6fdbd8 | 259 | { |
06b7345c | 260 | s64 added = fs_usage->s.data + fs_usage->s.reserved; |
4628529f | 261 | s64 should_not_have_added; |
7ef2a73a | 262 | int ret = 0; |
1c6fdbd8 | 263 | |
9166b41d | 264 | percpu_rwsem_assert_held(&c->mark_lock); |
eeb83e25 | 265 | |
1c6fdbd8 KO |
266 | /* |
267 | * Not allowed to reduce sectors_available except by getting a | |
268 | * reservation: | |
269 | */ | |
4628529f | 270 | should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); |
b35b1925 KO |
271 | if (WARN_ONCE(should_not_have_added > 0, |
272 | "disk usage increased without a reservation")) { | |
4628529f KO |
273 | atomic64_sub(should_not_have_added, &c->sectors_available); |
274 | added -= should_not_have_added; | |
7ef2a73a | 275 | ret = -1; |
4628529f | 276 | } |
1c6fdbd8 KO |
277 | |
278 | if (added > 0) { | |
5663a415 | 279 | disk_res->sectors -= added; |
06b7345c | 280 | fs_usage->s.online_reserved -= added; |
1c6fdbd8 KO |
281 | } |
282 | ||
1c6fdbd8 | 283 | preempt_disable(); |
23f80d2b KO |
284 | acc_u64s((u64 *) this_cpu_ptr(c->usage[0]), |
285 | (u64 *) fs_usage, | |
7ef2a73a | 286 | sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); |
1c6fdbd8 | 287 | preempt_enable(); |
1c6fdbd8 | 288 | |
7ef2a73a | 289 | return ret; |
1c6fdbd8 KO |
290 | } |
291 | ||
06b7345c KO |
292 | static inline void account_bucket(struct bch_fs_usage *fs_usage, |
293 | struct bch_dev_usage *dev_usage, | |
294 | enum bch_data_type type, | |
295 | int nr, s64 size) | |
296 | { | |
297 | if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) | |
298 | fs_usage->s.hidden += size; | |
299 | ||
06b7345c KO |
300 | dev_usage->buckets[type] += nr; |
301 | } | |
302 | ||
1c6fdbd8 | 303 | static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, |
9ca53b55 KO |
304 | struct bch_fs_usage *fs_usage, |
305 | struct bucket_mark old, struct bucket_mark new, | |
306 | bool gc) | |
1c6fdbd8 KO |
307 | { |
308 | struct bch_dev_usage *dev_usage; | |
309 | ||
9166b41d | 310 | percpu_rwsem_assert_held(&c->mark_lock); |
1c6fdbd8 | 311 | |
6eac2c2e KO |
312 | bch2_fs_inconsistent_on(old.data_type && new.data_type && |
313 | old.data_type != new.data_type, c, | |
314 | "different types of data in same bucket: %s, %s", | |
315 | bch2_data_types[old.data_type], | |
316 | bch2_data_types[new.data_type]); | |
1c6fdbd8 KO |
317 | |
318 | preempt_disable(); | |
9ca53b55 | 319 | dev_usage = this_cpu_ptr(ca->usage[gc]); |
1c6fdbd8 | 320 | |
06b7345c KO |
321 | if (bucket_type(old)) |
322 | account_bucket(fs_usage, dev_usage, bucket_type(old), | |
323 | -1, -ca->mi.bucket_size); | |
dfe9bfb3 | 324 | |
06b7345c KO |
325 | if (bucket_type(new)) |
326 | account_bucket(fs_usage, dev_usage, bucket_type(new), | |
327 | 1, ca->mi.bucket_size); | |
1c6fdbd8 KO |
328 | |
329 | dev_usage->buckets_alloc += | |
330 | (int) new.owned_by_allocator - (int) old.owned_by_allocator; | |
cd575ddf KO |
331 | dev_usage->buckets_ec += |
332 | (int) new.stripe - (int) old.stripe; | |
1c6fdbd8 KO |
333 | dev_usage->buckets_unavailable += |
334 | is_unavailable_bucket(new) - is_unavailable_bucket(old); | |
335 | ||
336 | dev_usage->sectors[old.data_type] -= old.dirty_sectors; | |
337 | dev_usage->sectors[new.data_type] += new.dirty_sectors; | |
338 | dev_usage->sectors[BCH_DATA_CACHED] += | |
339 | (int) new.cached_sectors - (int) old.cached_sectors; | |
340 | dev_usage->sectors_fragmented += | |
341 | is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); | |
342 | preempt_enable(); | |
343 | ||
344 | if (!is_available_bucket(old) && is_available_bucket(new)) | |
345 | bch2_wake_allocator(ca); | |
1c6fdbd8 KO |
346 | } |
347 | ||
eeb83e25 KO |
348 | void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca) |
349 | { | |
350 | struct bucket_mark old = { .v.counter = 0 }; | |
351 | struct bch_fs_usage *fs_usage; | |
352 | struct bucket_array *buckets; | |
353 | struct bucket *g; | |
354 | ||
9166b41d | 355 | percpu_down_read(&c->mark_lock); |
eeb83e25 KO |
356 | fs_usage = this_cpu_ptr(c->usage[0]); |
357 | buckets = bucket_array(ca); | |
358 | ||
359 | for_each_bucket(g, buckets) | |
360 | if (g->mark.data_type) | |
361 | bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false); | |
9166b41d | 362 | percpu_up_read(&c->mark_lock); |
eeb83e25 KO |
363 | } |
364 | ||
365 | #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \ | |
1c6fdbd8 KO |
366 | ({ \ |
367 | struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ | |
368 | \ | |
dfe9bfb3 | 369 | bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \ |
1c6fdbd8 KO |
370 | _old; \ |
371 | }) | |
372 | ||
7ef2a73a KO |
373 | static inline void update_replicas(struct bch_fs *c, |
374 | struct bch_fs_usage *fs_usage, | |
375 | struct bch_replicas_entry *r, | |
376 | s64 sectors) | |
377 | { | |
378 | int idx = bch2_replicas_entry_idx(c, r); | |
379 | ||
380 | BUG_ON(idx < 0); | |
381 | BUG_ON(!sectors); | |
382 | ||
383 | if (r->data_type == BCH_DATA_CACHED) | |
384 | fs_usage->s.cached += sectors; | |
385 | else | |
386 | fs_usage->s.data += sectors; | |
387 | fs_usage->data[idx] += sectors; | |
388 | } | |
389 | ||
390 | static inline void update_cached_sectors(struct bch_fs *c, | |
391 | struct bch_fs_usage *fs_usage, | |
392 | unsigned dev, s64 sectors) | |
393 | { | |
394 | struct bch_replicas_padded r; | |
395 | ||
396 | bch2_replicas_entry_cached(&r.e, dev); | |
397 | ||
398 | update_replicas(c, fs_usage, &r.e, sectors); | |
399 | } | |
400 | ||
8777210b KO |
401 | #define do_mark_fn(fn, c, pos, flags, ...) \ |
402 | ({ \ | |
403 | int gc, ret = 0; \ | |
404 | \ | |
405 | percpu_rwsem_assert_held(&c->mark_lock); \ | |
406 | \ | |
407 | for (gc = 0; gc < 2 && !ret; gc++) \ | |
408 | if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \ | |
409 | (gc && gc_visited(c, pos))) \ | |
410 | ret = fn(c, __VA_ARGS__, gc); \ | |
411 | ret; \ | |
412 | }) | |
413 | ||
414 | static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, | |
415 | size_t b, struct bucket_mark *ret, | |
416 | bool gc) | |
1c6fdbd8 | 417 | { |
5663a415 | 418 | struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); |
9ca53b55 | 419 | struct bucket *g = __bucket(ca, b, gc); |
39fbc5a4 | 420 | struct bucket_mark old, new; |
1c6fdbd8 | 421 | |
39fbc5a4 | 422 | old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ |
b29e197a | 423 | BUG_ON(!is_available_bucket(new)); |
1c6fdbd8 | 424 | |
430735cd KO |
425 | new.owned_by_allocator = true; |
426 | new.dirty = true; | |
1c6fdbd8 KO |
427 | new.data_type = 0; |
428 | new.cached_sectors = 0; | |
429 | new.dirty_sectors = 0; | |
430 | new.gen++; | |
431 | })); | |
432 | ||
39fbc5a4 | 433 | if (old.cached_sectors) |
7ef2a73a | 434 | update_cached_sectors(c, fs_usage, ca->dev_idx, |
73c27c60 | 435 | -((s64) old.cached_sectors)); |
39fbc5a4 | 436 | |
8777210b | 437 | if (!gc) |
39fbc5a4 | 438 | *ret = old; |
8777210b | 439 | return 0; |
9ca53b55 KO |
440 | } |
441 | ||
442 | void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, | |
443 | size_t b, struct bucket_mark *old) | |
444 | { | |
8777210b KO |
445 | do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, |
446 | ca, b, old); | |
39fbc5a4 | 447 | |
1c6fdbd8 KO |
448 | if (!old->owned_by_allocator && old->cached_sectors) |
449 | trace_invalidate(ca, bucket_to_sector(ca, b), | |
450 | old->cached_sectors); | |
1c6fdbd8 KO |
451 | } |
452 | ||
8777210b KO |
453 | static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, |
454 | size_t b, bool owned_by_allocator, | |
455 | bool gc) | |
1c6fdbd8 | 456 | { |
5663a415 | 457 | struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); |
9ca53b55 | 458 | struct bucket *g = __bucket(ca, b, gc); |
1c6fdbd8 KO |
459 | struct bucket_mark old, new; |
460 | ||
5663a415 | 461 | old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ |
1c6fdbd8 KO |
462 | new.owned_by_allocator = owned_by_allocator; |
463 | })); | |
464 | ||
9ca53b55 KO |
465 | BUG_ON(!gc && |
466 | !owned_by_allocator && !old.owned_by_allocator); | |
8777210b KO |
467 | |
468 | return 0; | |
9ca53b55 KO |
469 | } |
470 | ||
471 | void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, | |
472 | size_t b, bool owned_by_allocator, | |
473 | struct gc_pos pos, unsigned flags) | |
474 | { | |
8777210b KO |
475 | do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, |
476 | ca, b, owned_by_allocator); | |
1c6fdbd8 KO |
477 | } |
478 | ||
8fe826f9 KO |
479 | static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, |
480 | bool inserting, | |
481 | struct bch_fs_usage *fs_usage, | |
482 | unsigned journal_seq, unsigned flags, | |
483 | bool gc) | |
484 | { | |
485 | struct bkey_alloc_unpacked u; | |
486 | struct bch_dev *ca; | |
487 | struct bucket *g; | |
488 | struct bucket_mark old, m; | |
489 | ||
490 | if (!inserting) | |
491 | return 0; | |
492 | ||
493 | /* | |
494 | * alloc btree is read in by bch2_alloc_read, not gc: | |
495 | */ | |
496 | if (flags & BCH_BUCKET_MARK_GC) | |
497 | return 0; | |
498 | ||
499 | u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v); | |
500 | ca = bch_dev_bkey_exists(c, k.k->p.inode); | |
501 | g = __bucket(ca, k.k->p.offset, gc); | |
502 | ||
503 | /* | |
504 | * this should currently only be getting called from the bucket | |
505 | * invalidate path: | |
506 | */ | |
507 | BUG_ON(u.dirty_sectors); | |
508 | BUG_ON(u.cached_sectors); | |
509 | BUG_ON(!g->mark.owned_by_allocator); | |
510 | ||
511 | old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({ | |
512 | m.gen = u.gen; | |
513 | m.data_type = u.data_type; | |
514 | m.dirty_sectors = u.dirty_sectors; | |
515 | m.cached_sectors = u.cached_sectors; | |
516 | })); | |
517 | ||
518 | g->io_time[READ] = u.read_time; | |
519 | g->io_time[WRITE] = u.write_time; | |
520 | g->oldest_gen = u.oldest_gen; | |
521 | g->gen_valid = 1; | |
522 | ||
523 | if (old.cached_sectors) { | |
524 | update_cached_sectors(c, fs_usage, ca->dev_idx, | |
525 | -old.cached_sectors); | |
526 | trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), | |
527 | old.cached_sectors); | |
528 | } | |
529 | ||
530 | return 0; | |
531 | } | |
532 | ||
b2be7c8b | 533 | #define checked_add(a, b) \ |
1c6fdbd8 | 534 | do { \ |
b2be7c8b KO |
535 | unsigned _res = (unsigned) (a) + (b); \ |
536 | (a) = _res; \ | |
537 | BUG_ON((a) != _res); \ | |
1c6fdbd8 KO |
538 | } while (0) |
539 | ||
8777210b KO |
540 | static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, |
541 | size_t b, enum bch_data_type type, | |
542 | unsigned sectors, bool gc) | |
9ca53b55 KO |
543 | { |
544 | struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); | |
545 | struct bucket *g = __bucket(ca, b, gc); | |
eeb83e25 | 546 | struct bucket_mark new; |
9ca53b55 KO |
547 | |
548 | BUG_ON(type != BCH_DATA_SB && | |
549 | type != BCH_DATA_JOURNAL); | |
550 | ||
eeb83e25 | 551 | bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ |
430735cd | 552 | new.dirty = true; |
9ca53b55 KO |
553 | new.data_type = type; |
554 | checked_add(new.dirty_sectors, sectors); | |
555 | })); | |
8777210b KO |
556 | |
557 | return 0; | |
9ca53b55 KO |
558 | } |
559 | ||
1c6fdbd8 KO |
560 | void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, |
561 | size_t b, enum bch_data_type type, | |
562 | unsigned sectors, struct gc_pos pos, | |
563 | unsigned flags) | |
564 | { | |
5b650fd1 KO |
565 | BUG_ON(type != BCH_DATA_SB && |
566 | type != BCH_DATA_JOURNAL); | |
1c6fdbd8 | 567 | |
9ca53b55 KO |
568 | preempt_disable(); |
569 | ||
1c6fdbd8 | 570 | if (likely(c)) { |
8777210b KO |
571 | do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, |
572 | ca, b, type, sectors); | |
6eac2c2e | 573 | } else { |
9ca53b55 | 574 | struct bucket *g; |
430735cd | 575 | struct bucket_mark new; |
9ca53b55 | 576 | |
6eac2c2e KO |
577 | rcu_read_lock(); |
578 | ||
579 | g = bucket(ca, b); | |
430735cd KO |
580 | bucket_cmpxchg(g, new, ({ |
581 | new.dirty = true; | |
582 | new.data_type = type; | |
6eac2c2e KO |
583 | checked_add(new.dirty_sectors, sectors); |
584 | })); | |
585 | ||
586 | rcu_read_unlock(); | |
587 | } | |
1c6fdbd8 | 588 | |
9ca53b55 | 589 | preempt_enable(); |
1c6fdbd8 KO |
590 | } |
591 | ||
641ab736 KO |
592 | static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, |
593 | s64 delta) | |
1c6fdbd8 | 594 | { |
641ab736 KO |
595 | if (delta > 0) { |
596 | /* | |
597 | * marking a new extent, which _will have size_ @delta | |
598 | * | |
599 | * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE | |
600 | * case, we haven't actually created the key we'll be inserting | |
601 | * yet (for the split) - so we don't want to be using | |
602 | * k->size/crc.live_size here: | |
603 | */ | |
604 | return __ptr_disk_sectors(p, delta); | |
605 | } else { | |
606 | BUG_ON(-delta > p.crc.live_size); | |
1c6fdbd8 | 607 | |
641ab736 KO |
608 | return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) - |
609 | (s64) ptr_disk_sectors(p); | |
1c6fdbd8 | 610 | } |
47799326 KO |
611 | } |
612 | ||
613 | /* | |
614 | * Checking against gc's position has to be done here, inside the cmpxchg() | |
615 | * loop, to avoid racing with the start of gc clearing all the marks - GC does | |
616 | * that with the gc pos seqlock held. | |
617 | */ | |
73c27c60 | 618 | static bool bch2_mark_pointer(struct bch_fs *c, |
47799326 KO |
619 | struct extent_ptr_decoded p, |
620 | s64 sectors, enum bch_data_type data_type, | |
621 | struct bch_fs_usage *fs_usage, | |
5663a415 | 622 | unsigned journal_seq, unsigned flags, |
9ca53b55 | 623 | bool gc) |
47799326 KO |
624 | { |
625 | struct bucket_mark old, new; | |
626 | struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); | |
9ca53b55 KO |
627 | size_t b = PTR_BUCKET_NR(ca, &p.ptr); |
628 | struct bucket *g = __bucket(ca, b, gc); | |
47799326 | 629 | u64 v; |
6eac2c2e | 630 | |
1c6fdbd8 KO |
631 | v = atomic64_read(&g->_mark.v); |
632 | do { | |
633 | new.v.counter = old.v.counter = v; | |
1c6fdbd8 | 634 | |
430735cd KO |
635 | new.dirty = true; |
636 | ||
1c6fdbd8 KO |
637 | /* |
638 | * Check this after reading bucket mark to guard against | |
639 | * the allocator invalidating a bucket after we've already | |
640 | * checked the gen | |
641 | */ | |
1742237b | 642 | if (gen_after(new.gen, p.ptr.gen)) { |
1c6fdbd8 | 643 | BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)); |
1742237b | 644 | EBUG_ON(!p.ptr.cached && |
1c6fdbd8 | 645 | test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); |
73c27c60 | 646 | return true; |
1c6fdbd8 KO |
647 | } |
648 | ||
1742237b | 649 | if (!p.ptr.cached) |
b2be7c8b | 650 | checked_add(new.dirty_sectors, sectors); |
1c6fdbd8 | 651 | else |
b2be7c8b | 652 | checked_add(new.cached_sectors, sectors); |
1c6fdbd8 KO |
653 | |
654 | if (!new.dirty_sectors && | |
655 | !new.cached_sectors) { | |
656 | new.data_type = 0; | |
657 | ||
658 | if (journal_seq) { | |
659 | new.journal_seq_valid = 1; | |
660 | new.journal_seq = journal_seq; | |
661 | } | |
662 | } else { | |
663 | new.data_type = data_type; | |
664 | } | |
665 | ||
666 | if (flags & BCH_BUCKET_MARK_NOATOMIC) { | |
667 | g->_mark = new; | |
668 | break; | |
669 | } | |
670 | } while ((v = atomic64_cmpxchg(&g->_mark.v, | |
671 | old.v.counter, | |
672 | new.v.counter)) != old.v.counter); | |
673 | ||
9ca53b55 | 674 | bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); |
1c6fdbd8 | 675 | |
9ca53b55 | 676 | BUG_ON(!gc && bucket_became_unavailable(old, new)); |
73c27c60 KO |
677 | |
678 | return false; | |
1c6fdbd8 KO |
679 | } |
680 | ||
dfe9bfb3 KO |
681 | static int bch2_mark_stripe_ptr(struct bch_fs *c, |
682 | struct bch_extent_stripe_ptr p, | |
7ef2a73a KO |
683 | enum bch_data_type data_type, |
684 | struct bch_fs_usage *fs_usage, | |
dfe9bfb3 | 685 | s64 sectors, unsigned flags, |
dfe9bfb3 | 686 | bool gc) |
cd575ddf | 687 | { |
dfe9bfb3 | 688 | struct stripe *m; |
cd575ddf KO |
689 | unsigned old, new, nr_data; |
690 | int blocks_nonempty_delta; | |
691 | s64 parity_sectors; | |
692 | ||
61c8d7c8 KO |
693 | BUG_ON(!sectors); |
694 | ||
dfe9bfb3 | 695 | m = genradix_ptr(&c->stripes[gc], p.idx); |
cd575ddf | 696 | |
61c8d7c8 KO |
697 | spin_lock(&c->ec_stripes_heap_lock); |
698 | ||
dfe9bfb3 | 699 | if (!m || !m->alive) { |
61c8d7c8 | 700 | spin_unlock(&c->ec_stripes_heap_lock); |
dfe9bfb3 KO |
701 | bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", |
702 | (u64) p.idx); | |
703 | return -1; | |
704 | } | |
cd575ddf | 705 | |
7ef2a73a KO |
706 | BUG_ON(m->r.e.data_type != data_type); |
707 | ||
cd575ddf KO |
708 | nr_data = m->nr_blocks - m->nr_redundant; |
709 | ||
710 | parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); | |
711 | ||
712 | if (sectors < 0) | |
713 | parity_sectors = -parity_sectors; | |
7ef2a73a | 714 | sectors += parity_sectors; |
cd575ddf | 715 | |
61c8d7c8 KO |
716 | old = m->block_sectors[p.block]; |
717 | m->block_sectors[p.block] += sectors; | |
718 | new = m->block_sectors[p.block]; | |
cd575ddf KO |
719 | |
720 | blocks_nonempty_delta = (int) !!new - (int) !!old; | |
61c8d7c8 KO |
721 | if (blocks_nonempty_delta) { |
722 | m->blocks_nonempty += blocks_nonempty_delta; | |
cd575ddf | 723 | |
61c8d7c8 KO |
724 | if (!gc) |
725 | bch2_stripes_heap_update(c, m, p.idx); | |
726 | } | |
cd575ddf | 727 | |
61c8d7c8 | 728 | m->dirty = true; |
cd575ddf | 729 | |
61c8d7c8 | 730 | spin_unlock(&c->ec_stripes_heap_lock); |
dfe9bfb3 | 731 | |
7ef2a73a KO |
732 | update_replicas(c, fs_usage, &m->r.e, sectors); |
733 | ||
dfe9bfb3 | 734 | return 0; |
cd575ddf KO |
735 | } |
736 | ||
dfe9bfb3 KO |
737 | static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, |
738 | s64 sectors, enum bch_data_type data_type, | |
5663a415 KO |
739 | struct bch_fs_usage *fs_usage, |
740 | unsigned journal_seq, unsigned flags, | |
dfe9bfb3 | 741 | bool gc) |
1c6fdbd8 | 742 | { |
26609b61 KO |
743 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
744 | const union bch_extent_entry *entry; | |
745 | struct extent_ptr_decoded p; | |
7ef2a73a KO |
746 | struct bch_replicas_padded r; |
747 | s64 dirty_sectors = 0; | |
26609b61 KO |
748 | unsigned i; |
749 | int ret; | |
750 | ||
7ef2a73a KO |
751 | r.e.data_type = data_type; |
752 | r.e.nr_devs = 0; | |
753 | r.e.nr_required = 1; | |
754 | ||
47799326 KO |
755 | BUG_ON(!sectors); |
756 | ||
26609b61 | 757 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { |
641ab736 KO |
758 | s64 disk_sectors = data_type == BCH_DATA_BTREE |
759 | ? sectors | |
760 | : ptr_disk_sectors_delta(p, sectors); | |
73c27c60 KO |
761 | bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, |
762 | fs_usage, journal_seq, flags, gc); | |
b35b1925 | 763 | |
7ef2a73a | 764 | if (p.ptr.cached) { |
73c27c60 KO |
765 | if (disk_sectors && !stale) |
766 | update_cached_sectors(c, fs_usage, p.ptr.dev, | |
767 | disk_sectors); | |
7ef2a73a KO |
768 | } else if (!p.ec_nr) { |
769 | dirty_sectors += disk_sectors; | |
770 | r.e.devs[r.e.nr_devs++] = p.ptr.dev; | |
771 | } else { | |
26609b61 KO |
772 | for (i = 0; i < p.ec_nr; i++) { |
773 | ret = bch2_mark_stripe_ptr(c, p.ec[i], | |
7ef2a73a KO |
774 | data_type, fs_usage, |
775 | disk_sectors, flags, gc); | |
26609b61 KO |
776 | if (ret) |
777 | return ret; | |
778 | } | |
26609b61 | 779 | |
7ef2a73a KO |
780 | r.e.nr_required = 0; |
781 | } | |
47799326 | 782 | } |
b35b1925 | 783 | |
7ef2a73a KO |
784 | if (dirty_sectors) |
785 | update_replicas(c, fs_usage, &r.e, dirty_sectors); | |
dfe9bfb3 KO |
786 | |
787 | return 0; | |
47799326 | 788 | } |
5b650fd1 | 789 | |
cd575ddf KO |
790 | static void bucket_set_stripe(struct bch_fs *c, |
791 | const struct bch_stripe *v, | |
792 | bool enabled, | |
793 | struct bch_fs_usage *fs_usage, | |
9ca53b55 KO |
794 | u64 journal_seq, |
795 | bool gc) | |
cd575ddf KO |
796 | { |
797 | unsigned i; | |
798 | ||
799 | for (i = 0; i < v->nr_blocks; i++) { | |
800 | const struct bch_extent_ptr *ptr = v->ptrs + i; | |
801 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
9ca53b55 KO |
802 | size_t b = PTR_BUCKET_NR(ca, ptr); |
803 | struct bucket *g = __bucket(ca, b, gc); | |
cd575ddf KO |
804 | struct bucket_mark new, old; |
805 | ||
806 | BUG_ON(ptr_stale(ca, ptr)); | |
807 | ||
dfe9bfb3 | 808 | old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ |
430735cd | 809 | new.dirty = true; |
cd575ddf KO |
810 | new.stripe = enabled; |
811 | if (journal_seq) { | |
812 | new.journal_seq_valid = 1; | |
813 | new.journal_seq = journal_seq; | |
814 | } | |
815 | })); | |
cd575ddf KO |
816 | } |
817 | } | |
818 | ||
dfe9bfb3 KO |
819 | static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, |
820 | bool inserting, | |
821 | struct bch_fs_usage *fs_usage, | |
822 | u64 journal_seq, unsigned flags, | |
823 | bool gc) | |
cd575ddf | 824 | { |
26609b61 KO |
825 | struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); |
826 | size_t idx = s.k->p.offset; | |
827 | struct stripe *m = genradix_ptr(&c->stripes[gc], idx); | |
828 | unsigned i; | |
cd575ddf | 829 | |
61c8d7c8 KO |
830 | spin_lock(&c->ec_stripes_heap_lock); |
831 | ||
26609b61 | 832 | if (!m || (!inserting && !m->alive)) { |
61c8d7c8 | 833 | spin_unlock(&c->ec_stripes_heap_lock); |
26609b61 KO |
834 | bch_err_ratelimited(c, "error marking nonexistent stripe %zu", |
835 | idx); | |
836 | return -1; | |
837 | } | |
cd575ddf | 838 | |
61c8d7c8 KO |
839 | if (m->alive) |
840 | bch2_stripes_heap_del(c, m, idx); | |
cd575ddf | 841 | |
61c8d7c8 | 842 | memset(m, 0, sizeof(*m)); |
cd575ddf | 843 | |
26609b61 KO |
844 | if (inserting) { |
845 | m->sectors = le16_to_cpu(s.v->sectors); | |
846 | m->algorithm = s.v->algorithm; | |
847 | m->nr_blocks = s.v->nr_blocks; | |
848 | m->nr_redundant = s.v->nr_redundant; | |
7ef2a73a KO |
849 | |
850 | memset(&m->r, 0, sizeof(m->r)); | |
851 | ||
852 | m->r.e.data_type = BCH_DATA_USER; | |
853 | m->r.e.nr_devs = s.v->nr_blocks; | |
854 | m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant; | |
855 | ||
856 | for (i = 0; i < s.v->nr_blocks; i++) | |
857 | m->r.e.devs[i] = s.v->ptrs[i].dev; | |
26609b61 | 858 | |
7ef2a73a KO |
859 | /* |
860 | * XXX: account for stripes somehow here | |
861 | */ | |
862 | #if 0 | |
863 | update_replicas(c, fs_usage, &m->r.e, stripe_sectors); | |
864 | #endif | |
865 | ||
61c8d7c8 KO |
866 | /* gc recalculates these fields: */ |
867 | if (!(flags & BCH_BUCKET_MARK_GC)) { | |
868 | for (i = 0; i < s.v->nr_blocks; i++) { | |
869 | m->block_sectors[i] = | |
870 | stripe_blockcount_get(s.v, i); | |
871 | m->blocks_nonempty += !!m->block_sectors[i]; | |
872 | } | |
873 | } | |
874 | ||
875 | if (!gc) | |
26609b61 KO |
876 | bch2_stripes_heap_insert(c, m, idx); |
877 | else | |
61c8d7c8 | 878 | m->alive = true; |
cd575ddf | 879 | } |
dfe9bfb3 | 880 | |
61c8d7c8 KO |
881 | spin_unlock(&c->ec_stripes_heap_lock); |
882 | ||
26609b61 | 883 | bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc); |
dfe9bfb3 | 884 | return 0; |
cd575ddf KO |
885 | } |
886 | ||
26609b61 | 887 | static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, |
dfe9bfb3 | 888 | bool inserting, s64 sectors, |
5663a415 KO |
889 | struct bch_fs_usage *fs_usage, |
890 | unsigned journal_seq, unsigned flags, | |
dfe9bfb3 | 891 | bool gc) |
47799326 | 892 | { |
8777210b KO |
893 | if (!fs_usage || gc) |
894 | fs_usage = this_cpu_ptr(c->usage[gc]); | |
dfe9bfb3 | 895 | |
26609b61 | 896 | switch (k.k->type) { |
8fe826f9 KO |
897 | case KEY_TYPE_alloc: |
898 | return bch2_mark_alloc(c, k, inserting, | |
899 | fs_usage, journal_seq, flags, gc); | |
26609b61 | 900 | case KEY_TYPE_btree_ptr: |
8777210b | 901 | return bch2_mark_extent(c, k, inserting |
8fe826f9 KO |
902 | ? c->opts.btree_node_size |
903 | : -c->opts.btree_node_size, | |
904 | BCH_DATA_BTREE, | |
905 | fs_usage, journal_seq, flags, gc); | |
26609b61 | 906 | case KEY_TYPE_extent: |
8777210b | 907 | return bch2_mark_extent(c, k, sectors, BCH_DATA_USER, |
8fe826f9 | 908 | fs_usage, journal_seq, flags, gc); |
26609b61 | 909 | case KEY_TYPE_stripe: |
8777210b | 910 | return bch2_mark_stripe(c, k, inserting, |
8fe826f9 | 911 | fs_usage, journal_seq, flags, gc); |
bdba6c29 | 912 | case KEY_TYPE_inode: |
f0cfb963 | 913 | if (inserting) |
06b7345c | 914 | fs_usage->s.nr_inodes++; |
f0cfb963 | 915 | else |
06b7345c | 916 | fs_usage->s.nr_inodes--; |
8777210b | 917 | return 0; |
26609b61 KO |
918 | case KEY_TYPE_reservation: { |
919 | unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; | |
920 | ||
921 | sectors *= replicas; | |
7ef2a73a KO |
922 | replicas = clamp_t(unsigned, replicas, 1, |
923 | ARRAY_SIZE(fs_usage->persistent_reserved)); | |
26609b61 | 924 | |
7ef2a73a KO |
925 | fs_usage->s.reserved += sectors; |
926 | fs_usage->persistent_reserved[replicas - 1] += sectors; | |
8777210b | 927 | return 0; |
26609b61 | 928 | } |
47799326 | 929 | default: |
8777210b | 930 | return 0; |
1c6fdbd8 | 931 | } |
9ca53b55 KO |
932 | } |
933 | ||
eeb83e25 | 934 | int bch2_mark_key_locked(struct bch_fs *c, |
26609b61 | 935 | struct bkey_s_c k, |
eeb83e25 KO |
936 | bool inserting, s64 sectors, |
937 | struct gc_pos pos, | |
5663a415 | 938 | struct bch_fs_usage *fs_usage, |
eeb83e25 | 939 | u64 journal_seq, unsigned flags) |
9ca53b55 | 940 | { |
8777210b KO |
941 | return do_mark_fn(__bch2_mark_key, c, pos, flags, |
942 | k, inserting, sectors, fs_usage, | |
943 | journal_seq, flags); | |
eeb83e25 KO |
944 | } |
945 | ||
26609b61 | 946 | int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, |
eeb83e25 KO |
947 | bool inserting, s64 sectors, |
948 | struct gc_pos pos, | |
5663a415 | 949 | struct bch_fs_usage *fs_usage, |
eeb83e25 KO |
950 | u64 journal_seq, unsigned flags) |
951 | { | |
952 | int ret; | |
953 | ||
9166b41d | 954 | percpu_down_read(&c->mark_lock); |
26609b61 | 955 | ret = bch2_mark_key_locked(c, k, inserting, sectors, |
5663a415 | 956 | pos, fs_usage, journal_seq, flags); |
9166b41d | 957 | percpu_up_read(&c->mark_lock); |
dfe9bfb3 KO |
958 | |
959 | return ret; | |
1c6fdbd8 KO |
960 | } |
961 | ||
b35b1925 KO |
962 | void bch2_mark_update(struct btree_insert *trans, |
963 | struct btree_insert_entry *insert) | |
964 | { | |
965 | struct bch_fs *c = trans->c; | |
966 | struct btree_iter *iter = insert->iter; | |
967 | struct btree *b = iter->l[0].b; | |
968 | struct btree_node_iter node_iter = iter->l[0].iter; | |
7ef2a73a | 969 | struct bch_fs_usage *fs_usage; |
b35b1925 KO |
970 | struct gc_pos pos = gc_pos_btree_node(b); |
971 | struct bkey_packed *_k; | |
7ef2a73a KO |
972 | u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; |
973 | static int warned_disk_usage = 0; | |
b35b1925 | 974 | |
26609b61 | 975 | if (!btree_node_type_needs_gc(iter->btree_id)) |
eeb83e25 KO |
976 | return; |
977 | ||
9166b41d | 978 | percpu_down_read(&c->mark_lock); |
7ef2a73a KO |
979 | preempt_disable(); |
980 | fs_usage = bch2_fs_usage_get_scratch(c); | |
eeb83e25 | 981 | |
8fe826f9 | 982 | if (!(trans->flags & BTREE_INSERT_NOMARK)) |
26609b61 | 983 | bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, |
eeb83e25 KO |
984 | bpos_min(insert->k->k.p, b->key.k.p).offset - |
985 | bkey_start_offset(&insert->k->k), | |
7ef2a73a | 986 | pos, fs_usage, trans->journal_res.seq, 0); |
b35b1925 KO |
987 | |
988 | while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, | |
26609b61 | 989 | KEY_TYPE_discard))) { |
b35b1925 KO |
990 | struct bkey unpacked; |
991 | struct bkey_s_c k; | |
992 | s64 sectors = 0; | |
993 | ||
994 | k = bkey_disassemble(b, _k, &unpacked); | |
995 | ||
996 | if (btree_node_is_extents(b) | |
997 | ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0 | |
998 | : bkey_cmp(insert->k->k.p, k.k->p)) | |
999 | break; | |
1000 | ||
1001 | if (btree_node_is_extents(b)) { | |
1002 | switch (bch2_extent_overlap(&insert->k->k, k.k)) { | |
1003 | case BCH_EXTENT_OVERLAP_ALL: | |
1004 | sectors = -((s64) k.k->size); | |
1005 | break; | |
1006 | case BCH_EXTENT_OVERLAP_BACK: | |
1007 | sectors = bkey_start_offset(&insert->k->k) - | |
1008 | k.k->p.offset; | |
1009 | break; | |
1010 | case BCH_EXTENT_OVERLAP_FRONT: | |
1011 | sectors = bkey_start_offset(k.k) - | |
1012 | insert->k->k.p.offset; | |
1013 | break; | |
1014 | case BCH_EXTENT_OVERLAP_MIDDLE: | |
1015 | sectors = k.k->p.offset - insert->k->k.p.offset; | |
1016 | BUG_ON(sectors <= 0); | |
1017 | ||
26609b61 | 1018 | bch2_mark_key_locked(c, k, true, sectors, |
7ef2a73a | 1019 | pos, fs_usage, trans->journal_res.seq, 0); |
b35b1925 KO |
1020 | |
1021 | sectors = bkey_start_offset(&insert->k->k) - | |
1022 | k.k->p.offset; | |
1023 | break; | |
1024 | } | |
1025 | ||
1026 | BUG_ON(sectors >= 0); | |
1027 | } | |
1028 | ||
26609b61 | 1029 | bch2_mark_key_locked(c, k, false, sectors, |
7ef2a73a | 1030 | pos, fs_usage, trans->journal_res.seq, 0); |
b35b1925 KO |
1031 | |
1032 | bch2_btree_node_iter_advance(&node_iter, b); | |
1033 | } | |
1034 | ||
2ecc6171 | 1035 | if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) && |
7ef2a73a KO |
1036 | !warned_disk_usage && |
1037 | !xchg(&warned_disk_usage, 1)) { | |
1038 | char buf[200]; | |
1039 | ||
1040 | pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); | |
1041 | ||
1042 | pr_err("while inserting"); | |
1043 | bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k)); | |
1044 | pr_err("%s", buf); | |
1045 | pr_err("overlapping with"); | |
1046 | ||
1047 | node_iter = iter->l[0].iter; | |
1048 | while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, | |
1049 | KEY_TYPE_discard))) { | |
1050 | struct bkey unpacked; | |
1051 | struct bkey_s_c k; | |
1052 | ||
1053 | k = bkey_disassemble(b, _k, &unpacked); | |
eeb83e25 | 1054 | |
7ef2a73a KO |
1055 | if (btree_node_is_extents(b) |
1056 | ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0 | |
1057 | : bkey_cmp(insert->k->k.p, k.k->p)) | |
1058 | break; | |
1059 | ||
1060 | bch2_bkey_val_to_text(&PBUF(buf), c, k); | |
1061 | pr_err("%s", buf); | |
1062 | ||
1063 | bch2_btree_node_iter_advance(&node_iter, b); | |
1064 | } | |
1065 | } | |
1066 | ||
1067 | preempt_enable(); | |
9166b41d | 1068 | percpu_up_read(&c->mark_lock); |
b35b1925 KO |
1069 | } |
1070 | ||
1c6fdbd8 KO |
1071 | /* Disk reservations: */ |
1072 | ||
9ca53b55 | 1073 | static u64 bch2_recalc_sectors_available(struct bch_fs *c) |
1c6fdbd8 | 1074 | { |
4c97e04a | 1075 | percpu_u64_set(&c->pcpu->sectors_available, 0); |
1c6fdbd8 | 1076 | |
5663a415 | 1077 | return avail_factor(bch2_fs_sectors_free(c)); |
1c6fdbd8 KO |
1078 | } |
1079 | ||
1c6fdbd8 KO |
1080 | void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) |
1081 | { | |
9166b41d | 1082 | percpu_down_read(&c->mark_lock); |
06b7345c | 1083 | this_cpu_sub(c->usage[0]->s.online_reserved, res->sectors); |
9166b41d | 1084 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
1085 | |
1086 | res->sectors = 0; | |
1087 | } | |
1088 | ||
1089 | #define SECTORS_CACHE 1024 | |
1090 | ||
1091 | int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, | |
1092 | unsigned sectors, int flags) | |
1093 | { | |
5663a415 | 1094 | struct bch_fs_pcpu *pcpu; |
1c6fdbd8 KO |
1095 | u64 old, v, get; |
1096 | s64 sectors_available; | |
1097 | int ret; | |
1098 | ||
9166b41d | 1099 | percpu_down_read(&c->mark_lock); |
1c6fdbd8 | 1100 | preempt_disable(); |
5663a415 | 1101 | pcpu = this_cpu_ptr(c->pcpu); |
1c6fdbd8 | 1102 | |
5663a415 | 1103 | if (sectors <= pcpu->sectors_available) |
1c6fdbd8 KO |
1104 | goto out; |
1105 | ||
1106 | v = atomic64_read(&c->sectors_available); | |
1107 | do { | |
1108 | old = v; | |
1109 | get = min((u64) sectors + SECTORS_CACHE, old); | |
1110 | ||
1111 | if (get < sectors) { | |
1112 | preempt_enable(); | |
9166b41d | 1113 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
1114 | goto recalculate; |
1115 | } | |
1116 | } while ((v = atomic64_cmpxchg(&c->sectors_available, | |
1117 | old, old - get)) != old); | |
1118 | ||
5663a415 | 1119 | pcpu->sectors_available += get; |
1c6fdbd8 KO |
1120 | |
1121 | out: | |
5663a415 | 1122 | pcpu->sectors_available -= sectors; |
06b7345c | 1123 | this_cpu_add(c->usage[0]->s.online_reserved, sectors); |
5663a415 | 1124 | res->sectors += sectors; |
1c6fdbd8 | 1125 | |
1c6fdbd8 | 1126 | preempt_enable(); |
9166b41d | 1127 | percpu_up_read(&c->mark_lock); |
1c6fdbd8 KO |
1128 | return 0; |
1129 | ||
1130 | recalculate: | |
9166b41d | 1131 | percpu_down_write(&c->mark_lock); |
39fbc5a4 | 1132 | |
9ca53b55 | 1133 | sectors_available = bch2_recalc_sectors_available(c); |
1c6fdbd8 KO |
1134 | |
1135 | if (sectors <= sectors_available || | |
1136 | (flags & BCH_DISK_RESERVATION_NOFAIL)) { | |
1137 | atomic64_set(&c->sectors_available, | |
1138 | max_t(s64, 0, sectors_available - sectors)); | |
06b7345c | 1139 | this_cpu_add(c->usage[0]->s.online_reserved, sectors); |
5663a415 | 1140 | res->sectors += sectors; |
1c6fdbd8 | 1141 | ret = 0; |
1c6fdbd8 KO |
1142 | } else { |
1143 | atomic64_set(&c->sectors_available, sectors_available); | |
1144 | ret = -ENOSPC; | |
1145 | } | |
1146 | ||
9166b41d | 1147 | percpu_up_write(&c->mark_lock); |
1c6fdbd8 | 1148 | |
1c6fdbd8 KO |
1149 | return ret; |
1150 | } | |
1151 | ||
1152 | /* Startup/shutdown: */ | |
1153 | ||
1154 | static void buckets_free_rcu(struct rcu_head *rcu) | |
1155 | { | |
1156 | struct bucket_array *buckets = | |
1157 | container_of(rcu, struct bucket_array, rcu); | |
1158 | ||
1159 | kvpfree(buckets, | |
1160 | sizeof(struct bucket_array) + | |
1161 | buckets->nbuckets * sizeof(struct bucket)); | |
1162 | } | |
1163 | ||
1164 | int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) | |
1165 | { | |
1166 | struct bucket_array *buckets = NULL, *old_buckets = NULL; | |
8eb7f3ee | 1167 | unsigned long *buckets_nouse = NULL; |
61274e9d | 1168 | unsigned long *buckets_written = NULL; |
1c6fdbd8 KO |
1169 | alloc_fifo free[RESERVE_NR]; |
1170 | alloc_fifo free_inc; | |
1171 | alloc_heap alloc_heap; | |
1172 | copygc_heap copygc_heap; | |
1173 | ||
1174 | size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, | |
1175 | ca->mi.bucket_size / c->opts.btree_node_size); | |
1176 | /* XXX: these should be tunable */ | |
8b335bae KO |
1177 | size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); |
1178 | size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); | |
1179 | size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), | |
d0cc3def | 1180 | btree_reserve * 2); |
9ca53b55 | 1181 | bool resize = ca->buckets[0] != NULL, |
1c6fdbd8 KO |
1182 | start_copygc = ca->copygc_thread != NULL; |
1183 | int ret = -ENOMEM; | |
1184 | unsigned i; | |
1185 | ||
1186 | memset(&free, 0, sizeof(free)); | |
1187 | memset(&free_inc, 0, sizeof(free_inc)); | |
1188 | memset(&alloc_heap, 0, sizeof(alloc_heap)); | |
1189 | memset(©gc_heap, 0, sizeof(copygc_heap)); | |
1190 | ||
1191 | if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + | |
1192 | nbuckets * sizeof(struct bucket), | |
1193 | GFP_KERNEL|__GFP_ZERO)) || | |
8eb7f3ee | 1194 | !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * |
1c6fdbd8 KO |
1195 | sizeof(unsigned long), |
1196 | GFP_KERNEL|__GFP_ZERO)) || | |
61274e9d KO |
1197 | !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) * |
1198 | sizeof(unsigned long), | |
1199 | GFP_KERNEL|__GFP_ZERO)) || | |
1c6fdbd8 KO |
1200 | !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || |
1201 | !init_fifo(&free[RESERVE_MOVINGGC], | |
1202 | copygc_reserve, GFP_KERNEL) || | |
1203 | !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || | |
b29e197a KO |
1204 | !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || |
1205 | !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || | |
1c6fdbd8 KO |
1206 | !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) |
1207 | goto err; | |
1208 | ||
1209 | buckets->first_bucket = ca->mi.first_bucket; | |
1210 | buckets->nbuckets = nbuckets; | |
1211 | ||
1212 | bch2_copygc_stop(ca); | |
1213 | ||
1214 | if (resize) { | |
1215 | down_write(&c->gc_lock); | |
1216 | down_write(&ca->bucket_lock); | |
9166b41d | 1217 | percpu_down_write(&c->mark_lock); |
1c6fdbd8 KO |
1218 | } |
1219 | ||
1220 | old_buckets = bucket_array(ca); | |
1221 | ||
1222 | if (resize) { | |
1223 | size_t n = min(buckets->nbuckets, old_buckets->nbuckets); | |
1224 | ||
1225 | memcpy(buckets->b, | |
1226 | old_buckets->b, | |
1227 | n * sizeof(struct bucket)); | |
8eb7f3ee KO |
1228 | memcpy(buckets_nouse, |
1229 | ca->buckets_nouse, | |
1c6fdbd8 | 1230 | BITS_TO_LONGS(n) * sizeof(unsigned long)); |
61274e9d KO |
1231 | memcpy(buckets_written, |
1232 | ca->buckets_written, | |
1233 | BITS_TO_LONGS(n) * sizeof(unsigned long)); | |
1c6fdbd8 KO |
1234 | } |
1235 | ||
9ca53b55 | 1236 | rcu_assign_pointer(ca->buckets[0], buckets); |
1c6fdbd8 KO |
1237 | buckets = old_buckets; |
1238 | ||
8eb7f3ee | 1239 | swap(ca->buckets_nouse, buckets_nouse); |
61274e9d | 1240 | swap(ca->buckets_written, buckets_written); |
1c6fdbd8 KO |
1241 | |
1242 | if (resize) | |
9166b41d | 1243 | percpu_up_write(&c->mark_lock); |
1c6fdbd8 KO |
1244 | |
1245 | spin_lock(&c->freelist_lock); | |
1246 | for (i = 0; i < RESERVE_NR; i++) { | |
1247 | fifo_move(&free[i], &ca->free[i]); | |
1248 | swap(ca->free[i], free[i]); | |
1249 | } | |
1250 | fifo_move(&free_inc, &ca->free_inc); | |
1251 | swap(ca->free_inc, free_inc); | |
1252 | spin_unlock(&c->freelist_lock); | |
1253 | ||
1254 | /* with gc lock held, alloc_heap can't be in use: */ | |
1255 | swap(ca->alloc_heap, alloc_heap); | |
1256 | ||
1257 | /* and we shut down copygc: */ | |
1258 | swap(ca->copygc_heap, copygc_heap); | |
1259 | ||
1260 | nbuckets = ca->mi.nbuckets; | |
1261 | ||
1262 | if (resize) { | |
1263 | up_write(&ca->bucket_lock); | |
1264 | up_write(&c->gc_lock); | |
1265 | } | |
1266 | ||
1267 | if (start_copygc && | |
1268 | bch2_copygc_start(c, ca)) | |
1269 | bch_err(ca, "error restarting copygc thread"); | |
1270 | ||
1271 | ret = 0; | |
1272 | err: | |
1273 | free_heap(©gc_heap); | |
1274 | free_heap(&alloc_heap); | |
1275 | free_fifo(&free_inc); | |
1276 | for (i = 0; i < RESERVE_NR; i++) | |
1277 | free_fifo(&free[i]); | |
8eb7f3ee | 1278 | kvpfree(buckets_nouse, |
1c6fdbd8 | 1279 | BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); |
61274e9d KO |
1280 | kvpfree(buckets_written, |
1281 | BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); | |
1c6fdbd8 KO |
1282 | if (buckets) |
1283 | call_rcu(&old_buckets->rcu, buckets_free_rcu); | |
1284 | ||
1285 | return ret; | |
1286 | } | |
1287 | ||
1288 | void bch2_dev_buckets_free(struct bch_dev *ca) | |
1289 | { | |
1290 | unsigned i; | |
1291 | ||
1292 | free_heap(&ca->copygc_heap); | |
1293 | free_heap(&ca->alloc_heap); | |
1294 | free_fifo(&ca->free_inc); | |
1295 | for (i = 0; i < RESERVE_NR; i++) | |
1296 | free_fifo(&ca->free[i]); | |
61274e9d KO |
1297 | kvpfree(ca->buckets_written, |
1298 | BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); | |
8eb7f3ee | 1299 | kvpfree(ca->buckets_nouse, |
1c6fdbd8 | 1300 | BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); |
9ca53b55 | 1301 | kvpfree(rcu_dereference_protected(ca->buckets[0], 1), |
1c6fdbd8 KO |
1302 | sizeof(struct bucket_array) + |
1303 | ca->mi.nbuckets * sizeof(struct bucket)); | |
1304 | ||
9ca53b55 | 1305 | free_percpu(ca->usage[0]); |
1c6fdbd8 KO |
1306 | } |
1307 | ||
1308 | int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) | |
1309 | { | |
9ca53b55 | 1310 | if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) |
1c6fdbd8 KO |
1311 | return -ENOMEM; |
1312 | ||
1313 | return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; | |
1314 | } |