bcachefs: Convert bucket invalidation to key marking path
[linux-block.git] / fs / bcachefs / buckets.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Code for manipulating bucket marks for garbage collection.
4 *
5 * Copyright 2014 Datera, Inc.
6 *
7 * Bucket states:
8 * - free bucket: mark == 0
9 * The bucket contains no data and will not be read
10 *
11 * - allocator bucket: owned_by_allocator == 1
12 * The bucket is on a free list, or it is an open bucket
13 *
14 * - cached bucket: owned_by_allocator == 0 &&
15 * dirty_sectors == 0 &&
16 * cached_sectors > 0
17 * The bucket contains data but may be safely discarded as there are
18 * enough replicas of the data on other cache devices, or it has been
19 * written back to the backing device
20 *
21 * - dirty bucket: owned_by_allocator == 0 &&
22 * dirty_sectors > 0
23 * The bucket contains data that we must not discard (either only copy,
24 * or one of the 'main copies' for data requiring multiple replicas)
25 *
26 * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
27 * This is a btree node, journal or gen/prio bucket
28 *
29 * Lifecycle:
30 *
31 * bucket invalidated => bucket on freelist => open bucket =>
32 * [dirty bucket =>] cached bucket => bucket invalidated => ...
33 *
34 * Note that cache promotion can skip the dirty bucket step, as data
35 * is copied from a deeper tier to a shallower tier, onto a cached
36 * bucket.
37 * Note also that a cached bucket can spontaneously become dirty --
38 * see below.
39 *
40 * Only a traversal of the key space can determine whether a bucket is
41 * truly dirty or cached.
42 *
43 * Transitions:
44 *
45 * - free => allocator: bucket was invalidated
46 * - cached => allocator: bucket was invalidated
47 *
48 * - allocator => dirty: open bucket was filled up
49 * - allocator => cached: open bucket was filled up
50 * - allocator => metadata: metadata was allocated
51 *
52 * - dirty => cached: dirty sectors were copied to a deeper tier
53 * - dirty => free: dirty sectors were overwritten or moved (copy gc)
54 * - cached => free: cached sectors were overwritten
55 *
56 * - metadata => free: metadata was freed
57 *
58 * Oddities:
59 * - cached => dirty: a device was removed so formerly replicated data
60 * is no longer sufficiently replicated
61 * - free => cached: cannot happen
62 * - free => dirty: cannot happen
63 * - free => metadata: cannot happen
64 */
65
66#include "bcachefs.h"
7b3f84ea 67#include "alloc_background.h"
b35b1925 68#include "bset.h"
1c6fdbd8 69#include "btree_gc.h"
b35b1925 70#include "btree_update.h"
1c6fdbd8 71#include "buckets.h"
cd575ddf 72#include "ec.h"
1c6fdbd8
KO
73#include "error.h"
74#include "movinggc.h"
7ef2a73a 75#include "replicas.h"
1c6fdbd8
KO
76#include "trace.h"
77
78#include <linux/preempt.h>
79
1c6fdbd8
KO
80/*
81 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
82 * wraparound:
83 */
84void bch2_bucket_seq_cleanup(struct bch_fs *c)
85{
c6923995 86 u64 journal_seq = atomic64_read(&c->journal.seq);
1c6fdbd8
KO
87 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
88 struct bch_dev *ca;
89 struct bucket_array *buckets;
90 struct bucket *g;
91 struct bucket_mark m;
92 unsigned i;
93
c6923995
KO
94 if (journal_seq - c->last_bucket_seq_cleanup <
95 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
96 return;
97
98 c->last_bucket_seq_cleanup = journal_seq;
99
1c6fdbd8
KO
100 for_each_member_device(ca, c, i) {
101 down_read(&ca->bucket_lock);
102 buckets = bucket_array(ca);
103
104 for_each_bucket(g, buckets) {
105 bucket_cmpxchg(g, m, ({
106 if (!m.journal_seq_valid ||
107 bucket_needs_journal_commit(m, last_seq_ondisk))
108 break;
109
110 m.journal_seq_valid = 0;
111 }));
112 }
113 up_read(&ca->bucket_lock);
114 }
115}
116
3e0745e2
KO
117void bch2_fs_usage_initialize(struct bch_fs *c)
118{
119 struct bch_fs_usage *usage;
120 unsigned i, nr;
121
122 percpu_down_write(&c->mark_lock);
123 nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
124 usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
125
3577df5f
KO
126 for (i = 0; i < BCH_REPLICAS_MAX; i++)
127 usage->s.reserved += usage->persistent_reserved[i];
128
3e0745e2
KO
129 for (i = 0; i < c->replicas.nr; i++) {
130 struct bch_replicas_entry *e =
131 cpu_replicas_entry(&c->replicas, i);
132
133 switch (e->data_type) {
134 case BCH_DATA_BTREE:
135 case BCH_DATA_USER:
136 usage->s.data += usage->data[i];
137 break;
138 case BCH_DATA_CACHED:
139 usage->s.cached += usage->data[i];
140 break;
141 }
142 }
143
144 percpu_up_write(&c->mark_lock);
145}
146
1c6fdbd8
KO
147#define bch2_usage_read_raw(_stats) \
148({ \
149 typeof(*this_cpu_ptr(_stats)) _acc; \
1c6fdbd8
KO
150 \
151 memset(&_acc, 0, sizeof(_acc)); \
23f80d2b
KO
152 acc_u64s_percpu((u64 *) &_acc, \
153 (u64 __percpu *) _stats, \
154 sizeof(_acc) / sizeof(u64)); \
1c6fdbd8
KO
155 \
156 _acc; \
157})
158
1c6fdbd8
KO
159struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
160{
9ca53b55 161 return bch2_usage_read_raw(ca->usage[0]);
1c6fdbd8
KO
162}
163
7ef2a73a 164struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
1c6fdbd8 165{
7ef2a73a
KO
166 struct bch_fs_usage *ret;
167 unsigned nr = READ_ONCE(c->replicas.nr);
168retry:
169 ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
170 if (unlikely(!ret))
171 return NULL;
172
173 percpu_down_read(&c->mark_lock);
174
175 if (unlikely(nr < c->replicas.nr)) {
176 nr = c->replicas.nr;
177 percpu_up_read(&c->mark_lock);
178 kfree(ret);
179 goto retry;
180 }
181
182 acc_u64s_percpu((u64 *) ret,
183 (u64 __percpu *) c->usage[0],
184 sizeof(*ret) / sizeof(u64) + nr);
185
186 return ret;
1c6fdbd8
KO
187}
188
1c6fdbd8
KO
189#define RESERVE_FACTOR 6
190
191static u64 reserve_factor(u64 r)
192{
193 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
194}
195
196static u64 avail_factor(u64 r)
197{
5b650fd1 198 return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
1c6fdbd8
KO
199}
200
5663a415 201u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
1c6fdbd8 202{
7ef2a73a
KO
203 return min(fs_usage.s.hidden +
204 fs_usage.s.data +
205 reserve_factor(fs_usage.s.reserved +
206 fs_usage.s.online_reserved),
207 c->capacity);
5663a415
KO
208}
209
210struct bch_fs_usage_short
211bch2_fs_usage_read_short(struct bch_fs *c)
212{
06b7345c
KO
213 struct bch_fs_usage_summarized usage =
214 bch2_usage_read_raw(&c->usage[0]->s);
5663a415
KO
215 struct bch_fs_usage_short ret;
216
06b7345c
KO
217 ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
218 ret.used = min(ret.capacity, usage.data +
219 reserve_factor(usage.reserved +
220 usage.online_reserved));
5663a415
KO
221 ret.nr_inodes = usage.nr_inodes;
222
223 return ret;
1c6fdbd8
KO
224}
225
1c6fdbd8
KO
226static inline int is_unavailable_bucket(struct bucket_mark m)
227{
228 return !is_available_bucket(m);
229}
230
231static inline int is_fragmented_bucket(struct bucket_mark m,
232 struct bch_dev *ca)
233{
234 if (!m.owned_by_allocator &&
235 m.data_type == BCH_DATA_USER &&
236 bucket_sectors_used(m))
237 return max_t(int, 0, (int) ca->mi.bucket_size -
238 bucket_sectors_used(m));
239 return 0;
240}
241
242static inline enum bch_data_type bucket_type(struct bucket_mark m)
243{
244 return m.cached_sectors && !m.dirty_sectors
dfe9bfb3 245 ? BCH_DATA_CACHED
1c6fdbd8
KO
246 : m.data_type;
247}
248
9ca53b55 249static bool bucket_became_unavailable(struct bucket_mark old,
1c6fdbd8
KO
250 struct bucket_mark new)
251{
252 return is_available_bucket(old) &&
9ca53b55 253 !is_available_bucket(new);
1c6fdbd8
KO
254}
255
7ef2a73a
KO
256int bch2_fs_usage_apply(struct bch_fs *c,
257 struct bch_fs_usage *fs_usage,
2ecc6171 258 struct disk_reservation *disk_res)
1c6fdbd8 259{
06b7345c 260 s64 added = fs_usage->s.data + fs_usage->s.reserved;
4628529f 261 s64 should_not_have_added;
7ef2a73a 262 int ret = 0;
1c6fdbd8 263
9166b41d 264 percpu_rwsem_assert_held(&c->mark_lock);
eeb83e25 265
1c6fdbd8
KO
266 /*
267 * Not allowed to reduce sectors_available except by getting a
268 * reservation:
269 */
4628529f 270 should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
b35b1925
KO
271 if (WARN_ONCE(should_not_have_added > 0,
272 "disk usage increased without a reservation")) {
4628529f
KO
273 atomic64_sub(should_not_have_added, &c->sectors_available);
274 added -= should_not_have_added;
7ef2a73a 275 ret = -1;
4628529f 276 }
1c6fdbd8
KO
277
278 if (added > 0) {
5663a415 279 disk_res->sectors -= added;
06b7345c 280 fs_usage->s.online_reserved -= added;
1c6fdbd8
KO
281 }
282
1c6fdbd8 283 preempt_disable();
23f80d2b
KO
284 acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
285 (u64 *) fs_usage,
7ef2a73a 286 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
1c6fdbd8 287 preempt_enable();
1c6fdbd8 288
7ef2a73a 289 return ret;
1c6fdbd8
KO
290}
291
06b7345c
KO
292static inline void account_bucket(struct bch_fs_usage *fs_usage,
293 struct bch_dev_usage *dev_usage,
294 enum bch_data_type type,
295 int nr, s64 size)
296{
297 if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
298 fs_usage->s.hidden += size;
299
06b7345c
KO
300 dev_usage->buckets[type] += nr;
301}
302
1c6fdbd8 303static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
9ca53b55
KO
304 struct bch_fs_usage *fs_usage,
305 struct bucket_mark old, struct bucket_mark new,
306 bool gc)
1c6fdbd8
KO
307{
308 struct bch_dev_usage *dev_usage;
309
9166b41d 310 percpu_rwsem_assert_held(&c->mark_lock);
1c6fdbd8 311
6eac2c2e
KO
312 bch2_fs_inconsistent_on(old.data_type && new.data_type &&
313 old.data_type != new.data_type, c,
314 "different types of data in same bucket: %s, %s",
315 bch2_data_types[old.data_type],
316 bch2_data_types[new.data_type]);
1c6fdbd8
KO
317
318 preempt_disable();
9ca53b55 319 dev_usage = this_cpu_ptr(ca->usage[gc]);
1c6fdbd8 320
06b7345c
KO
321 if (bucket_type(old))
322 account_bucket(fs_usage, dev_usage, bucket_type(old),
323 -1, -ca->mi.bucket_size);
dfe9bfb3 324
06b7345c
KO
325 if (bucket_type(new))
326 account_bucket(fs_usage, dev_usage, bucket_type(new),
327 1, ca->mi.bucket_size);
1c6fdbd8
KO
328
329 dev_usage->buckets_alloc +=
330 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
cd575ddf
KO
331 dev_usage->buckets_ec +=
332 (int) new.stripe - (int) old.stripe;
1c6fdbd8
KO
333 dev_usage->buckets_unavailable +=
334 is_unavailable_bucket(new) - is_unavailable_bucket(old);
335
336 dev_usage->sectors[old.data_type] -= old.dirty_sectors;
337 dev_usage->sectors[new.data_type] += new.dirty_sectors;
338 dev_usage->sectors[BCH_DATA_CACHED] +=
339 (int) new.cached_sectors - (int) old.cached_sectors;
340 dev_usage->sectors_fragmented +=
341 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
342 preempt_enable();
343
344 if (!is_available_bucket(old) && is_available_bucket(new))
345 bch2_wake_allocator(ca);
1c6fdbd8
KO
346}
347
eeb83e25
KO
348void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
349{
350 struct bucket_mark old = { .v.counter = 0 };
351 struct bch_fs_usage *fs_usage;
352 struct bucket_array *buckets;
353 struct bucket *g;
354
9166b41d 355 percpu_down_read(&c->mark_lock);
eeb83e25
KO
356 fs_usage = this_cpu_ptr(c->usage[0]);
357 buckets = bucket_array(ca);
358
359 for_each_bucket(g, buckets)
360 if (g->mark.data_type)
361 bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
9166b41d 362 percpu_up_read(&c->mark_lock);
eeb83e25
KO
363}
364
365#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \
1c6fdbd8
KO
366({ \
367 struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
368 \
dfe9bfb3 369 bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \
1c6fdbd8
KO
370 _old; \
371})
372
7ef2a73a
KO
373static inline void update_replicas(struct bch_fs *c,
374 struct bch_fs_usage *fs_usage,
375 struct bch_replicas_entry *r,
376 s64 sectors)
377{
378 int idx = bch2_replicas_entry_idx(c, r);
379
380 BUG_ON(idx < 0);
381 BUG_ON(!sectors);
382
383 if (r->data_type == BCH_DATA_CACHED)
384 fs_usage->s.cached += sectors;
385 else
386 fs_usage->s.data += sectors;
387 fs_usage->data[idx] += sectors;
388}
389
390static inline void update_cached_sectors(struct bch_fs *c,
391 struct bch_fs_usage *fs_usage,
392 unsigned dev, s64 sectors)
393{
394 struct bch_replicas_padded r;
395
396 bch2_replicas_entry_cached(&r.e, dev);
397
398 update_replicas(c, fs_usage, &r.e, sectors);
399}
400
8777210b
KO
401#define do_mark_fn(fn, c, pos, flags, ...) \
402({ \
403 int gc, ret = 0; \
404 \
405 percpu_rwsem_assert_held(&c->mark_lock); \
406 \
407 for (gc = 0; gc < 2 && !ret; gc++) \
408 if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \
409 (gc && gc_visited(c, pos))) \
410 ret = fn(c, __VA_ARGS__, gc); \
411 ret; \
412})
413
414static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
415 size_t b, struct bucket_mark *ret,
416 bool gc)
1c6fdbd8 417{
5663a415 418 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
9ca53b55 419 struct bucket *g = __bucket(ca, b, gc);
39fbc5a4 420 struct bucket_mark old, new;
1c6fdbd8 421
39fbc5a4 422 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
b29e197a 423 BUG_ON(!is_available_bucket(new));
1c6fdbd8 424
430735cd
KO
425 new.owned_by_allocator = true;
426 new.dirty = true;
1c6fdbd8
KO
427 new.data_type = 0;
428 new.cached_sectors = 0;
429 new.dirty_sectors = 0;
430 new.gen++;
431 }));
432
39fbc5a4 433 if (old.cached_sectors)
7ef2a73a 434 update_cached_sectors(c, fs_usage, ca->dev_idx,
73c27c60 435 -((s64) old.cached_sectors));
39fbc5a4 436
8777210b 437 if (!gc)
39fbc5a4 438 *ret = old;
8777210b 439 return 0;
9ca53b55
KO
440}
441
442void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
443 size_t b, struct bucket_mark *old)
444{
8777210b
KO
445 do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
446 ca, b, old);
39fbc5a4 447
1c6fdbd8
KO
448 if (!old->owned_by_allocator && old->cached_sectors)
449 trace_invalidate(ca, bucket_to_sector(ca, b),
450 old->cached_sectors);
1c6fdbd8
KO
451}
452
8777210b
KO
453static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
454 size_t b, bool owned_by_allocator,
455 bool gc)
1c6fdbd8 456{
5663a415 457 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
9ca53b55 458 struct bucket *g = __bucket(ca, b, gc);
1c6fdbd8
KO
459 struct bucket_mark old, new;
460
5663a415 461 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
1c6fdbd8
KO
462 new.owned_by_allocator = owned_by_allocator;
463 }));
464
9ca53b55
KO
465 BUG_ON(!gc &&
466 !owned_by_allocator && !old.owned_by_allocator);
8777210b
KO
467
468 return 0;
9ca53b55
KO
469}
470
471void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
472 size_t b, bool owned_by_allocator,
473 struct gc_pos pos, unsigned flags)
474{
8777210b
KO
475 do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
476 ca, b, owned_by_allocator);
1c6fdbd8
KO
477}
478
8fe826f9
KO
479static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
480 bool inserting,
481 struct bch_fs_usage *fs_usage,
482 unsigned journal_seq, unsigned flags,
483 bool gc)
484{
485 struct bkey_alloc_unpacked u;
486 struct bch_dev *ca;
487 struct bucket *g;
488 struct bucket_mark old, m;
489
490 if (!inserting)
491 return 0;
492
493 /*
494 * alloc btree is read in by bch2_alloc_read, not gc:
495 */
496 if (flags & BCH_BUCKET_MARK_GC)
497 return 0;
498
499 u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
500 ca = bch_dev_bkey_exists(c, k.k->p.inode);
501 g = __bucket(ca, k.k->p.offset, gc);
502
503 /*
504 * this should currently only be getting called from the bucket
505 * invalidate path:
506 */
507 BUG_ON(u.dirty_sectors);
508 BUG_ON(u.cached_sectors);
509 BUG_ON(!g->mark.owned_by_allocator);
510
511 old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
512 m.gen = u.gen;
513 m.data_type = u.data_type;
514 m.dirty_sectors = u.dirty_sectors;
515 m.cached_sectors = u.cached_sectors;
516 }));
517
518 g->io_time[READ] = u.read_time;
519 g->io_time[WRITE] = u.write_time;
520 g->oldest_gen = u.oldest_gen;
521 g->gen_valid = 1;
522
523 if (old.cached_sectors) {
524 update_cached_sectors(c, fs_usage, ca->dev_idx,
525 -old.cached_sectors);
526 trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
527 old.cached_sectors);
528 }
529
530 return 0;
531}
532
b2be7c8b 533#define checked_add(a, b) \
1c6fdbd8 534do { \
b2be7c8b
KO
535 unsigned _res = (unsigned) (a) + (b); \
536 (a) = _res; \
537 BUG_ON((a) != _res); \
1c6fdbd8
KO
538} while (0)
539
8777210b
KO
540static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
541 size_t b, enum bch_data_type type,
542 unsigned sectors, bool gc)
9ca53b55
KO
543{
544 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
545 struct bucket *g = __bucket(ca, b, gc);
eeb83e25 546 struct bucket_mark new;
9ca53b55
KO
547
548 BUG_ON(type != BCH_DATA_SB &&
549 type != BCH_DATA_JOURNAL);
550
eeb83e25 551 bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
430735cd 552 new.dirty = true;
9ca53b55
KO
553 new.data_type = type;
554 checked_add(new.dirty_sectors, sectors);
555 }));
8777210b
KO
556
557 return 0;
9ca53b55
KO
558}
559
1c6fdbd8
KO
560void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
561 size_t b, enum bch_data_type type,
562 unsigned sectors, struct gc_pos pos,
563 unsigned flags)
564{
5b650fd1
KO
565 BUG_ON(type != BCH_DATA_SB &&
566 type != BCH_DATA_JOURNAL);
1c6fdbd8 567
9ca53b55
KO
568 preempt_disable();
569
1c6fdbd8 570 if (likely(c)) {
8777210b
KO
571 do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
572 ca, b, type, sectors);
6eac2c2e 573 } else {
9ca53b55 574 struct bucket *g;
430735cd 575 struct bucket_mark new;
9ca53b55 576
6eac2c2e
KO
577 rcu_read_lock();
578
579 g = bucket(ca, b);
430735cd
KO
580 bucket_cmpxchg(g, new, ({
581 new.dirty = true;
582 new.data_type = type;
6eac2c2e
KO
583 checked_add(new.dirty_sectors, sectors);
584 }));
585
586 rcu_read_unlock();
587 }
1c6fdbd8 588
9ca53b55 589 preempt_enable();
1c6fdbd8
KO
590}
591
641ab736
KO
592static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
593 s64 delta)
1c6fdbd8 594{
641ab736
KO
595 if (delta > 0) {
596 /*
597 * marking a new extent, which _will have size_ @delta
598 *
599 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
600 * case, we haven't actually created the key we'll be inserting
601 * yet (for the split) - so we don't want to be using
602 * k->size/crc.live_size here:
603 */
604 return __ptr_disk_sectors(p, delta);
605 } else {
606 BUG_ON(-delta > p.crc.live_size);
1c6fdbd8 607
641ab736
KO
608 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
609 (s64) ptr_disk_sectors(p);
1c6fdbd8 610 }
47799326
KO
611}
612
613/*
614 * Checking against gc's position has to be done here, inside the cmpxchg()
615 * loop, to avoid racing with the start of gc clearing all the marks - GC does
616 * that with the gc pos seqlock held.
617 */
73c27c60 618static bool bch2_mark_pointer(struct bch_fs *c,
47799326
KO
619 struct extent_ptr_decoded p,
620 s64 sectors, enum bch_data_type data_type,
621 struct bch_fs_usage *fs_usage,
5663a415 622 unsigned journal_seq, unsigned flags,
9ca53b55 623 bool gc)
47799326
KO
624{
625 struct bucket_mark old, new;
626 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
9ca53b55
KO
627 size_t b = PTR_BUCKET_NR(ca, &p.ptr);
628 struct bucket *g = __bucket(ca, b, gc);
47799326 629 u64 v;
6eac2c2e 630
1c6fdbd8
KO
631 v = atomic64_read(&g->_mark.v);
632 do {
633 new.v.counter = old.v.counter = v;
1c6fdbd8 634
430735cd
KO
635 new.dirty = true;
636
1c6fdbd8
KO
637 /*
638 * Check this after reading bucket mark to guard against
639 * the allocator invalidating a bucket after we've already
640 * checked the gen
641 */
1742237b 642 if (gen_after(new.gen, p.ptr.gen)) {
1c6fdbd8 643 BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
1742237b 644 EBUG_ON(!p.ptr.cached &&
1c6fdbd8 645 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
73c27c60 646 return true;
1c6fdbd8
KO
647 }
648
1742237b 649 if (!p.ptr.cached)
b2be7c8b 650 checked_add(new.dirty_sectors, sectors);
1c6fdbd8 651 else
b2be7c8b 652 checked_add(new.cached_sectors, sectors);
1c6fdbd8
KO
653
654 if (!new.dirty_sectors &&
655 !new.cached_sectors) {
656 new.data_type = 0;
657
658 if (journal_seq) {
659 new.journal_seq_valid = 1;
660 new.journal_seq = journal_seq;
661 }
662 } else {
663 new.data_type = data_type;
664 }
665
666 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
667 g->_mark = new;
668 break;
669 }
670 } while ((v = atomic64_cmpxchg(&g->_mark.v,
671 old.v.counter,
672 new.v.counter)) != old.v.counter);
673
9ca53b55 674 bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
1c6fdbd8 675
9ca53b55 676 BUG_ON(!gc && bucket_became_unavailable(old, new));
73c27c60
KO
677
678 return false;
1c6fdbd8
KO
679}
680
dfe9bfb3
KO
681static int bch2_mark_stripe_ptr(struct bch_fs *c,
682 struct bch_extent_stripe_ptr p,
7ef2a73a
KO
683 enum bch_data_type data_type,
684 struct bch_fs_usage *fs_usage,
dfe9bfb3 685 s64 sectors, unsigned flags,
dfe9bfb3 686 bool gc)
cd575ddf 687{
dfe9bfb3 688 struct stripe *m;
cd575ddf
KO
689 unsigned old, new, nr_data;
690 int blocks_nonempty_delta;
691 s64 parity_sectors;
692
61c8d7c8
KO
693 BUG_ON(!sectors);
694
dfe9bfb3 695 m = genradix_ptr(&c->stripes[gc], p.idx);
cd575ddf 696
61c8d7c8
KO
697 spin_lock(&c->ec_stripes_heap_lock);
698
dfe9bfb3 699 if (!m || !m->alive) {
61c8d7c8 700 spin_unlock(&c->ec_stripes_heap_lock);
dfe9bfb3
KO
701 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
702 (u64) p.idx);
703 return -1;
704 }
cd575ddf 705
7ef2a73a
KO
706 BUG_ON(m->r.e.data_type != data_type);
707
cd575ddf
KO
708 nr_data = m->nr_blocks - m->nr_redundant;
709
710 parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
711
712 if (sectors < 0)
713 parity_sectors = -parity_sectors;
7ef2a73a 714 sectors += parity_sectors;
cd575ddf 715
61c8d7c8
KO
716 old = m->block_sectors[p.block];
717 m->block_sectors[p.block] += sectors;
718 new = m->block_sectors[p.block];
cd575ddf
KO
719
720 blocks_nonempty_delta = (int) !!new - (int) !!old;
61c8d7c8
KO
721 if (blocks_nonempty_delta) {
722 m->blocks_nonempty += blocks_nonempty_delta;
cd575ddf 723
61c8d7c8
KO
724 if (!gc)
725 bch2_stripes_heap_update(c, m, p.idx);
726 }
cd575ddf 727
61c8d7c8 728 m->dirty = true;
cd575ddf 729
61c8d7c8 730 spin_unlock(&c->ec_stripes_heap_lock);
dfe9bfb3 731
7ef2a73a
KO
732 update_replicas(c, fs_usage, &m->r.e, sectors);
733
dfe9bfb3 734 return 0;
cd575ddf
KO
735}
736
dfe9bfb3
KO
737static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
738 s64 sectors, enum bch_data_type data_type,
5663a415
KO
739 struct bch_fs_usage *fs_usage,
740 unsigned journal_seq, unsigned flags,
dfe9bfb3 741 bool gc)
1c6fdbd8 742{
26609b61
KO
743 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
744 const union bch_extent_entry *entry;
745 struct extent_ptr_decoded p;
7ef2a73a
KO
746 struct bch_replicas_padded r;
747 s64 dirty_sectors = 0;
26609b61
KO
748 unsigned i;
749 int ret;
750
7ef2a73a
KO
751 r.e.data_type = data_type;
752 r.e.nr_devs = 0;
753 r.e.nr_required = 1;
754
47799326
KO
755 BUG_ON(!sectors);
756
26609b61 757 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
641ab736
KO
758 s64 disk_sectors = data_type == BCH_DATA_BTREE
759 ? sectors
760 : ptr_disk_sectors_delta(p, sectors);
73c27c60
KO
761 bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
762 fs_usage, journal_seq, flags, gc);
b35b1925 763
7ef2a73a 764 if (p.ptr.cached) {
73c27c60
KO
765 if (disk_sectors && !stale)
766 update_cached_sectors(c, fs_usage, p.ptr.dev,
767 disk_sectors);
7ef2a73a
KO
768 } else if (!p.ec_nr) {
769 dirty_sectors += disk_sectors;
770 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
771 } else {
26609b61
KO
772 for (i = 0; i < p.ec_nr; i++) {
773 ret = bch2_mark_stripe_ptr(c, p.ec[i],
7ef2a73a
KO
774 data_type, fs_usage,
775 disk_sectors, flags, gc);
26609b61
KO
776 if (ret)
777 return ret;
778 }
26609b61 779
7ef2a73a
KO
780 r.e.nr_required = 0;
781 }
47799326 782 }
b35b1925 783
7ef2a73a
KO
784 if (dirty_sectors)
785 update_replicas(c, fs_usage, &r.e, dirty_sectors);
dfe9bfb3
KO
786
787 return 0;
47799326 788}
5b650fd1 789
cd575ddf
KO
790static void bucket_set_stripe(struct bch_fs *c,
791 const struct bch_stripe *v,
792 bool enabled,
793 struct bch_fs_usage *fs_usage,
9ca53b55
KO
794 u64 journal_seq,
795 bool gc)
cd575ddf
KO
796{
797 unsigned i;
798
799 for (i = 0; i < v->nr_blocks; i++) {
800 const struct bch_extent_ptr *ptr = v->ptrs + i;
801 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
9ca53b55
KO
802 size_t b = PTR_BUCKET_NR(ca, ptr);
803 struct bucket *g = __bucket(ca, b, gc);
cd575ddf
KO
804 struct bucket_mark new, old;
805
806 BUG_ON(ptr_stale(ca, ptr));
807
dfe9bfb3 808 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
430735cd 809 new.dirty = true;
cd575ddf
KO
810 new.stripe = enabled;
811 if (journal_seq) {
812 new.journal_seq_valid = 1;
813 new.journal_seq = journal_seq;
814 }
815 }));
cd575ddf
KO
816 }
817}
818
dfe9bfb3
KO
819static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
820 bool inserting,
821 struct bch_fs_usage *fs_usage,
822 u64 journal_seq, unsigned flags,
823 bool gc)
cd575ddf 824{
26609b61
KO
825 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
826 size_t idx = s.k->p.offset;
827 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
828 unsigned i;
cd575ddf 829
61c8d7c8
KO
830 spin_lock(&c->ec_stripes_heap_lock);
831
26609b61 832 if (!m || (!inserting && !m->alive)) {
61c8d7c8 833 spin_unlock(&c->ec_stripes_heap_lock);
26609b61
KO
834 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
835 idx);
836 return -1;
837 }
cd575ddf 838
61c8d7c8
KO
839 if (m->alive)
840 bch2_stripes_heap_del(c, m, idx);
cd575ddf 841
61c8d7c8 842 memset(m, 0, sizeof(*m));
cd575ddf 843
26609b61
KO
844 if (inserting) {
845 m->sectors = le16_to_cpu(s.v->sectors);
846 m->algorithm = s.v->algorithm;
847 m->nr_blocks = s.v->nr_blocks;
848 m->nr_redundant = s.v->nr_redundant;
7ef2a73a
KO
849
850 memset(&m->r, 0, sizeof(m->r));
851
852 m->r.e.data_type = BCH_DATA_USER;
853 m->r.e.nr_devs = s.v->nr_blocks;
854 m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant;
855
856 for (i = 0; i < s.v->nr_blocks; i++)
857 m->r.e.devs[i] = s.v->ptrs[i].dev;
26609b61 858
7ef2a73a
KO
859 /*
860 * XXX: account for stripes somehow here
861 */
862#if 0
863 update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
864#endif
865
61c8d7c8
KO
866 /* gc recalculates these fields: */
867 if (!(flags & BCH_BUCKET_MARK_GC)) {
868 for (i = 0; i < s.v->nr_blocks; i++) {
869 m->block_sectors[i] =
870 stripe_blockcount_get(s.v, i);
871 m->blocks_nonempty += !!m->block_sectors[i];
872 }
873 }
874
875 if (!gc)
26609b61
KO
876 bch2_stripes_heap_insert(c, m, idx);
877 else
61c8d7c8 878 m->alive = true;
cd575ddf 879 }
dfe9bfb3 880
61c8d7c8
KO
881 spin_unlock(&c->ec_stripes_heap_lock);
882
26609b61 883 bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
dfe9bfb3 884 return 0;
cd575ddf
KO
885}
886
26609b61 887static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
dfe9bfb3 888 bool inserting, s64 sectors,
5663a415
KO
889 struct bch_fs_usage *fs_usage,
890 unsigned journal_seq, unsigned flags,
dfe9bfb3 891 bool gc)
47799326 892{
8777210b
KO
893 if (!fs_usage || gc)
894 fs_usage = this_cpu_ptr(c->usage[gc]);
dfe9bfb3 895
26609b61 896 switch (k.k->type) {
8fe826f9
KO
897 case KEY_TYPE_alloc:
898 return bch2_mark_alloc(c, k, inserting,
899 fs_usage, journal_seq, flags, gc);
26609b61 900 case KEY_TYPE_btree_ptr:
8777210b 901 return bch2_mark_extent(c, k, inserting
8fe826f9
KO
902 ? c->opts.btree_node_size
903 : -c->opts.btree_node_size,
904 BCH_DATA_BTREE,
905 fs_usage, journal_seq, flags, gc);
26609b61 906 case KEY_TYPE_extent:
8777210b 907 return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
8fe826f9 908 fs_usage, journal_seq, flags, gc);
26609b61 909 case KEY_TYPE_stripe:
8777210b 910 return bch2_mark_stripe(c, k, inserting,
8fe826f9 911 fs_usage, journal_seq, flags, gc);
bdba6c29 912 case KEY_TYPE_inode:
f0cfb963 913 if (inserting)
06b7345c 914 fs_usage->s.nr_inodes++;
f0cfb963 915 else
06b7345c 916 fs_usage->s.nr_inodes--;
8777210b 917 return 0;
26609b61
KO
918 case KEY_TYPE_reservation: {
919 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
920
921 sectors *= replicas;
7ef2a73a
KO
922 replicas = clamp_t(unsigned, replicas, 1,
923 ARRAY_SIZE(fs_usage->persistent_reserved));
26609b61 924
7ef2a73a
KO
925 fs_usage->s.reserved += sectors;
926 fs_usage->persistent_reserved[replicas - 1] += sectors;
8777210b 927 return 0;
26609b61 928 }
47799326 929 default:
8777210b 930 return 0;
1c6fdbd8 931 }
9ca53b55
KO
932}
933
eeb83e25 934int bch2_mark_key_locked(struct bch_fs *c,
26609b61 935 struct bkey_s_c k,
eeb83e25
KO
936 bool inserting, s64 sectors,
937 struct gc_pos pos,
5663a415 938 struct bch_fs_usage *fs_usage,
eeb83e25 939 u64 journal_seq, unsigned flags)
9ca53b55 940{
8777210b
KO
941 return do_mark_fn(__bch2_mark_key, c, pos, flags,
942 k, inserting, sectors, fs_usage,
943 journal_seq, flags);
eeb83e25
KO
944}
945
26609b61 946int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
eeb83e25
KO
947 bool inserting, s64 sectors,
948 struct gc_pos pos,
5663a415 949 struct bch_fs_usage *fs_usage,
eeb83e25
KO
950 u64 journal_seq, unsigned flags)
951{
952 int ret;
953
9166b41d 954 percpu_down_read(&c->mark_lock);
26609b61 955 ret = bch2_mark_key_locked(c, k, inserting, sectors,
5663a415 956 pos, fs_usage, journal_seq, flags);
9166b41d 957 percpu_up_read(&c->mark_lock);
dfe9bfb3
KO
958
959 return ret;
1c6fdbd8
KO
960}
961
b35b1925
KO
962void bch2_mark_update(struct btree_insert *trans,
963 struct btree_insert_entry *insert)
964{
965 struct bch_fs *c = trans->c;
966 struct btree_iter *iter = insert->iter;
967 struct btree *b = iter->l[0].b;
968 struct btree_node_iter node_iter = iter->l[0].iter;
7ef2a73a 969 struct bch_fs_usage *fs_usage;
b35b1925
KO
970 struct gc_pos pos = gc_pos_btree_node(b);
971 struct bkey_packed *_k;
7ef2a73a
KO
972 u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
973 static int warned_disk_usage = 0;
b35b1925 974
26609b61 975 if (!btree_node_type_needs_gc(iter->btree_id))
eeb83e25
KO
976 return;
977
9166b41d 978 percpu_down_read(&c->mark_lock);
7ef2a73a
KO
979 preempt_disable();
980 fs_usage = bch2_fs_usage_get_scratch(c);
eeb83e25 981
8fe826f9 982 if (!(trans->flags & BTREE_INSERT_NOMARK))
26609b61 983 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
eeb83e25
KO
984 bpos_min(insert->k->k.p, b->key.k.p).offset -
985 bkey_start_offset(&insert->k->k),
7ef2a73a 986 pos, fs_usage, trans->journal_res.seq, 0);
b35b1925
KO
987
988 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
26609b61 989 KEY_TYPE_discard))) {
b35b1925
KO
990 struct bkey unpacked;
991 struct bkey_s_c k;
992 s64 sectors = 0;
993
994 k = bkey_disassemble(b, _k, &unpacked);
995
996 if (btree_node_is_extents(b)
997 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
998 : bkey_cmp(insert->k->k.p, k.k->p))
999 break;
1000
1001 if (btree_node_is_extents(b)) {
1002 switch (bch2_extent_overlap(&insert->k->k, k.k)) {
1003 case BCH_EXTENT_OVERLAP_ALL:
1004 sectors = -((s64) k.k->size);
1005 break;
1006 case BCH_EXTENT_OVERLAP_BACK:
1007 sectors = bkey_start_offset(&insert->k->k) -
1008 k.k->p.offset;
1009 break;
1010 case BCH_EXTENT_OVERLAP_FRONT:
1011 sectors = bkey_start_offset(k.k) -
1012 insert->k->k.p.offset;
1013 break;
1014 case BCH_EXTENT_OVERLAP_MIDDLE:
1015 sectors = k.k->p.offset - insert->k->k.p.offset;
1016 BUG_ON(sectors <= 0);
1017
26609b61 1018 bch2_mark_key_locked(c, k, true, sectors,
7ef2a73a 1019 pos, fs_usage, trans->journal_res.seq, 0);
b35b1925
KO
1020
1021 sectors = bkey_start_offset(&insert->k->k) -
1022 k.k->p.offset;
1023 break;
1024 }
1025
1026 BUG_ON(sectors >= 0);
1027 }
1028
26609b61 1029 bch2_mark_key_locked(c, k, false, sectors,
7ef2a73a 1030 pos, fs_usage, trans->journal_res.seq, 0);
b35b1925
KO
1031
1032 bch2_btree_node_iter_advance(&node_iter, b);
1033 }
1034
2ecc6171 1035 if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
7ef2a73a
KO
1036 !warned_disk_usage &&
1037 !xchg(&warned_disk_usage, 1)) {
1038 char buf[200];
1039
1040 pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
1041
1042 pr_err("while inserting");
1043 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
1044 pr_err("%s", buf);
1045 pr_err("overlapping with");
1046
1047 node_iter = iter->l[0].iter;
1048 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
1049 KEY_TYPE_discard))) {
1050 struct bkey unpacked;
1051 struct bkey_s_c k;
1052
1053 k = bkey_disassemble(b, _k, &unpacked);
eeb83e25 1054
7ef2a73a
KO
1055 if (btree_node_is_extents(b)
1056 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
1057 : bkey_cmp(insert->k->k.p, k.k->p))
1058 break;
1059
1060 bch2_bkey_val_to_text(&PBUF(buf), c, k);
1061 pr_err("%s", buf);
1062
1063 bch2_btree_node_iter_advance(&node_iter, b);
1064 }
1065 }
1066
1067 preempt_enable();
9166b41d 1068 percpu_up_read(&c->mark_lock);
b35b1925
KO
1069}
1070
1c6fdbd8
KO
1071/* Disk reservations: */
1072
9ca53b55 1073static u64 bch2_recalc_sectors_available(struct bch_fs *c)
1c6fdbd8 1074{
4c97e04a 1075 percpu_u64_set(&c->pcpu->sectors_available, 0);
1c6fdbd8 1076
5663a415 1077 return avail_factor(bch2_fs_sectors_free(c));
1c6fdbd8
KO
1078}
1079
1c6fdbd8
KO
1080void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
1081{
9166b41d 1082 percpu_down_read(&c->mark_lock);
06b7345c 1083 this_cpu_sub(c->usage[0]->s.online_reserved, res->sectors);
9166b41d 1084 percpu_up_read(&c->mark_lock);
1c6fdbd8
KO
1085
1086 res->sectors = 0;
1087}
1088
1089#define SECTORS_CACHE 1024
1090
1091int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1092 unsigned sectors, int flags)
1093{
5663a415 1094 struct bch_fs_pcpu *pcpu;
1c6fdbd8
KO
1095 u64 old, v, get;
1096 s64 sectors_available;
1097 int ret;
1098
9166b41d 1099 percpu_down_read(&c->mark_lock);
1c6fdbd8 1100 preempt_disable();
5663a415 1101 pcpu = this_cpu_ptr(c->pcpu);
1c6fdbd8 1102
5663a415 1103 if (sectors <= pcpu->sectors_available)
1c6fdbd8
KO
1104 goto out;
1105
1106 v = atomic64_read(&c->sectors_available);
1107 do {
1108 old = v;
1109 get = min((u64) sectors + SECTORS_CACHE, old);
1110
1111 if (get < sectors) {
1112 preempt_enable();
9166b41d 1113 percpu_up_read(&c->mark_lock);
1c6fdbd8
KO
1114 goto recalculate;
1115 }
1116 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1117 old, old - get)) != old);
1118
5663a415 1119 pcpu->sectors_available += get;
1c6fdbd8
KO
1120
1121out:
5663a415 1122 pcpu->sectors_available -= sectors;
06b7345c 1123 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
5663a415 1124 res->sectors += sectors;
1c6fdbd8 1125
1c6fdbd8 1126 preempt_enable();
9166b41d 1127 percpu_up_read(&c->mark_lock);
1c6fdbd8
KO
1128 return 0;
1129
1130recalculate:
9166b41d 1131 percpu_down_write(&c->mark_lock);
39fbc5a4 1132
9ca53b55 1133 sectors_available = bch2_recalc_sectors_available(c);
1c6fdbd8
KO
1134
1135 if (sectors <= sectors_available ||
1136 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1137 atomic64_set(&c->sectors_available,
1138 max_t(s64, 0, sectors_available - sectors));
06b7345c 1139 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
5663a415 1140 res->sectors += sectors;
1c6fdbd8 1141 ret = 0;
1c6fdbd8
KO
1142 } else {
1143 atomic64_set(&c->sectors_available, sectors_available);
1144 ret = -ENOSPC;
1145 }
1146
9166b41d 1147 percpu_up_write(&c->mark_lock);
1c6fdbd8 1148
1c6fdbd8
KO
1149 return ret;
1150}
1151
1152/* Startup/shutdown: */
1153
1154static void buckets_free_rcu(struct rcu_head *rcu)
1155{
1156 struct bucket_array *buckets =
1157 container_of(rcu, struct bucket_array, rcu);
1158
1159 kvpfree(buckets,
1160 sizeof(struct bucket_array) +
1161 buckets->nbuckets * sizeof(struct bucket));
1162}
1163
1164int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1165{
1166 struct bucket_array *buckets = NULL, *old_buckets = NULL;
8eb7f3ee 1167 unsigned long *buckets_nouse = NULL;
61274e9d 1168 unsigned long *buckets_written = NULL;
1c6fdbd8
KO
1169 alloc_fifo free[RESERVE_NR];
1170 alloc_fifo free_inc;
1171 alloc_heap alloc_heap;
1172 copygc_heap copygc_heap;
1173
1174 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1175 ca->mi.bucket_size / c->opts.btree_node_size);
1176 /* XXX: these should be tunable */
8b335bae
KO
1177 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
1178 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
1179 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
d0cc3def 1180 btree_reserve * 2);
9ca53b55 1181 bool resize = ca->buckets[0] != NULL,
1c6fdbd8
KO
1182 start_copygc = ca->copygc_thread != NULL;
1183 int ret = -ENOMEM;
1184 unsigned i;
1185
1186 memset(&free, 0, sizeof(free));
1187 memset(&free_inc, 0, sizeof(free_inc));
1188 memset(&alloc_heap, 0, sizeof(alloc_heap));
1189 memset(&copygc_heap, 0, sizeof(copygc_heap));
1190
1191 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
1192 nbuckets * sizeof(struct bucket),
1193 GFP_KERNEL|__GFP_ZERO)) ||
8eb7f3ee 1194 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1c6fdbd8
KO
1195 sizeof(unsigned long),
1196 GFP_KERNEL|__GFP_ZERO)) ||
61274e9d
KO
1197 !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1198 sizeof(unsigned long),
1199 GFP_KERNEL|__GFP_ZERO)) ||
1c6fdbd8
KO
1200 !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1201 !init_fifo(&free[RESERVE_MOVINGGC],
1202 copygc_reserve, GFP_KERNEL) ||
1203 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
b29e197a
KO
1204 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
1205 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1c6fdbd8
KO
1206 !init_heap(&copygc_heap, copygc_reserve, GFP_KERNEL))
1207 goto err;
1208
1209 buckets->first_bucket = ca->mi.first_bucket;
1210 buckets->nbuckets = nbuckets;
1211
1212 bch2_copygc_stop(ca);
1213
1214 if (resize) {
1215 down_write(&c->gc_lock);
1216 down_write(&ca->bucket_lock);
9166b41d 1217 percpu_down_write(&c->mark_lock);
1c6fdbd8
KO
1218 }
1219
1220 old_buckets = bucket_array(ca);
1221
1222 if (resize) {
1223 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1224
1225 memcpy(buckets->b,
1226 old_buckets->b,
1227 n * sizeof(struct bucket));
8eb7f3ee
KO
1228 memcpy(buckets_nouse,
1229 ca->buckets_nouse,
1c6fdbd8 1230 BITS_TO_LONGS(n) * sizeof(unsigned long));
61274e9d
KO
1231 memcpy(buckets_written,
1232 ca->buckets_written,
1233 BITS_TO_LONGS(n) * sizeof(unsigned long));
1c6fdbd8
KO
1234 }
1235
9ca53b55 1236 rcu_assign_pointer(ca->buckets[0], buckets);
1c6fdbd8
KO
1237 buckets = old_buckets;
1238
8eb7f3ee 1239 swap(ca->buckets_nouse, buckets_nouse);
61274e9d 1240 swap(ca->buckets_written, buckets_written);
1c6fdbd8
KO
1241
1242 if (resize)
9166b41d 1243 percpu_up_write(&c->mark_lock);
1c6fdbd8
KO
1244
1245 spin_lock(&c->freelist_lock);
1246 for (i = 0; i < RESERVE_NR; i++) {
1247 fifo_move(&free[i], &ca->free[i]);
1248 swap(ca->free[i], free[i]);
1249 }
1250 fifo_move(&free_inc, &ca->free_inc);
1251 swap(ca->free_inc, free_inc);
1252 spin_unlock(&c->freelist_lock);
1253
1254 /* with gc lock held, alloc_heap can't be in use: */
1255 swap(ca->alloc_heap, alloc_heap);
1256
1257 /* and we shut down copygc: */
1258 swap(ca->copygc_heap, copygc_heap);
1259
1260 nbuckets = ca->mi.nbuckets;
1261
1262 if (resize) {
1263 up_write(&ca->bucket_lock);
1264 up_write(&c->gc_lock);
1265 }
1266
1267 if (start_copygc &&
1268 bch2_copygc_start(c, ca))
1269 bch_err(ca, "error restarting copygc thread");
1270
1271 ret = 0;
1272err:
1273 free_heap(&copygc_heap);
1274 free_heap(&alloc_heap);
1275 free_fifo(&free_inc);
1276 for (i = 0; i < RESERVE_NR; i++)
1277 free_fifo(&free[i]);
8eb7f3ee 1278 kvpfree(buckets_nouse,
1c6fdbd8 1279 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
61274e9d
KO
1280 kvpfree(buckets_written,
1281 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1c6fdbd8
KO
1282 if (buckets)
1283 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1284
1285 return ret;
1286}
1287
1288void bch2_dev_buckets_free(struct bch_dev *ca)
1289{
1290 unsigned i;
1291
1292 free_heap(&ca->copygc_heap);
1293 free_heap(&ca->alloc_heap);
1294 free_fifo(&ca->free_inc);
1295 for (i = 0; i < RESERVE_NR; i++)
1296 free_fifo(&ca->free[i]);
61274e9d
KO
1297 kvpfree(ca->buckets_written,
1298 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
8eb7f3ee 1299 kvpfree(ca->buckets_nouse,
1c6fdbd8 1300 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
9ca53b55 1301 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1c6fdbd8
KO
1302 sizeof(struct bucket_array) +
1303 ca->mi.nbuckets * sizeof(struct bucket));
1304
9ca53b55 1305 free_percpu(ca->usage[0]);
1c6fdbd8
KO
1306}
1307
1308int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1309{
9ca53b55 1310 if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1c6fdbd8
KO
1311 return -ENOMEM;
1312
1313 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
1314}