bcachefs: Fix a bug when shutting down before allocator started
[linux-block.git] / fs / bcachefs / alloc_background.c
CommitLineData
7b3f84ea 1// SPDX-License-Identifier: GPL-2.0
1c6fdbd8 2#include "bcachefs.h"
7b3f84ea
KO
3#include "alloc_background.h"
4#include "alloc_foreground.h"
1c6fdbd8
KO
5#include "btree_cache.h"
6#include "btree_io.h"
7#include "btree_update.h"
8#include "btree_update_interior.h"
9#include "btree_gc.h"
10#include "buckets.h"
1c6fdbd8
KO
11#include "clock.h"
12#include "debug.h"
cd575ddf 13#include "ec.h"
1c6fdbd8 14#include "error.h"
1c6fdbd8 15#include "journal_io.h"
1c6fdbd8
KO
16#include "trace.h"
17
1c6fdbd8
KO
18#include <linux/kthread.h>
19#include <linux/math64.h>
20#include <linux/random.h>
21#include <linux/rculist.h>
22#include <linux/rcupdate.h>
23#include <linux/sched/task.h>
24#include <linux/sort.h>
25
90541a74
KO
26static const char * const bch2_alloc_field_names[] = {
27#define x(name, bytes) #name,
28 BCH_ALLOC_FIELDS()
29#undef x
30 NULL
31};
32
1c6fdbd8
KO
33static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
34
35/* Ratelimiting/PD controllers */
36
37static void pd_controllers_update(struct work_struct *work)
38{
39 struct bch_fs *c = container_of(to_delayed_work(work),
40 struct bch_fs,
41 pd_controllers_update);
42 struct bch_dev *ca;
43 unsigned i;
44
45 for_each_member_device(ca, c, i) {
46 struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
47
48 u64 free = bucket_to_sector(ca,
49 __dev_buckets_free(ca, stats)) << 9;
50 /*
51 * Bytes of internal fragmentation, which can be
52 * reclaimed by copy GC
53 */
54 s64 fragmented = (bucket_to_sector(ca,
55 stats.buckets[BCH_DATA_USER] +
56 stats.buckets[BCH_DATA_CACHED]) -
57 (stats.sectors[BCH_DATA_USER] +
58 stats.sectors[BCH_DATA_CACHED])) << 9;
59
60 fragmented = max(0LL, fragmented);
61
62 bch2_pd_controller_update(&ca->copygc_pd,
63 free, fragmented, -1);
64 }
65
66 schedule_delayed_work(&c->pd_controllers_update,
67 c->pd_controllers_update_seconds * HZ);
68}
69
70/* Persistent alloc info: */
71
90541a74
KO
72static inline u64 get_alloc_field(const struct bch_alloc *a,
73 const void **p, unsigned field)
74{
75 unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
76 u64 v;
77
78 if (!(a->fields & (1 << field)))
79 return 0;
80
81 switch (bytes) {
82 case 1:
83 v = *((const u8 *) *p);
84 break;
85 case 2:
86 v = le16_to_cpup(*p);
87 break;
88 case 4:
89 v = le32_to_cpup(*p);
90 break;
91 case 8:
92 v = le64_to_cpup(*p);
93 break;
94 default:
95 BUG();
96 }
97
98 *p += bytes;
99 return v;
100}
101
102static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
103 unsigned field, u64 v)
104{
105 unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
106
107 if (!v)
108 return;
109
110 a->v.fields |= 1 << field;
111
112 switch (bytes) {
113 case 1:
114 *((u8 *) *p) = v;
115 break;
116 case 2:
117 *((__le16 *) *p) = cpu_to_le16(v);
118 break;
119 case 4:
120 *((__le32 *) *p) = cpu_to_le32(v);
121 break;
122 case 8:
123 *((__le64 *) *p) = cpu_to_le64(v);
124 break;
125 default:
126 BUG();
127 }
128
129 *p += bytes;
130}
131
1c6fdbd8
KO
132static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
133{
90541a74 134 unsigned i, bytes = offsetof(struct bch_alloc, data);
1c6fdbd8 135
90541a74
KO
136 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
137 if (a->fields & (1 << i))
138 bytes += BCH_ALLOC_FIELD_BYTES[i];
1c6fdbd8
KO
139
140 return DIV_ROUND_UP(bytes, sizeof(u64));
141}
142
143const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
144{
26609b61
KO
145 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
146
1c6fdbd8
KO
147 if (k.k->p.inode >= c->sb.nr_devices ||
148 !c->devs[k.k->p.inode])
149 return "invalid device";
150
26609b61
KO
151 /* allow for unknown fields */
152 if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
153 return "incorrect value size";
1c6fdbd8
KO
154
155 return NULL;
156}
157
319f9ac3
KO
158void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
159 struct bkey_s_c k)
1c6fdbd8 160{
26609b61 161 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
90541a74
KO
162 const void *d = a.v->data;
163 unsigned i;
319f9ac3 164
26609b61 165 pr_buf(out, "gen %u", a.v->gen);
90541a74
KO
166
167 for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
168 if (a.v->fields & (1 << i))
169 pr_buf(out, " %s %llu",
170 bch2_alloc_field_names[i],
171 get_alloc_field(a.v, &d, i));
1c6fdbd8
KO
172}
173
90541a74 174static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
1c6fdbd8 175{
90541a74
KO
176 const void *d = a->data;
177 unsigned idx = 0;
178
179 g->_mark.gen = a->gen;
180 g->gen_valid = 1;
181 g->io_time[READ] = get_alloc_field(a, &d, idx++);
182 g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
183 g->_mark.data_type = get_alloc_field(a, &d, idx++);
184 g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++);
185 g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
1c6fdbd8
KO
186}
187
8eb7f3ee
KO
188static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
189 struct bucket_mark m)
1c6fdbd8 190{
90541a74
KO
191 unsigned idx = 0;
192 void *d = a->v.data;
1c6fdbd8 193
90541a74
KO
194 a->v.fields = 0;
195 a->v.gen = m.gen;
196
197 d = a->v.data;
198 put_alloc_field(a, &d, idx++, g->io_time[READ]);
199 put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
200 put_alloc_field(a, &d, idx++, m.data_type);
201 put_alloc_field(a, &d, idx++, m.dirty_sectors);
202 put_alloc_field(a, &d, idx++, m.cached_sectors);
203
204 set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
1c6fdbd8
KO
205}
206
207static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
208{
209 struct bch_dev *ca;
210 struct bkey_s_c_alloc a;
1c6fdbd8 211
26609b61 212 if (k.k->type != KEY_TYPE_alloc)
1c6fdbd8
KO
213 return;
214
215 a = bkey_s_c_to_alloc(k);
216 ca = bch_dev_bkey_exists(c, a.k->p.inode);
217
218 if (a.k->p.offset >= ca->mi.nbuckets)
219 return;
220
9166b41d 221 percpu_down_read(&c->mark_lock);
90541a74 222 __alloc_read_key(bucket(ca, a.k->p.offset), a.v);
9166b41d 223 percpu_up_read(&c->mark_lock);
1c6fdbd8
KO
224}
225
226int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
227{
228 struct journal_replay *r;
229 struct btree_iter iter;
230 struct bkey_s_c k;
231 struct bch_dev *ca;
232 unsigned i;
233 int ret;
234
235 for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
236 bch2_alloc_read_key(c, k);
237 bch2_btree_iter_cond_resched(&iter);
238 }
239
240 ret = bch2_btree_iter_unlock(&iter);
241 if (ret)
242 return ret;
243
244 list_for_each_entry(r, journal_replay_list, list) {
245 struct bkey_i *k, *n;
246 struct jset_entry *entry;
247
248 for_each_jset_key(k, n, entry, &r->j)
249 if (entry->btree_id == BTREE_ID_ALLOC)
250 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
251 }
252
430735cd
KO
253 for_each_member_device(ca, c, i)
254 bch2_dev_usage_from_buckets(c, ca);
255
1c6fdbd8
KO
256 mutex_lock(&c->bucket_clock[READ].lock);
257 for_each_member_device(ca, c, i) {
258 down_read(&ca->bucket_lock);
259 bch2_recalc_oldest_io(c, ca, READ);
260 up_read(&ca->bucket_lock);
261 }
262 mutex_unlock(&c->bucket_clock[READ].lock);
263
264 mutex_lock(&c->bucket_clock[WRITE].lock);
265 for_each_member_device(ca, c, i) {
266 down_read(&ca->bucket_lock);
267 bch2_recalc_oldest_io(c, ca, WRITE);
268 up_read(&ca->bucket_lock);
269 }
270 mutex_unlock(&c->bucket_clock[WRITE].lock);
271
272 return 0;
273}
274
275static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
276 size_t b, struct btree_iter *iter,
b29e197a 277 u64 *journal_seq, unsigned flags)
1c6fdbd8 278{
90541a74
KO
279#if 0
280 __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
281#else
282 /* hack: */
283 __BKEY_PADDED(k, 8) alloc_key;
284#endif
285 struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
8eb7f3ee 286 struct bucket *g;
430735cd 287 struct bucket_mark m, new;
61274e9d 288 int ret;
1c6fdbd8 289
90541a74 290 BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
b29e197a 291
90541a74 292 a->k.p = POS(ca->dev_idx, b);
b29e197a 293
430735cd
KO
294 bch2_btree_iter_set_pos(iter, a->k.p);
295
296 ret = bch2_btree_iter_traverse(iter);
297 if (ret)
298 return ret;
299
9166b41d 300 percpu_down_read(&c->mark_lock);
8eb7f3ee 301 g = bucket(ca, b);
430735cd
KO
302 m = READ_ONCE(g->mark);
303
304 if (!m.dirty) {
305 percpu_up_read(&c->mark_lock);
306 return 0;
307 }
8eb7f3ee
KO
308
309 __alloc_write_key(a, g, m);
9166b41d 310 percpu_up_read(&c->mark_lock);
1c6fdbd8 311
b29e197a 312 bch2_btree_iter_cond_resched(iter);
1c6fdbd8 313
61274e9d 314 ret = bch2_btree_insert_at(c, NULL, journal_seq,
430735cd 315 BTREE_INSERT_NOCHECK_RW|
61274e9d
KO
316 BTREE_INSERT_NOFAIL|
317 BTREE_INSERT_USE_RESERVE|
318 BTREE_INSERT_USE_ALLOC_RESERVE|
319 flags,
320 BTREE_INSERT_ENTRY(iter, &a->k_i));
430735cd
KO
321 if (ret)
322 return ret;
61274e9d 323
430735cd
KO
324 new = m;
325 new.dirty = false;
326 atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
327
328 if (ca->buckets_written)
61274e9d
KO
329 set_bit(b, ca->buckets_written);
330
430735cd 331 return 0;
1c6fdbd8
KO
332}
333
61274e9d 334int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
1c6fdbd8
KO
335{
336 struct bch_dev *ca;
337 struct btree_iter iter;
338 int ret;
339
61274e9d
KO
340 if (k->k.p.inode >= c->sb.nr_devices ||
341 !c->devs[k->k.p.inode])
1c6fdbd8
KO
342 return 0;
343
61274e9d 344 ca = bch_dev_bkey_exists(c, k->k.p.inode);
1c6fdbd8 345
61274e9d 346 if (k->k.p.offset >= ca->mi.nbuckets)
1c6fdbd8
KO
347 return 0;
348
61274e9d
KO
349 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
350 BTREE_ITER_INTENT);
1c6fdbd8 351
61274e9d
KO
352 ret = bch2_btree_iter_traverse(&iter);
353 if (ret)
354 goto err;
355
356 /* check buckets_written with btree node locked: */
357
358 ret = test_bit(k->k.p.offset, ca->buckets_written)
359 ? 0
360 : bch2_btree_insert_at(c, NULL, NULL,
361 BTREE_INSERT_NOFAIL|
362 BTREE_INSERT_JOURNAL_REPLAY,
363 BTREE_INSERT_ENTRY(&iter, k));
364err:
1c6fdbd8
KO
365 bch2_btree_iter_unlock(&iter);
366 return ret;
367}
368
d0cc3def 369int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
1c6fdbd8
KO
370{
371 struct bch_dev *ca;
372 unsigned i;
373 int ret = 0;
374
d0cc3def
KO
375 *wrote = false;
376
1c6fdbd8
KO
377 for_each_rw_member(ca, c, i) {
378 struct btree_iter iter;
8eb7f3ee
KO
379 struct bucket_array *buckets;
380 size_t b;
1c6fdbd8
KO
381
382 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
383 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
384
385 down_read(&ca->bucket_lock);
8eb7f3ee
KO
386 buckets = bucket_array(ca);
387
388 for (b = buckets->first_bucket;
389 b < buckets->nbuckets;
390 b++) {
391 if (!buckets->b[b].mark.dirty)
392 continue;
393
d0cc3def
KO
394 ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
395 nowait
396 ? BTREE_INSERT_NOWAIT
397 : 0);
1c6fdbd8
KO
398 if (ret)
399 break;
d0cc3def
KO
400
401 *wrote = true;
1c6fdbd8
KO
402 }
403 up_read(&ca->bucket_lock);
404 bch2_btree_iter_unlock(&iter);
405
406 if (ret) {
407 percpu_ref_put(&ca->io_ref);
408 break;
409 }
410 }
411
412 return ret;
413}
414
415/* Bucket IO clocks: */
416
417static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
418{
419 struct bucket_clock *clock = &c->bucket_clock[rw];
420 struct bucket_array *buckets = bucket_array(ca);
421 struct bucket *g;
422 u16 max_last_io = 0;
423 unsigned i;
424
425 lockdep_assert_held(&c->bucket_clock[rw].lock);
426
427 /* Recalculate max_last_io for this device: */
428 for_each_bucket(g, buckets)
429 max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
430
431 ca->max_last_bucket_io[rw] = max_last_io;
432
433 /* Recalculate global max_last_io: */
434 max_last_io = 0;
435
436 for_each_member_device(ca, c, i)
437 max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
438
439 clock->max_last_io = max_last_io;
440}
441
442static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
443{
444 struct bucket_clock *clock = &c->bucket_clock[rw];
445 struct bucket_array *buckets;
446 struct bch_dev *ca;
447 struct bucket *g;
448 unsigned i;
449
450 trace_rescale_prios(c);
451
452 for_each_member_device(ca, c, i) {
453 down_read(&ca->bucket_lock);
454 buckets = bucket_array(ca);
455
456 for_each_bucket(g, buckets)
457 g->io_time[rw] = clock->hand -
458 bucket_last_io(c, g, rw) / 2;
459
460 bch2_recalc_oldest_io(c, ca, rw);
461
462 up_read(&ca->bucket_lock);
463 }
464}
465
8b335bae
KO
466static inline u64 bucket_clock_freq(u64 capacity)
467{
468 return max(capacity >> 10, 2028ULL);
469}
470
1c6fdbd8
KO
471static void bch2_inc_clock_hand(struct io_timer *timer)
472{
473 struct bucket_clock *clock = container_of(timer,
474 struct bucket_clock, rescale);
475 struct bch_fs *c = container_of(clock,
476 struct bch_fs, bucket_clock[clock->rw]);
477 struct bch_dev *ca;
478 u64 capacity;
479 unsigned i;
480
481 mutex_lock(&clock->lock);
482
483 /* if clock cannot be advanced more, rescale prio */
484 if (clock->max_last_io >= U16_MAX - 2)
485 bch2_rescale_bucket_io_times(c, clock->rw);
486
487 BUG_ON(clock->max_last_io >= U16_MAX - 2);
488
489 for_each_member_device(ca, c, i)
490 ca->max_last_bucket_io[clock->rw]++;
491 clock->max_last_io++;
492 clock->hand++;
493
494 mutex_unlock(&clock->lock);
495
496 capacity = READ_ONCE(c->capacity);
497
498 if (!capacity)
499 return;
500
501 /*
502 * we only increment when 0.1% of the filesystem capacity has been read
503 * or written too, this determines if it's time
504 *
505 * XXX: we shouldn't really be going off of the capacity of devices in
506 * RW mode (that will be 0 when we're RO, yet we can still service
507 * reads)
508 */
8b335bae 509 timer->expire += bucket_clock_freq(capacity);
1c6fdbd8
KO
510
511 bch2_io_timer_add(&c->io_clock[clock->rw], timer);
512}
513
514static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
515{
516 struct bucket_clock *clock = &c->bucket_clock[rw];
517
518 clock->hand = 1;
519 clock->rw = rw;
520 clock->rescale.fn = bch2_inc_clock_hand;
8b335bae 521 clock->rescale.expire = bucket_clock_freq(c->capacity);
1c6fdbd8
KO
522 mutex_init(&clock->lock);
523}
524
525/* Background allocator thread: */
526
527/*
528 * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
529 * (marking them as invalidated on disk), then optionally issues discard
530 * commands to the newly free buckets, then puts them on the various freelists.
531 */
532
1c6fdbd8
KO
533#define BUCKET_GC_GEN_MAX 96U
534
535/**
536 * wait_buckets_available - wait on reclaimable buckets
537 *
538 * If there aren't enough available buckets to fill up free_inc, wait until
539 * there are.
540 */
541static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
542{
543 unsigned long gc_count = c->gc_count;
544 int ret = 0;
545
546 while (1) {
547 set_current_state(TASK_INTERRUPTIBLE);
548 if (kthread_should_stop()) {
549 ret = 1;
550 break;
551 }
552
553 if (gc_count != c->gc_count)
554 ca->inc_gen_really_needs_gc = 0;
555
556 if ((ssize_t) (dev_buckets_available(c, ca) -
557 ca->inc_gen_really_needs_gc) >=
558 (ssize_t) fifo_free(&ca->free_inc))
559 break;
560
561 up_read(&c->gc_lock);
562 schedule();
563 try_to_freeze();
564 down_read(&c->gc_lock);
565 }
566
567 __set_current_state(TASK_RUNNING);
568 return ret;
569}
570
571static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
572 size_t bucket,
573 struct bucket_mark mark)
574{
575 u8 gc_gen;
576
577 if (!is_available_bucket(mark))
578 return false;
579
8eb7f3ee
KO
580 if (ca->buckets_nouse &&
581 test_bit(bucket, ca->buckets_nouse))
582 return false;
583
1c6fdbd8
KO
584 gc_gen = bucket_gc_gen(ca, bucket);
585
586 if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
587 ca->inc_gen_needs_gc++;
588
589 if (gc_gen >= BUCKET_GC_GEN_MAX)
590 ca->inc_gen_really_needs_gc++;
591
592 return gc_gen < BUCKET_GC_GEN_MAX;
593}
594
1c6fdbd8
KO
595/*
596 * Determines what order we're going to reuse buckets, smallest bucket_key()
597 * first.
598 *
599 *
600 * - We take into account the read prio of the bucket, which gives us an
601 * indication of how hot the data is -- we scale the prio so that the prio
602 * farthest from the clock is worth 1/8th of the closest.
603 *
604 * - The number of sectors of cached data in the bucket, which gives us an
605 * indication of the cost in cache misses this eviction will cause.
606 *
607 * - If hotness * sectors used compares equal, we pick the bucket with the
608 * smallest bucket_gc_gen() - since incrementing the same bucket's generation
609 * number repeatedly forces us to run mark and sweep gc to avoid generation
610 * number wraparound.
611 */
612
613static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
614 size_t b, struct bucket_mark m)
615{
616 unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
617 unsigned max_last_io = ca->max_last_bucket_io[READ];
618
619 /*
620 * Time since last read, scaled to [0, 8) where larger value indicates
621 * more recently read data:
622 */
623 unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
624
625 /* How much we want to keep the data in this bucket: */
626 unsigned long data_wantness =
627 (hotness + 1) * bucket_sectors_used(m);
628
629 unsigned long needs_journal_commit =
630 bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
631
632 return (data_wantness << 9) |
633 (needs_journal_commit << 8) |
f84306a5 634 (bucket_gc_gen(ca, b) / 16);
1c6fdbd8
KO
635}
636
637static inline int bucket_alloc_cmp(alloc_heap *h,
638 struct alloc_heap_entry l,
639 struct alloc_heap_entry r)
640{
641 return (l.key > r.key) - (l.key < r.key) ?:
642 (l.nr < r.nr) - (l.nr > r.nr) ?:
643 (l.bucket > r.bucket) - (l.bucket < r.bucket);
644}
645
b29e197a
KO
646static inline int bucket_idx_cmp(const void *_l, const void *_r)
647{
648 const struct alloc_heap_entry *l = _l, *r = _r;
649
650 return (l->bucket > r->bucket) - (l->bucket < r->bucket);
651}
652
1c6fdbd8
KO
653static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
654{
655 struct bucket_array *buckets;
656 struct alloc_heap_entry e = { 0 };
b29e197a 657 size_t b, i, nr = 0;
1c6fdbd8
KO
658
659 ca->alloc_heap.used = 0;
660
661 mutex_lock(&c->bucket_clock[READ].lock);
662 down_read(&ca->bucket_lock);
663
664 buckets = bucket_array(ca);
665
666 bch2_recalc_oldest_io(c, ca, READ);
667
668 /*
669 * Find buckets with lowest read priority, by building a maxheap sorted
670 * by read priority and repeatedly replacing the maximum element until
671 * all buckets have been visited.
672 */
673 for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
674 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
675 unsigned long key = bucket_sort_key(c, ca, b, m);
676
677 if (!bch2_can_invalidate_bucket(ca, b, m))
678 continue;
679
680 if (e.nr && e.bucket + e.nr == b && e.key == key) {
681 e.nr++;
682 } else {
683 if (e.nr)
198d6700
KO
684 heap_add_or_replace(&ca->alloc_heap, e,
685 -bucket_alloc_cmp, NULL);
1c6fdbd8
KO
686
687 e = (struct alloc_heap_entry) {
688 .bucket = b,
689 .nr = 1,
690 .key = key,
691 };
692 }
693
694 cond_resched();
695 }
696
697 if (e.nr)
198d6700
KO
698 heap_add_or_replace(&ca->alloc_heap, e,
699 -bucket_alloc_cmp, NULL);
1c6fdbd8 700
b29e197a
KO
701 for (i = 0; i < ca->alloc_heap.used; i++)
702 nr += ca->alloc_heap.data[i].nr;
1c6fdbd8 703
b29e197a
KO
704 while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
705 nr -= ca->alloc_heap.data[0].nr;
198d6700 706 heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
1c6fdbd8 707 }
b29e197a
KO
708
709 up_read(&ca->bucket_lock);
710 mutex_unlock(&c->bucket_clock[READ].lock);
1c6fdbd8
KO
711}
712
713static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
714{
715 struct bucket_array *buckets = bucket_array(ca);
716 struct bucket_mark m;
b29e197a 717 size_t b, start;
1c6fdbd8 718
b29e197a
KO
719 if (ca->fifo_last_bucket < ca->mi.first_bucket ||
720 ca->fifo_last_bucket >= ca->mi.nbuckets)
721 ca->fifo_last_bucket = ca->mi.first_bucket;
722
723 start = ca->fifo_last_bucket;
1c6fdbd8 724
b29e197a
KO
725 do {
726 ca->fifo_last_bucket++;
727 if (ca->fifo_last_bucket == ca->mi.nbuckets)
728 ca->fifo_last_bucket = ca->mi.first_bucket;
1c6fdbd8 729
b29e197a 730 b = ca->fifo_last_bucket;
1c6fdbd8
KO
731 m = READ_ONCE(buckets->b[b].mark);
732
b29e197a
KO
733 if (bch2_can_invalidate_bucket(ca, b, m)) {
734 struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
735
198d6700 736 heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
b29e197a
KO
737 if (heap_full(&ca->alloc_heap))
738 break;
739 }
1c6fdbd8
KO
740
741 cond_resched();
b29e197a 742 } while (ca->fifo_last_bucket != start);
1c6fdbd8
KO
743}
744
745static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
746{
747 struct bucket_array *buckets = bucket_array(ca);
748 struct bucket_mark m;
b29e197a 749 size_t checked, i;
1c6fdbd8
KO
750
751 for (checked = 0;
b29e197a 752 checked < ca->mi.nbuckets / 2;
1c6fdbd8
KO
753 checked++) {
754 size_t b = bch2_rand_range(ca->mi.nbuckets -
755 ca->mi.first_bucket) +
756 ca->mi.first_bucket;
757
758 m = READ_ONCE(buckets->b[b].mark);
759
b29e197a
KO
760 if (bch2_can_invalidate_bucket(ca, b, m)) {
761 struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
762
198d6700 763 heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
b29e197a
KO
764 if (heap_full(&ca->alloc_heap))
765 break;
766 }
1c6fdbd8
KO
767
768 cond_resched();
769 }
b29e197a
KO
770
771 sort(ca->alloc_heap.data,
772 ca->alloc_heap.used,
773 sizeof(ca->alloc_heap.data[0]),
774 bucket_idx_cmp, NULL);
775
776 /* remove duplicates: */
777 for (i = 0; i + 1 < ca->alloc_heap.used; i++)
778 if (ca->alloc_heap.data[i].bucket ==
779 ca->alloc_heap.data[i + 1].bucket)
780 ca->alloc_heap.data[i].nr = 0;
1c6fdbd8
KO
781}
782
b29e197a 783static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
1c6fdbd8 784{
b29e197a
KO
785 size_t i, nr = 0;
786
1c6fdbd8 787 ca->inc_gen_needs_gc = 0;
1c6fdbd8
KO
788
789 switch (ca->mi.replacement) {
790 case CACHE_REPLACEMENT_LRU:
791 find_reclaimable_buckets_lru(c, ca);
792 break;
793 case CACHE_REPLACEMENT_FIFO:
794 find_reclaimable_buckets_fifo(c, ca);
795 break;
796 case CACHE_REPLACEMENT_RANDOM:
797 find_reclaimable_buckets_random(c, ca);
798 break;
799 }
b29e197a 800
198d6700 801 heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
b29e197a
KO
802
803 for (i = 0; i < ca->alloc_heap.used; i++)
804 nr += ca->alloc_heap.data[i].nr;
805
806 return nr;
1c6fdbd8
KO
807}
808
b29e197a 809static inline long next_alloc_bucket(struct bch_dev *ca)
1c6fdbd8 810{
b29e197a
KO
811 struct alloc_heap_entry e, *top = ca->alloc_heap.data;
812
813 while (ca->alloc_heap.used) {
814 if (top->nr) {
815 size_t b = top->bucket;
816
817 top->bucket++;
818 top->nr--;
819 return b;
820 }
1c6fdbd8 821
198d6700 822 heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
b29e197a
KO
823 }
824
825 return -1;
1c6fdbd8
KO
826}
827
b29e197a
KO
828static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
829 size_t bucket, u64 *flush_seq)
1c6fdbd8 830{
b29e197a 831 struct bucket_mark m;
1c6fdbd8 832
9166b41d 833 percpu_down_read(&c->mark_lock);
1c6fdbd8 834 spin_lock(&c->freelist_lock);
b29e197a
KO
835
836 bch2_invalidate_bucket(c, ca, bucket, &m);
837
838 verify_not_on_freelist(c, ca, bucket);
839 BUG_ON(!fifo_push(&ca->free_inc, bucket));
840
1c6fdbd8 841 spin_unlock(&c->freelist_lock);
b29e197a
KO
842
843 bucket_io_clock_reset(c, ca, bucket, READ);
844 bucket_io_clock_reset(c, ca, bucket, WRITE);
845
9166b41d 846 percpu_up_read(&c->mark_lock);
b29e197a
KO
847
848 if (m.journal_seq_valid) {
849 u64 journal_seq = atomic64_read(&c->journal.seq);
850 u64 bucket_seq = journal_seq;
851
852 bucket_seq &= ~((u64) U16_MAX);
853 bucket_seq |= m.journal_seq;
854
855 if (bucket_seq > journal_seq)
856 bucket_seq -= 1 << 16;
857
858 *flush_seq = max(*flush_seq, bucket_seq);
859 }
860
861 return m.cached_sectors != 0;
1c6fdbd8
KO
862}
863
b29e197a
KO
864/*
865 * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
866 */
867static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
1c6fdbd8
KO
868{
869 struct btree_iter iter;
b29e197a 870 u64 journal_seq = 0;
1c6fdbd8 871 int ret = 0;
b29e197a 872 long b;
1c6fdbd8
KO
873
874 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
875 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
876
877 /* Only use nowait if we've already invalidated at least one bucket: */
b29e197a
KO
878 while (!ret &&
879 !fifo_full(&ca->free_inc) &&
880 (b = next_alloc_bucket(ca)) >= 0) {
881 bool must_flush =
882 bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
883
884 ret = __bch2_alloc_write_key(c, ca, b, &iter,
885 must_flush ? &journal_seq : NULL,
886 !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
1c6fdbd8
KO
887 }
888
889 bch2_btree_iter_unlock(&iter);
890
891 /* If we used NOWAIT, don't return the error: */
b29e197a
KO
892 if (!fifo_empty(&ca->free_inc))
893 ret = 0;
894 if (ret) {
895 bch_err(ca, "error invalidating buckets: %i", ret);
896 return ret;
897 }
1c6fdbd8 898
b29e197a
KO
899 if (journal_seq)
900 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
901 if (ret) {
902 bch_err(ca, "journal error: %i", ret);
903 return ret;
904 }
1c6fdbd8 905
b29e197a 906 return 0;
1c6fdbd8
KO
907}
908
909static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
910{
b29e197a 911 unsigned i;
1c6fdbd8
KO
912 int ret = 0;
913
914 while (1) {
915 set_current_state(TASK_INTERRUPTIBLE);
916
b29e197a
KO
917 spin_lock(&c->freelist_lock);
918 for (i = 0; i < RESERVE_NR; i++)
919 if (fifo_push(&ca->free[i], bucket)) {
920 fifo_pop(&ca->free_inc, bucket);
430735cd 921
b29e197a 922 closure_wake_up(&c->freelist_wait);
430735cd
KO
923 ca->allocator_blocked_full = false;
924
b29e197a
KO
925 spin_unlock(&c->freelist_lock);
926 goto out;
927 }
430735cd
KO
928
929 if (!ca->allocator_blocked_full) {
930 ca->allocator_blocked_full = true;
931 closure_wake_up(&c->freelist_wait);
932 }
933
b29e197a 934 spin_unlock(&c->freelist_lock);
1c6fdbd8
KO
935
936 if ((current->flags & PF_KTHREAD) &&
937 kthread_should_stop()) {
938 ret = 1;
939 break;
940 }
941
942 schedule();
943 try_to_freeze();
944 }
b29e197a 945out:
1c6fdbd8
KO
946 __set_current_state(TASK_RUNNING);
947 return ret;
948}
949
950/*
b29e197a
KO
951 * Pulls buckets off free_inc, discards them (if enabled), then adds them to
952 * freelists, waiting until there's room if necessary:
1c6fdbd8
KO
953 */
954static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
955{
b29e197a 956 while (!fifo_empty(&ca->free_inc)) {
1c6fdbd8
KO
957 size_t bucket = fifo_peek(&ca->free_inc);
958
1c6fdbd8
KO
959 if (ca->mi.discard &&
960 bdev_max_discard_sectors(ca->disk_sb.bdev))
961 blkdev_issue_discard(ca->disk_sb.bdev,
962 bucket_to_sector(ca, bucket),
963 ca->mi.bucket_size, GFP_NOIO);
964
965 if (push_invalidated_bucket(c, ca, bucket))
966 return 1;
967 }
968
969 return 0;
970}
971
972/**
973 * bch_allocator_thread - move buckets from free_inc to reserves
974 *
975 * The free_inc FIFO is populated by find_reclaimable_buckets(), and
976 * the reserves are depleted by bucket allocation. When we run out
977 * of free_inc, try to invalidate some buckets and write out
978 * prios and gens.
979 */
980static int bch2_allocator_thread(void *arg)
981{
982 struct bch_dev *ca = arg;
983 struct bch_fs *c = ca->fs;
b29e197a 984 size_t nr;
1c6fdbd8
KO
985 int ret;
986
987 set_freezable();
988
989 while (1) {
b29e197a 990 cond_resched();
1c6fdbd8 991
b29e197a
KO
992 pr_debug("discarding %zu invalidated buckets",
993 fifo_used(&ca->free_inc));
1c6fdbd8 994
b29e197a
KO
995 ret = discard_invalidated_buckets(c, ca);
996 if (ret)
997 goto stop;
1c6fdbd8 998
94c1f4ad
KO
999 down_read(&c->gc_lock);
1000
b29e197a 1001 ret = bch2_invalidate_buckets(c, ca);
94c1f4ad
KO
1002 if (ret) {
1003 up_read(&c->gc_lock);
b29e197a 1004 goto stop;
94c1f4ad 1005 }
1c6fdbd8 1006
94c1f4ad
KO
1007 if (!fifo_empty(&ca->free_inc)) {
1008 up_read(&c->gc_lock);
b29e197a 1009 continue;
94c1f4ad 1010 }
1c6fdbd8
KO
1011
1012 pr_debug("free_inc now empty");
1013
b29e197a 1014 do {
1c6fdbd8
KO
1015 /*
1016 * Find some buckets that we can invalidate, either
1017 * they're completely unused, or only contain clean data
1018 * that's been written back to the backing device or
1019 * another cache tier
1020 */
1021
1022 pr_debug("scanning for reclaimable buckets");
1023
b29e197a 1024 nr = find_reclaimable_buckets(c, ca);
1c6fdbd8 1025
b29e197a 1026 pr_debug("found %zu buckets", nr);
1c6fdbd8 1027
b29e197a 1028 trace_alloc_batch(ca, nr, ca->alloc_heap.size);
1c6fdbd8 1029
b29e197a
KO
1030 if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
1031 ca->inc_gen_really_needs_gc) &&
1c6fdbd8
KO
1032 c->gc_thread) {
1033 atomic_inc(&c->kick_gc);
1034 wake_up_process(c->gc_thread);
1035 }
1036
1c6fdbd8 1037 /*
b29e197a
KO
1038 * If we found any buckets, we have to invalidate them
1039 * before we scan for more - but if we didn't find very
1040 * many we may want to wait on more buckets being
1041 * available so we don't spin:
1c6fdbd8 1042 */
b29e197a
KO
1043 if (!nr ||
1044 (nr < ALLOC_SCAN_BATCH(ca) &&
1045 !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
1046 ca->allocator_blocked = true;
1047 closure_wake_up(&c->freelist_wait);
1048
1049 ret = wait_buckets_available(c, ca);
1050 if (ret) {
1051 up_read(&c->gc_lock);
1052 goto stop;
1053 }
1c6fdbd8 1054 }
b29e197a 1055 } while (!nr);
1c6fdbd8
KO
1056
1057 ca->allocator_blocked = false;
1058 up_read(&c->gc_lock);
1059
b29e197a 1060 pr_debug("%zu buckets to invalidate", nr);
1c6fdbd8
KO
1061
1062 /*
b29e197a 1063 * alloc_heap is now full of newly-invalidated buckets: next,
1c6fdbd8
KO
1064 * write out the new bucket gens:
1065 */
1066 }
1067
1068stop:
1069 pr_debug("alloc thread stopping (ret %i)", ret);
1070 return 0;
1071}
1072
1c6fdbd8
KO
1073/* Startup/shutdown (ro/rw): */
1074
1075void bch2_recalc_capacity(struct bch_fs *c)
1076{
1077 struct bch_dev *ca;
a50ed7c8 1078 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
b092dadd 1079 unsigned bucket_size_max = 0;
1c6fdbd8
KO
1080 unsigned long ra_pages = 0;
1081 unsigned i, j;
1082
1083 lockdep_assert_held(&c->state_lock);
1084
1085 for_each_online_member(ca, c, i) {
1086 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
1087
1088 ra_pages += bdi->ra_pages;
1089 }
1090
1091 bch2_set_ra_pages(c, ra_pages);
1092
1093 for_each_rw_member(ca, c, i) {
a50ed7c8 1094 u64 dev_reserve = 0;
1c6fdbd8
KO
1095
1096 /*
1097 * We need to reserve buckets (from the number
1098 * of currently available buckets) against
1099 * foreground writes so that mainly copygc can
1100 * make forward progress.
1101 *
1102 * We need enough to refill the various reserves
1103 * from scratch - copygc will use its entire
1104 * reserve all at once, then run against when
1105 * its reserve is refilled (from the formerly
1106 * available buckets).
1107 *
1108 * This reserve is just used when considering if
1109 * allocations for foreground writes must wait -
1110 * not -ENOSPC calculations.
1111 */
1112 for (j = 0; j < RESERVE_NONE; j++)
a9bec520 1113 dev_reserve += ca->free[j].size;
1c6fdbd8 1114
a9bec520
KO
1115 dev_reserve += 1; /* btree write point */
1116 dev_reserve += 1; /* copygc write point */
1117 dev_reserve += 1; /* rebalance write point */
1c6fdbd8 1118
a9bec520 1119 dev_reserve *= ca->mi.bucket_size;
1c6fdbd8 1120
a50ed7c8 1121 ca->copygc_threshold = dev_reserve;
a9bec520 1122
a50ed7c8
KO
1123 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1124 ca->mi.first_bucket);
1c6fdbd8 1125
a50ed7c8 1126 reserved_sectors += dev_reserve * 2;
b092dadd
KO
1127
1128 bucket_size_max = max_t(unsigned, bucket_size_max,
1129 ca->mi.bucket_size);
a9bec520 1130 }
1c6fdbd8 1131
a50ed7c8
KO
1132 gc_reserve = c->opts.gc_reserve_bytes
1133 ? c->opts.gc_reserve_bytes >> 9
1134 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1135
1136 reserved_sectors = max(gc_reserve, reserved_sectors);
1c6fdbd8 1137
a50ed7c8 1138 reserved_sectors = min(reserved_sectors, capacity);
1c6fdbd8 1139
a9bec520 1140 c->capacity = capacity - reserved_sectors;
1c6fdbd8 1141
b092dadd
KO
1142 c->bucket_size_max = bucket_size_max;
1143
1c6fdbd8
KO
1144 if (c->capacity) {
1145 bch2_io_timer_add(&c->io_clock[READ],
1146 &c->bucket_clock[READ].rescale);
1147 bch2_io_timer_add(&c->io_clock[WRITE],
1148 &c->bucket_clock[WRITE].rescale);
1149 } else {
1150 bch2_io_timer_del(&c->io_clock[READ],
1151 &c->bucket_clock[READ].rescale);
1152 bch2_io_timer_del(&c->io_clock[WRITE],
1153 &c->bucket_clock[WRITE].rescale);
1154 }
1155
1156 /* Wake up case someone was waiting for buckets */
1157 closure_wake_up(&c->freelist_wait);
1158}
1159
1c6fdbd8
KO
1160static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1161{
1162 struct open_bucket *ob;
1163 bool ret = false;
1164
1165 for (ob = c->open_buckets;
1166 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1167 ob++) {
1168 spin_lock(&ob->lock);
1169 if (ob->valid && !ob->on_partial_list &&
1170 ob->ptr.dev == ca->dev_idx)
1171 ret = true;
1172 spin_unlock(&ob->lock);
1173 }
1174
1175 return ret;
1176}
1177
1178/* device goes ro: */
1179void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1180{
1181 unsigned i;
1182
1183 BUG_ON(ca->alloc_thread);
1184
1185 /* First, remove device from allocation groups: */
1186
1187 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1188 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1189
1190 /*
1191 * Capacity is calculated based off of devices in allocation groups:
1192 */
1193 bch2_recalc_capacity(c);
1194
1195 /* Next, close write points that point to this device... */
1196 for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
7b3f84ea 1197 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1c6fdbd8 1198
7b3f84ea
KO
1199 bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
1200 bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1201 bch2_writepoint_stop(c, ca, &c->btree_write_point);
1c6fdbd8
KO
1202
1203 mutex_lock(&c->btree_reserve_cache_lock);
1204 while (c->btree_reserve_cache_nr) {
1205 struct btree_alloc *a =
1206 &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1207
ef337c54 1208 bch2_open_buckets_put(c, &a->ob);
1c6fdbd8
KO
1209 }
1210 mutex_unlock(&c->btree_reserve_cache_lock);
1211
cd575ddf
KO
1212 while (1) {
1213 struct open_bucket *ob;
1214
1215 spin_lock(&c->freelist_lock);
1216 if (!ca->open_buckets_partial_nr) {
1217 spin_unlock(&c->freelist_lock);
1218 break;
1219 }
1220 ob = c->open_buckets +
1221 ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1222 ob->on_partial_list = false;
1223 spin_unlock(&c->freelist_lock);
1224
1225 bch2_open_bucket_put(c, ob);
1226 }
1227
1228 bch2_ec_stop_dev(c, ca);
1229
1c6fdbd8
KO
1230 /*
1231 * Wake up threads that were blocked on allocation, so they can notice
1232 * the device can no longer be removed and the capacity has changed:
1233 */
1234 closure_wake_up(&c->freelist_wait);
1235
1236 /*
1237 * journal_res_get() can block waiting for free space in the journal -
1238 * it needs to notice there may not be devices to allocate from anymore:
1239 */
1240 wake_up(&c->journal.wait);
1241
1242 /* Now wait for any in flight writes: */
1243
1244 closure_wait_event(&c->open_buckets_wait,
1245 !bch2_dev_has_open_write_point(c, ca));
1246}
1247
1248/* device goes rw: */
1249void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1250{
1251 unsigned i;
1252
1253 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1254 if (ca->mi.data_allowed & (1 << i))
1255 set_bit(ca->dev_idx, c->rw_devs[i].d);
1256}
1257
430735cd
KO
1258void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
1259{
1260 closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
1261}
1262
1c6fdbd8
KO
1263/* stop allocator thread: */
1264void bch2_dev_allocator_stop(struct bch_dev *ca)
1265{
1266 struct task_struct *p;
1267
1268 p = rcu_dereference_protected(ca->alloc_thread, 1);
1269 ca->alloc_thread = NULL;
1270
1271 /*
1272 * We need an rcu barrier between setting ca->alloc_thread = NULL and
1273 * the thread shutting down to avoid bch2_wake_allocator() racing:
1274 *
1275 * XXX: it would be better to have the rcu barrier be asynchronous
1276 * instead of blocking us here
1277 */
1278 synchronize_rcu();
1279
1280 if (p) {
1281 kthread_stop(p);
1282 put_task_struct(p);
1283 }
1284}
1285
1286/* start allocator thread: */
1287int bch2_dev_allocator_start(struct bch_dev *ca)
1288{
1289 struct task_struct *p;
1290
1291 /*
1292 * allocator thread already started?
1293 */
1294 if (ca->alloc_thread)
1295 return 0;
1296
1297 p = kthread_create(bch2_allocator_thread, ca,
1298 "bch_alloc[%s]", ca->name);
1299 if (IS_ERR(p))
1300 return PTR_ERR(p);
1301
1302 get_task_struct(p);
1303 rcu_assign_pointer(ca->alloc_thread, p);
1304 wake_up_process(p);
1305 return 0;
1306}
1307
b29e197a
KO
1308static void flush_held_btree_writes(struct bch_fs *c)
1309{
1310 struct bucket_table *tbl;
1311 struct rhash_head *pos;
1312 struct btree *b;
d0cc3def
KO
1313 bool nodes_blocked;
1314 size_t i;
1315 struct closure cl;
1316
1317 closure_init_stack(&cl);
b29e197a
KO
1318
1319 clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1320again:
1321 pr_debug("flushing dirty btree nodes");
1322 cond_resched();
d0cc3def 1323 closure_wait(&c->btree_interior_update_wait, &cl);
b29e197a 1324
d0cc3def 1325 nodes_blocked = false;
b29e197a
KO
1326
1327 rcu_read_lock();
1328 for_each_cached_btree(b, c, tbl, i, pos)
d0cc3def 1329 if (btree_node_need_write(b)) {
b29e197a
KO
1330 if (btree_node_may_write(b)) {
1331 rcu_read_unlock();
1332 btree_node_lock_type(c, b, SIX_LOCK_read);
1333 bch2_btree_node_write(c, b, SIX_LOCK_read);
1334 six_unlock_read(&b->lock);
1335 goto again;
1336 } else {
d0cc3def 1337 nodes_blocked = true;
b29e197a
KO
1338 }
1339 }
1340 rcu_read_unlock();
1341
1342 if (c->btree_roots_dirty)
1343 bch2_journal_meta(&c->journal);
1344
d0cc3def
KO
1345 if (nodes_blocked) {
1346 closure_sync(&cl);
b29e197a
KO
1347 goto again;
1348 }
1349
d0cc3def
KO
1350 closure_wake_up(&c->btree_interior_update_wait);
1351 closure_sync(&cl);
1352
1353 closure_wait_event(&c->btree_interior_update_wait,
1354 !bch2_btree_interior_updates_nr_pending(c));
b29e197a
KO
1355}
1356
1c6fdbd8
KO
1357static void allocator_start_issue_discards(struct bch_fs *c)
1358{
1359 struct bch_dev *ca;
1360 unsigned dev_iter;
b29e197a 1361 size_t bu;
1c6fdbd8 1362
b29e197a
KO
1363 for_each_rw_member(ca, c, dev_iter)
1364 while (fifo_pop(&ca->free_inc, bu))
1c6fdbd8
KO
1365 blkdev_issue_discard(ca->disk_sb.bdev,
1366 bucket_to_sector(ca, bu),
1367 ca->mi.bucket_size, GFP_NOIO);
1c6fdbd8
KO
1368}
1369
5e5d9bdb
KO
1370static int resize_free_inc(struct bch_dev *ca)
1371{
1372 alloc_fifo free_inc;
1373
1374 if (!fifo_full(&ca->free_inc))
1375 return 0;
1376
1377 if (!init_fifo(&free_inc,
1378 ca->free_inc.size * 2,
1379 GFP_KERNEL))
1380 return -ENOMEM;
1381
1382 fifo_move(&free_inc, &ca->free_inc);
1383 swap(free_inc, ca->free_inc);
1384 free_fifo(&free_inc);
1385 return 0;
1386}
1387
1c6fdbd8
KO
1388static int __bch2_fs_allocator_start(struct bch_fs *c)
1389{
1390 struct bch_dev *ca;
1c6fdbd8
KO
1391 unsigned dev_iter;
1392 u64 journal_seq = 0;
b29e197a 1393 long bu;
1c6fdbd8
KO
1394 int ret = 0;
1395
d0cc3def 1396 if (test_alloc_startup(c))
b29e197a 1397 goto not_enough;
b29e197a 1398
1c6fdbd8
KO
1399 /* Scan for buckets that are already invalidated: */
1400 for_each_rw_member(ca, c, dev_iter) {
61274e9d 1401 struct bucket_array *buckets;
1c6fdbd8 1402 struct bucket_mark m;
1c6fdbd8 1403
61274e9d 1404 down_read(&ca->bucket_lock);
9166b41d 1405 percpu_down_read(&c->mark_lock);
61274e9d
KO
1406
1407 buckets = bucket_array(ca);
1c6fdbd8 1408
61274e9d
KO
1409 for (bu = buckets->first_bucket;
1410 bu < buckets->nbuckets; bu++) {
1411 m = READ_ONCE(buckets->b[bu].mark);
1c6fdbd8 1412
90541a74 1413 if (!buckets->b[bu].gen_valid ||
8eb7f3ee 1414 !test_bit(bu, ca->buckets_nouse) ||
61274e9d
KO
1415 !is_available_bucket(m) ||
1416 m.cached_sectors)
1c6fdbd8
KO
1417 continue;
1418
1c6fdbd8 1419 bch2_mark_alloc_bucket(c, ca, bu, true,
9ca53b55 1420 gc_pos_alloc(c, NULL), 0);
1c6fdbd8
KO
1421
1422 fifo_push(&ca->free_inc, bu);
1c6fdbd8 1423
61274e9d
KO
1424 discard_invalidated_buckets(c, ca);
1425
1426 if (fifo_full(&ca->free[RESERVE_BTREE]))
1c6fdbd8
KO
1427 break;
1428 }
9166b41d 1429 percpu_up_read(&c->mark_lock);
61274e9d 1430 up_read(&ca->bucket_lock);
1c6fdbd8
KO
1431 }
1432
1433 /* did we find enough buckets? */
1434 for_each_rw_member(ca, c, dev_iter)
61274e9d 1435 if (!fifo_full(&ca->free[RESERVE_BTREE])) {
1c6fdbd8
KO
1436 percpu_ref_put(&ca->io_ref);
1437 goto not_enough;
1438 }
1439
1440 return 0;
1441not_enough:
61274e9d 1442 pr_debug("not enough empty buckets; scanning for reclaimable buckets");
1c6fdbd8 1443
1c6fdbd8
KO
1444 /*
1445 * We're moving buckets to freelists _before_ they've been marked as
1446 * invalidated on disk - we have to so that we can allocate new btree
1447 * nodes to mark them as invalidated on disk.
1448 *
1449 * However, we can't _write_ to any of these buckets yet - they might
1450 * have cached data in them, which is live until they're marked as
1451 * invalidated on disk:
1452 */
d0cc3def 1453 set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1c6fdbd8 1454
d0cc3def
KO
1455 while (1) {
1456 bool wrote = false;
1c6fdbd8 1457
d0cc3def
KO
1458 for_each_rw_member(ca, c, dev_iter) {
1459 find_reclaimable_buckets(c, ca);
1c6fdbd8 1460
d0cc3def
KO
1461 while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
1462 (bu = next_alloc_bucket(ca)) >= 0) {
5e5d9bdb
KO
1463 ret = resize_free_inc(ca);
1464 if (ret) {
1465 percpu_ref_put(&ca->io_ref);
1466 return ret;
1467 }
1468
d0cc3def
KO
1469 bch2_invalidate_one_bucket(c, ca, bu,
1470 &journal_seq);
1471
1472 fifo_push(&ca->free[RESERVE_BTREE], bu);
1473 bucket_set_dirty(ca, bu);
1474 }
1475 }
1476
1477 pr_debug("done scanning for reclaimable buckets");
1478
1479 /*
1480 * XXX: it's possible for this to deadlock waiting on journal reclaim,
1481 * since we're holding btree writes. What then?
1482 */
1483 ret = bch2_alloc_write(c, true, &wrote);
1c6fdbd8 1484
d0cc3def
KO
1485 /*
1486 * If bch2_alloc_write() did anything, it may have used some
1487 * buckets, and we need the RESERVE_BTREE freelist full - so we
1488 * need to loop and scan again.
1489 * And if it errored, it may have been because there weren't
1490 * enough buckets, so just scan and loop again as long as it
1491 * made some progress:
1492 */
1493 if (!wrote && ret)
1494 return ret;
1495 if (!wrote && !ret)
1496 break;
1c6fdbd8
KO
1497 }
1498
d0cc3def
KO
1499 pr_debug("flushing journal");
1500
1501 ret = bch2_journal_flush(&c->journal);
1502 if (ret)
1503 return ret;
1504
1505 pr_debug("issuing discards");
1506 allocator_start_issue_discards(c);
1507
1c6fdbd8
KO
1508 set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
1509
1510 /* now flush dirty btree nodes: */
d0cc3def 1511 flush_held_btree_writes(c);
1c6fdbd8
KO
1512
1513 return 0;
1514}
1515
1516int bch2_fs_allocator_start(struct bch_fs *c)
1517{
1518 struct bch_dev *ca;
1519 unsigned i;
d0cc3def 1520 bool wrote;
1c6fdbd8
KO
1521 int ret;
1522
1523 down_read(&c->gc_lock);
1524 ret = __bch2_fs_allocator_start(c);
1525 up_read(&c->gc_lock);
1526
1527 if (ret)
1528 return ret;
1529
1530 for_each_rw_member(ca, c, i) {
1531 ret = bch2_dev_allocator_start(ca);
1532 if (ret) {
1533 percpu_ref_put(&ca->io_ref);
1534 return ret;
1535 }
1536 }
1537
b935a8a6
KO
1538 set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
1539
d0cc3def 1540 return bch2_alloc_write(c, false, &wrote);
1c6fdbd8
KO
1541}
1542
b092dadd 1543void bch2_fs_allocator_background_init(struct bch_fs *c)
1c6fdbd8 1544{
1c6fdbd8
KO
1545 spin_lock_init(&c->freelist_lock);
1546 bch2_bucket_clock_init(c, READ);
1547 bch2_bucket_clock_init(c, WRITE);
1548
1c6fdbd8
KO
1549 c->pd_controllers_update_seconds = 5;
1550 INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
1551}