bcachefs: Fix check for if extent update is allocating
[linux-block.git] / fs / bcachefs / alloc_background.c
CommitLineData
7b3f84ea 1// SPDX-License-Identifier: GPL-2.0
1c6fdbd8 2#include "bcachefs.h"
7b3f84ea
KO
3#include "alloc_background.h"
4#include "alloc_foreground.h"
1c6fdbd8
KO
5#include "btree_cache.h"
6#include "btree_io.h"
7#include "btree_update.h"
8#include "btree_update_interior.h"
9#include "btree_gc.h"
10#include "buckets.h"
1c6fdbd8
KO
11#include "clock.h"
12#include "debug.h"
cd575ddf 13#include "ec.h"
1c6fdbd8 14#include "error.h"
1c6fdbd8 15#include "journal_io.h"
1c6fdbd8
KO
16#include "trace.h"
17
1c6fdbd8
KO
18#include <linux/kthread.h>
19#include <linux/math64.h>
20#include <linux/random.h>
21#include <linux/rculist.h>
22#include <linux/rcupdate.h>
23#include <linux/sched/task.h>
24#include <linux/sort.h>
25
90541a74
KO
26static const char * const bch2_alloc_field_names[] = {
27#define x(name, bytes) #name,
28 BCH_ALLOC_FIELDS()
29#undef x
30 NULL
31};
32
1c6fdbd8
KO
33static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
34
35/* Ratelimiting/PD controllers */
36
37static void pd_controllers_update(struct work_struct *work)
38{
39 struct bch_fs *c = container_of(to_delayed_work(work),
40 struct bch_fs,
41 pd_controllers_update);
42 struct bch_dev *ca;
43 unsigned i;
44
45 for_each_member_device(ca, c, i) {
46 struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
47
48 u64 free = bucket_to_sector(ca,
49 __dev_buckets_free(ca, stats)) << 9;
50 /*
51 * Bytes of internal fragmentation, which can be
52 * reclaimed by copy GC
53 */
54 s64 fragmented = (bucket_to_sector(ca,
55 stats.buckets[BCH_DATA_USER] +
56 stats.buckets[BCH_DATA_CACHED]) -
57 (stats.sectors[BCH_DATA_USER] +
58 stats.sectors[BCH_DATA_CACHED])) << 9;
59
60 fragmented = max(0LL, fragmented);
61
62 bch2_pd_controller_update(&ca->copygc_pd,
63 free, fragmented, -1);
64 }
65
66 schedule_delayed_work(&c->pd_controllers_update,
67 c->pd_controllers_update_seconds * HZ);
68}
69
70/* Persistent alloc info: */
71
90541a74
KO
72static inline u64 get_alloc_field(const struct bch_alloc *a,
73 const void **p, unsigned field)
74{
75 unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
76 u64 v;
77
78 if (!(a->fields & (1 << field)))
79 return 0;
80
81 switch (bytes) {
82 case 1:
83 v = *((const u8 *) *p);
84 break;
85 case 2:
86 v = le16_to_cpup(*p);
87 break;
88 case 4:
89 v = le32_to_cpup(*p);
90 break;
91 case 8:
92 v = le64_to_cpup(*p);
93 break;
94 default:
95 BUG();
96 }
97
98 *p += bytes;
99 return v;
100}
101
102static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
103 unsigned field, u64 v)
104{
105 unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
106
107 if (!v)
108 return;
109
110 a->v.fields |= 1 << field;
111
112 switch (bytes) {
113 case 1:
114 *((u8 *) *p) = v;
115 break;
116 case 2:
117 *((__le16 *) *p) = cpu_to_le16(v);
118 break;
119 case 4:
120 *((__le32 *) *p) = cpu_to_le32(v);
121 break;
122 case 8:
123 *((__le64 *) *p) = cpu_to_le64(v);
124 break;
125 default:
126 BUG();
127 }
128
129 *p += bytes;
130}
131
1c6fdbd8
KO
132static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
133{
90541a74 134 unsigned i, bytes = offsetof(struct bch_alloc, data);
1c6fdbd8 135
90541a74
KO
136 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
137 if (a->fields & (1 << i))
138 bytes += BCH_ALLOC_FIELD_BYTES[i];
1c6fdbd8
KO
139
140 return DIV_ROUND_UP(bytes, sizeof(u64));
141}
142
143const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
144{
26609b61
KO
145 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
146
1c6fdbd8
KO
147 if (k.k->p.inode >= c->sb.nr_devices ||
148 !c->devs[k.k->p.inode])
149 return "invalid device";
150
26609b61
KO
151 /* allow for unknown fields */
152 if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
153 return "incorrect value size";
1c6fdbd8
KO
154
155 return NULL;
156}
157
319f9ac3
KO
158void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
159 struct bkey_s_c k)
1c6fdbd8 160{
26609b61 161 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
90541a74
KO
162 const void *d = a.v->data;
163 unsigned i;
319f9ac3 164
26609b61 165 pr_buf(out, "gen %u", a.v->gen);
90541a74
KO
166
167 for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
168 if (a.v->fields & (1 << i))
169 pr_buf(out, " %s %llu",
170 bch2_alloc_field_names[i],
171 get_alloc_field(a.v, &d, i));
1c6fdbd8
KO
172}
173
90541a74 174static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
1c6fdbd8 175{
90541a74
KO
176 const void *d = a->data;
177 unsigned idx = 0;
178
179 g->_mark.gen = a->gen;
180 g->gen_valid = 1;
181 g->io_time[READ] = get_alloc_field(a, &d, idx++);
182 g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
183 g->_mark.data_type = get_alloc_field(a, &d, idx++);
184 g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++);
185 g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
1c6fdbd8
KO
186}
187
8eb7f3ee
KO
188static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
189 struct bucket_mark m)
1c6fdbd8 190{
90541a74
KO
191 unsigned idx = 0;
192 void *d = a->v.data;
1c6fdbd8 193
90541a74
KO
194 a->v.fields = 0;
195 a->v.gen = m.gen;
196
197 d = a->v.data;
198 put_alloc_field(a, &d, idx++, g->io_time[READ]);
199 put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
200 put_alloc_field(a, &d, idx++, m.data_type);
201 put_alloc_field(a, &d, idx++, m.dirty_sectors);
202 put_alloc_field(a, &d, idx++, m.cached_sectors);
203
204 set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
1c6fdbd8
KO
205}
206
207static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
208{
209 struct bch_dev *ca;
210 struct bkey_s_c_alloc a;
1c6fdbd8 211
26609b61 212 if (k.k->type != KEY_TYPE_alloc)
1c6fdbd8
KO
213 return;
214
215 a = bkey_s_c_to_alloc(k);
216 ca = bch_dev_bkey_exists(c, a.k->p.inode);
217
218 if (a.k->p.offset >= ca->mi.nbuckets)
219 return;
220
9166b41d 221 percpu_down_read(&c->mark_lock);
90541a74 222 __alloc_read_key(bucket(ca, a.k->p.offset), a.v);
9166b41d 223 percpu_up_read(&c->mark_lock);
1c6fdbd8
KO
224}
225
226int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
227{
228 struct journal_replay *r;
229 struct btree_iter iter;
230 struct bkey_s_c k;
231 struct bch_dev *ca;
232 unsigned i;
233 int ret;
234
235 for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
236 bch2_alloc_read_key(c, k);
237 bch2_btree_iter_cond_resched(&iter);
238 }
239
240 ret = bch2_btree_iter_unlock(&iter);
241 if (ret)
242 return ret;
243
244 list_for_each_entry(r, journal_replay_list, list) {
245 struct bkey_i *k, *n;
246 struct jset_entry *entry;
247
248 for_each_jset_key(k, n, entry, &r->j)
249 if (entry->btree_id == BTREE_ID_ALLOC)
250 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
251 }
252
253 mutex_lock(&c->bucket_clock[READ].lock);
254 for_each_member_device(ca, c, i) {
255 down_read(&ca->bucket_lock);
256 bch2_recalc_oldest_io(c, ca, READ);
257 up_read(&ca->bucket_lock);
258 }
259 mutex_unlock(&c->bucket_clock[READ].lock);
260
261 mutex_lock(&c->bucket_clock[WRITE].lock);
262 for_each_member_device(ca, c, i) {
263 down_read(&ca->bucket_lock);
264 bch2_recalc_oldest_io(c, ca, WRITE);
265 up_read(&ca->bucket_lock);
266 }
267 mutex_unlock(&c->bucket_clock[WRITE].lock);
268
269 return 0;
270}
271
272static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
273 size_t b, struct btree_iter *iter,
b29e197a 274 u64 *journal_seq, unsigned flags)
1c6fdbd8 275{
90541a74
KO
276#if 0
277 __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
278#else
279 /* hack: */
280 __BKEY_PADDED(k, 8) alloc_key;
281#endif
282 struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
8eb7f3ee
KO
283 struct bucket *g;
284 struct bucket_mark m;
61274e9d 285 int ret;
1c6fdbd8 286
90541a74 287 BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
b29e197a 288
90541a74 289 a->k.p = POS(ca->dev_idx, b);
b29e197a 290
9166b41d 291 percpu_down_read(&c->mark_lock);
8eb7f3ee
KO
292 g = bucket(ca, b);
293 m = bucket_cmpxchg(g, m, m.dirty = false);
294
295 __alloc_write_key(a, g, m);
9166b41d 296 percpu_up_read(&c->mark_lock);
1c6fdbd8 297
b29e197a 298 bch2_btree_iter_cond_resched(iter);
1c6fdbd8 299
b29e197a 300 bch2_btree_iter_set_pos(iter, a->k.p);
1c6fdbd8 301
61274e9d
KO
302 ret = bch2_btree_insert_at(c, NULL, journal_seq,
303 BTREE_INSERT_NOFAIL|
304 BTREE_INSERT_USE_RESERVE|
305 BTREE_INSERT_USE_ALLOC_RESERVE|
306 flags,
307 BTREE_INSERT_ENTRY(iter, &a->k_i));
308
309 if (!ret && ca->buckets_written)
310 set_bit(b, ca->buckets_written);
311
312 return ret;
1c6fdbd8
KO
313}
314
61274e9d 315int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
1c6fdbd8
KO
316{
317 struct bch_dev *ca;
318 struct btree_iter iter;
319 int ret;
320
61274e9d
KO
321 if (k->k.p.inode >= c->sb.nr_devices ||
322 !c->devs[k->k.p.inode])
1c6fdbd8
KO
323 return 0;
324
61274e9d 325 ca = bch_dev_bkey_exists(c, k->k.p.inode);
1c6fdbd8 326
61274e9d 327 if (k->k.p.offset >= ca->mi.nbuckets)
1c6fdbd8
KO
328 return 0;
329
61274e9d
KO
330 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
331 BTREE_ITER_INTENT);
1c6fdbd8 332
61274e9d
KO
333 ret = bch2_btree_iter_traverse(&iter);
334 if (ret)
335 goto err;
336
337 /* check buckets_written with btree node locked: */
338
339 ret = test_bit(k->k.p.offset, ca->buckets_written)
340 ? 0
341 : bch2_btree_insert_at(c, NULL, NULL,
342 BTREE_INSERT_NOFAIL|
343 BTREE_INSERT_JOURNAL_REPLAY,
344 BTREE_INSERT_ENTRY(&iter, k));
345err:
1c6fdbd8
KO
346 bch2_btree_iter_unlock(&iter);
347 return ret;
348}
349
d0cc3def 350int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
1c6fdbd8
KO
351{
352 struct bch_dev *ca;
353 unsigned i;
354 int ret = 0;
355
d0cc3def
KO
356 *wrote = false;
357
1c6fdbd8
KO
358 for_each_rw_member(ca, c, i) {
359 struct btree_iter iter;
8eb7f3ee
KO
360 struct bucket_array *buckets;
361 size_t b;
1c6fdbd8
KO
362
363 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
364 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
365
366 down_read(&ca->bucket_lock);
8eb7f3ee
KO
367 buckets = bucket_array(ca);
368
369 for (b = buckets->first_bucket;
370 b < buckets->nbuckets;
371 b++) {
372 if (!buckets->b[b].mark.dirty)
373 continue;
374
d0cc3def
KO
375 ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
376 nowait
377 ? BTREE_INSERT_NOWAIT
378 : 0);
1c6fdbd8
KO
379 if (ret)
380 break;
d0cc3def
KO
381
382 *wrote = true;
1c6fdbd8
KO
383 }
384 up_read(&ca->bucket_lock);
385 bch2_btree_iter_unlock(&iter);
386
387 if (ret) {
388 percpu_ref_put(&ca->io_ref);
389 break;
390 }
391 }
392
393 return ret;
394}
395
396/* Bucket IO clocks: */
397
398static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
399{
400 struct bucket_clock *clock = &c->bucket_clock[rw];
401 struct bucket_array *buckets = bucket_array(ca);
402 struct bucket *g;
403 u16 max_last_io = 0;
404 unsigned i;
405
406 lockdep_assert_held(&c->bucket_clock[rw].lock);
407
408 /* Recalculate max_last_io for this device: */
409 for_each_bucket(g, buckets)
410 max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
411
412 ca->max_last_bucket_io[rw] = max_last_io;
413
414 /* Recalculate global max_last_io: */
415 max_last_io = 0;
416
417 for_each_member_device(ca, c, i)
418 max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
419
420 clock->max_last_io = max_last_io;
421}
422
423static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
424{
425 struct bucket_clock *clock = &c->bucket_clock[rw];
426 struct bucket_array *buckets;
427 struct bch_dev *ca;
428 struct bucket *g;
429 unsigned i;
430
431 trace_rescale_prios(c);
432
433 for_each_member_device(ca, c, i) {
434 down_read(&ca->bucket_lock);
435 buckets = bucket_array(ca);
436
437 for_each_bucket(g, buckets)
438 g->io_time[rw] = clock->hand -
439 bucket_last_io(c, g, rw) / 2;
440
441 bch2_recalc_oldest_io(c, ca, rw);
442
443 up_read(&ca->bucket_lock);
444 }
445}
446
8b335bae
KO
447static inline u64 bucket_clock_freq(u64 capacity)
448{
449 return max(capacity >> 10, 2028ULL);
450}
451
1c6fdbd8
KO
452static void bch2_inc_clock_hand(struct io_timer *timer)
453{
454 struct bucket_clock *clock = container_of(timer,
455 struct bucket_clock, rescale);
456 struct bch_fs *c = container_of(clock,
457 struct bch_fs, bucket_clock[clock->rw]);
458 struct bch_dev *ca;
459 u64 capacity;
460 unsigned i;
461
462 mutex_lock(&clock->lock);
463
464 /* if clock cannot be advanced more, rescale prio */
465 if (clock->max_last_io >= U16_MAX - 2)
466 bch2_rescale_bucket_io_times(c, clock->rw);
467
468 BUG_ON(clock->max_last_io >= U16_MAX - 2);
469
470 for_each_member_device(ca, c, i)
471 ca->max_last_bucket_io[clock->rw]++;
472 clock->max_last_io++;
473 clock->hand++;
474
475 mutex_unlock(&clock->lock);
476
477 capacity = READ_ONCE(c->capacity);
478
479 if (!capacity)
480 return;
481
482 /*
483 * we only increment when 0.1% of the filesystem capacity has been read
484 * or written too, this determines if it's time
485 *
486 * XXX: we shouldn't really be going off of the capacity of devices in
487 * RW mode (that will be 0 when we're RO, yet we can still service
488 * reads)
489 */
8b335bae 490 timer->expire += bucket_clock_freq(capacity);
1c6fdbd8
KO
491
492 bch2_io_timer_add(&c->io_clock[clock->rw], timer);
493}
494
495static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
496{
497 struct bucket_clock *clock = &c->bucket_clock[rw];
498
499 clock->hand = 1;
500 clock->rw = rw;
501 clock->rescale.fn = bch2_inc_clock_hand;
8b335bae 502 clock->rescale.expire = bucket_clock_freq(c->capacity);
1c6fdbd8
KO
503 mutex_init(&clock->lock);
504}
505
506/* Background allocator thread: */
507
508/*
509 * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
510 * (marking them as invalidated on disk), then optionally issues discard
511 * commands to the newly free buckets, then puts them on the various freelists.
512 */
513
1c6fdbd8
KO
514#define BUCKET_GC_GEN_MAX 96U
515
516/**
517 * wait_buckets_available - wait on reclaimable buckets
518 *
519 * If there aren't enough available buckets to fill up free_inc, wait until
520 * there are.
521 */
522static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
523{
524 unsigned long gc_count = c->gc_count;
525 int ret = 0;
526
527 while (1) {
528 set_current_state(TASK_INTERRUPTIBLE);
529 if (kthread_should_stop()) {
530 ret = 1;
531 break;
532 }
533
534 if (gc_count != c->gc_count)
535 ca->inc_gen_really_needs_gc = 0;
536
537 if ((ssize_t) (dev_buckets_available(c, ca) -
538 ca->inc_gen_really_needs_gc) >=
539 (ssize_t) fifo_free(&ca->free_inc))
540 break;
541
542 up_read(&c->gc_lock);
543 schedule();
544 try_to_freeze();
545 down_read(&c->gc_lock);
546 }
547
548 __set_current_state(TASK_RUNNING);
549 return ret;
550}
551
552static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
553 size_t bucket,
554 struct bucket_mark mark)
555{
556 u8 gc_gen;
557
558 if (!is_available_bucket(mark))
559 return false;
560
8eb7f3ee
KO
561 if (ca->buckets_nouse &&
562 test_bit(bucket, ca->buckets_nouse))
563 return false;
564
1c6fdbd8
KO
565 gc_gen = bucket_gc_gen(ca, bucket);
566
567 if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
568 ca->inc_gen_needs_gc++;
569
570 if (gc_gen >= BUCKET_GC_GEN_MAX)
571 ca->inc_gen_really_needs_gc++;
572
573 return gc_gen < BUCKET_GC_GEN_MAX;
574}
575
1c6fdbd8
KO
576/*
577 * Determines what order we're going to reuse buckets, smallest bucket_key()
578 * first.
579 *
580 *
581 * - We take into account the read prio of the bucket, which gives us an
582 * indication of how hot the data is -- we scale the prio so that the prio
583 * farthest from the clock is worth 1/8th of the closest.
584 *
585 * - The number of sectors of cached data in the bucket, which gives us an
586 * indication of the cost in cache misses this eviction will cause.
587 *
588 * - If hotness * sectors used compares equal, we pick the bucket with the
589 * smallest bucket_gc_gen() - since incrementing the same bucket's generation
590 * number repeatedly forces us to run mark and sweep gc to avoid generation
591 * number wraparound.
592 */
593
594static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
595 size_t b, struct bucket_mark m)
596{
597 unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
598 unsigned max_last_io = ca->max_last_bucket_io[READ];
599
600 /*
601 * Time since last read, scaled to [0, 8) where larger value indicates
602 * more recently read data:
603 */
604 unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
605
606 /* How much we want to keep the data in this bucket: */
607 unsigned long data_wantness =
608 (hotness + 1) * bucket_sectors_used(m);
609
610 unsigned long needs_journal_commit =
611 bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
612
613 return (data_wantness << 9) |
614 (needs_journal_commit << 8) |
f84306a5 615 (bucket_gc_gen(ca, b) / 16);
1c6fdbd8
KO
616}
617
618static inline int bucket_alloc_cmp(alloc_heap *h,
619 struct alloc_heap_entry l,
620 struct alloc_heap_entry r)
621{
622 return (l.key > r.key) - (l.key < r.key) ?:
623 (l.nr < r.nr) - (l.nr > r.nr) ?:
624 (l.bucket > r.bucket) - (l.bucket < r.bucket);
625}
626
b29e197a
KO
627static inline int bucket_idx_cmp(const void *_l, const void *_r)
628{
629 const struct alloc_heap_entry *l = _l, *r = _r;
630
631 return (l->bucket > r->bucket) - (l->bucket < r->bucket);
632}
633
1c6fdbd8
KO
634static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
635{
636 struct bucket_array *buckets;
637 struct alloc_heap_entry e = { 0 };
b29e197a 638 size_t b, i, nr = 0;
1c6fdbd8
KO
639
640 ca->alloc_heap.used = 0;
641
642 mutex_lock(&c->bucket_clock[READ].lock);
643 down_read(&ca->bucket_lock);
644
645 buckets = bucket_array(ca);
646
647 bch2_recalc_oldest_io(c, ca, READ);
648
649 /*
650 * Find buckets with lowest read priority, by building a maxheap sorted
651 * by read priority and repeatedly replacing the maximum element until
652 * all buckets have been visited.
653 */
654 for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
655 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
656 unsigned long key = bucket_sort_key(c, ca, b, m);
657
658 if (!bch2_can_invalidate_bucket(ca, b, m))
659 continue;
660
661 if (e.nr && e.bucket + e.nr == b && e.key == key) {
662 e.nr++;
663 } else {
664 if (e.nr)
198d6700
KO
665 heap_add_or_replace(&ca->alloc_heap, e,
666 -bucket_alloc_cmp, NULL);
1c6fdbd8
KO
667
668 e = (struct alloc_heap_entry) {
669 .bucket = b,
670 .nr = 1,
671 .key = key,
672 };
673 }
674
675 cond_resched();
676 }
677
678 if (e.nr)
198d6700
KO
679 heap_add_or_replace(&ca->alloc_heap, e,
680 -bucket_alloc_cmp, NULL);
1c6fdbd8 681
b29e197a
KO
682 for (i = 0; i < ca->alloc_heap.used; i++)
683 nr += ca->alloc_heap.data[i].nr;
1c6fdbd8 684
b29e197a
KO
685 while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
686 nr -= ca->alloc_heap.data[0].nr;
198d6700 687 heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
1c6fdbd8 688 }
b29e197a
KO
689
690 up_read(&ca->bucket_lock);
691 mutex_unlock(&c->bucket_clock[READ].lock);
1c6fdbd8
KO
692}
693
694static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
695{
696 struct bucket_array *buckets = bucket_array(ca);
697 struct bucket_mark m;
b29e197a 698 size_t b, start;
1c6fdbd8 699
b29e197a
KO
700 if (ca->fifo_last_bucket < ca->mi.first_bucket ||
701 ca->fifo_last_bucket >= ca->mi.nbuckets)
702 ca->fifo_last_bucket = ca->mi.first_bucket;
703
704 start = ca->fifo_last_bucket;
1c6fdbd8 705
b29e197a
KO
706 do {
707 ca->fifo_last_bucket++;
708 if (ca->fifo_last_bucket == ca->mi.nbuckets)
709 ca->fifo_last_bucket = ca->mi.first_bucket;
1c6fdbd8 710
b29e197a 711 b = ca->fifo_last_bucket;
1c6fdbd8
KO
712 m = READ_ONCE(buckets->b[b].mark);
713
b29e197a
KO
714 if (bch2_can_invalidate_bucket(ca, b, m)) {
715 struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
716
198d6700 717 heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
b29e197a
KO
718 if (heap_full(&ca->alloc_heap))
719 break;
720 }
1c6fdbd8
KO
721
722 cond_resched();
b29e197a 723 } while (ca->fifo_last_bucket != start);
1c6fdbd8
KO
724}
725
726static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
727{
728 struct bucket_array *buckets = bucket_array(ca);
729 struct bucket_mark m;
b29e197a 730 size_t checked, i;
1c6fdbd8
KO
731
732 for (checked = 0;
b29e197a 733 checked < ca->mi.nbuckets / 2;
1c6fdbd8
KO
734 checked++) {
735 size_t b = bch2_rand_range(ca->mi.nbuckets -
736 ca->mi.first_bucket) +
737 ca->mi.first_bucket;
738
739 m = READ_ONCE(buckets->b[b].mark);
740
b29e197a
KO
741 if (bch2_can_invalidate_bucket(ca, b, m)) {
742 struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
743
198d6700 744 heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
b29e197a
KO
745 if (heap_full(&ca->alloc_heap))
746 break;
747 }
1c6fdbd8
KO
748
749 cond_resched();
750 }
b29e197a
KO
751
752 sort(ca->alloc_heap.data,
753 ca->alloc_heap.used,
754 sizeof(ca->alloc_heap.data[0]),
755 bucket_idx_cmp, NULL);
756
757 /* remove duplicates: */
758 for (i = 0; i + 1 < ca->alloc_heap.used; i++)
759 if (ca->alloc_heap.data[i].bucket ==
760 ca->alloc_heap.data[i + 1].bucket)
761 ca->alloc_heap.data[i].nr = 0;
1c6fdbd8
KO
762}
763
b29e197a 764static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
1c6fdbd8 765{
b29e197a
KO
766 size_t i, nr = 0;
767
1c6fdbd8 768 ca->inc_gen_needs_gc = 0;
1c6fdbd8
KO
769
770 switch (ca->mi.replacement) {
771 case CACHE_REPLACEMENT_LRU:
772 find_reclaimable_buckets_lru(c, ca);
773 break;
774 case CACHE_REPLACEMENT_FIFO:
775 find_reclaimable_buckets_fifo(c, ca);
776 break;
777 case CACHE_REPLACEMENT_RANDOM:
778 find_reclaimable_buckets_random(c, ca);
779 break;
780 }
b29e197a 781
198d6700 782 heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
b29e197a
KO
783
784 for (i = 0; i < ca->alloc_heap.used; i++)
785 nr += ca->alloc_heap.data[i].nr;
786
787 return nr;
1c6fdbd8
KO
788}
789
b29e197a 790static inline long next_alloc_bucket(struct bch_dev *ca)
1c6fdbd8 791{
b29e197a
KO
792 struct alloc_heap_entry e, *top = ca->alloc_heap.data;
793
794 while (ca->alloc_heap.used) {
795 if (top->nr) {
796 size_t b = top->bucket;
797
798 top->bucket++;
799 top->nr--;
800 return b;
801 }
1c6fdbd8 802
198d6700 803 heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
b29e197a
KO
804 }
805
806 return -1;
1c6fdbd8
KO
807}
808
b29e197a
KO
809static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
810 size_t bucket, u64 *flush_seq)
1c6fdbd8 811{
b29e197a 812 struct bucket_mark m;
1c6fdbd8 813
9166b41d 814 percpu_down_read(&c->mark_lock);
1c6fdbd8 815 spin_lock(&c->freelist_lock);
b29e197a
KO
816
817 bch2_invalidate_bucket(c, ca, bucket, &m);
818
819 verify_not_on_freelist(c, ca, bucket);
820 BUG_ON(!fifo_push(&ca->free_inc, bucket));
821
1c6fdbd8 822 spin_unlock(&c->freelist_lock);
b29e197a
KO
823
824 bucket_io_clock_reset(c, ca, bucket, READ);
825 bucket_io_clock_reset(c, ca, bucket, WRITE);
826
9166b41d 827 percpu_up_read(&c->mark_lock);
b29e197a
KO
828
829 if (m.journal_seq_valid) {
830 u64 journal_seq = atomic64_read(&c->journal.seq);
831 u64 bucket_seq = journal_seq;
832
833 bucket_seq &= ~((u64) U16_MAX);
834 bucket_seq |= m.journal_seq;
835
836 if (bucket_seq > journal_seq)
837 bucket_seq -= 1 << 16;
838
839 *flush_seq = max(*flush_seq, bucket_seq);
840 }
841
842 return m.cached_sectors != 0;
1c6fdbd8
KO
843}
844
b29e197a
KO
845/*
846 * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
847 */
848static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
1c6fdbd8
KO
849{
850 struct btree_iter iter;
b29e197a 851 u64 journal_seq = 0;
1c6fdbd8 852 int ret = 0;
b29e197a 853 long b;
1c6fdbd8
KO
854
855 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
856 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
857
858 /* Only use nowait if we've already invalidated at least one bucket: */
b29e197a
KO
859 while (!ret &&
860 !fifo_full(&ca->free_inc) &&
861 (b = next_alloc_bucket(ca)) >= 0) {
862 bool must_flush =
863 bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
864
865 ret = __bch2_alloc_write_key(c, ca, b, &iter,
866 must_flush ? &journal_seq : NULL,
867 !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
1c6fdbd8
KO
868 }
869
870 bch2_btree_iter_unlock(&iter);
871
872 /* If we used NOWAIT, don't return the error: */
b29e197a
KO
873 if (!fifo_empty(&ca->free_inc))
874 ret = 0;
875 if (ret) {
876 bch_err(ca, "error invalidating buckets: %i", ret);
877 return ret;
878 }
1c6fdbd8 879
b29e197a
KO
880 if (journal_seq)
881 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
882 if (ret) {
883 bch_err(ca, "journal error: %i", ret);
884 return ret;
885 }
1c6fdbd8 886
b29e197a 887 return 0;
1c6fdbd8
KO
888}
889
890static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
891{
b29e197a 892 unsigned i;
1c6fdbd8
KO
893 int ret = 0;
894
895 while (1) {
896 set_current_state(TASK_INTERRUPTIBLE);
897
b29e197a
KO
898 spin_lock(&c->freelist_lock);
899 for (i = 0; i < RESERVE_NR; i++)
900 if (fifo_push(&ca->free[i], bucket)) {
901 fifo_pop(&ca->free_inc, bucket);
902 closure_wake_up(&c->freelist_wait);
903 spin_unlock(&c->freelist_lock);
904 goto out;
905 }
906 spin_unlock(&c->freelist_lock);
1c6fdbd8
KO
907
908 if ((current->flags & PF_KTHREAD) &&
909 kthread_should_stop()) {
910 ret = 1;
911 break;
912 }
913
914 schedule();
915 try_to_freeze();
916 }
b29e197a 917out:
1c6fdbd8
KO
918 __set_current_state(TASK_RUNNING);
919 return ret;
920}
921
922/*
b29e197a
KO
923 * Pulls buckets off free_inc, discards them (if enabled), then adds them to
924 * freelists, waiting until there's room if necessary:
1c6fdbd8
KO
925 */
926static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
927{
b29e197a 928 while (!fifo_empty(&ca->free_inc)) {
1c6fdbd8
KO
929 size_t bucket = fifo_peek(&ca->free_inc);
930
1c6fdbd8
KO
931 if (ca->mi.discard &&
932 bdev_max_discard_sectors(ca->disk_sb.bdev))
933 blkdev_issue_discard(ca->disk_sb.bdev,
934 bucket_to_sector(ca, bucket),
935 ca->mi.bucket_size, GFP_NOIO);
936
937 if (push_invalidated_bucket(c, ca, bucket))
938 return 1;
939 }
940
941 return 0;
942}
943
944/**
945 * bch_allocator_thread - move buckets from free_inc to reserves
946 *
947 * The free_inc FIFO is populated by find_reclaimable_buckets(), and
948 * the reserves are depleted by bucket allocation. When we run out
949 * of free_inc, try to invalidate some buckets and write out
950 * prios and gens.
951 */
952static int bch2_allocator_thread(void *arg)
953{
954 struct bch_dev *ca = arg;
955 struct bch_fs *c = ca->fs;
b29e197a 956 size_t nr;
1c6fdbd8
KO
957 int ret;
958
959 set_freezable();
960
961 while (1) {
b29e197a 962 cond_resched();
1c6fdbd8 963
b29e197a
KO
964 pr_debug("discarding %zu invalidated buckets",
965 fifo_used(&ca->free_inc));
1c6fdbd8 966
b29e197a
KO
967 ret = discard_invalidated_buckets(c, ca);
968 if (ret)
969 goto stop;
1c6fdbd8 970
94c1f4ad
KO
971 down_read(&c->gc_lock);
972
b29e197a 973 ret = bch2_invalidate_buckets(c, ca);
94c1f4ad
KO
974 if (ret) {
975 up_read(&c->gc_lock);
b29e197a 976 goto stop;
94c1f4ad 977 }
1c6fdbd8 978
94c1f4ad
KO
979 if (!fifo_empty(&ca->free_inc)) {
980 up_read(&c->gc_lock);
b29e197a 981 continue;
94c1f4ad 982 }
1c6fdbd8
KO
983
984 pr_debug("free_inc now empty");
985
b29e197a 986 do {
1c6fdbd8
KO
987 /*
988 * Find some buckets that we can invalidate, either
989 * they're completely unused, or only contain clean data
990 * that's been written back to the backing device or
991 * another cache tier
992 */
993
994 pr_debug("scanning for reclaimable buckets");
995
b29e197a 996 nr = find_reclaimable_buckets(c, ca);
1c6fdbd8 997
b29e197a 998 pr_debug("found %zu buckets", nr);
1c6fdbd8 999
b29e197a 1000 trace_alloc_batch(ca, nr, ca->alloc_heap.size);
1c6fdbd8 1001
b29e197a
KO
1002 if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
1003 ca->inc_gen_really_needs_gc) &&
1c6fdbd8
KO
1004 c->gc_thread) {
1005 atomic_inc(&c->kick_gc);
1006 wake_up_process(c->gc_thread);
1007 }
1008
1c6fdbd8 1009 /*
b29e197a
KO
1010 * If we found any buckets, we have to invalidate them
1011 * before we scan for more - but if we didn't find very
1012 * many we may want to wait on more buckets being
1013 * available so we don't spin:
1c6fdbd8 1014 */
b29e197a
KO
1015 if (!nr ||
1016 (nr < ALLOC_SCAN_BATCH(ca) &&
1017 !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
1018 ca->allocator_blocked = true;
1019 closure_wake_up(&c->freelist_wait);
1020
1021 ret = wait_buckets_available(c, ca);
1022 if (ret) {
1023 up_read(&c->gc_lock);
1024 goto stop;
1025 }
1c6fdbd8 1026 }
b29e197a 1027 } while (!nr);
1c6fdbd8
KO
1028
1029 ca->allocator_blocked = false;
1030 up_read(&c->gc_lock);
1031
b29e197a 1032 pr_debug("%zu buckets to invalidate", nr);
1c6fdbd8
KO
1033
1034 /*
b29e197a 1035 * alloc_heap is now full of newly-invalidated buckets: next,
1c6fdbd8
KO
1036 * write out the new bucket gens:
1037 */
1038 }
1039
1040stop:
1041 pr_debug("alloc thread stopping (ret %i)", ret);
1042 return 0;
1043}
1044
1c6fdbd8
KO
1045/* Startup/shutdown (ro/rw): */
1046
1047void bch2_recalc_capacity(struct bch_fs *c)
1048{
1049 struct bch_dev *ca;
a50ed7c8 1050 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
b092dadd 1051 unsigned bucket_size_max = 0;
1c6fdbd8
KO
1052 unsigned long ra_pages = 0;
1053 unsigned i, j;
1054
1055 lockdep_assert_held(&c->state_lock);
1056
1057 for_each_online_member(ca, c, i) {
1058 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
1059
1060 ra_pages += bdi->ra_pages;
1061 }
1062
1063 bch2_set_ra_pages(c, ra_pages);
1064
1065 for_each_rw_member(ca, c, i) {
a50ed7c8 1066 u64 dev_reserve = 0;
1c6fdbd8
KO
1067
1068 /*
1069 * We need to reserve buckets (from the number
1070 * of currently available buckets) against
1071 * foreground writes so that mainly copygc can
1072 * make forward progress.
1073 *
1074 * We need enough to refill the various reserves
1075 * from scratch - copygc will use its entire
1076 * reserve all at once, then run against when
1077 * its reserve is refilled (from the formerly
1078 * available buckets).
1079 *
1080 * This reserve is just used when considering if
1081 * allocations for foreground writes must wait -
1082 * not -ENOSPC calculations.
1083 */
1084 for (j = 0; j < RESERVE_NONE; j++)
a9bec520 1085 dev_reserve += ca->free[j].size;
1c6fdbd8 1086
a9bec520
KO
1087 dev_reserve += 1; /* btree write point */
1088 dev_reserve += 1; /* copygc write point */
1089 dev_reserve += 1; /* rebalance write point */
1c6fdbd8 1090
a9bec520 1091 dev_reserve *= ca->mi.bucket_size;
1c6fdbd8 1092
a50ed7c8 1093 ca->copygc_threshold = dev_reserve;
a9bec520 1094
a50ed7c8
KO
1095 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1096 ca->mi.first_bucket);
1c6fdbd8 1097
a50ed7c8 1098 reserved_sectors += dev_reserve * 2;
b092dadd
KO
1099
1100 bucket_size_max = max_t(unsigned, bucket_size_max,
1101 ca->mi.bucket_size);
a9bec520 1102 }
1c6fdbd8 1103
a50ed7c8
KO
1104 gc_reserve = c->opts.gc_reserve_bytes
1105 ? c->opts.gc_reserve_bytes >> 9
1106 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1107
1108 reserved_sectors = max(gc_reserve, reserved_sectors);
1c6fdbd8 1109
a50ed7c8 1110 reserved_sectors = min(reserved_sectors, capacity);
1c6fdbd8 1111
a9bec520 1112 c->capacity = capacity - reserved_sectors;
1c6fdbd8 1113
b092dadd
KO
1114 c->bucket_size_max = bucket_size_max;
1115
1c6fdbd8
KO
1116 if (c->capacity) {
1117 bch2_io_timer_add(&c->io_clock[READ],
1118 &c->bucket_clock[READ].rescale);
1119 bch2_io_timer_add(&c->io_clock[WRITE],
1120 &c->bucket_clock[WRITE].rescale);
1121 } else {
1122 bch2_io_timer_del(&c->io_clock[READ],
1123 &c->bucket_clock[READ].rescale);
1124 bch2_io_timer_del(&c->io_clock[WRITE],
1125 &c->bucket_clock[WRITE].rescale);
1126 }
1127
1128 /* Wake up case someone was waiting for buckets */
1129 closure_wake_up(&c->freelist_wait);
1130}
1131
1c6fdbd8
KO
1132static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1133{
1134 struct open_bucket *ob;
1135 bool ret = false;
1136
1137 for (ob = c->open_buckets;
1138 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1139 ob++) {
1140 spin_lock(&ob->lock);
1141 if (ob->valid && !ob->on_partial_list &&
1142 ob->ptr.dev == ca->dev_idx)
1143 ret = true;
1144 spin_unlock(&ob->lock);
1145 }
1146
1147 return ret;
1148}
1149
1150/* device goes ro: */
1151void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1152{
1153 unsigned i;
1154
1155 BUG_ON(ca->alloc_thread);
1156
1157 /* First, remove device from allocation groups: */
1158
1159 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1160 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1161
1162 /*
1163 * Capacity is calculated based off of devices in allocation groups:
1164 */
1165 bch2_recalc_capacity(c);
1166
1167 /* Next, close write points that point to this device... */
1168 for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
7b3f84ea 1169 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1c6fdbd8 1170
7b3f84ea
KO
1171 bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
1172 bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1173 bch2_writepoint_stop(c, ca, &c->btree_write_point);
1c6fdbd8
KO
1174
1175 mutex_lock(&c->btree_reserve_cache_lock);
1176 while (c->btree_reserve_cache_nr) {
1177 struct btree_alloc *a =
1178 &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1179
ef337c54 1180 bch2_open_buckets_put(c, &a->ob);
1c6fdbd8
KO
1181 }
1182 mutex_unlock(&c->btree_reserve_cache_lock);
1183
cd575ddf
KO
1184 while (1) {
1185 struct open_bucket *ob;
1186
1187 spin_lock(&c->freelist_lock);
1188 if (!ca->open_buckets_partial_nr) {
1189 spin_unlock(&c->freelist_lock);
1190 break;
1191 }
1192 ob = c->open_buckets +
1193 ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1194 ob->on_partial_list = false;
1195 spin_unlock(&c->freelist_lock);
1196
1197 bch2_open_bucket_put(c, ob);
1198 }
1199
1200 bch2_ec_stop_dev(c, ca);
1201
1c6fdbd8
KO
1202 /*
1203 * Wake up threads that were blocked on allocation, so they can notice
1204 * the device can no longer be removed and the capacity has changed:
1205 */
1206 closure_wake_up(&c->freelist_wait);
1207
1208 /*
1209 * journal_res_get() can block waiting for free space in the journal -
1210 * it needs to notice there may not be devices to allocate from anymore:
1211 */
1212 wake_up(&c->journal.wait);
1213
1214 /* Now wait for any in flight writes: */
1215
1216 closure_wait_event(&c->open_buckets_wait,
1217 !bch2_dev_has_open_write_point(c, ca));
1218}
1219
1220/* device goes rw: */
1221void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1222{
1223 unsigned i;
1224
1225 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1226 if (ca->mi.data_allowed & (1 << i))
1227 set_bit(ca->dev_idx, c->rw_devs[i].d);
1228}
1229
1230/* stop allocator thread: */
1231void bch2_dev_allocator_stop(struct bch_dev *ca)
1232{
1233 struct task_struct *p;
1234
1235 p = rcu_dereference_protected(ca->alloc_thread, 1);
1236 ca->alloc_thread = NULL;
1237
1238 /*
1239 * We need an rcu barrier between setting ca->alloc_thread = NULL and
1240 * the thread shutting down to avoid bch2_wake_allocator() racing:
1241 *
1242 * XXX: it would be better to have the rcu barrier be asynchronous
1243 * instead of blocking us here
1244 */
1245 synchronize_rcu();
1246
1247 if (p) {
1248 kthread_stop(p);
1249 put_task_struct(p);
1250 }
1251}
1252
1253/* start allocator thread: */
1254int bch2_dev_allocator_start(struct bch_dev *ca)
1255{
1256 struct task_struct *p;
1257
1258 /*
1259 * allocator thread already started?
1260 */
1261 if (ca->alloc_thread)
1262 return 0;
1263
1264 p = kthread_create(bch2_allocator_thread, ca,
1265 "bch_alloc[%s]", ca->name);
1266 if (IS_ERR(p))
1267 return PTR_ERR(p);
1268
1269 get_task_struct(p);
1270 rcu_assign_pointer(ca->alloc_thread, p);
1271 wake_up_process(p);
1272 return 0;
1273}
1274
b29e197a
KO
1275static void flush_held_btree_writes(struct bch_fs *c)
1276{
1277 struct bucket_table *tbl;
1278 struct rhash_head *pos;
1279 struct btree *b;
d0cc3def
KO
1280 bool nodes_blocked;
1281 size_t i;
1282 struct closure cl;
1283
1284 closure_init_stack(&cl);
b29e197a
KO
1285
1286 clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1287again:
1288 pr_debug("flushing dirty btree nodes");
1289 cond_resched();
d0cc3def 1290 closure_wait(&c->btree_interior_update_wait, &cl);
b29e197a 1291
d0cc3def 1292 nodes_blocked = false;
b29e197a
KO
1293
1294 rcu_read_lock();
1295 for_each_cached_btree(b, c, tbl, i, pos)
d0cc3def 1296 if (btree_node_need_write(b)) {
b29e197a
KO
1297 if (btree_node_may_write(b)) {
1298 rcu_read_unlock();
1299 btree_node_lock_type(c, b, SIX_LOCK_read);
1300 bch2_btree_node_write(c, b, SIX_LOCK_read);
1301 six_unlock_read(&b->lock);
1302 goto again;
1303 } else {
d0cc3def 1304 nodes_blocked = true;
b29e197a
KO
1305 }
1306 }
1307 rcu_read_unlock();
1308
1309 if (c->btree_roots_dirty)
1310 bch2_journal_meta(&c->journal);
1311
d0cc3def
KO
1312 if (nodes_blocked) {
1313 closure_sync(&cl);
b29e197a
KO
1314 goto again;
1315 }
1316
d0cc3def
KO
1317 closure_wake_up(&c->btree_interior_update_wait);
1318 closure_sync(&cl);
1319
1320 closure_wait_event(&c->btree_interior_update_wait,
1321 !bch2_btree_interior_updates_nr_pending(c));
b29e197a
KO
1322}
1323
1c6fdbd8
KO
1324static void allocator_start_issue_discards(struct bch_fs *c)
1325{
1326 struct bch_dev *ca;
1327 unsigned dev_iter;
b29e197a 1328 size_t bu;
1c6fdbd8 1329
b29e197a
KO
1330 for_each_rw_member(ca, c, dev_iter)
1331 while (fifo_pop(&ca->free_inc, bu))
1c6fdbd8
KO
1332 blkdev_issue_discard(ca->disk_sb.bdev,
1333 bucket_to_sector(ca, bu),
1334 ca->mi.bucket_size, GFP_NOIO);
1c6fdbd8
KO
1335}
1336
1337static int __bch2_fs_allocator_start(struct bch_fs *c)
1338{
1339 struct bch_dev *ca;
1c6fdbd8
KO
1340 unsigned dev_iter;
1341 u64 journal_seq = 0;
b29e197a 1342 long bu;
1c6fdbd8
KO
1343 int ret = 0;
1344
d0cc3def 1345 if (test_alloc_startup(c))
b29e197a 1346 goto not_enough;
b29e197a 1347
1c6fdbd8
KO
1348 /* Scan for buckets that are already invalidated: */
1349 for_each_rw_member(ca, c, dev_iter) {
61274e9d 1350 struct bucket_array *buckets;
1c6fdbd8 1351 struct bucket_mark m;
1c6fdbd8 1352
61274e9d 1353 down_read(&ca->bucket_lock);
9166b41d 1354 percpu_down_read(&c->mark_lock);
61274e9d
KO
1355
1356 buckets = bucket_array(ca);
1c6fdbd8 1357
61274e9d
KO
1358 for (bu = buckets->first_bucket;
1359 bu < buckets->nbuckets; bu++) {
1360 m = READ_ONCE(buckets->b[bu].mark);
1c6fdbd8 1361
90541a74 1362 if (!buckets->b[bu].gen_valid ||
8eb7f3ee 1363 !test_bit(bu, ca->buckets_nouse) ||
61274e9d
KO
1364 !is_available_bucket(m) ||
1365 m.cached_sectors)
1c6fdbd8
KO
1366 continue;
1367
1c6fdbd8 1368 bch2_mark_alloc_bucket(c, ca, bu, true,
9ca53b55 1369 gc_pos_alloc(c, NULL), 0);
1c6fdbd8
KO
1370
1371 fifo_push(&ca->free_inc, bu);
1c6fdbd8 1372
61274e9d
KO
1373 discard_invalidated_buckets(c, ca);
1374
1375 if (fifo_full(&ca->free[RESERVE_BTREE]))
1c6fdbd8
KO
1376 break;
1377 }
9166b41d 1378 percpu_up_read(&c->mark_lock);
61274e9d 1379 up_read(&ca->bucket_lock);
1c6fdbd8
KO
1380 }
1381
1382 /* did we find enough buckets? */
1383 for_each_rw_member(ca, c, dev_iter)
61274e9d 1384 if (!fifo_full(&ca->free[RESERVE_BTREE])) {
1c6fdbd8
KO
1385 percpu_ref_put(&ca->io_ref);
1386 goto not_enough;
1387 }
1388
1389 return 0;
1390not_enough:
61274e9d 1391 pr_debug("not enough empty buckets; scanning for reclaimable buckets");
1c6fdbd8 1392
1c6fdbd8
KO
1393 /*
1394 * We're moving buckets to freelists _before_ they've been marked as
1395 * invalidated on disk - we have to so that we can allocate new btree
1396 * nodes to mark them as invalidated on disk.
1397 *
1398 * However, we can't _write_ to any of these buckets yet - they might
1399 * have cached data in them, which is live until they're marked as
1400 * invalidated on disk:
1401 */
d0cc3def 1402 set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1c6fdbd8 1403
d0cc3def
KO
1404 while (1) {
1405 bool wrote = false;
1c6fdbd8 1406
d0cc3def
KO
1407 for_each_rw_member(ca, c, dev_iter) {
1408 find_reclaimable_buckets(c, ca);
1c6fdbd8 1409
d0cc3def
KO
1410 while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
1411 (bu = next_alloc_bucket(ca)) >= 0) {
1412 bch2_invalidate_one_bucket(c, ca, bu,
1413 &journal_seq);
1414
1415 fifo_push(&ca->free[RESERVE_BTREE], bu);
1416 bucket_set_dirty(ca, bu);
1417 }
1418 }
1419
1420 pr_debug("done scanning for reclaimable buckets");
1421
1422 /*
1423 * XXX: it's possible for this to deadlock waiting on journal reclaim,
1424 * since we're holding btree writes. What then?
1425 */
1426 ret = bch2_alloc_write(c, true, &wrote);
1c6fdbd8 1427
d0cc3def
KO
1428 /*
1429 * If bch2_alloc_write() did anything, it may have used some
1430 * buckets, and we need the RESERVE_BTREE freelist full - so we
1431 * need to loop and scan again.
1432 * And if it errored, it may have been because there weren't
1433 * enough buckets, so just scan and loop again as long as it
1434 * made some progress:
1435 */
1436 if (!wrote && ret)
1437 return ret;
1438 if (!wrote && !ret)
1439 break;
1c6fdbd8
KO
1440 }
1441
d0cc3def
KO
1442 pr_debug("flushing journal");
1443
1444 ret = bch2_journal_flush(&c->journal);
1445 if (ret)
1446 return ret;
1447
1448 pr_debug("issuing discards");
1449 allocator_start_issue_discards(c);
1450
1c6fdbd8
KO
1451 set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
1452
1453 /* now flush dirty btree nodes: */
d0cc3def 1454 flush_held_btree_writes(c);
1c6fdbd8
KO
1455
1456 return 0;
1457}
1458
1459int bch2_fs_allocator_start(struct bch_fs *c)
1460{
1461 struct bch_dev *ca;
1462 unsigned i;
d0cc3def 1463 bool wrote;
1c6fdbd8
KO
1464 int ret;
1465
1466 down_read(&c->gc_lock);
1467 ret = __bch2_fs_allocator_start(c);
1468 up_read(&c->gc_lock);
1469
1470 if (ret)
1471 return ret;
1472
1473 for_each_rw_member(ca, c, i) {
1474 ret = bch2_dev_allocator_start(ca);
1475 if (ret) {
1476 percpu_ref_put(&ca->io_ref);
1477 return ret;
1478 }
1479 }
1480
d0cc3def 1481 return bch2_alloc_write(c, false, &wrote);
1c6fdbd8
KO
1482}
1483
b092dadd 1484void bch2_fs_allocator_background_init(struct bch_fs *c)
1c6fdbd8 1485{
1c6fdbd8
KO
1486 spin_lock_init(&c->freelist_lock);
1487 bch2_bucket_clock_init(c, READ);
1488 bch2_bucket_clock_init(c, WRITE);
1489
1c6fdbd8
KO
1490 c->pd_controllers_update_seconds = 5;
1491 INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
1492}