bcachefs: Pass flags arg to bch2_alloc_write()
[linux-block.git] / fs / bcachefs / ec.c
CommitLineData
cd575ddf
KO
1// SPDX-License-Identifier: GPL-2.0
2
3/* erasure coding */
4
5#include "bcachefs.h"
6#include "alloc_foreground.h"
7#include "bset.h"
8#include "btree_gc.h"
9#include "btree_update.h"
10#include "buckets.h"
11#include "disk_groups.h"
12#include "ec.h"
13#include "error.h"
14#include "io.h"
61c8d7c8 15#include "journal_io.h"
cd575ddf
KO
16#include "keylist.h"
17#include "super-io.h"
18#include "util.h"
19
de5bb710
KO
20#include <linux/sort.h>
21
22#ifdef __KERNEL__
23
cd575ddf
KO
24#include <linux/raid/pq.h>
25#include <linux/raid/xor.h>
de5bb710
KO
26
27static void raid5_recov(unsigned disks, unsigned failed_idx,
28 size_t size, void **data)
29{
30 unsigned i = 2, nr;
31
32 BUG_ON(failed_idx >= disks);
33
34 swap(data[0], data[failed_idx]);
35 memcpy(data[0], data[1], size);
36
37 while (i < disks) {
38 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
39 xor_blocks(nr, size, data[0], data + i);
40 i += nr;
41 }
42
43 swap(data[0], data[failed_idx]);
44}
45
46static void raid_gen(int nd, int np, size_t size, void **v)
47{
48 if (np >= 1)
49 raid5_recov(nd + np, nd, size, v);
50 if (np >= 2)
51 raid6_call.gen_syndrome(nd + np, size, v);
52 BUG_ON(np > 2);
53}
54
55static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
56{
57 switch (nr) {
58 case 0:
59 break;
60 case 1:
61 if (ir[0] < nd + 1)
62 raid5_recov(nd + 1, ir[0], size, v);
63 else
64 raid6_call.gen_syndrome(nd + np, size, v);
65 break;
66 case 2:
67 if (ir[1] < nd) {
68 /* data+data failure. */
69 raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
70 } else if (ir[0] < nd) {
71 /* data + p/q failure */
72
73 if (ir[1] == nd) /* data + p failure */
74 raid6_datap_recov(nd + np, size, ir[0], v);
75 else { /* data + q failure */
76 raid5_recov(nd + 1, ir[0], size, v);
77 raid6_call.gen_syndrome(nd + np, size, v);
78 }
79 } else {
80 raid_gen(nd, np, size, v);
81 }
82 break;
83 default:
84 BUG();
85 }
86}
87
88#else
89
90#include <raid/raid.h>
91
92#endif
cd575ddf
KO
93
94struct ec_bio {
95 struct bch_dev *ca;
96 struct ec_stripe_buf *buf;
97 size_t idx;
98 struct bio bio;
99};
100
101/* Stripes btree keys: */
102
26609b61 103const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
cd575ddf 104{
26609b61
KO
105 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
106
cd575ddf
KO
107 if (k.k->p.inode)
108 return "invalid stripe key";
109
26609b61
KO
110 if (bkey_val_bytes(k.k) < sizeof(*s))
111 return "incorrect value size";
cd575ddf 112
76640280
KO
113 if (bkey_val_bytes(k.k) < sizeof(*s) ||
114 bkey_val_u64s(k.k) < stripe_val_u64s(s))
26609b61 115 return "incorrect value size";
cd575ddf 116
26609b61 117 return NULL;
cd575ddf
KO
118}
119
26609b61 120void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
cd575ddf
KO
121 struct bkey_s_c k)
122{
26609b61
KO
123 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
124 unsigned i;
125
126 pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
127 s->algorithm,
128 le16_to_cpu(s->sectors),
129 s->nr_blocks - s->nr_redundant,
130 s->nr_redundant,
131 s->csum_type,
132 1U << s->csum_granularity_bits);
133
134 for (i = 0; i < s->nr_blocks; i++)
61c8d7c8
KO
135 pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
136 (u64) s->ptrs[i].offset,
137 stripe_blockcount_get(s, i));
cd575ddf
KO
138}
139
140static int ptr_matches_stripe(struct bch_fs *c,
141 struct bch_stripe *v,
142 const struct bch_extent_ptr *ptr)
143{
144 unsigned i;
145
146 for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
147 const struct bch_extent_ptr *ptr2 = v->ptrs + i;
148
149 if (ptr->dev == ptr2->dev &&
150 ptr->gen == ptr2->gen &&
151 ptr->offset >= ptr2->offset &&
152 ptr->offset < ptr2->offset + le16_to_cpu(v->sectors))
153 return i;
154 }
155
156 return -1;
157}
158
159static int extent_matches_stripe(struct bch_fs *c,
160 struct bch_stripe *v,
161 struct bkey_s_c k)
162{
163 struct bkey_s_c_extent e;
164 const struct bch_extent_ptr *ptr;
165 int idx;
166
167 if (!bkey_extent_is_data(k.k))
168 return -1;
169
170 e = bkey_s_c_to_extent(k);
171
172 extent_for_each_ptr(e, ptr) {
173 idx = ptr_matches_stripe(c, v, ptr);
174 if (idx >= 0)
175 return idx;
176 }
177
178 return -1;
179}
180
181static void ec_stripe_key_init(struct bch_fs *c,
182 struct bkey_i_stripe *s,
183 struct open_buckets *blocks,
184 struct open_buckets *parity,
185 unsigned stripe_size)
186{
187 struct open_bucket *ob;
188 unsigned i, u64s;
189
190 bkey_stripe_init(&s->k_i);
191 s->v.sectors = cpu_to_le16(stripe_size);
192 s->v.algorithm = 0;
193 s->v.nr_blocks = parity->nr + blocks->nr;
194 s->v.nr_redundant = parity->nr;
195 s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
196 s->v.csum_type = BCH_CSUM_CRC32C;
197 s->v.pad = 0;
198
199 open_bucket_for_each(c, blocks, ob, i)
200 s->v.ptrs[i] = ob->ptr;
201
202 open_bucket_for_each(c, parity, ob, i)
203 s->v.ptrs[blocks->nr + i] = ob->ptr;
204
205 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
206 BUG_ON(1 << s->v.csum_granularity_bits >=
207 le16_to_cpu(s->v.sectors) ||
208 s->v.csum_granularity_bits == U8_MAX);
209 s->v.csum_granularity_bits++;
210 }
211
212 set_bkey_val_u64s(&s->k, u64s);
213}
214
215/* Checksumming: */
216
217static void ec_generate_checksums(struct ec_stripe_buf *buf)
218{
219 struct bch_stripe *v = &buf->key.v;
220 unsigned csum_granularity = 1 << v->csum_granularity_bits;
221 unsigned csums_per_device = stripe_csums_per_device(v);
222 unsigned csum_bytes = bch_crc_bytes[v->csum_type];
223 unsigned i, j;
224
225 if (!csum_bytes)
226 return;
227
228 BUG_ON(buf->offset);
229 BUG_ON(buf->size != le16_to_cpu(v->sectors));
230
231 for (i = 0; i < v->nr_blocks; i++) {
232 for (j = 0; j < csums_per_device; j++) {
233 unsigned offset = j << v->csum_granularity_bits;
234 unsigned len = min(csum_granularity, buf->size - offset);
235
236 struct bch_csum csum =
237 bch2_checksum(NULL, v->csum_type,
238 null_nonce(),
239 buf->data[i] + (offset << 9),
240 len << 9);
241
242 memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
243 }
244 }
245}
246
247static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
248{
249 struct bch_stripe *v = &buf->key.v;
250 unsigned csum_granularity = 1 << v->csum_granularity_bits;
251 unsigned csum_bytes = bch_crc_bytes[v->csum_type];
252 unsigned i;
253
254 if (!csum_bytes)
255 return;
256
257 for (i = 0; i < v->nr_blocks; i++) {
258 unsigned offset = buf->offset;
259 unsigned end = buf->offset + buf->size;
260
261 if (!test_bit(i, buf->valid))
262 continue;
263
264 while (offset < end) {
265 unsigned j = offset >> v->csum_granularity_bits;
266 unsigned len = min(csum_granularity, end - offset);
267 struct bch_csum csum;
268
269 BUG_ON(offset & (csum_granularity - 1));
270 BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
271 ((offset + len) & (csum_granularity - 1)));
272
273 csum = bch2_checksum(NULL, v->csum_type,
274 null_nonce(),
275 buf->data[i] + ((offset - buf->offset) << 9),
276 len << 9);
277
278 if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
279 __bcache_io_error(c,
280 "checksum error while doing reconstruct read (%u:%u)",
281 i, j);
282 clear_bit(i, buf->valid);
283 break;
284 }
285
286 offset += len;
287 }
288 }
289}
290
291/* Erasure coding: */
292
cd575ddf
KO
293static void ec_generate_ec(struct ec_stripe_buf *buf)
294{
295 struct bch_stripe *v = &buf->key.v;
296 unsigned nr_data = v->nr_blocks - v->nr_redundant;
297 unsigned bytes = le16_to_cpu(v->sectors) << 9;
298
de5bb710 299 raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
cd575ddf
KO
300}
301
302static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
303{
304 return nr - bitmap_weight(buf->valid, nr);
305}
306
307static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
308{
309 return __ec_nr_failed(buf, buf->key.v.nr_blocks);
310}
311
312static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
313{
314 struct bch_stripe *v = &buf->key.v;
315 unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
316 unsigned nr_data = v->nr_blocks - v->nr_redundant;
317 unsigned bytes = buf->size << 9;
318
319 if (ec_nr_failed(buf) > v->nr_redundant) {
320 __bcache_io_error(c,
321 "error doing reconstruct read: unable to read enough blocks");
322 return -1;
323 }
324
325 for (i = 0; i < nr_data; i++)
326 if (!test_bit(i, buf->valid))
327 failed[nr_failed++] = i;
328
de5bb710 329 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
cd575ddf
KO
330 return 0;
331}
332
333/* IO: */
334
335static void ec_block_endio(struct bio *bio)
336{
337 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
338 struct bch_dev *ca = ec_bio->ca;
339 struct closure *cl = bio->bi_private;
340
341 if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
342 clear_bit(ec_bio->idx, ec_bio->buf->valid);
343
344 bio_put(&ec_bio->bio);
345 percpu_ref_put(&ca->io_ref);
346 closure_put(cl);
347}
348
349static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
350 unsigned rw, unsigned idx, struct closure *cl)
351{
352 struct bch_stripe *v = &buf->key.v;
353 unsigned offset = 0, bytes = buf->size << 9;
354 struct bch_extent_ptr *ptr = &v->ptrs[idx];
355 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
356
357 if (!bch2_dev_get_ioref(ca, rw)) {
358 clear_bit(idx, buf->valid);
359 return;
360 }
361
362 while (offset < bytes) {
363 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
364 DIV_ROUND_UP(bytes, PAGE_SIZE));
365 unsigned b = min_t(size_t, bytes - offset,
366 nr_iovecs << PAGE_SHIFT);
367 struct ec_bio *ec_bio;
368
369 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
370 nr_iovecs,
371 rw,
372 GFP_KERNEL,
373 &c->ec_bioset),
374 struct ec_bio, bio);
375
376 ec_bio->ca = ca;
377 ec_bio->buf = buf;
378 ec_bio->idx = idx;
379
380 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
381 ec_bio->bio.bi_iter.bi_size = b;
382 ec_bio->bio.bi_end_io = ec_block_endio;
383 ec_bio->bio.bi_private = cl;
384
385 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
386
387 closure_get(cl);
388 percpu_ref_get(&ca->io_ref);
389
390 submit_bio(&ec_bio->bio);
391
392 offset += b;
393 }
394
395 percpu_ref_put(&ca->io_ref);
396}
397
398/* recovery read path: */
399int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
400{
424eb881
KO
401 struct btree_trans trans;
402 struct btree_iter *iter;
cd575ddf
KO
403 struct ec_stripe_buf *buf;
404 struct closure cl;
405 struct bkey_s_c k;
406 struct bch_stripe *v;
407 unsigned stripe_idx;
408 unsigned offset, end;
409 unsigned i, nr_data, csum_granularity;
410 int ret = 0, idx;
411
412 closure_init_stack(&cl);
413
414 BUG_ON(!rbio->pick.idx ||
415 rbio->pick.idx - 1 >= rbio->pick.ec_nr);
416
417 stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
418
419 buf = kzalloc(sizeof(*buf), GFP_NOIO);
420 if (!buf)
421 return -ENOMEM;
422
424eb881
KO
423 bch2_trans_init(&trans, c);
424
425 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
426 POS(0, stripe_idx),
427 BTREE_ITER_SLOTS);
428 k = bch2_btree_iter_peek_slot(iter);
0f238367 429 if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
cd575ddf
KO
430 __bcache_io_error(c,
431 "error doing reconstruct read: stripe not found");
432 kfree(buf);
424eb881 433 return bch2_trans_exit(&trans) ?: -EIO;
cd575ddf
KO
434 }
435
436 bkey_reassemble(&buf->key.k_i, k);
424eb881 437 bch2_trans_exit(&trans);
cd575ddf
KO
438
439 v = &buf->key.v;
440
441 nr_data = v->nr_blocks - v->nr_redundant;
442
443 idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
444 BUG_ON(idx < 0);
445
446 csum_granularity = 1U << v->csum_granularity_bits;
447
448 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
449 end = offset + bio_sectors(&rbio->bio);
450
451 BUG_ON(end > le16_to_cpu(v->sectors));
452
453 buf->offset = round_down(offset, csum_granularity);
454 buf->size = min_t(unsigned, le16_to_cpu(v->sectors),
455 round_up(end, csum_granularity)) - buf->offset;
456
457 for (i = 0; i < v->nr_blocks; i++) {
458 buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
459 if (!buf->data[i]) {
460 ret = -ENOMEM;
461 goto err;
462 }
463 }
464
465 memset(buf->valid, 0xFF, sizeof(buf->valid));
466
467 for (i = 0; i < v->nr_blocks; i++) {
468 struct bch_extent_ptr *ptr = v->ptrs + i;
469 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
470
471 if (ptr_stale(ca, ptr)) {
472 __bcache_io_error(c,
473 "error doing reconstruct read: stale pointer");
474 clear_bit(i, buf->valid);
475 continue;
476 }
477
478 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
479 }
480
481 closure_sync(&cl);
482
483 if (ec_nr_failed(buf) > v->nr_redundant) {
484 __bcache_io_error(c,
485 "error doing reconstruct read: unable to read enough blocks");
486 ret = -EIO;
487 goto err;
488 }
489
490 ec_validate_checksums(c, buf);
491
492 ret = ec_do_recov(c, buf);
493 if (ret)
494 goto err;
495
496 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
497 buf->data[idx] + ((offset - buf->offset) << 9));
498err:
499 for (i = 0; i < v->nr_blocks; i++)
500 kfree(buf->data[i]);
501 kfree(buf);
502 return ret;
503}
504
dfe9bfb3 505/* stripe bucket accounting: */
cd575ddf
KO
506
507static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
508{
509 ec_stripes_heap n, *h = &c->ec_stripes_heap;
510
511 if (idx >= h->size) {
512 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
513 return -ENOMEM;
514
515 spin_lock(&c->ec_stripes_heap_lock);
516 if (n.size > h->size) {
517 memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
518 n.used = h->used;
519 swap(*h, n);
520 }
521 spin_unlock(&c->ec_stripes_heap_lock);
522
523 free_heap(&n);
524 }
525
dfe9bfb3
KO
526 if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
527 return -ENOMEM;
528
529 if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
530 !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
cd575ddf
KO
531 return -ENOMEM;
532
533 return 0;
534}
535
536static int ec_stripe_mem_alloc(struct bch_fs *c,
537 struct btree_iter *iter)
538{
539 size_t idx = iter->pos.offset;
540
541 if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
542 return 0;
543
0f238367 544 bch2_btree_trans_unlock(iter->trans);
cd575ddf
KO
545
546 if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
547 return -EINTR;
548 return -ENOMEM;
549}
550
551static ssize_t stripe_idx_to_delete(struct bch_fs *c)
552{
553 ec_stripes_heap *h = &c->ec_stripes_heap;
554
555 return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
556}
557
558static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
559 struct ec_stripe_heap_entry l,
560 struct ec_stripe_heap_entry r)
561{
562 return ((l.blocks_nonempty > r.blocks_nonempty) -
563 (l.blocks_nonempty < r.blocks_nonempty));
564}
565
566static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
567 size_t i)
568{
569 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
570
dfe9bfb3 571 genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
cd575ddf
KO
572}
573
574static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
575{
576 ec_stripes_heap *h = &c->ec_stripes_heap;
dfe9bfb3 577 struct stripe *m = genradix_ptr(&c->stripes[0], idx);
cd575ddf
KO
578
579 BUG_ON(!m->alive);
580 BUG_ON(m->heap_idx >= h->used);
581 BUG_ON(h->data[m->heap_idx].idx != idx);
582}
583
cd575ddf 584void bch2_stripes_heap_update(struct bch_fs *c,
dfe9bfb3 585 struct stripe *m, size_t idx)
cd575ddf
KO
586{
587 ec_stripes_heap *h = &c->ec_stripes_heap;
cd575ddf
KO
588 size_t i;
589
cd575ddf
KO
590 heap_verify_backpointer(c, idx);
591
61c8d7c8 592 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
cd575ddf
KO
593
594 i = m->heap_idx;
595 heap_sift_up(h, i, ec_stripes_heap_cmp,
596 ec_stripes_heap_set_backpointer);
597 heap_sift_down(h, i, ec_stripes_heap_cmp,
598 ec_stripes_heap_set_backpointer);
599
600 heap_verify_backpointer(c, idx);
601
61c8d7c8 602 if (stripe_idx_to_delete(c) >= 0)
cd575ddf
KO
603 schedule_work(&c->ec_stripe_delete_work);
604}
605
606void bch2_stripes_heap_del(struct bch_fs *c,
dfe9bfb3 607 struct stripe *m, size_t idx)
cd575ddf 608{
cd575ddf
KO
609 heap_verify_backpointer(c, idx);
610
611 m->alive = false;
612 heap_del(&c->ec_stripes_heap, m->heap_idx,
613 ec_stripes_heap_cmp,
614 ec_stripes_heap_set_backpointer);
cd575ddf
KO
615}
616
617void bch2_stripes_heap_insert(struct bch_fs *c,
dfe9bfb3 618 struct stripe *m, size_t idx)
cd575ddf 619{
cd575ddf
KO
620 BUG_ON(heap_full(&c->ec_stripes_heap));
621
622 heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
623 .idx = idx,
61c8d7c8 624 .blocks_nonempty = m->blocks_nonempty,
cd575ddf
KO
625 }),
626 ec_stripes_heap_cmp,
627 ec_stripes_heap_set_backpointer);
628 m->alive = true;
629
630 heap_verify_backpointer(c, idx);
cd575ddf
KO
631}
632
dfe9bfb3
KO
633/* stripe deletion */
634
0564b167 635static int ec_stripe_delete(struct bch_fs *c, size_t idx)
cd575ddf 636{
0564b167
KO
637 return bch2_btree_delete_range(c, BTREE_ID_EC,
638 POS(0, idx),
639 POS(0, idx + 1),
640 NULL);
cd575ddf
KO
641}
642
643static void ec_stripe_delete_work(struct work_struct *work)
644{
645 struct bch_fs *c =
646 container_of(work, struct bch_fs, ec_stripe_delete_work);
647 ssize_t idx;
648
649 down_read(&c->gc_lock);
dfe9bfb3 650 mutex_lock(&c->ec_stripe_create_lock);
cd575ddf
KO
651
652 while (1) {
653 spin_lock(&c->ec_stripes_heap_lock);
654 idx = stripe_idx_to_delete(c);
655 spin_unlock(&c->ec_stripes_heap_lock);
656
657 if (idx < 0)
658 break;
659
660 ec_stripe_delete(c, idx);
661 }
662
dfe9bfb3 663 mutex_unlock(&c->ec_stripe_create_lock);
cd575ddf
KO
664 up_read(&c->gc_lock);
665}
666
dfe9bfb3
KO
667/* stripe creation: */
668
cd575ddf
KO
669static int ec_stripe_bkey_insert(struct bch_fs *c,
670 struct bkey_i_stripe *stripe)
671{
0564b167
KO
672 struct btree_trans trans;
673 struct btree_iter *iter;
cd575ddf
KO
674 struct bkey_s_c k;
675 int ret;
676
0564b167 677 bch2_trans_init(&trans, c);
cd575ddf 678retry:
0564b167
KO
679 bch2_trans_begin(&trans);
680
681 /* XXX: start pos hint */
682 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
683 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
684
685 for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
686 if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
687 break;
cd575ddf
KO
688
689 if (bkey_deleted(k.k))
690 goto found_slot;
691 }
692
0564b167
KO
693 ret = -ENOSPC;
694 goto out;
cd575ddf 695found_slot:
0564b167 696 ret = ec_stripe_mem_alloc(c, iter);
cd575ddf
KO
697
698 if (ret == -EINTR)
699 goto retry;
700 if (ret)
701 return ret;
702
0564b167 703 stripe->k.p = iter->pos;
cd575ddf 704
0564b167
KO
705 bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
706
707 ret = bch2_trans_commit(&trans, NULL, NULL,
708 BTREE_INSERT_NOFAIL|
709 BTREE_INSERT_USE_RESERVE);
710out:
711 bch2_trans_exit(&trans);
cd575ddf 712
cd575ddf
KO
713 return ret;
714}
715
cd575ddf
KO
716static void extent_stripe_ptr_add(struct bkey_s_extent e,
717 struct ec_stripe_buf *s,
718 struct bch_extent_ptr *ptr,
719 unsigned block)
720{
721 struct bch_extent_stripe_ptr *dst = (void *) ptr;
722 union bch_extent_entry *end = extent_entry_last(e);
723
724 memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
725 e.k->u64s += sizeof(*dst) / sizeof(u64);
726
727 *dst = (struct bch_extent_stripe_ptr) {
728 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
729 .block = block,
730 .idx = s->key.k.p.offset,
731 };
732}
733
734static int ec_stripe_update_ptrs(struct bch_fs *c,
735 struct ec_stripe_buf *s,
736 struct bkey *pos)
737{
0564b167
KO
738 struct btree_trans trans;
739 struct btree_iter *iter;
cd575ddf
KO
740 struct bkey_s_c k;
741 struct bkey_s_extent e;
742 struct bch_extent_ptr *ptr;
743 BKEY_PADDED(k) tmp;
744 int ret = 0, dev, idx;
745
0564b167 746 bch2_trans_init(&trans, c);
cd575ddf 747
0564b167
KO
748 iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
749 bkey_start_pos(pos),
750 BTREE_ITER_INTENT);
751
752 while ((k = bch2_btree_iter_peek(iter)).k &&
0f238367 753 !(ret = bkey_err(k)) &&
cd575ddf
KO
754 bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
755 idx = extent_matches_stripe(c, &s->key.v, k);
756 if (idx < 0) {
0564b167 757 bch2_btree_iter_next(iter);
cd575ddf
KO
758 continue;
759 }
760
761 dev = s->key.v.ptrs[idx].dev;
762
763 bkey_reassemble(&tmp.k, k);
764 e = bkey_i_to_s_extent(&tmp.k);
765
766 extent_for_each_ptr(e, ptr)
767 if (ptr->dev != dev)
768 ptr->cached = true;
769
770 ptr = (void *) bch2_extent_has_device(e.c, dev);
771 BUG_ON(!ptr);
772
773 extent_stripe_ptr_add(e, s, ptr, idx);
774
0564b167
KO
775 bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k));
776
777 ret = bch2_trans_commit(&trans, NULL, NULL,
778 BTREE_INSERT_ATOMIC|
779 BTREE_INSERT_NOFAIL|
780 BTREE_INSERT_USE_RESERVE);
cd575ddf
KO
781 if (ret == -EINTR)
782 ret = 0;
783 if (ret)
784 break;
785 }
786
0564b167
KO
787 bch2_trans_exit(&trans);
788
789 return ret;
cd575ddf
KO
790}
791
792/*
793 * data buckets of new stripe all written: create the stripe
794 */
795static void ec_stripe_create(struct ec_stripe_new *s)
796{
cd575ddf
KO
797 struct bch_fs *c = s->c;
798 struct open_bucket *ob;
799 struct bkey_i *k;
800 struct bch_stripe *v = &s->stripe.key.v;
801 unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
802 struct closure cl;
803 int ret;
804
805 BUG_ON(s->h->s == s);
806
807 closure_init_stack(&cl);
808
809 if (s->err) {
810 bch_err(c, "error creating stripe: error writing data buckets");
811 goto err;
812 }
813
814 if (!percpu_ref_tryget(&c->writes))
815 goto err;
816
817 BUG_ON(bitmap_weight(s->blocks_allocated,
818 s->blocks.nr) != s->blocks.nr);
819
820 ec_generate_ec(&s->stripe);
821
822 ec_generate_checksums(&s->stripe);
823
824 /* write p/q: */
825 for (i = nr_data; i < v->nr_blocks; i++)
826 ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
827
828 closure_sync(&cl);
829
830 for (i = nr_data; i < v->nr_blocks; i++)
831 if (!test_bit(i, s->stripe.valid)) {
832 bch_err(c, "error creating stripe: error writing redundancy buckets");
833 goto err_put_writes;
834 }
835
dfe9bfb3
KO
836 mutex_lock(&c->ec_stripe_create_lock);
837
cd575ddf
KO
838 ret = ec_stripe_bkey_insert(c, &s->stripe.key);
839 if (ret) {
840 bch_err(c, "error creating stripe: error creating stripe key");
dfe9bfb3 841 goto err_unlock;
cd575ddf
KO
842 }
843
844 for_each_keylist_key(&s->keys, k) {
845 ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
846 if (ret)
847 break;
848 }
849
dfe9bfb3
KO
850err_unlock:
851 mutex_unlock(&c->ec_stripe_create_lock);
cd575ddf
KO
852err_put_writes:
853 percpu_ref_put(&c->writes);
854err:
855 open_bucket_for_each(c, &s->blocks, ob, i) {
856 ob->ec = NULL;
857 __bch2_open_bucket_put(c, ob);
858 }
859
860 bch2_open_buckets_put(c, &s->parity);
861
862 bch2_keylist_free(&s->keys, s->inline_keys);
863
864 mutex_lock(&s->h->lock);
865 list_del(&s->list);
866 mutex_unlock(&s->h->lock);
867
868 for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
869 kvpfree(s->stripe.data[i], s->stripe.size << 9);
870 kfree(s);
871}
872
873static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
874{
875 struct ec_stripe_new *s = h->s;
876
877 list_add(&s->list, &h->stripes);
878 h->s = NULL;
879
880 return s;
881}
882
883static void ec_stripe_new_put(struct ec_stripe_new *s)
884{
885 BUG_ON(atomic_read(&s->pin) <= 0);
886 if (atomic_dec_and_test(&s->pin))
887 ec_stripe_create(s);
888}
889
890/* have a full bucket - hand it off to be erasure coded: */
891void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
892{
893 struct ec_stripe_new *s = ob->ec;
894
895 if (ob->sectors_free)
896 s->err = -1;
897
898 ec_stripe_new_put(s);
899}
900
901void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
902{
903 struct ec_stripe_new *s = ob->ec;
904
905 s->err = -EIO;
906}
907
908void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
909{
910 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
911 struct bch_dev *ca;
912 unsigned offset;
913
914 if (!ob)
915 return NULL;
916
917 ca = bch_dev_bkey_exists(c, ob->ptr.dev);
918 offset = ca->mi.bucket_size - ob->sectors_free;
919
920 return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
921}
922
923void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
924 struct bpos pos, unsigned sectors)
925{
926 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
927 struct ec_stripe_new *ec;
928
929 if (!ob)
930 return;
931
932 ec = ob->ec;
933 mutex_lock(&ec->lock);
934
935 if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
936 ARRAY_SIZE(ec->inline_keys),
937 BKEY_U64s)) {
938 BUG();
939 }
940
941 bkey_init(&ec->keys.top->k);
942 ec->keys.top->k.p = pos;
943 bch2_key_resize(&ec->keys.top->k, sectors);
944 bch2_keylist_push(&ec->keys);
945
946 mutex_unlock(&ec->lock);
947}
948
949static int unsigned_cmp(const void *_l, const void *_r)
950{
951 unsigned l = *((const unsigned *) _l);
952 unsigned r = *((const unsigned *) _r);
953
954 return (l > r) - (l < r);
955}
956
957/* pick most common bucket size: */
958static unsigned pick_blocksize(struct bch_fs *c,
959 struct bch_devs_mask *devs)
960{
961 struct bch_dev *ca;
962 unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
963 struct {
964 unsigned nr, size;
965 } cur = { 0, 0 }, best = { 0, 0 };
966
967 for_each_member_device_rcu(ca, c, i, devs)
968 sizes[nr++] = ca->mi.bucket_size;
969
970 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
971
972 for (i = 0; i < nr; i++) {
973 if (sizes[i] != cur.size) {
974 if (cur.nr > best.nr)
975 best = cur;
976
977 cur.nr = 0;
978 cur.size = sizes[i];
979 }
980
981 cur.nr++;
982 }
983
984 if (cur.nr > best.nr)
985 best = cur;
986
987 return best.size;
988}
989
990int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
991{
992 struct ec_stripe_new *s;
993 unsigned i;
994
995 BUG_ON(h->parity.nr != h->redundancy);
996 BUG_ON(!h->blocks.nr);
997 BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
998 lockdep_assert_held(&h->lock);
999
1000 s = kzalloc(sizeof(*s), GFP_KERNEL);
1001 if (!s)
1002 return -ENOMEM;
1003
1004 mutex_init(&s->lock);
1005 atomic_set(&s->pin, 1);
1006 s->c = c;
1007 s->h = h;
1008 s->blocks = h->blocks;
1009 s->parity = h->parity;
1010
1011 memset(&h->blocks, 0, sizeof(h->blocks));
1012 memset(&h->parity, 0, sizeof(h->parity));
1013
1014 bch2_keylist_init(&s->keys, s->inline_keys);
1015
1016 s->stripe.offset = 0;
1017 s->stripe.size = h->blocksize;
1018 memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
1019
1020 ec_stripe_key_init(c, &s->stripe.key,
1021 &s->blocks, &s->parity,
1022 h->blocksize);
1023
1024 for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
1025 s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
1026 if (!s->stripe.data[i])
1027 goto err;
1028 }
1029
1030 h->s = s;
1031
1032 return 0;
1033err:
1034 for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
1035 kvpfree(s->stripe.data[i], s->stripe.size << 9);
1036 kfree(s);
1037 return -ENOMEM;
1038}
1039
1040static struct ec_stripe_head *
1041ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1042 unsigned algo, unsigned redundancy)
1043{
1044 struct ec_stripe_head *h;
1045 struct bch_dev *ca;
1046 unsigned i;
1047
1048 h = kzalloc(sizeof(*h), GFP_KERNEL);
1049 if (!h)
1050 return NULL;
1051
1052 mutex_init(&h->lock);
1053 mutex_lock(&h->lock);
1054 INIT_LIST_HEAD(&h->stripes);
1055
1056 h->target = target;
1057 h->algo = algo;
1058 h->redundancy = redundancy;
1059
1060 rcu_read_lock();
1061 h->devs = target_rw_devs(c, BCH_DATA_USER, target);
1062
1063 for_each_member_device_rcu(ca, c, i, &h->devs)
1064 if (!ca->mi.durability)
1065 __clear_bit(i, h->devs.d);
1066
1067 h->blocksize = pick_blocksize(c, &h->devs);
1068
1069 for_each_member_device_rcu(ca, c, i, &h->devs)
1070 if (ca->mi.bucket_size == h->blocksize)
1071 h->nr_active_devs++;
1072
1073 rcu_read_unlock();
1074 list_add(&h->list, &c->ec_new_stripe_list);
1075 return h;
1076}
1077
1078void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
1079{
1080 struct ec_stripe_new *s = NULL;
1081
1082 if (h->s &&
1083 bitmap_weight(h->s->blocks_allocated,
1084 h->s->blocks.nr) == h->s->blocks.nr)
1085 s = ec_stripe_set_pending(h);
1086
1087 mutex_unlock(&h->lock);
1088
1089 if (s)
1090 ec_stripe_new_put(s);
1091}
1092
1093struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
1094 unsigned target,
1095 unsigned algo,
1096 unsigned redundancy)
1097{
1098 struct ec_stripe_head *h;
1099
1100 if (!redundancy)
1101 return NULL;
1102
1103 mutex_lock(&c->ec_new_stripe_lock);
1104 list_for_each_entry(h, &c->ec_new_stripe_list, list)
1105 if (h->target == target &&
1106 h->algo == algo &&
1107 h->redundancy == redundancy) {
1108 mutex_lock(&h->lock);
1109 goto found;
1110 }
1111
1112 h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
1113found:
1114 mutex_unlock(&c->ec_new_stripe_lock);
1115 return h;
1116}
1117
1118void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1119{
1120 struct ec_stripe_head *h;
1121 struct open_bucket *ob;
1122 unsigned i;
1123
1124 mutex_lock(&c->ec_new_stripe_lock);
1125 list_for_each_entry(h, &c->ec_new_stripe_list, list) {
1126 struct ec_stripe_new *s = NULL;
1127
1128 mutex_lock(&h->lock);
1129 bch2_open_buckets_stop_dev(c, ca,
1130 &h->blocks,
1131 BCH_DATA_USER);
1132 bch2_open_buckets_stop_dev(c, ca,
1133 &h->parity,
1134 BCH_DATA_USER);
1135
1136 if (!h->s)
1137 goto unlock;
1138
1139 open_bucket_for_each(c, &h->s->blocks, ob, i)
1140 if (ob->ptr.dev == ca->dev_idx)
1141 goto found;
1142 open_bucket_for_each(c, &h->s->parity, ob, i)
1143 if (ob->ptr.dev == ca->dev_idx)
1144 goto found;
1145 goto unlock;
1146found:
1147 h->s->err = -1;
1148 s = ec_stripe_set_pending(h);
1149unlock:
1150 mutex_unlock(&h->lock);
1151
1152 if (s)
1153 ec_stripe_new_put(s);
1154 }
1155 mutex_unlock(&c->ec_new_stripe_lock);
1156}
1157
0564b167 1158static int __bch2_stripe_write_key(struct btree_trans *trans,
61c8d7c8
KO
1159 struct btree_iter *iter,
1160 struct stripe *m,
1161 size_t idx,
1162 struct bkey_i_stripe *new_key,
1163 unsigned flags)
1164{
0564b167 1165 struct bch_fs *c = trans->c;
61c8d7c8
KO
1166 struct bkey_s_c k;
1167 unsigned i;
1168 int ret;
1169
1170 bch2_btree_iter_set_pos(iter, POS(0, idx));
1171
1172 k = bch2_btree_iter_peek_slot(iter);
0f238367 1173 ret = bkey_err(k);
61c8d7c8
KO
1174 if (ret)
1175 return ret;
1176
1177 if (k.k->type != KEY_TYPE_stripe)
1178 return -EIO;
1179
1180 bkey_reassemble(&new_key->k_i, k);
1181
1182 spin_lock(&c->ec_stripes_heap_lock);
1183
1184 for (i = 0; i < new_key->v.nr_blocks; i++)
1185 stripe_blockcount_set(&new_key->v, i,
1186 m->block_sectors[i]);
1187 m->dirty = false;
1188
1189 spin_unlock(&c->ec_stripes_heap_lock);
1190
0564b167
KO
1191 bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i));
1192
1193 return bch2_trans_commit(trans, NULL, NULL,
1194 BTREE_INSERT_NOFAIL|flags);
61c8d7c8
KO
1195}
1196
a0e0bda1 1197int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
61c8d7c8 1198{
0564b167
KO
1199 struct btree_trans trans;
1200 struct btree_iter *iter;
61c8d7c8
KO
1201 struct genradix_iter giter;
1202 struct bkey_i_stripe *new_key;
1203 struct stripe *m;
1204 int ret = 0;
1205
1206 new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
1207 BUG_ON(!new_key);
1208
0564b167
KO
1209 bch2_trans_init(&trans, c);
1210
1211 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
1212 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
61c8d7c8
KO
1213
1214 genradix_for_each(&c->stripes[0], giter, m) {
1215 if (!m->dirty)
1216 continue;
1217
0564b167 1218 ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
a0e0bda1 1219 new_key, flags);
61c8d7c8
KO
1220 if (ret)
1221 break;
1222
1223 *wrote = true;
1224 }
1225
0564b167 1226 bch2_trans_exit(&trans);
61c8d7c8
KO
1227
1228 kfree(new_key);
1229
1230 return ret;
1231}
1232
1233static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
1234{
36e916e1 1235 bch2_mark_key(c, k, true, 0, NULL, 0, 0);
61c8d7c8
KO
1236}
1237
1238int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
1239{
1240 struct journal_replay *r;
424eb881
KO
1241 struct btree_trans trans;
1242 struct btree_iter *iter;
61c8d7c8
KO
1243 struct bkey_s_c k;
1244 int ret;
1245
1246 ret = bch2_fs_ec_start(c);
1247 if (ret)
1248 return ret;
1249
424eb881
KO
1250 bch2_trans_init(&trans, c);
1251
1252 for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k) {
61c8d7c8 1253 bch2_stripe_read_key(c, k);
424eb881 1254 bch2_trans_cond_resched(&trans);
61c8d7c8
KO
1255 }
1256
424eb881 1257 ret = bch2_trans_exit(&trans);
61c8d7c8
KO
1258 if (ret)
1259 return ret;
1260
1261 list_for_each_entry(r, journal_replay_list, list) {
1262 struct bkey_i *k, *n;
1263 struct jset_entry *entry;
1264
1265 for_each_jset_key(k, n, entry, &r->j)
1266 if (entry->btree_id == BTREE_ID_EC)
1267 bch2_stripe_read_key(c, bkey_i_to_s_c(k));
1268 }
1269
1270 return 0;
1271}
1272
dfe9bfb3 1273int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
cd575ddf 1274{
424eb881
KO
1275 struct btree_trans trans;
1276 struct btree_iter *iter;
cd575ddf
KO
1277 struct bkey_s_c k;
1278 size_t i, idx = 0;
1279 int ret = 0;
1280
424eb881
KO
1281 bch2_trans_init(&trans, c);
1282
1283 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
cd575ddf 1284
424eb881 1285 k = bch2_btree_iter_prev(iter);
cd575ddf
KO
1286 if (!IS_ERR_OR_NULL(k.k))
1287 idx = k.k->p.offset + 1;
424eb881 1288 ret = bch2_trans_exit(&trans);
cd575ddf
KO
1289 if (ret)
1290 return ret;
1291
dfe9bfb3
KO
1292 if (!gc &&
1293 !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
cd575ddf
KO
1294 GFP_KERNEL))
1295 return -ENOMEM;
1296#if 0
dfe9bfb3 1297 ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
cd575ddf
KO
1298#else
1299 for (i = 0; i < idx; i++)
dfe9bfb3 1300 if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
cd575ddf
KO
1301 return -ENOMEM;
1302#endif
1303 return 0;
1304}
1305
dfe9bfb3
KO
1306int bch2_fs_ec_start(struct bch_fs *c)
1307{
1308 return bch2_ec_mem_alloc(c, false);
1309}
1310
cd575ddf
KO
1311void bch2_fs_ec_exit(struct bch_fs *c)
1312{
1313 struct ec_stripe_head *h;
1314
1315 while (1) {
1316 mutex_lock(&c->ec_new_stripe_lock);
1317 h = list_first_entry_or_null(&c->ec_new_stripe_list,
1318 struct ec_stripe_head, list);
1319 if (h)
1320 list_del(&h->list);
1321 mutex_unlock(&c->ec_new_stripe_lock);
1322 if (!h)
1323 break;
1324
1325 BUG_ON(h->s);
1326 BUG_ON(!list_empty(&h->stripes));
1327 kfree(h);
1328 }
1329
1330 free_heap(&c->ec_stripes_heap);
dfe9bfb3 1331 genradix_free(&c->stripes[0]);
cd575ddf
KO
1332 bioset_exit(&c->ec_bioset);
1333}
1334
1335int bch2_fs_ec_init(struct bch_fs *c)
1336{
1337 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1338
1339 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
1340 BIOSET_NEED_BVECS);
1341}