Commit | Line | Data |
---|---|---|
cd575ddf KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* erasure coding */ | |
4 | ||
5 | #include "bcachefs.h" | |
6 | #include "alloc_foreground.h" | |
35189e09 | 7 | #include "bkey_on_stack.h" |
cd575ddf KO |
8 | #include "bset.h" |
9 | #include "btree_gc.h" | |
10 | #include "btree_update.h" | |
11 | #include "buckets.h" | |
12 | #include "disk_groups.h" | |
13 | #include "ec.h" | |
14 | #include "error.h" | |
15 | #include "io.h" | |
16 | #include "keylist.h" | |
d0734356 | 17 | #include "recovery.h" |
cd575ddf KO |
18 | #include "super-io.h" |
19 | #include "util.h" | |
20 | ||
de5bb710 KO |
21 | #include <linux/sort.h> |
22 | ||
23 | #ifdef __KERNEL__ | |
24 | ||
cd575ddf KO |
25 | #include <linux/raid/pq.h> |
26 | #include <linux/raid/xor.h> | |
de5bb710 KO |
27 | |
28 | static void raid5_recov(unsigned disks, unsigned failed_idx, | |
29 | size_t size, void **data) | |
30 | { | |
31 | unsigned i = 2, nr; | |
32 | ||
33 | BUG_ON(failed_idx >= disks); | |
34 | ||
35 | swap(data[0], data[failed_idx]); | |
36 | memcpy(data[0], data[1], size); | |
37 | ||
38 | while (i < disks) { | |
39 | nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); | |
40 | xor_blocks(nr, size, data[0], data + i); | |
41 | i += nr; | |
42 | } | |
43 | ||
44 | swap(data[0], data[failed_idx]); | |
45 | } | |
46 | ||
47 | static void raid_gen(int nd, int np, size_t size, void **v) | |
48 | { | |
49 | if (np >= 1) | |
50 | raid5_recov(nd + np, nd, size, v); | |
51 | if (np >= 2) | |
52 | raid6_call.gen_syndrome(nd + np, size, v); | |
53 | BUG_ON(np > 2); | |
54 | } | |
55 | ||
56 | static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) | |
57 | { | |
58 | switch (nr) { | |
59 | case 0: | |
60 | break; | |
61 | case 1: | |
62 | if (ir[0] < nd + 1) | |
63 | raid5_recov(nd + 1, ir[0], size, v); | |
64 | else | |
65 | raid6_call.gen_syndrome(nd + np, size, v); | |
66 | break; | |
67 | case 2: | |
68 | if (ir[1] < nd) { | |
69 | /* data+data failure. */ | |
70 | raid6_2data_recov(nd + np, size, ir[0], ir[1], v); | |
71 | } else if (ir[0] < nd) { | |
72 | /* data + p/q failure */ | |
73 | ||
74 | if (ir[1] == nd) /* data + p failure */ | |
75 | raid6_datap_recov(nd + np, size, ir[0], v); | |
76 | else { /* data + q failure */ | |
77 | raid5_recov(nd + 1, ir[0], size, v); | |
78 | raid6_call.gen_syndrome(nd + np, size, v); | |
79 | } | |
80 | } else { | |
81 | raid_gen(nd, np, size, v); | |
82 | } | |
83 | break; | |
84 | default: | |
85 | BUG(); | |
86 | } | |
87 | } | |
88 | ||
89 | #else | |
90 | ||
91 | #include <raid/raid.h> | |
92 | ||
93 | #endif | |
cd575ddf KO |
94 | |
95 | struct ec_bio { | |
96 | struct bch_dev *ca; | |
97 | struct ec_stripe_buf *buf; | |
98 | size_t idx; | |
99 | struct bio bio; | |
100 | }; | |
101 | ||
102 | /* Stripes btree keys: */ | |
103 | ||
26609b61 | 104 | const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) |
cd575ddf | 105 | { |
26609b61 KO |
106 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
107 | ||
cd575ddf KO |
108 | if (k.k->p.inode) |
109 | return "invalid stripe key"; | |
110 | ||
26609b61 KO |
111 | if (bkey_val_bytes(k.k) < sizeof(*s)) |
112 | return "incorrect value size"; | |
cd575ddf | 113 | |
76640280 KO |
114 | if (bkey_val_bytes(k.k) < sizeof(*s) || |
115 | bkey_val_u64s(k.k) < stripe_val_u64s(s)) | |
26609b61 | 116 | return "incorrect value size"; |
cd575ddf | 117 | |
3811aa6d | 118 | return bch2_bkey_ptrs_invalid(c, k); |
cd575ddf KO |
119 | } |
120 | ||
26609b61 | 121 | void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, |
cd575ddf KO |
122 | struct bkey_s_c k) |
123 | { | |
26609b61 KO |
124 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
125 | unsigned i; | |
126 | ||
127 | pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", | |
128 | s->algorithm, | |
129 | le16_to_cpu(s->sectors), | |
130 | s->nr_blocks - s->nr_redundant, | |
131 | s->nr_redundant, | |
132 | s->csum_type, | |
133 | 1U << s->csum_granularity_bits); | |
134 | ||
135 | for (i = 0; i < s->nr_blocks; i++) | |
61c8d7c8 KO |
136 | pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, |
137 | (u64) s->ptrs[i].offset, | |
138 | stripe_blockcount_get(s, i)); | |
cd575ddf KO |
139 | } |
140 | ||
141 | static int ptr_matches_stripe(struct bch_fs *c, | |
142 | struct bch_stripe *v, | |
143 | const struct bch_extent_ptr *ptr) | |
144 | { | |
145 | unsigned i; | |
146 | ||
147 | for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { | |
148 | const struct bch_extent_ptr *ptr2 = v->ptrs + i; | |
149 | ||
150 | if (ptr->dev == ptr2->dev && | |
151 | ptr->gen == ptr2->gen && | |
152 | ptr->offset >= ptr2->offset && | |
153 | ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) | |
154 | return i; | |
155 | } | |
156 | ||
157 | return -1; | |
158 | } | |
159 | ||
160 | static int extent_matches_stripe(struct bch_fs *c, | |
161 | struct bch_stripe *v, | |
162 | struct bkey_s_c k) | |
163 | { | |
cd575ddf | 164 | |
99aaf570 KO |
165 | switch (k.k->type) { |
166 | case KEY_TYPE_extent: { | |
167 | struct bkey_s_c_extent e = bkey_s_c_to_extent(k); | |
168 | const struct bch_extent_ptr *ptr; | |
169 | int idx; | |
cd575ddf | 170 | |
99aaf570 KO |
171 | extent_for_each_ptr(e, ptr) { |
172 | idx = ptr_matches_stripe(c, v, ptr); | |
173 | if (idx >= 0) | |
174 | return idx; | |
175 | } | |
176 | break; | |
177 | } | |
cd575ddf KO |
178 | } |
179 | ||
180 | return -1; | |
181 | } | |
182 | ||
42c7d748 KO |
183 | static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) |
184 | { | |
99aaf570 KO |
185 | switch (k.k->type) { |
186 | case KEY_TYPE_extent: { | |
187 | struct bkey_s_c_extent e = bkey_s_c_to_extent(k); | |
188 | const union bch_extent_entry *entry; | |
42c7d748 | 189 | |
99aaf570 KO |
190 | extent_for_each_entry(e, entry) |
191 | if (extent_entry_type(entry) == | |
192 | BCH_EXTENT_ENTRY_stripe_ptr && | |
193 | entry->stripe_ptr.idx == idx) | |
194 | return true; | |
42c7d748 | 195 | |
99aaf570 KO |
196 | break; |
197 | } | |
198 | } | |
42c7d748 KO |
199 | |
200 | return false; | |
201 | } | |
202 | ||
cd575ddf KO |
203 | static void ec_stripe_key_init(struct bch_fs *c, |
204 | struct bkey_i_stripe *s, | |
205 | struct open_buckets *blocks, | |
206 | struct open_buckets *parity, | |
207 | unsigned stripe_size) | |
208 | { | |
209 | struct open_bucket *ob; | |
210 | unsigned i, u64s; | |
211 | ||
212 | bkey_stripe_init(&s->k_i); | |
213 | s->v.sectors = cpu_to_le16(stripe_size); | |
214 | s->v.algorithm = 0; | |
215 | s->v.nr_blocks = parity->nr + blocks->nr; | |
216 | s->v.nr_redundant = parity->nr; | |
217 | s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); | |
218 | s->v.csum_type = BCH_CSUM_CRC32C; | |
219 | s->v.pad = 0; | |
220 | ||
221 | open_bucket_for_each(c, blocks, ob, i) | |
222 | s->v.ptrs[i] = ob->ptr; | |
223 | ||
224 | open_bucket_for_each(c, parity, ob, i) | |
225 | s->v.ptrs[blocks->nr + i] = ob->ptr; | |
226 | ||
227 | while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { | |
228 | BUG_ON(1 << s->v.csum_granularity_bits >= | |
229 | le16_to_cpu(s->v.sectors) || | |
230 | s->v.csum_granularity_bits == U8_MAX); | |
231 | s->v.csum_granularity_bits++; | |
232 | } | |
233 | ||
234 | set_bkey_val_u64s(&s->k, u64s); | |
235 | } | |
236 | ||
237 | /* Checksumming: */ | |
238 | ||
239 | static void ec_generate_checksums(struct ec_stripe_buf *buf) | |
240 | { | |
241 | struct bch_stripe *v = &buf->key.v; | |
242 | unsigned csum_granularity = 1 << v->csum_granularity_bits; | |
243 | unsigned csums_per_device = stripe_csums_per_device(v); | |
244 | unsigned csum_bytes = bch_crc_bytes[v->csum_type]; | |
245 | unsigned i, j; | |
246 | ||
247 | if (!csum_bytes) | |
248 | return; | |
249 | ||
250 | BUG_ON(buf->offset); | |
251 | BUG_ON(buf->size != le16_to_cpu(v->sectors)); | |
252 | ||
253 | for (i = 0; i < v->nr_blocks; i++) { | |
254 | for (j = 0; j < csums_per_device; j++) { | |
255 | unsigned offset = j << v->csum_granularity_bits; | |
256 | unsigned len = min(csum_granularity, buf->size - offset); | |
257 | ||
258 | struct bch_csum csum = | |
259 | bch2_checksum(NULL, v->csum_type, | |
260 | null_nonce(), | |
261 | buf->data[i] + (offset << 9), | |
262 | len << 9); | |
263 | ||
264 | memcpy(stripe_csum(v, i, j), &csum, csum_bytes); | |
265 | } | |
266 | } | |
267 | } | |
268 | ||
269 | static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) | |
270 | { | |
271 | struct bch_stripe *v = &buf->key.v; | |
272 | unsigned csum_granularity = 1 << v->csum_granularity_bits; | |
273 | unsigned csum_bytes = bch_crc_bytes[v->csum_type]; | |
274 | unsigned i; | |
275 | ||
276 | if (!csum_bytes) | |
277 | return; | |
278 | ||
279 | for (i = 0; i < v->nr_blocks; i++) { | |
280 | unsigned offset = buf->offset; | |
281 | unsigned end = buf->offset + buf->size; | |
282 | ||
283 | if (!test_bit(i, buf->valid)) | |
284 | continue; | |
285 | ||
286 | while (offset < end) { | |
287 | unsigned j = offset >> v->csum_granularity_bits; | |
288 | unsigned len = min(csum_granularity, end - offset); | |
289 | struct bch_csum csum; | |
290 | ||
291 | BUG_ON(offset & (csum_granularity - 1)); | |
292 | BUG_ON(offset + len != le16_to_cpu(v->sectors) && | |
293 | ((offset + len) & (csum_granularity - 1))); | |
294 | ||
295 | csum = bch2_checksum(NULL, v->csum_type, | |
296 | null_nonce(), | |
297 | buf->data[i] + ((offset - buf->offset) << 9), | |
298 | len << 9); | |
299 | ||
300 | if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { | |
301 | __bcache_io_error(c, | |
302 | "checksum error while doing reconstruct read (%u:%u)", | |
303 | i, j); | |
304 | clear_bit(i, buf->valid); | |
305 | break; | |
306 | } | |
307 | ||
308 | offset += len; | |
309 | } | |
310 | } | |
311 | } | |
312 | ||
313 | /* Erasure coding: */ | |
314 | ||
cd575ddf KO |
315 | static void ec_generate_ec(struct ec_stripe_buf *buf) |
316 | { | |
317 | struct bch_stripe *v = &buf->key.v; | |
318 | unsigned nr_data = v->nr_blocks - v->nr_redundant; | |
319 | unsigned bytes = le16_to_cpu(v->sectors) << 9; | |
320 | ||
de5bb710 | 321 | raid_gen(nr_data, v->nr_redundant, bytes, buf->data); |
cd575ddf KO |
322 | } |
323 | ||
324 | static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) | |
325 | { | |
326 | return nr - bitmap_weight(buf->valid, nr); | |
327 | } | |
328 | ||
329 | static unsigned ec_nr_failed(struct ec_stripe_buf *buf) | |
330 | { | |
331 | return __ec_nr_failed(buf, buf->key.v.nr_blocks); | |
332 | } | |
333 | ||
334 | static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) | |
335 | { | |
336 | struct bch_stripe *v = &buf->key.v; | |
337 | unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; | |
338 | unsigned nr_data = v->nr_blocks - v->nr_redundant; | |
339 | unsigned bytes = buf->size << 9; | |
340 | ||
341 | if (ec_nr_failed(buf) > v->nr_redundant) { | |
342 | __bcache_io_error(c, | |
343 | "error doing reconstruct read: unable to read enough blocks"); | |
344 | return -1; | |
345 | } | |
346 | ||
347 | for (i = 0; i < nr_data; i++) | |
348 | if (!test_bit(i, buf->valid)) | |
349 | failed[nr_failed++] = i; | |
350 | ||
de5bb710 | 351 | raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); |
cd575ddf KO |
352 | return 0; |
353 | } | |
354 | ||
355 | /* IO: */ | |
356 | ||
357 | static void ec_block_endio(struct bio *bio) | |
358 | { | |
359 | struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); | |
360 | struct bch_dev *ca = ec_bio->ca; | |
361 | struct closure *cl = bio->bi_private; | |
362 | ||
363 | if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) | |
364 | clear_bit(ec_bio->idx, ec_bio->buf->valid); | |
365 | ||
366 | bio_put(&ec_bio->bio); | |
367 | percpu_ref_put(&ca->io_ref); | |
368 | closure_put(cl); | |
369 | } | |
370 | ||
371 | static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, | |
372 | unsigned rw, unsigned idx, struct closure *cl) | |
373 | { | |
374 | struct bch_stripe *v = &buf->key.v; | |
375 | unsigned offset = 0, bytes = buf->size << 9; | |
376 | struct bch_extent_ptr *ptr = &v->ptrs[idx]; | |
377 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
378 | ||
379 | if (!bch2_dev_get_ioref(ca, rw)) { | |
380 | clear_bit(idx, buf->valid); | |
381 | return; | |
382 | } | |
383 | ||
384 | while (offset < bytes) { | |
385 | unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, | |
386 | DIV_ROUND_UP(bytes, PAGE_SIZE)); | |
387 | unsigned b = min_t(size_t, bytes - offset, | |
388 | nr_iovecs << PAGE_SHIFT); | |
389 | struct ec_bio *ec_bio; | |
390 | ||
391 | ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, | |
392 | nr_iovecs, | |
393 | rw, | |
394 | GFP_KERNEL, | |
395 | &c->ec_bioset), | |
396 | struct ec_bio, bio); | |
397 | ||
398 | ec_bio->ca = ca; | |
399 | ec_bio->buf = buf; | |
400 | ec_bio->idx = idx; | |
401 | ||
402 | ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); | |
cd575ddf KO |
403 | ec_bio->bio.bi_end_io = ec_block_endio; |
404 | ec_bio->bio.bi_private = cl; | |
405 | ||
885678f6 | 406 | bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); |
cd575ddf KO |
407 | |
408 | closure_get(cl); | |
409 | percpu_ref_get(&ca->io_ref); | |
410 | ||
411 | submit_bio(&ec_bio->bio); | |
412 | ||
413 | offset += b; | |
414 | } | |
415 | ||
416 | percpu_ref_put(&ca->io_ref); | |
417 | } | |
418 | ||
419 | /* recovery read path: */ | |
420 | int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) | |
421 | { | |
424eb881 KO |
422 | struct btree_trans trans; |
423 | struct btree_iter *iter; | |
cd575ddf KO |
424 | struct ec_stripe_buf *buf; |
425 | struct closure cl; | |
426 | struct bkey_s_c k; | |
427 | struct bch_stripe *v; | |
428 | unsigned stripe_idx; | |
429 | unsigned offset, end; | |
430 | unsigned i, nr_data, csum_granularity; | |
431 | int ret = 0, idx; | |
432 | ||
433 | closure_init_stack(&cl); | |
434 | ||
37954a27 | 435 | BUG_ON(!rbio->pick.has_ec); |
cd575ddf | 436 | |
37954a27 | 437 | stripe_idx = rbio->pick.ec.idx; |
cd575ddf KO |
438 | |
439 | buf = kzalloc(sizeof(*buf), GFP_NOIO); | |
440 | if (!buf) | |
441 | return -ENOMEM; | |
442 | ||
20bceecb | 443 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 KO |
444 | |
445 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, | |
446 | POS(0, stripe_idx), | |
447 | BTREE_ITER_SLOTS); | |
448 | k = bch2_btree_iter_peek_slot(iter); | |
0f238367 | 449 | if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { |
cd575ddf KO |
450 | __bcache_io_error(c, |
451 | "error doing reconstruct read: stripe not found"); | |
452 | kfree(buf); | |
424eb881 | 453 | return bch2_trans_exit(&trans) ?: -EIO; |
cd575ddf KO |
454 | } |
455 | ||
456 | bkey_reassemble(&buf->key.k_i, k); | |
424eb881 | 457 | bch2_trans_exit(&trans); |
cd575ddf KO |
458 | |
459 | v = &buf->key.v; | |
460 | ||
461 | nr_data = v->nr_blocks - v->nr_redundant; | |
462 | ||
463 | idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); | |
464 | BUG_ON(idx < 0); | |
465 | ||
466 | csum_granularity = 1U << v->csum_granularity_bits; | |
467 | ||
468 | offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; | |
469 | end = offset + bio_sectors(&rbio->bio); | |
470 | ||
471 | BUG_ON(end > le16_to_cpu(v->sectors)); | |
472 | ||
473 | buf->offset = round_down(offset, csum_granularity); | |
474 | buf->size = min_t(unsigned, le16_to_cpu(v->sectors), | |
475 | round_up(end, csum_granularity)) - buf->offset; | |
476 | ||
477 | for (i = 0; i < v->nr_blocks; i++) { | |
478 | buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); | |
479 | if (!buf->data[i]) { | |
480 | ret = -ENOMEM; | |
481 | goto err; | |
482 | } | |
483 | } | |
484 | ||
485 | memset(buf->valid, 0xFF, sizeof(buf->valid)); | |
486 | ||
487 | for (i = 0; i < v->nr_blocks; i++) { | |
488 | struct bch_extent_ptr *ptr = v->ptrs + i; | |
489 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
490 | ||
491 | if (ptr_stale(ca, ptr)) { | |
492 | __bcache_io_error(c, | |
493 | "error doing reconstruct read: stale pointer"); | |
494 | clear_bit(i, buf->valid); | |
495 | continue; | |
496 | } | |
497 | ||
498 | ec_block_io(c, buf, REQ_OP_READ, i, &cl); | |
499 | } | |
500 | ||
501 | closure_sync(&cl); | |
502 | ||
503 | if (ec_nr_failed(buf) > v->nr_redundant) { | |
504 | __bcache_io_error(c, | |
505 | "error doing reconstruct read: unable to read enough blocks"); | |
506 | ret = -EIO; | |
507 | goto err; | |
508 | } | |
509 | ||
510 | ec_validate_checksums(c, buf); | |
511 | ||
512 | ret = ec_do_recov(c, buf); | |
513 | if (ret) | |
514 | goto err; | |
515 | ||
516 | memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, | |
517 | buf->data[idx] + ((offset - buf->offset) << 9)); | |
518 | err: | |
519 | for (i = 0; i < v->nr_blocks; i++) | |
520 | kfree(buf->data[i]); | |
521 | kfree(buf); | |
522 | return ret; | |
523 | } | |
524 | ||
dfe9bfb3 | 525 | /* stripe bucket accounting: */ |
cd575ddf KO |
526 | |
527 | static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) | |
528 | { | |
529 | ec_stripes_heap n, *h = &c->ec_stripes_heap; | |
530 | ||
531 | if (idx >= h->size) { | |
532 | if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) | |
533 | return -ENOMEM; | |
534 | ||
535 | spin_lock(&c->ec_stripes_heap_lock); | |
536 | if (n.size > h->size) { | |
537 | memcpy(n.data, h->data, h->used * sizeof(h->data[0])); | |
538 | n.used = h->used; | |
539 | swap(*h, n); | |
540 | } | |
541 | spin_unlock(&c->ec_stripes_heap_lock); | |
542 | ||
543 | free_heap(&n); | |
544 | } | |
545 | ||
dfe9bfb3 KO |
546 | if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) |
547 | return -ENOMEM; | |
548 | ||
549 | if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && | |
550 | !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) | |
cd575ddf KO |
551 | return -ENOMEM; |
552 | ||
553 | return 0; | |
554 | } | |
555 | ||
556 | static int ec_stripe_mem_alloc(struct bch_fs *c, | |
557 | struct btree_iter *iter) | |
558 | { | |
559 | size_t idx = iter->pos.offset; | |
932aa837 | 560 | int ret = 0; |
cd575ddf KO |
561 | |
562 | if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) | |
932aa837 | 563 | return ret; |
cd575ddf | 564 | |
58fbf808 | 565 | bch2_trans_unlock(iter->trans); |
932aa837 | 566 | ret = -EINTR; |
cd575ddf KO |
567 | |
568 | if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) | |
932aa837 KO |
569 | return ret; |
570 | ||
cd575ddf KO |
571 | return -ENOMEM; |
572 | } | |
573 | ||
574 | static ssize_t stripe_idx_to_delete(struct bch_fs *c) | |
575 | { | |
576 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
577 | ||
f516c872 KO |
578 | return h->used && h->data[0].blocks_nonempty == 0 |
579 | ? h->data[0].idx : -1; | |
cd575ddf KO |
580 | } |
581 | ||
582 | static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, | |
583 | struct ec_stripe_heap_entry l, | |
584 | struct ec_stripe_heap_entry r) | |
585 | { | |
586 | return ((l.blocks_nonempty > r.blocks_nonempty) - | |
587 | (l.blocks_nonempty < r.blocks_nonempty)); | |
588 | } | |
589 | ||
590 | static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, | |
591 | size_t i) | |
592 | { | |
593 | struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); | |
594 | ||
dfe9bfb3 | 595 | genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; |
cd575ddf KO |
596 | } |
597 | ||
598 | static void heap_verify_backpointer(struct bch_fs *c, size_t idx) | |
599 | { | |
600 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
dfe9bfb3 | 601 | struct stripe *m = genradix_ptr(&c->stripes[0], idx); |
cd575ddf KO |
602 | |
603 | BUG_ON(!m->alive); | |
604 | BUG_ON(m->heap_idx >= h->used); | |
605 | BUG_ON(h->data[m->heap_idx].idx != idx); | |
606 | } | |
607 | ||
cd575ddf | 608 | void bch2_stripes_heap_update(struct bch_fs *c, |
dfe9bfb3 | 609 | struct stripe *m, size_t idx) |
cd575ddf KO |
610 | { |
611 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
cd575ddf KO |
612 | size_t i; |
613 | ||
6e738539 KO |
614 | if (m->alive) { |
615 | heap_verify_backpointer(c, idx); | |
cd575ddf | 616 | |
6e738539 | 617 | h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; |
cd575ddf | 618 | |
6e738539 KO |
619 | i = m->heap_idx; |
620 | heap_sift_up(h, i, ec_stripes_heap_cmp, | |
621 | ec_stripes_heap_set_backpointer); | |
622 | heap_sift_down(h, i, ec_stripes_heap_cmp, | |
623 | ec_stripes_heap_set_backpointer); | |
cd575ddf | 624 | |
6e738539 KO |
625 | heap_verify_backpointer(c, idx); |
626 | } else { | |
627 | bch2_stripes_heap_insert(c, m, idx); | |
628 | } | |
cd575ddf | 629 | |
97fd13ad KO |
630 | if (stripe_idx_to_delete(c) >= 0 && |
631 | !percpu_ref_is_dying(&c->writes)) | |
cd575ddf KO |
632 | schedule_work(&c->ec_stripe_delete_work); |
633 | } | |
634 | ||
635 | void bch2_stripes_heap_del(struct bch_fs *c, | |
dfe9bfb3 | 636 | struct stripe *m, size_t idx) |
cd575ddf | 637 | { |
cd575ddf KO |
638 | heap_verify_backpointer(c, idx); |
639 | ||
640 | m->alive = false; | |
641 | heap_del(&c->ec_stripes_heap, m->heap_idx, | |
642 | ec_stripes_heap_cmp, | |
643 | ec_stripes_heap_set_backpointer); | |
cd575ddf KO |
644 | } |
645 | ||
646 | void bch2_stripes_heap_insert(struct bch_fs *c, | |
dfe9bfb3 | 647 | struct stripe *m, size_t idx) |
cd575ddf | 648 | { |
cd575ddf KO |
649 | BUG_ON(heap_full(&c->ec_stripes_heap)); |
650 | ||
651 | heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { | |
652 | .idx = idx, | |
61c8d7c8 | 653 | .blocks_nonempty = m->blocks_nonempty, |
cd575ddf KO |
654 | }), |
655 | ec_stripes_heap_cmp, | |
656 | ec_stripes_heap_set_backpointer); | |
657 | m->alive = true; | |
658 | ||
659 | heap_verify_backpointer(c, idx); | |
cd575ddf KO |
660 | } |
661 | ||
dfe9bfb3 KO |
662 | /* stripe deletion */ |
663 | ||
0564b167 | 664 | static int ec_stripe_delete(struct bch_fs *c, size_t idx) |
cd575ddf | 665 | { |
0564b167 KO |
666 | return bch2_btree_delete_range(c, BTREE_ID_EC, |
667 | POS(0, idx), | |
668 | POS(0, idx + 1), | |
669 | NULL); | |
cd575ddf KO |
670 | } |
671 | ||
672 | static void ec_stripe_delete_work(struct work_struct *work) | |
673 | { | |
674 | struct bch_fs *c = | |
675 | container_of(work, struct bch_fs, ec_stripe_delete_work); | |
676 | ssize_t idx; | |
677 | ||
678 | down_read(&c->gc_lock); | |
dfe9bfb3 | 679 | mutex_lock(&c->ec_stripe_create_lock); |
cd575ddf KO |
680 | |
681 | while (1) { | |
682 | spin_lock(&c->ec_stripes_heap_lock); | |
683 | idx = stripe_idx_to_delete(c); | |
684 | spin_unlock(&c->ec_stripes_heap_lock); | |
685 | ||
686 | if (idx < 0) | |
687 | break; | |
688 | ||
97fd13ad KO |
689 | if (ec_stripe_delete(c, idx)) |
690 | break; | |
cd575ddf KO |
691 | } |
692 | ||
dfe9bfb3 | 693 | mutex_unlock(&c->ec_stripe_create_lock); |
cd575ddf KO |
694 | up_read(&c->gc_lock); |
695 | } | |
696 | ||
dfe9bfb3 KO |
697 | /* stripe creation: */ |
698 | ||
cd575ddf KO |
699 | static int ec_stripe_bkey_insert(struct bch_fs *c, |
700 | struct bkey_i_stripe *stripe) | |
701 | { | |
0564b167 KO |
702 | struct btree_trans trans; |
703 | struct btree_iter *iter; | |
cd575ddf | 704 | struct bkey_s_c k; |
4e1510c3 | 705 | struct bpos start_pos = POS(0, c->ec_stripe_hint); |
cd575ddf KO |
706 | int ret; |
707 | ||
20bceecb | 708 | bch2_trans_init(&trans, c, 0, 0); |
cd575ddf | 709 | retry: |
0564b167 KO |
710 | bch2_trans_begin(&trans); |
711 | ||
4e1510c3 | 712 | for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, |
94f651e2 | 713 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { |
4e1510c3 KO |
714 | if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { |
715 | if (start_pos.offset) { | |
716 | start_pos = POS_MIN; | |
717 | bch2_btree_iter_set_pos(iter, start_pos); | |
718 | continue; | |
719 | } | |
720 | ||
721 | ret = -ENOSPC; | |
0564b167 | 722 | break; |
4e1510c3 | 723 | } |
cd575ddf KO |
724 | |
725 | if (bkey_deleted(k.k)) | |
726 | goto found_slot; | |
727 | } | |
728 | ||
932aa837 | 729 | goto err; |
cd575ddf | 730 | found_slot: |
4e1510c3 KO |
731 | start_pos = iter->pos; |
732 | ||
0564b167 | 733 | ret = ec_stripe_mem_alloc(c, iter); |
cd575ddf | 734 | if (ret) |
932aa837 | 735 | goto err; |
cd575ddf | 736 | |
0564b167 | 737 | stripe->k.p = iter->pos; |
cd575ddf | 738 | |
2d594dfb | 739 | bch2_trans_update(&trans, iter, &stripe->k_i, 0); |
0564b167 KO |
740 | |
741 | ret = bch2_trans_commit(&trans, NULL, NULL, | |
932aa837 KO |
742 | BTREE_INSERT_NOFAIL); |
743 | err: | |
163e885a KO |
744 | bch2_trans_iter_put(&trans, iter); |
745 | ||
932aa837 KO |
746 | if (ret == -EINTR) |
747 | goto retry; | |
4e1510c3 KO |
748 | |
749 | c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; | |
0564b167 | 750 | bch2_trans_exit(&trans); |
cd575ddf | 751 | |
cd575ddf KO |
752 | return ret; |
753 | } | |
754 | ||
cd575ddf KO |
755 | static void extent_stripe_ptr_add(struct bkey_s_extent e, |
756 | struct ec_stripe_buf *s, | |
757 | struct bch_extent_ptr *ptr, | |
758 | unsigned block) | |
759 | { | |
760 | struct bch_extent_stripe_ptr *dst = (void *) ptr; | |
761 | union bch_extent_entry *end = extent_entry_last(e); | |
762 | ||
763 | memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); | |
764 | e.k->u64s += sizeof(*dst) / sizeof(u64); | |
765 | ||
766 | *dst = (struct bch_extent_stripe_ptr) { | |
767 | .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, | |
768 | .block = block, | |
769 | .idx = s->key.k.p.offset, | |
770 | }; | |
771 | } | |
772 | ||
773 | static int ec_stripe_update_ptrs(struct bch_fs *c, | |
774 | struct ec_stripe_buf *s, | |
775 | struct bkey *pos) | |
776 | { | |
0564b167 KO |
777 | struct btree_trans trans; |
778 | struct btree_iter *iter; | |
cd575ddf KO |
779 | struct bkey_s_c k; |
780 | struct bkey_s_extent e; | |
35189e09 | 781 | struct bkey_on_stack sk; |
cd575ddf KO |
782 | int ret = 0, dev, idx; |
783 | ||
35189e09 | 784 | bkey_on_stack_init(&sk); |
20bceecb | 785 | bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); |
cd575ddf | 786 | |
0564b167 KO |
787 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, |
788 | bkey_start_pos(pos), | |
789 | BTREE_ITER_INTENT); | |
790 | ||
791 | while ((k = bch2_btree_iter_peek(iter)).k && | |
0f238367 | 792 | !(ret = bkey_err(k)) && |
cd575ddf | 793 | bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { |
aef90ce0 KO |
794 | struct bch_extent_ptr *ptr, *ec_ptr = NULL; |
795 | ||
42c7d748 KO |
796 | if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { |
797 | bch2_btree_iter_next(iter); | |
798 | continue; | |
799 | } | |
800 | ||
cd575ddf KO |
801 | idx = extent_matches_stripe(c, &s->key.v, k); |
802 | if (idx < 0) { | |
0564b167 | 803 | bch2_btree_iter_next(iter); |
cd575ddf KO |
804 | continue; |
805 | } | |
806 | ||
807 | dev = s->key.v.ptrs[idx].dev; | |
808 | ||
5934a0ca | 809 | bkey_on_stack_reassemble(&sk, c, k); |
35189e09 | 810 | e = bkey_i_to_s_extent(sk.k); |
cd575ddf | 811 | |
aef90ce0 KO |
812 | extent_for_each_ptr(e, ptr) { |
813 | if (ptr->dev == dev) | |
814 | ec_ptr = ptr; | |
815 | else | |
cd575ddf | 816 | ptr->cached = true; |
aef90ce0 | 817 | } |
cd575ddf | 818 | |
aef90ce0 | 819 | extent_stripe_ptr_add(e, s, ec_ptr, idx); |
cd575ddf | 820 | |
e5e6aaa7 | 821 | bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); |
2d594dfb | 822 | bch2_trans_update(&trans, iter, sk.k, 0); |
0564b167 KO |
823 | |
824 | ret = bch2_trans_commit(&trans, NULL, NULL, | |
0564b167 KO |
825 | BTREE_INSERT_NOFAIL| |
826 | BTREE_INSERT_USE_RESERVE); | |
cd575ddf KO |
827 | if (ret == -EINTR) |
828 | ret = 0; | |
829 | if (ret) | |
830 | break; | |
831 | } | |
832 | ||
0564b167 | 833 | bch2_trans_exit(&trans); |
35189e09 | 834 | bkey_on_stack_exit(&sk, c); |
0564b167 KO |
835 | |
836 | return ret; | |
cd575ddf KO |
837 | } |
838 | ||
839 | /* | |
840 | * data buckets of new stripe all written: create the stripe | |
841 | */ | |
842 | static void ec_stripe_create(struct ec_stripe_new *s) | |
843 | { | |
cd575ddf KO |
844 | struct bch_fs *c = s->c; |
845 | struct open_bucket *ob; | |
846 | struct bkey_i *k; | |
847 | struct bch_stripe *v = &s->stripe.key.v; | |
848 | unsigned i, nr_data = v->nr_blocks - v->nr_redundant; | |
849 | struct closure cl; | |
850 | int ret; | |
851 | ||
852 | BUG_ON(s->h->s == s); | |
853 | ||
854 | closure_init_stack(&cl); | |
855 | ||
856 | if (s->err) { | |
857 | bch_err(c, "error creating stripe: error writing data buckets"); | |
858 | goto err; | |
859 | } | |
860 | ||
861 | if (!percpu_ref_tryget(&c->writes)) | |
862 | goto err; | |
863 | ||
864 | BUG_ON(bitmap_weight(s->blocks_allocated, | |
865 | s->blocks.nr) != s->blocks.nr); | |
866 | ||
867 | ec_generate_ec(&s->stripe); | |
868 | ||
869 | ec_generate_checksums(&s->stripe); | |
870 | ||
871 | /* write p/q: */ | |
872 | for (i = nr_data; i < v->nr_blocks; i++) | |
873 | ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); | |
874 | ||
875 | closure_sync(&cl); | |
876 | ||
877 | for (i = nr_data; i < v->nr_blocks; i++) | |
878 | if (!test_bit(i, s->stripe.valid)) { | |
879 | bch_err(c, "error creating stripe: error writing redundancy buckets"); | |
880 | goto err_put_writes; | |
881 | } | |
882 | ||
dfe9bfb3 KO |
883 | mutex_lock(&c->ec_stripe_create_lock); |
884 | ||
cd575ddf KO |
885 | ret = ec_stripe_bkey_insert(c, &s->stripe.key); |
886 | if (ret) { | |
887 | bch_err(c, "error creating stripe: error creating stripe key"); | |
dfe9bfb3 | 888 | goto err_unlock; |
cd575ddf KO |
889 | } |
890 | ||
891 | for_each_keylist_key(&s->keys, k) { | |
892 | ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); | |
893 | if (ret) | |
894 | break; | |
895 | } | |
896 | ||
dfe9bfb3 KO |
897 | err_unlock: |
898 | mutex_unlock(&c->ec_stripe_create_lock); | |
cd575ddf KO |
899 | err_put_writes: |
900 | percpu_ref_put(&c->writes); | |
901 | err: | |
902 | open_bucket_for_each(c, &s->blocks, ob, i) { | |
903 | ob->ec = NULL; | |
904 | __bch2_open_bucket_put(c, ob); | |
905 | } | |
906 | ||
907 | bch2_open_buckets_put(c, &s->parity); | |
908 | ||
909 | bch2_keylist_free(&s->keys, s->inline_keys); | |
910 | ||
911 | mutex_lock(&s->h->lock); | |
912 | list_del(&s->list); | |
913 | mutex_unlock(&s->h->lock); | |
914 | ||
915 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) | |
916 | kvpfree(s->stripe.data[i], s->stripe.size << 9); | |
917 | kfree(s); | |
918 | } | |
919 | ||
920 | static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) | |
921 | { | |
922 | struct ec_stripe_new *s = h->s; | |
923 | ||
924 | list_add(&s->list, &h->stripes); | |
925 | h->s = NULL; | |
926 | ||
927 | return s; | |
928 | } | |
929 | ||
930 | static void ec_stripe_new_put(struct ec_stripe_new *s) | |
931 | { | |
932 | BUG_ON(atomic_read(&s->pin) <= 0); | |
933 | if (atomic_dec_and_test(&s->pin)) | |
934 | ec_stripe_create(s); | |
935 | } | |
936 | ||
937 | /* have a full bucket - hand it off to be erasure coded: */ | |
938 | void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) | |
939 | { | |
940 | struct ec_stripe_new *s = ob->ec; | |
941 | ||
942 | if (ob->sectors_free) | |
943 | s->err = -1; | |
944 | ||
945 | ec_stripe_new_put(s); | |
946 | } | |
947 | ||
948 | void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) | |
949 | { | |
950 | struct ec_stripe_new *s = ob->ec; | |
951 | ||
952 | s->err = -EIO; | |
953 | } | |
954 | ||
955 | void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) | |
956 | { | |
957 | struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); | |
958 | struct bch_dev *ca; | |
959 | unsigned offset; | |
960 | ||
961 | if (!ob) | |
962 | return NULL; | |
963 | ||
964 | ca = bch_dev_bkey_exists(c, ob->ptr.dev); | |
965 | offset = ca->mi.bucket_size - ob->sectors_free; | |
966 | ||
967 | return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); | |
968 | } | |
969 | ||
970 | void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, | |
971 | struct bpos pos, unsigned sectors) | |
972 | { | |
973 | struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); | |
974 | struct ec_stripe_new *ec; | |
975 | ||
976 | if (!ob) | |
977 | return; | |
978 | ||
979 | ec = ob->ec; | |
980 | mutex_lock(&ec->lock); | |
981 | ||
982 | if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, | |
983 | ARRAY_SIZE(ec->inline_keys), | |
984 | BKEY_U64s)) { | |
985 | BUG(); | |
986 | } | |
987 | ||
988 | bkey_init(&ec->keys.top->k); | |
989 | ec->keys.top->k.p = pos; | |
990 | bch2_key_resize(&ec->keys.top->k, sectors); | |
991 | bch2_keylist_push(&ec->keys); | |
992 | ||
993 | mutex_unlock(&ec->lock); | |
994 | } | |
995 | ||
996 | static int unsigned_cmp(const void *_l, const void *_r) | |
997 | { | |
998 | unsigned l = *((const unsigned *) _l); | |
999 | unsigned r = *((const unsigned *) _r); | |
1000 | ||
3ea2b1e1 | 1001 | return cmp_int(l, r); |
cd575ddf KO |
1002 | } |
1003 | ||
1004 | /* pick most common bucket size: */ | |
1005 | static unsigned pick_blocksize(struct bch_fs *c, | |
1006 | struct bch_devs_mask *devs) | |
1007 | { | |
1008 | struct bch_dev *ca; | |
1009 | unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; | |
1010 | struct { | |
1011 | unsigned nr, size; | |
1012 | } cur = { 0, 0 }, best = { 0, 0 }; | |
1013 | ||
1014 | for_each_member_device_rcu(ca, c, i, devs) | |
1015 | sizes[nr++] = ca->mi.bucket_size; | |
1016 | ||
1017 | sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); | |
1018 | ||
1019 | for (i = 0; i < nr; i++) { | |
1020 | if (sizes[i] != cur.size) { | |
1021 | if (cur.nr > best.nr) | |
1022 | best = cur; | |
1023 | ||
1024 | cur.nr = 0; | |
1025 | cur.size = sizes[i]; | |
1026 | } | |
1027 | ||
1028 | cur.nr++; | |
1029 | } | |
1030 | ||
1031 | if (cur.nr > best.nr) | |
1032 | best = cur; | |
1033 | ||
1034 | return best.size; | |
1035 | } | |
1036 | ||
1037 | int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) | |
1038 | { | |
1039 | struct ec_stripe_new *s; | |
1040 | unsigned i; | |
1041 | ||
1042 | BUG_ON(h->parity.nr != h->redundancy); | |
1043 | BUG_ON(!h->blocks.nr); | |
1044 | BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); | |
1045 | lockdep_assert_held(&h->lock); | |
1046 | ||
1047 | s = kzalloc(sizeof(*s), GFP_KERNEL); | |
1048 | if (!s) | |
1049 | return -ENOMEM; | |
1050 | ||
1051 | mutex_init(&s->lock); | |
1052 | atomic_set(&s->pin, 1); | |
1053 | s->c = c; | |
1054 | s->h = h; | |
1055 | s->blocks = h->blocks; | |
1056 | s->parity = h->parity; | |
1057 | ||
1058 | memset(&h->blocks, 0, sizeof(h->blocks)); | |
1059 | memset(&h->parity, 0, sizeof(h->parity)); | |
1060 | ||
1061 | bch2_keylist_init(&s->keys, s->inline_keys); | |
1062 | ||
1063 | s->stripe.offset = 0; | |
1064 | s->stripe.size = h->blocksize; | |
1065 | memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); | |
1066 | ||
1067 | ec_stripe_key_init(c, &s->stripe.key, | |
1068 | &s->blocks, &s->parity, | |
1069 | h->blocksize); | |
1070 | ||
1071 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { | |
1072 | s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); | |
1073 | if (!s->stripe.data[i]) | |
1074 | goto err; | |
1075 | } | |
1076 | ||
1077 | h->s = s; | |
1078 | ||
1079 | return 0; | |
1080 | err: | |
1081 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) | |
1082 | kvpfree(s->stripe.data[i], s->stripe.size << 9); | |
1083 | kfree(s); | |
1084 | return -ENOMEM; | |
1085 | } | |
1086 | ||
1087 | static struct ec_stripe_head * | |
1088 | ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, | |
1089 | unsigned algo, unsigned redundancy) | |
1090 | { | |
1091 | struct ec_stripe_head *h; | |
1092 | struct bch_dev *ca; | |
1093 | unsigned i; | |
1094 | ||
1095 | h = kzalloc(sizeof(*h), GFP_KERNEL); | |
1096 | if (!h) | |
1097 | return NULL; | |
1098 | ||
1099 | mutex_init(&h->lock); | |
1100 | mutex_lock(&h->lock); | |
1101 | INIT_LIST_HEAD(&h->stripes); | |
1102 | ||
1103 | h->target = target; | |
1104 | h->algo = algo; | |
1105 | h->redundancy = redundancy; | |
1106 | ||
1107 | rcu_read_lock(); | |
1108 | h->devs = target_rw_devs(c, BCH_DATA_USER, target); | |
1109 | ||
1110 | for_each_member_device_rcu(ca, c, i, &h->devs) | |
1111 | if (!ca->mi.durability) | |
1112 | __clear_bit(i, h->devs.d); | |
1113 | ||
1114 | h->blocksize = pick_blocksize(c, &h->devs); | |
1115 | ||
1116 | for_each_member_device_rcu(ca, c, i, &h->devs) | |
1117 | if (ca->mi.bucket_size == h->blocksize) | |
1118 | h->nr_active_devs++; | |
1119 | ||
1120 | rcu_read_unlock(); | |
1121 | list_add(&h->list, &c->ec_new_stripe_list); | |
1122 | return h; | |
1123 | } | |
1124 | ||
1125 | void bch2_ec_stripe_head_put(struct ec_stripe_head *h) | |
1126 | { | |
1127 | struct ec_stripe_new *s = NULL; | |
1128 | ||
1129 | if (h->s && | |
1130 | bitmap_weight(h->s->blocks_allocated, | |
1131 | h->s->blocks.nr) == h->s->blocks.nr) | |
1132 | s = ec_stripe_set_pending(h); | |
1133 | ||
1134 | mutex_unlock(&h->lock); | |
1135 | ||
1136 | if (s) | |
1137 | ec_stripe_new_put(s); | |
1138 | } | |
1139 | ||
1140 | struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, | |
1141 | unsigned target, | |
1142 | unsigned algo, | |
1143 | unsigned redundancy) | |
1144 | { | |
1145 | struct ec_stripe_head *h; | |
1146 | ||
1147 | if (!redundancy) | |
1148 | return NULL; | |
1149 | ||
1150 | mutex_lock(&c->ec_new_stripe_lock); | |
1151 | list_for_each_entry(h, &c->ec_new_stripe_list, list) | |
1152 | if (h->target == target && | |
1153 | h->algo == algo && | |
1154 | h->redundancy == redundancy) { | |
1155 | mutex_lock(&h->lock); | |
1156 | goto found; | |
1157 | } | |
1158 | ||
1159 | h = ec_new_stripe_head_alloc(c, target, algo, redundancy); | |
1160 | found: | |
1161 | mutex_unlock(&c->ec_new_stripe_lock); | |
1162 | return h; | |
1163 | } | |
1164 | ||
1165 | void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) | |
1166 | { | |
1167 | struct ec_stripe_head *h; | |
1168 | struct open_bucket *ob; | |
1169 | unsigned i; | |
1170 | ||
1171 | mutex_lock(&c->ec_new_stripe_lock); | |
1172 | list_for_each_entry(h, &c->ec_new_stripe_list, list) { | |
1173 | struct ec_stripe_new *s = NULL; | |
1174 | ||
1175 | mutex_lock(&h->lock); | |
ec350b90 KO |
1176 | bch2_open_buckets_stop_dev(c, ca, &h->blocks); |
1177 | bch2_open_buckets_stop_dev(c, ca, &h->parity); | |
cd575ddf KO |
1178 | |
1179 | if (!h->s) | |
1180 | goto unlock; | |
1181 | ||
1182 | open_bucket_for_each(c, &h->s->blocks, ob, i) | |
1183 | if (ob->ptr.dev == ca->dev_idx) | |
1184 | goto found; | |
1185 | open_bucket_for_each(c, &h->s->parity, ob, i) | |
1186 | if (ob->ptr.dev == ca->dev_idx) | |
1187 | goto found; | |
1188 | goto unlock; | |
1189 | found: | |
1190 | h->s->err = -1; | |
1191 | s = ec_stripe_set_pending(h); | |
1192 | unlock: | |
1193 | mutex_unlock(&h->lock); | |
1194 | ||
1195 | if (s) | |
1196 | ec_stripe_new_put(s); | |
1197 | } | |
1198 | mutex_unlock(&c->ec_new_stripe_lock); | |
1199 | } | |
1200 | ||
0564b167 | 1201 | static int __bch2_stripe_write_key(struct btree_trans *trans, |
61c8d7c8 KO |
1202 | struct btree_iter *iter, |
1203 | struct stripe *m, | |
1204 | size_t idx, | |
163e885a | 1205 | struct bkey_i_stripe *new_key) |
61c8d7c8 | 1206 | { |
0564b167 | 1207 | struct bch_fs *c = trans->c; |
61c8d7c8 KO |
1208 | struct bkey_s_c k; |
1209 | unsigned i; | |
1210 | int ret; | |
1211 | ||
1212 | bch2_btree_iter_set_pos(iter, POS(0, idx)); | |
1213 | ||
1214 | k = bch2_btree_iter_peek_slot(iter); | |
0f238367 | 1215 | ret = bkey_err(k); |
61c8d7c8 KO |
1216 | if (ret) |
1217 | return ret; | |
1218 | ||
1219 | if (k.k->type != KEY_TYPE_stripe) | |
1220 | return -EIO; | |
1221 | ||
1222 | bkey_reassemble(&new_key->k_i, k); | |
1223 | ||
1224 | spin_lock(&c->ec_stripes_heap_lock); | |
1225 | ||
1226 | for (i = 0; i < new_key->v.nr_blocks; i++) | |
1227 | stripe_blockcount_set(&new_key->v, i, | |
1228 | m->block_sectors[i]); | |
1229 | m->dirty = false; | |
1230 | ||
1231 | spin_unlock(&c->ec_stripes_heap_lock); | |
1232 | ||
2d594dfb | 1233 | bch2_trans_update(trans, iter, &new_key->k_i, 0); |
163e885a | 1234 | return 0; |
61c8d7c8 KO |
1235 | } |
1236 | ||
a0e0bda1 | 1237 | int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) |
61c8d7c8 | 1238 | { |
0564b167 KO |
1239 | struct btree_trans trans; |
1240 | struct btree_iter *iter; | |
61c8d7c8 KO |
1241 | struct genradix_iter giter; |
1242 | struct bkey_i_stripe *new_key; | |
1243 | struct stripe *m; | |
1244 | int ret = 0; | |
1245 | ||
1246 | new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); | |
1247 | BUG_ON(!new_key); | |
1248 | ||
20bceecb | 1249 | bch2_trans_init(&trans, c, 0, 0); |
0564b167 KO |
1250 | |
1251 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, | |
1252 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT); | |
61c8d7c8 KO |
1253 | |
1254 | genradix_for_each(&c->stripes[0], giter, m) { | |
1255 | if (!m->dirty) | |
1256 | continue; | |
1257 | ||
163e885a KO |
1258 | ret = __bch2_trans_do(&trans, NULL, NULL, |
1259 | BTREE_INSERT_NOFAIL|flags, | |
1260 | __bch2_stripe_write_key(&trans, iter, m, | |
1261 | giter.pos, new_key)); | |
b1fd23df | 1262 | |
61c8d7c8 KO |
1263 | if (ret) |
1264 | break; | |
1265 | ||
1266 | *wrote = true; | |
1267 | } | |
1268 | ||
0564b167 | 1269 | bch2_trans_exit(&trans); |
61c8d7c8 KO |
1270 | |
1271 | kfree(new_key); | |
1272 | ||
1273 | return ret; | |
1274 | } | |
1275 | ||
b2930396 KO |
1276 | static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, |
1277 | unsigned level, struct bkey_s_c k) | |
61c8d7c8 | 1278 | { |
b2930396 | 1279 | int ret = 0; |
e222d206 | 1280 | |
b2930396 KO |
1281 | if (k.k->type == KEY_TYPE_stripe) |
1282 | ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: | |
1283 | bch2_mark_key(c, k, 0, 0, NULL, 0, | |
1284 | BTREE_TRIGGER_ALLOC_READ| | |
1285 | BTREE_TRIGGER_NOATOMIC); | |
9ec211b0 | 1286 | |
b2930396 KO |
1287 | return ret; |
1288 | } | |
61c8d7c8 | 1289 | |
b2930396 KO |
1290 | int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) |
1291 | { | |
1292 | int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, | |
1293 | NULL, bch2_stripes_read_fn); | |
1294 | if (ret) | |
94f651e2 | 1295 | bch_err(c, "error reading stripes: %i", ret); |
61c8d7c8 | 1296 | |
b2930396 | 1297 | return ret; |
61c8d7c8 KO |
1298 | } |
1299 | ||
dfe9bfb3 | 1300 | int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) |
cd575ddf | 1301 | { |
424eb881 KO |
1302 | struct btree_trans trans; |
1303 | struct btree_iter *iter; | |
cd575ddf KO |
1304 | struct bkey_s_c k; |
1305 | size_t i, idx = 0; | |
1306 | int ret = 0; | |
1307 | ||
20bceecb | 1308 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 KO |
1309 | |
1310 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); | |
cd575ddf | 1311 | |
424eb881 | 1312 | k = bch2_btree_iter_prev(iter); |
cd575ddf KO |
1313 | if (!IS_ERR_OR_NULL(k.k)) |
1314 | idx = k.k->p.offset + 1; | |
424eb881 | 1315 | ret = bch2_trans_exit(&trans); |
cd575ddf KO |
1316 | if (ret) |
1317 | return ret; | |
1318 | ||
618b9e57 KO |
1319 | if (!idx) |
1320 | return 0; | |
1321 | ||
dfe9bfb3 KO |
1322 | if (!gc && |
1323 | !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), | |
cd575ddf KO |
1324 | GFP_KERNEL)) |
1325 | return -ENOMEM; | |
1326 | #if 0 | |
dfe9bfb3 | 1327 | ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); |
cd575ddf KO |
1328 | #else |
1329 | for (i = 0; i < idx; i++) | |
dfe9bfb3 | 1330 | if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) |
cd575ddf KO |
1331 | return -ENOMEM; |
1332 | #endif | |
1333 | return 0; | |
1334 | } | |
1335 | ||
1336 | void bch2_fs_ec_exit(struct bch_fs *c) | |
1337 | { | |
1338 | struct ec_stripe_head *h; | |
1339 | ||
1340 | while (1) { | |
1341 | mutex_lock(&c->ec_new_stripe_lock); | |
1342 | h = list_first_entry_or_null(&c->ec_new_stripe_list, | |
1343 | struct ec_stripe_head, list); | |
1344 | if (h) | |
1345 | list_del(&h->list); | |
1346 | mutex_unlock(&c->ec_new_stripe_lock); | |
1347 | if (!h) | |
1348 | break; | |
1349 | ||
1350 | BUG_ON(h->s); | |
1351 | BUG_ON(!list_empty(&h->stripes)); | |
1352 | kfree(h); | |
1353 | } | |
1354 | ||
1355 | free_heap(&c->ec_stripes_heap); | |
dfe9bfb3 | 1356 | genradix_free(&c->stripes[0]); |
cd575ddf KO |
1357 | bioset_exit(&c->ec_bioset); |
1358 | } | |
1359 | ||
1360 | int bch2_fs_ec_init(struct bch_fs *c) | |
1361 | { | |
1362 | INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); | |
1363 | ||
1364 | return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), | |
1365 | BIOSET_NEED_BVECS); | |
1366 | } |