Commit | Line | Data |
---|---|---|
cd575ddf KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* erasure coding */ | |
4 | ||
5 | #include "bcachefs.h" | |
6 | #include "alloc_foreground.h" | |
7 | #include "bset.h" | |
8 | #include "btree_gc.h" | |
9 | #include "btree_update.h" | |
10 | #include "buckets.h" | |
11 | #include "disk_groups.h" | |
12 | #include "ec.h" | |
13 | #include "error.h" | |
14 | #include "io.h" | |
15 | #include "keylist.h" | |
d0734356 | 16 | #include "recovery.h" |
cd575ddf KO |
17 | #include "super-io.h" |
18 | #include "util.h" | |
19 | ||
de5bb710 KO |
20 | #include <linux/sort.h> |
21 | ||
22 | #ifdef __KERNEL__ | |
23 | ||
cd575ddf KO |
24 | #include <linux/raid/pq.h> |
25 | #include <linux/raid/xor.h> | |
de5bb710 KO |
26 | |
27 | static void raid5_recov(unsigned disks, unsigned failed_idx, | |
28 | size_t size, void **data) | |
29 | { | |
30 | unsigned i = 2, nr; | |
31 | ||
32 | BUG_ON(failed_idx >= disks); | |
33 | ||
34 | swap(data[0], data[failed_idx]); | |
35 | memcpy(data[0], data[1], size); | |
36 | ||
37 | while (i < disks) { | |
38 | nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); | |
39 | xor_blocks(nr, size, data[0], data + i); | |
40 | i += nr; | |
41 | } | |
42 | ||
43 | swap(data[0], data[failed_idx]); | |
44 | } | |
45 | ||
46 | static void raid_gen(int nd, int np, size_t size, void **v) | |
47 | { | |
48 | if (np >= 1) | |
49 | raid5_recov(nd + np, nd, size, v); | |
50 | if (np >= 2) | |
51 | raid6_call.gen_syndrome(nd + np, size, v); | |
52 | BUG_ON(np > 2); | |
53 | } | |
54 | ||
55 | static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) | |
56 | { | |
57 | switch (nr) { | |
58 | case 0: | |
59 | break; | |
60 | case 1: | |
61 | if (ir[0] < nd + 1) | |
62 | raid5_recov(nd + 1, ir[0], size, v); | |
63 | else | |
64 | raid6_call.gen_syndrome(nd + np, size, v); | |
65 | break; | |
66 | case 2: | |
67 | if (ir[1] < nd) { | |
68 | /* data+data failure. */ | |
69 | raid6_2data_recov(nd + np, size, ir[0], ir[1], v); | |
70 | } else if (ir[0] < nd) { | |
71 | /* data + p/q failure */ | |
72 | ||
73 | if (ir[1] == nd) /* data + p failure */ | |
74 | raid6_datap_recov(nd + np, size, ir[0], v); | |
75 | else { /* data + q failure */ | |
76 | raid5_recov(nd + 1, ir[0], size, v); | |
77 | raid6_call.gen_syndrome(nd + np, size, v); | |
78 | } | |
79 | } else { | |
80 | raid_gen(nd, np, size, v); | |
81 | } | |
82 | break; | |
83 | default: | |
84 | BUG(); | |
85 | } | |
86 | } | |
87 | ||
88 | #else | |
89 | ||
90 | #include <raid/raid.h> | |
91 | ||
92 | #endif | |
cd575ddf KO |
93 | |
94 | struct ec_bio { | |
95 | struct bch_dev *ca; | |
96 | struct ec_stripe_buf *buf; | |
97 | size_t idx; | |
98 | struct bio bio; | |
99 | }; | |
100 | ||
101 | /* Stripes btree keys: */ | |
102 | ||
26609b61 | 103 | const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) |
cd575ddf | 104 | { |
26609b61 KO |
105 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
106 | ||
cd575ddf KO |
107 | if (k.k->p.inode) |
108 | return "invalid stripe key"; | |
109 | ||
26609b61 KO |
110 | if (bkey_val_bytes(k.k) < sizeof(*s)) |
111 | return "incorrect value size"; | |
cd575ddf | 112 | |
76640280 KO |
113 | if (bkey_val_bytes(k.k) < sizeof(*s) || |
114 | bkey_val_u64s(k.k) < stripe_val_u64s(s)) | |
26609b61 | 115 | return "incorrect value size"; |
cd575ddf | 116 | |
3811aa6d | 117 | return bch2_bkey_ptrs_invalid(c, k); |
cd575ddf KO |
118 | } |
119 | ||
26609b61 | 120 | void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, |
cd575ddf KO |
121 | struct bkey_s_c k) |
122 | { | |
26609b61 KO |
123 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
124 | unsigned i; | |
125 | ||
126 | pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", | |
127 | s->algorithm, | |
128 | le16_to_cpu(s->sectors), | |
129 | s->nr_blocks - s->nr_redundant, | |
130 | s->nr_redundant, | |
131 | s->csum_type, | |
132 | 1U << s->csum_granularity_bits); | |
133 | ||
134 | for (i = 0; i < s->nr_blocks; i++) | |
61c8d7c8 KO |
135 | pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, |
136 | (u64) s->ptrs[i].offset, | |
137 | stripe_blockcount_get(s, i)); | |
3811aa6d KO |
138 | |
139 | bch2_bkey_ptrs_to_text(out, c, k); | |
cd575ddf KO |
140 | } |
141 | ||
142 | static int ptr_matches_stripe(struct bch_fs *c, | |
143 | struct bch_stripe *v, | |
144 | const struct bch_extent_ptr *ptr) | |
145 | { | |
146 | unsigned i; | |
147 | ||
148 | for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { | |
149 | const struct bch_extent_ptr *ptr2 = v->ptrs + i; | |
150 | ||
151 | if (ptr->dev == ptr2->dev && | |
152 | ptr->gen == ptr2->gen && | |
153 | ptr->offset >= ptr2->offset && | |
154 | ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) | |
155 | return i; | |
156 | } | |
157 | ||
158 | return -1; | |
159 | } | |
160 | ||
161 | static int extent_matches_stripe(struct bch_fs *c, | |
162 | struct bch_stripe *v, | |
163 | struct bkey_s_c k) | |
164 | { | |
165 | struct bkey_s_c_extent e; | |
166 | const struct bch_extent_ptr *ptr; | |
167 | int idx; | |
168 | ||
169 | if (!bkey_extent_is_data(k.k)) | |
170 | return -1; | |
171 | ||
172 | e = bkey_s_c_to_extent(k); | |
173 | ||
174 | extent_for_each_ptr(e, ptr) { | |
175 | idx = ptr_matches_stripe(c, v, ptr); | |
176 | if (idx >= 0) | |
177 | return idx; | |
178 | } | |
179 | ||
180 | return -1; | |
181 | } | |
182 | ||
42c7d748 KO |
183 | static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) |
184 | { | |
185 | struct bkey_s_c_extent e; | |
186 | const union bch_extent_entry *entry; | |
187 | ||
188 | if (!bkey_extent_is_data(k.k)) | |
189 | return false; | |
190 | ||
191 | e = bkey_s_c_to_extent(k); | |
192 | ||
193 | extent_for_each_entry(e, entry) | |
194 | if (extent_entry_type(entry) == | |
195 | BCH_EXTENT_ENTRY_stripe_ptr && | |
196 | entry->stripe_ptr.idx == idx) | |
197 | return true; | |
198 | ||
199 | return false; | |
200 | } | |
201 | ||
cd575ddf KO |
202 | static void ec_stripe_key_init(struct bch_fs *c, |
203 | struct bkey_i_stripe *s, | |
204 | struct open_buckets *blocks, | |
205 | struct open_buckets *parity, | |
206 | unsigned stripe_size) | |
207 | { | |
208 | struct open_bucket *ob; | |
209 | unsigned i, u64s; | |
210 | ||
211 | bkey_stripe_init(&s->k_i); | |
212 | s->v.sectors = cpu_to_le16(stripe_size); | |
213 | s->v.algorithm = 0; | |
214 | s->v.nr_blocks = parity->nr + blocks->nr; | |
215 | s->v.nr_redundant = parity->nr; | |
216 | s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); | |
217 | s->v.csum_type = BCH_CSUM_CRC32C; | |
218 | s->v.pad = 0; | |
219 | ||
220 | open_bucket_for_each(c, blocks, ob, i) | |
221 | s->v.ptrs[i] = ob->ptr; | |
222 | ||
223 | open_bucket_for_each(c, parity, ob, i) | |
224 | s->v.ptrs[blocks->nr + i] = ob->ptr; | |
225 | ||
226 | while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { | |
227 | BUG_ON(1 << s->v.csum_granularity_bits >= | |
228 | le16_to_cpu(s->v.sectors) || | |
229 | s->v.csum_granularity_bits == U8_MAX); | |
230 | s->v.csum_granularity_bits++; | |
231 | } | |
232 | ||
233 | set_bkey_val_u64s(&s->k, u64s); | |
234 | } | |
235 | ||
236 | /* Checksumming: */ | |
237 | ||
238 | static void ec_generate_checksums(struct ec_stripe_buf *buf) | |
239 | { | |
240 | struct bch_stripe *v = &buf->key.v; | |
241 | unsigned csum_granularity = 1 << v->csum_granularity_bits; | |
242 | unsigned csums_per_device = stripe_csums_per_device(v); | |
243 | unsigned csum_bytes = bch_crc_bytes[v->csum_type]; | |
244 | unsigned i, j; | |
245 | ||
246 | if (!csum_bytes) | |
247 | return; | |
248 | ||
249 | BUG_ON(buf->offset); | |
250 | BUG_ON(buf->size != le16_to_cpu(v->sectors)); | |
251 | ||
252 | for (i = 0; i < v->nr_blocks; i++) { | |
253 | for (j = 0; j < csums_per_device; j++) { | |
254 | unsigned offset = j << v->csum_granularity_bits; | |
255 | unsigned len = min(csum_granularity, buf->size - offset); | |
256 | ||
257 | struct bch_csum csum = | |
258 | bch2_checksum(NULL, v->csum_type, | |
259 | null_nonce(), | |
260 | buf->data[i] + (offset << 9), | |
261 | len << 9); | |
262 | ||
263 | memcpy(stripe_csum(v, i, j), &csum, csum_bytes); | |
264 | } | |
265 | } | |
266 | } | |
267 | ||
268 | static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) | |
269 | { | |
270 | struct bch_stripe *v = &buf->key.v; | |
271 | unsigned csum_granularity = 1 << v->csum_granularity_bits; | |
272 | unsigned csum_bytes = bch_crc_bytes[v->csum_type]; | |
273 | unsigned i; | |
274 | ||
275 | if (!csum_bytes) | |
276 | return; | |
277 | ||
278 | for (i = 0; i < v->nr_blocks; i++) { | |
279 | unsigned offset = buf->offset; | |
280 | unsigned end = buf->offset + buf->size; | |
281 | ||
282 | if (!test_bit(i, buf->valid)) | |
283 | continue; | |
284 | ||
285 | while (offset < end) { | |
286 | unsigned j = offset >> v->csum_granularity_bits; | |
287 | unsigned len = min(csum_granularity, end - offset); | |
288 | struct bch_csum csum; | |
289 | ||
290 | BUG_ON(offset & (csum_granularity - 1)); | |
291 | BUG_ON(offset + len != le16_to_cpu(v->sectors) && | |
292 | ((offset + len) & (csum_granularity - 1))); | |
293 | ||
294 | csum = bch2_checksum(NULL, v->csum_type, | |
295 | null_nonce(), | |
296 | buf->data[i] + ((offset - buf->offset) << 9), | |
297 | len << 9); | |
298 | ||
299 | if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { | |
300 | __bcache_io_error(c, | |
301 | "checksum error while doing reconstruct read (%u:%u)", | |
302 | i, j); | |
303 | clear_bit(i, buf->valid); | |
304 | break; | |
305 | } | |
306 | ||
307 | offset += len; | |
308 | } | |
309 | } | |
310 | } | |
311 | ||
312 | /* Erasure coding: */ | |
313 | ||
cd575ddf KO |
314 | static void ec_generate_ec(struct ec_stripe_buf *buf) |
315 | { | |
316 | struct bch_stripe *v = &buf->key.v; | |
317 | unsigned nr_data = v->nr_blocks - v->nr_redundant; | |
318 | unsigned bytes = le16_to_cpu(v->sectors) << 9; | |
319 | ||
de5bb710 | 320 | raid_gen(nr_data, v->nr_redundant, bytes, buf->data); |
cd575ddf KO |
321 | } |
322 | ||
323 | static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) | |
324 | { | |
325 | return nr - bitmap_weight(buf->valid, nr); | |
326 | } | |
327 | ||
328 | static unsigned ec_nr_failed(struct ec_stripe_buf *buf) | |
329 | { | |
330 | return __ec_nr_failed(buf, buf->key.v.nr_blocks); | |
331 | } | |
332 | ||
333 | static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) | |
334 | { | |
335 | struct bch_stripe *v = &buf->key.v; | |
336 | unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; | |
337 | unsigned nr_data = v->nr_blocks - v->nr_redundant; | |
338 | unsigned bytes = buf->size << 9; | |
339 | ||
340 | if (ec_nr_failed(buf) > v->nr_redundant) { | |
341 | __bcache_io_error(c, | |
342 | "error doing reconstruct read: unable to read enough blocks"); | |
343 | return -1; | |
344 | } | |
345 | ||
346 | for (i = 0; i < nr_data; i++) | |
347 | if (!test_bit(i, buf->valid)) | |
348 | failed[nr_failed++] = i; | |
349 | ||
de5bb710 | 350 | raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); |
cd575ddf KO |
351 | return 0; |
352 | } | |
353 | ||
354 | /* IO: */ | |
355 | ||
356 | static void ec_block_endio(struct bio *bio) | |
357 | { | |
358 | struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); | |
359 | struct bch_dev *ca = ec_bio->ca; | |
360 | struct closure *cl = bio->bi_private; | |
361 | ||
362 | if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) | |
363 | clear_bit(ec_bio->idx, ec_bio->buf->valid); | |
364 | ||
365 | bio_put(&ec_bio->bio); | |
366 | percpu_ref_put(&ca->io_ref); | |
367 | closure_put(cl); | |
368 | } | |
369 | ||
370 | static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, | |
371 | unsigned rw, unsigned idx, struct closure *cl) | |
372 | { | |
373 | struct bch_stripe *v = &buf->key.v; | |
374 | unsigned offset = 0, bytes = buf->size << 9; | |
375 | struct bch_extent_ptr *ptr = &v->ptrs[idx]; | |
376 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
377 | ||
378 | if (!bch2_dev_get_ioref(ca, rw)) { | |
379 | clear_bit(idx, buf->valid); | |
380 | return; | |
381 | } | |
382 | ||
383 | while (offset < bytes) { | |
384 | unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, | |
385 | DIV_ROUND_UP(bytes, PAGE_SIZE)); | |
386 | unsigned b = min_t(size_t, bytes - offset, | |
387 | nr_iovecs << PAGE_SHIFT); | |
388 | struct ec_bio *ec_bio; | |
389 | ||
390 | ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, | |
391 | nr_iovecs, | |
392 | rw, | |
393 | GFP_KERNEL, | |
394 | &c->ec_bioset), | |
395 | struct ec_bio, bio); | |
396 | ||
397 | ec_bio->ca = ca; | |
398 | ec_bio->buf = buf; | |
399 | ec_bio->idx = idx; | |
400 | ||
401 | ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); | |
cd575ddf KO |
402 | ec_bio->bio.bi_end_io = ec_block_endio; |
403 | ec_bio->bio.bi_private = cl; | |
404 | ||
885678f6 | 405 | bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); |
cd575ddf KO |
406 | |
407 | closure_get(cl); | |
408 | percpu_ref_get(&ca->io_ref); | |
409 | ||
410 | submit_bio(&ec_bio->bio); | |
411 | ||
412 | offset += b; | |
413 | } | |
414 | ||
415 | percpu_ref_put(&ca->io_ref); | |
416 | } | |
417 | ||
418 | /* recovery read path: */ | |
419 | int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) | |
420 | { | |
424eb881 KO |
421 | struct btree_trans trans; |
422 | struct btree_iter *iter; | |
cd575ddf KO |
423 | struct ec_stripe_buf *buf; |
424 | struct closure cl; | |
425 | struct bkey_s_c k; | |
426 | struct bch_stripe *v; | |
427 | unsigned stripe_idx; | |
428 | unsigned offset, end; | |
429 | unsigned i, nr_data, csum_granularity; | |
430 | int ret = 0, idx; | |
431 | ||
432 | closure_init_stack(&cl); | |
433 | ||
434 | BUG_ON(!rbio->pick.idx || | |
435 | rbio->pick.idx - 1 >= rbio->pick.ec_nr); | |
436 | ||
437 | stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; | |
438 | ||
439 | buf = kzalloc(sizeof(*buf), GFP_NOIO); | |
440 | if (!buf) | |
441 | return -ENOMEM; | |
442 | ||
20bceecb | 443 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 KO |
444 | |
445 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, | |
446 | POS(0, stripe_idx), | |
447 | BTREE_ITER_SLOTS); | |
448 | k = bch2_btree_iter_peek_slot(iter); | |
0f238367 | 449 | if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { |
cd575ddf KO |
450 | __bcache_io_error(c, |
451 | "error doing reconstruct read: stripe not found"); | |
452 | kfree(buf); | |
424eb881 | 453 | return bch2_trans_exit(&trans) ?: -EIO; |
cd575ddf KO |
454 | } |
455 | ||
456 | bkey_reassemble(&buf->key.k_i, k); | |
424eb881 | 457 | bch2_trans_exit(&trans); |
cd575ddf KO |
458 | |
459 | v = &buf->key.v; | |
460 | ||
461 | nr_data = v->nr_blocks - v->nr_redundant; | |
462 | ||
463 | idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); | |
464 | BUG_ON(idx < 0); | |
465 | ||
466 | csum_granularity = 1U << v->csum_granularity_bits; | |
467 | ||
468 | offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; | |
469 | end = offset + bio_sectors(&rbio->bio); | |
470 | ||
471 | BUG_ON(end > le16_to_cpu(v->sectors)); | |
472 | ||
473 | buf->offset = round_down(offset, csum_granularity); | |
474 | buf->size = min_t(unsigned, le16_to_cpu(v->sectors), | |
475 | round_up(end, csum_granularity)) - buf->offset; | |
476 | ||
477 | for (i = 0; i < v->nr_blocks; i++) { | |
478 | buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); | |
479 | if (!buf->data[i]) { | |
480 | ret = -ENOMEM; | |
481 | goto err; | |
482 | } | |
483 | } | |
484 | ||
485 | memset(buf->valid, 0xFF, sizeof(buf->valid)); | |
486 | ||
487 | for (i = 0; i < v->nr_blocks; i++) { | |
488 | struct bch_extent_ptr *ptr = v->ptrs + i; | |
489 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
490 | ||
491 | if (ptr_stale(ca, ptr)) { | |
492 | __bcache_io_error(c, | |
493 | "error doing reconstruct read: stale pointer"); | |
494 | clear_bit(i, buf->valid); | |
495 | continue; | |
496 | } | |
497 | ||
498 | ec_block_io(c, buf, REQ_OP_READ, i, &cl); | |
499 | } | |
500 | ||
501 | closure_sync(&cl); | |
502 | ||
503 | if (ec_nr_failed(buf) > v->nr_redundant) { | |
504 | __bcache_io_error(c, | |
505 | "error doing reconstruct read: unable to read enough blocks"); | |
506 | ret = -EIO; | |
507 | goto err; | |
508 | } | |
509 | ||
510 | ec_validate_checksums(c, buf); | |
511 | ||
512 | ret = ec_do_recov(c, buf); | |
513 | if (ret) | |
514 | goto err; | |
515 | ||
516 | memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, | |
517 | buf->data[idx] + ((offset - buf->offset) << 9)); | |
518 | err: | |
519 | for (i = 0; i < v->nr_blocks; i++) | |
520 | kfree(buf->data[i]); | |
521 | kfree(buf); | |
522 | return ret; | |
523 | } | |
524 | ||
dfe9bfb3 | 525 | /* stripe bucket accounting: */ |
cd575ddf KO |
526 | |
527 | static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) | |
528 | { | |
529 | ec_stripes_heap n, *h = &c->ec_stripes_heap; | |
530 | ||
531 | if (idx >= h->size) { | |
532 | if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) | |
533 | return -ENOMEM; | |
534 | ||
535 | spin_lock(&c->ec_stripes_heap_lock); | |
536 | if (n.size > h->size) { | |
537 | memcpy(n.data, h->data, h->used * sizeof(h->data[0])); | |
538 | n.used = h->used; | |
539 | swap(*h, n); | |
540 | } | |
541 | spin_unlock(&c->ec_stripes_heap_lock); | |
542 | ||
543 | free_heap(&n); | |
544 | } | |
545 | ||
dfe9bfb3 KO |
546 | if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) |
547 | return -ENOMEM; | |
548 | ||
549 | if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && | |
550 | !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) | |
cd575ddf KO |
551 | return -ENOMEM; |
552 | ||
553 | return 0; | |
554 | } | |
555 | ||
556 | static int ec_stripe_mem_alloc(struct bch_fs *c, | |
557 | struct btree_iter *iter) | |
558 | { | |
559 | size_t idx = iter->pos.offset; | |
932aa837 | 560 | int ret = 0; |
cd575ddf KO |
561 | |
562 | if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) | |
932aa837 | 563 | return ret; |
cd575ddf | 564 | |
58fbf808 | 565 | bch2_trans_unlock(iter->trans); |
932aa837 | 566 | ret = -EINTR; |
cd575ddf KO |
567 | |
568 | if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) | |
932aa837 KO |
569 | return ret; |
570 | ||
cd575ddf KO |
571 | return -ENOMEM; |
572 | } | |
573 | ||
574 | static ssize_t stripe_idx_to_delete(struct bch_fs *c) | |
575 | { | |
576 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
577 | ||
578 | return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1; | |
579 | } | |
580 | ||
581 | static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, | |
582 | struct ec_stripe_heap_entry l, | |
583 | struct ec_stripe_heap_entry r) | |
584 | { | |
585 | return ((l.blocks_nonempty > r.blocks_nonempty) - | |
586 | (l.blocks_nonempty < r.blocks_nonempty)); | |
587 | } | |
588 | ||
589 | static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, | |
590 | size_t i) | |
591 | { | |
592 | struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); | |
593 | ||
dfe9bfb3 | 594 | genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; |
cd575ddf KO |
595 | } |
596 | ||
597 | static void heap_verify_backpointer(struct bch_fs *c, size_t idx) | |
598 | { | |
599 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
dfe9bfb3 | 600 | struct stripe *m = genradix_ptr(&c->stripes[0], idx); |
cd575ddf KO |
601 | |
602 | BUG_ON(!m->alive); | |
603 | BUG_ON(m->heap_idx >= h->used); | |
604 | BUG_ON(h->data[m->heap_idx].idx != idx); | |
605 | } | |
606 | ||
cd575ddf | 607 | void bch2_stripes_heap_update(struct bch_fs *c, |
dfe9bfb3 | 608 | struct stripe *m, size_t idx) |
cd575ddf KO |
609 | { |
610 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
cd575ddf KO |
611 | size_t i; |
612 | ||
6e738539 KO |
613 | if (m->alive) { |
614 | heap_verify_backpointer(c, idx); | |
cd575ddf | 615 | |
6e738539 | 616 | h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; |
cd575ddf | 617 | |
6e738539 KO |
618 | i = m->heap_idx; |
619 | heap_sift_up(h, i, ec_stripes_heap_cmp, | |
620 | ec_stripes_heap_set_backpointer); | |
621 | heap_sift_down(h, i, ec_stripes_heap_cmp, | |
622 | ec_stripes_heap_set_backpointer); | |
cd575ddf | 623 | |
6e738539 KO |
624 | heap_verify_backpointer(c, idx); |
625 | } else { | |
626 | bch2_stripes_heap_insert(c, m, idx); | |
627 | } | |
cd575ddf | 628 | |
97fd13ad KO |
629 | if (stripe_idx_to_delete(c) >= 0 && |
630 | !percpu_ref_is_dying(&c->writes)) | |
cd575ddf KO |
631 | schedule_work(&c->ec_stripe_delete_work); |
632 | } | |
633 | ||
634 | void bch2_stripes_heap_del(struct bch_fs *c, | |
dfe9bfb3 | 635 | struct stripe *m, size_t idx) |
cd575ddf | 636 | { |
cd575ddf KO |
637 | heap_verify_backpointer(c, idx); |
638 | ||
639 | m->alive = false; | |
640 | heap_del(&c->ec_stripes_heap, m->heap_idx, | |
641 | ec_stripes_heap_cmp, | |
642 | ec_stripes_heap_set_backpointer); | |
cd575ddf KO |
643 | } |
644 | ||
645 | void bch2_stripes_heap_insert(struct bch_fs *c, | |
dfe9bfb3 | 646 | struct stripe *m, size_t idx) |
cd575ddf | 647 | { |
cd575ddf KO |
648 | BUG_ON(heap_full(&c->ec_stripes_heap)); |
649 | ||
650 | heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { | |
651 | .idx = idx, | |
61c8d7c8 | 652 | .blocks_nonempty = m->blocks_nonempty, |
cd575ddf KO |
653 | }), |
654 | ec_stripes_heap_cmp, | |
655 | ec_stripes_heap_set_backpointer); | |
656 | m->alive = true; | |
657 | ||
658 | heap_verify_backpointer(c, idx); | |
cd575ddf KO |
659 | } |
660 | ||
dfe9bfb3 KO |
661 | /* stripe deletion */ |
662 | ||
0564b167 | 663 | static int ec_stripe_delete(struct bch_fs *c, size_t idx) |
cd575ddf | 664 | { |
0564b167 KO |
665 | return bch2_btree_delete_range(c, BTREE_ID_EC, |
666 | POS(0, idx), | |
667 | POS(0, idx + 1), | |
668 | NULL); | |
cd575ddf KO |
669 | } |
670 | ||
671 | static void ec_stripe_delete_work(struct work_struct *work) | |
672 | { | |
673 | struct bch_fs *c = | |
674 | container_of(work, struct bch_fs, ec_stripe_delete_work); | |
675 | ssize_t idx; | |
676 | ||
677 | down_read(&c->gc_lock); | |
dfe9bfb3 | 678 | mutex_lock(&c->ec_stripe_create_lock); |
cd575ddf KO |
679 | |
680 | while (1) { | |
681 | spin_lock(&c->ec_stripes_heap_lock); | |
682 | idx = stripe_idx_to_delete(c); | |
683 | spin_unlock(&c->ec_stripes_heap_lock); | |
684 | ||
685 | if (idx < 0) | |
686 | break; | |
687 | ||
97fd13ad KO |
688 | if (ec_stripe_delete(c, idx)) |
689 | break; | |
cd575ddf KO |
690 | } |
691 | ||
dfe9bfb3 | 692 | mutex_unlock(&c->ec_stripe_create_lock); |
cd575ddf KO |
693 | up_read(&c->gc_lock); |
694 | } | |
695 | ||
dfe9bfb3 KO |
696 | /* stripe creation: */ |
697 | ||
cd575ddf KO |
698 | static int ec_stripe_bkey_insert(struct bch_fs *c, |
699 | struct bkey_i_stripe *stripe) | |
700 | { | |
0564b167 KO |
701 | struct btree_trans trans; |
702 | struct btree_iter *iter; | |
cd575ddf KO |
703 | struct bkey_s_c k; |
704 | int ret; | |
705 | ||
20bceecb | 706 | bch2_trans_init(&trans, c, 0, 0); |
cd575ddf | 707 | retry: |
0564b167 KO |
708 | bch2_trans_begin(&trans); |
709 | ||
710 | /* XXX: start pos hint */ | |
94f651e2 KO |
711 | for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, |
712 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { | |
0564b167 KO |
713 | if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) |
714 | break; | |
cd575ddf KO |
715 | |
716 | if (bkey_deleted(k.k)) | |
717 | goto found_slot; | |
718 | } | |
719 | ||
94f651e2 KO |
720 | if (!ret) |
721 | ret = -ENOSPC; | |
932aa837 | 722 | goto err; |
cd575ddf | 723 | found_slot: |
0564b167 | 724 | ret = ec_stripe_mem_alloc(c, iter); |
cd575ddf | 725 | if (ret) |
932aa837 | 726 | goto err; |
cd575ddf | 727 | |
0564b167 | 728 | stripe->k.p = iter->pos; |
cd575ddf | 729 | |
0564b167 KO |
730 | bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i)); |
731 | ||
732 | ret = bch2_trans_commit(&trans, NULL, NULL, | |
932aa837 KO |
733 | BTREE_INSERT_ATOMIC| |
734 | BTREE_INSERT_NOFAIL); | |
735 | err: | |
736 | if (ret == -EINTR) | |
737 | goto retry; | |
0564b167 | 738 | bch2_trans_exit(&trans); |
cd575ddf | 739 | |
cd575ddf KO |
740 | return ret; |
741 | } | |
742 | ||
cd575ddf KO |
743 | static void extent_stripe_ptr_add(struct bkey_s_extent e, |
744 | struct ec_stripe_buf *s, | |
745 | struct bch_extent_ptr *ptr, | |
746 | unsigned block) | |
747 | { | |
748 | struct bch_extent_stripe_ptr *dst = (void *) ptr; | |
749 | union bch_extent_entry *end = extent_entry_last(e); | |
750 | ||
751 | memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); | |
752 | e.k->u64s += sizeof(*dst) / sizeof(u64); | |
753 | ||
754 | *dst = (struct bch_extent_stripe_ptr) { | |
755 | .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, | |
756 | .block = block, | |
757 | .idx = s->key.k.p.offset, | |
758 | }; | |
759 | } | |
760 | ||
761 | static int ec_stripe_update_ptrs(struct bch_fs *c, | |
762 | struct ec_stripe_buf *s, | |
763 | struct bkey *pos) | |
764 | { | |
0564b167 KO |
765 | struct btree_trans trans; |
766 | struct btree_iter *iter; | |
cd575ddf KO |
767 | struct bkey_s_c k; |
768 | struct bkey_s_extent e; | |
769 | struct bch_extent_ptr *ptr; | |
770 | BKEY_PADDED(k) tmp; | |
771 | int ret = 0, dev, idx; | |
772 | ||
20bceecb | 773 | bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); |
cd575ddf | 774 | |
0564b167 KO |
775 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, |
776 | bkey_start_pos(pos), | |
777 | BTREE_ITER_INTENT); | |
778 | ||
779 | while ((k = bch2_btree_iter_peek(iter)).k && | |
0f238367 | 780 | !(ret = bkey_err(k)) && |
cd575ddf | 781 | bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { |
42c7d748 KO |
782 | if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { |
783 | bch2_btree_iter_next(iter); | |
784 | continue; | |
785 | } | |
786 | ||
cd575ddf KO |
787 | idx = extent_matches_stripe(c, &s->key.v, k); |
788 | if (idx < 0) { | |
0564b167 | 789 | bch2_btree_iter_next(iter); |
cd575ddf KO |
790 | continue; |
791 | } | |
792 | ||
42c7d748 KO |
793 | bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); |
794 | ||
cd575ddf KO |
795 | dev = s->key.v.ptrs[idx].dev; |
796 | ||
797 | bkey_reassemble(&tmp.k, k); | |
798 | e = bkey_i_to_s_extent(&tmp.k); | |
799 | ||
800 | extent_for_each_ptr(e, ptr) | |
801 | if (ptr->dev != dev) | |
802 | ptr->cached = true; | |
803 | ||
804 | ptr = (void *) bch2_extent_has_device(e.c, dev); | |
805 | BUG_ON(!ptr); | |
806 | ||
807 | extent_stripe_ptr_add(e, s, ptr, idx); | |
808 | ||
0564b167 KO |
809 | bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k)); |
810 | ||
811 | ret = bch2_trans_commit(&trans, NULL, NULL, | |
812 | BTREE_INSERT_ATOMIC| | |
813 | BTREE_INSERT_NOFAIL| | |
814 | BTREE_INSERT_USE_RESERVE); | |
cd575ddf KO |
815 | if (ret == -EINTR) |
816 | ret = 0; | |
817 | if (ret) | |
818 | break; | |
819 | } | |
820 | ||
0564b167 KO |
821 | bch2_trans_exit(&trans); |
822 | ||
823 | return ret; | |
cd575ddf KO |
824 | } |
825 | ||
826 | /* | |
827 | * data buckets of new stripe all written: create the stripe | |
828 | */ | |
829 | static void ec_stripe_create(struct ec_stripe_new *s) | |
830 | { | |
cd575ddf KO |
831 | struct bch_fs *c = s->c; |
832 | struct open_bucket *ob; | |
833 | struct bkey_i *k; | |
834 | struct bch_stripe *v = &s->stripe.key.v; | |
835 | unsigned i, nr_data = v->nr_blocks - v->nr_redundant; | |
836 | struct closure cl; | |
837 | int ret; | |
838 | ||
839 | BUG_ON(s->h->s == s); | |
840 | ||
841 | closure_init_stack(&cl); | |
842 | ||
843 | if (s->err) { | |
844 | bch_err(c, "error creating stripe: error writing data buckets"); | |
845 | goto err; | |
846 | } | |
847 | ||
848 | if (!percpu_ref_tryget(&c->writes)) | |
849 | goto err; | |
850 | ||
851 | BUG_ON(bitmap_weight(s->blocks_allocated, | |
852 | s->blocks.nr) != s->blocks.nr); | |
853 | ||
854 | ec_generate_ec(&s->stripe); | |
855 | ||
856 | ec_generate_checksums(&s->stripe); | |
857 | ||
858 | /* write p/q: */ | |
859 | for (i = nr_data; i < v->nr_blocks; i++) | |
860 | ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); | |
861 | ||
862 | closure_sync(&cl); | |
863 | ||
864 | for (i = nr_data; i < v->nr_blocks; i++) | |
865 | if (!test_bit(i, s->stripe.valid)) { | |
866 | bch_err(c, "error creating stripe: error writing redundancy buckets"); | |
867 | goto err_put_writes; | |
868 | } | |
869 | ||
dfe9bfb3 KO |
870 | mutex_lock(&c->ec_stripe_create_lock); |
871 | ||
cd575ddf KO |
872 | ret = ec_stripe_bkey_insert(c, &s->stripe.key); |
873 | if (ret) { | |
874 | bch_err(c, "error creating stripe: error creating stripe key"); | |
dfe9bfb3 | 875 | goto err_unlock; |
cd575ddf KO |
876 | } |
877 | ||
878 | for_each_keylist_key(&s->keys, k) { | |
879 | ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); | |
880 | if (ret) | |
881 | break; | |
882 | } | |
883 | ||
dfe9bfb3 KO |
884 | err_unlock: |
885 | mutex_unlock(&c->ec_stripe_create_lock); | |
cd575ddf KO |
886 | err_put_writes: |
887 | percpu_ref_put(&c->writes); | |
888 | err: | |
889 | open_bucket_for_each(c, &s->blocks, ob, i) { | |
890 | ob->ec = NULL; | |
891 | __bch2_open_bucket_put(c, ob); | |
892 | } | |
893 | ||
894 | bch2_open_buckets_put(c, &s->parity); | |
895 | ||
896 | bch2_keylist_free(&s->keys, s->inline_keys); | |
897 | ||
898 | mutex_lock(&s->h->lock); | |
899 | list_del(&s->list); | |
900 | mutex_unlock(&s->h->lock); | |
901 | ||
902 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) | |
903 | kvpfree(s->stripe.data[i], s->stripe.size << 9); | |
904 | kfree(s); | |
905 | } | |
906 | ||
907 | static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) | |
908 | { | |
909 | struct ec_stripe_new *s = h->s; | |
910 | ||
911 | list_add(&s->list, &h->stripes); | |
912 | h->s = NULL; | |
913 | ||
914 | return s; | |
915 | } | |
916 | ||
917 | static void ec_stripe_new_put(struct ec_stripe_new *s) | |
918 | { | |
919 | BUG_ON(atomic_read(&s->pin) <= 0); | |
920 | if (atomic_dec_and_test(&s->pin)) | |
921 | ec_stripe_create(s); | |
922 | } | |
923 | ||
924 | /* have a full bucket - hand it off to be erasure coded: */ | |
925 | void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) | |
926 | { | |
927 | struct ec_stripe_new *s = ob->ec; | |
928 | ||
929 | if (ob->sectors_free) | |
930 | s->err = -1; | |
931 | ||
932 | ec_stripe_new_put(s); | |
933 | } | |
934 | ||
935 | void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) | |
936 | { | |
937 | struct ec_stripe_new *s = ob->ec; | |
938 | ||
939 | s->err = -EIO; | |
940 | } | |
941 | ||
942 | void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) | |
943 | { | |
944 | struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); | |
945 | struct bch_dev *ca; | |
946 | unsigned offset; | |
947 | ||
948 | if (!ob) | |
949 | return NULL; | |
950 | ||
951 | ca = bch_dev_bkey_exists(c, ob->ptr.dev); | |
952 | offset = ca->mi.bucket_size - ob->sectors_free; | |
953 | ||
954 | return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); | |
955 | } | |
956 | ||
957 | void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, | |
958 | struct bpos pos, unsigned sectors) | |
959 | { | |
960 | struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); | |
961 | struct ec_stripe_new *ec; | |
962 | ||
963 | if (!ob) | |
964 | return; | |
965 | ||
966 | ec = ob->ec; | |
967 | mutex_lock(&ec->lock); | |
968 | ||
969 | if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, | |
970 | ARRAY_SIZE(ec->inline_keys), | |
971 | BKEY_U64s)) { | |
972 | BUG(); | |
973 | } | |
974 | ||
975 | bkey_init(&ec->keys.top->k); | |
976 | ec->keys.top->k.p = pos; | |
977 | bch2_key_resize(&ec->keys.top->k, sectors); | |
978 | bch2_keylist_push(&ec->keys); | |
979 | ||
980 | mutex_unlock(&ec->lock); | |
981 | } | |
982 | ||
983 | static int unsigned_cmp(const void *_l, const void *_r) | |
984 | { | |
985 | unsigned l = *((const unsigned *) _l); | |
986 | unsigned r = *((const unsigned *) _r); | |
987 | ||
3ea2b1e1 | 988 | return cmp_int(l, r); |
cd575ddf KO |
989 | } |
990 | ||
991 | /* pick most common bucket size: */ | |
992 | static unsigned pick_blocksize(struct bch_fs *c, | |
993 | struct bch_devs_mask *devs) | |
994 | { | |
995 | struct bch_dev *ca; | |
996 | unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; | |
997 | struct { | |
998 | unsigned nr, size; | |
999 | } cur = { 0, 0 }, best = { 0, 0 }; | |
1000 | ||
1001 | for_each_member_device_rcu(ca, c, i, devs) | |
1002 | sizes[nr++] = ca->mi.bucket_size; | |
1003 | ||
1004 | sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); | |
1005 | ||
1006 | for (i = 0; i < nr; i++) { | |
1007 | if (sizes[i] != cur.size) { | |
1008 | if (cur.nr > best.nr) | |
1009 | best = cur; | |
1010 | ||
1011 | cur.nr = 0; | |
1012 | cur.size = sizes[i]; | |
1013 | } | |
1014 | ||
1015 | cur.nr++; | |
1016 | } | |
1017 | ||
1018 | if (cur.nr > best.nr) | |
1019 | best = cur; | |
1020 | ||
1021 | return best.size; | |
1022 | } | |
1023 | ||
1024 | int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) | |
1025 | { | |
1026 | struct ec_stripe_new *s; | |
1027 | unsigned i; | |
1028 | ||
1029 | BUG_ON(h->parity.nr != h->redundancy); | |
1030 | BUG_ON(!h->blocks.nr); | |
1031 | BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); | |
1032 | lockdep_assert_held(&h->lock); | |
1033 | ||
1034 | s = kzalloc(sizeof(*s), GFP_KERNEL); | |
1035 | if (!s) | |
1036 | return -ENOMEM; | |
1037 | ||
1038 | mutex_init(&s->lock); | |
1039 | atomic_set(&s->pin, 1); | |
1040 | s->c = c; | |
1041 | s->h = h; | |
1042 | s->blocks = h->blocks; | |
1043 | s->parity = h->parity; | |
1044 | ||
1045 | memset(&h->blocks, 0, sizeof(h->blocks)); | |
1046 | memset(&h->parity, 0, sizeof(h->parity)); | |
1047 | ||
1048 | bch2_keylist_init(&s->keys, s->inline_keys); | |
1049 | ||
1050 | s->stripe.offset = 0; | |
1051 | s->stripe.size = h->blocksize; | |
1052 | memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); | |
1053 | ||
1054 | ec_stripe_key_init(c, &s->stripe.key, | |
1055 | &s->blocks, &s->parity, | |
1056 | h->blocksize); | |
1057 | ||
1058 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { | |
1059 | s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); | |
1060 | if (!s->stripe.data[i]) | |
1061 | goto err; | |
1062 | } | |
1063 | ||
1064 | h->s = s; | |
1065 | ||
1066 | return 0; | |
1067 | err: | |
1068 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) | |
1069 | kvpfree(s->stripe.data[i], s->stripe.size << 9); | |
1070 | kfree(s); | |
1071 | return -ENOMEM; | |
1072 | } | |
1073 | ||
1074 | static struct ec_stripe_head * | |
1075 | ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, | |
1076 | unsigned algo, unsigned redundancy) | |
1077 | { | |
1078 | struct ec_stripe_head *h; | |
1079 | struct bch_dev *ca; | |
1080 | unsigned i; | |
1081 | ||
1082 | h = kzalloc(sizeof(*h), GFP_KERNEL); | |
1083 | if (!h) | |
1084 | return NULL; | |
1085 | ||
1086 | mutex_init(&h->lock); | |
1087 | mutex_lock(&h->lock); | |
1088 | INIT_LIST_HEAD(&h->stripes); | |
1089 | ||
1090 | h->target = target; | |
1091 | h->algo = algo; | |
1092 | h->redundancy = redundancy; | |
1093 | ||
1094 | rcu_read_lock(); | |
1095 | h->devs = target_rw_devs(c, BCH_DATA_USER, target); | |
1096 | ||
1097 | for_each_member_device_rcu(ca, c, i, &h->devs) | |
1098 | if (!ca->mi.durability) | |
1099 | __clear_bit(i, h->devs.d); | |
1100 | ||
1101 | h->blocksize = pick_blocksize(c, &h->devs); | |
1102 | ||
1103 | for_each_member_device_rcu(ca, c, i, &h->devs) | |
1104 | if (ca->mi.bucket_size == h->blocksize) | |
1105 | h->nr_active_devs++; | |
1106 | ||
1107 | rcu_read_unlock(); | |
1108 | list_add(&h->list, &c->ec_new_stripe_list); | |
1109 | return h; | |
1110 | } | |
1111 | ||
1112 | void bch2_ec_stripe_head_put(struct ec_stripe_head *h) | |
1113 | { | |
1114 | struct ec_stripe_new *s = NULL; | |
1115 | ||
1116 | if (h->s && | |
1117 | bitmap_weight(h->s->blocks_allocated, | |
1118 | h->s->blocks.nr) == h->s->blocks.nr) | |
1119 | s = ec_stripe_set_pending(h); | |
1120 | ||
1121 | mutex_unlock(&h->lock); | |
1122 | ||
1123 | if (s) | |
1124 | ec_stripe_new_put(s); | |
1125 | } | |
1126 | ||
1127 | struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, | |
1128 | unsigned target, | |
1129 | unsigned algo, | |
1130 | unsigned redundancy) | |
1131 | { | |
1132 | struct ec_stripe_head *h; | |
1133 | ||
1134 | if (!redundancy) | |
1135 | return NULL; | |
1136 | ||
1137 | mutex_lock(&c->ec_new_stripe_lock); | |
1138 | list_for_each_entry(h, &c->ec_new_stripe_list, list) | |
1139 | if (h->target == target && | |
1140 | h->algo == algo && | |
1141 | h->redundancy == redundancy) { | |
1142 | mutex_lock(&h->lock); | |
1143 | goto found; | |
1144 | } | |
1145 | ||
1146 | h = ec_new_stripe_head_alloc(c, target, algo, redundancy); | |
1147 | found: | |
1148 | mutex_unlock(&c->ec_new_stripe_lock); | |
1149 | return h; | |
1150 | } | |
1151 | ||
1152 | void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) | |
1153 | { | |
1154 | struct ec_stripe_head *h; | |
1155 | struct open_bucket *ob; | |
1156 | unsigned i; | |
1157 | ||
1158 | mutex_lock(&c->ec_new_stripe_lock); | |
1159 | list_for_each_entry(h, &c->ec_new_stripe_list, list) { | |
1160 | struct ec_stripe_new *s = NULL; | |
1161 | ||
1162 | mutex_lock(&h->lock); | |
1163 | bch2_open_buckets_stop_dev(c, ca, | |
1164 | &h->blocks, | |
1165 | BCH_DATA_USER); | |
1166 | bch2_open_buckets_stop_dev(c, ca, | |
1167 | &h->parity, | |
1168 | BCH_DATA_USER); | |
1169 | ||
1170 | if (!h->s) | |
1171 | goto unlock; | |
1172 | ||
1173 | open_bucket_for_each(c, &h->s->blocks, ob, i) | |
1174 | if (ob->ptr.dev == ca->dev_idx) | |
1175 | goto found; | |
1176 | open_bucket_for_each(c, &h->s->parity, ob, i) | |
1177 | if (ob->ptr.dev == ca->dev_idx) | |
1178 | goto found; | |
1179 | goto unlock; | |
1180 | found: | |
1181 | h->s->err = -1; | |
1182 | s = ec_stripe_set_pending(h); | |
1183 | unlock: | |
1184 | mutex_unlock(&h->lock); | |
1185 | ||
1186 | if (s) | |
1187 | ec_stripe_new_put(s); | |
1188 | } | |
1189 | mutex_unlock(&c->ec_new_stripe_lock); | |
1190 | } | |
1191 | ||
0564b167 | 1192 | static int __bch2_stripe_write_key(struct btree_trans *trans, |
61c8d7c8 KO |
1193 | struct btree_iter *iter, |
1194 | struct stripe *m, | |
1195 | size_t idx, | |
1196 | struct bkey_i_stripe *new_key, | |
1197 | unsigned flags) | |
1198 | { | |
0564b167 | 1199 | struct bch_fs *c = trans->c; |
61c8d7c8 KO |
1200 | struct bkey_s_c k; |
1201 | unsigned i; | |
1202 | int ret; | |
1203 | ||
1204 | bch2_btree_iter_set_pos(iter, POS(0, idx)); | |
1205 | ||
1206 | k = bch2_btree_iter_peek_slot(iter); | |
0f238367 | 1207 | ret = bkey_err(k); |
61c8d7c8 KO |
1208 | if (ret) |
1209 | return ret; | |
1210 | ||
1211 | if (k.k->type != KEY_TYPE_stripe) | |
1212 | return -EIO; | |
1213 | ||
1214 | bkey_reassemble(&new_key->k_i, k); | |
1215 | ||
1216 | spin_lock(&c->ec_stripes_heap_lock); | |
1217 | ||
1218 | for (i = 0; i < new_key->v.nr_blocks; i++) | |
1219 | stripe_blockcount_set(&new_key->v, i, | |
1220 | m->block_sectors[i]); | |
1221 | m->dirty = false; | |
1222 | ||
1223 | spin_unlock(&c->ec_stripes_heap_lock); | |
1224 | ||
0564b167 KO |
1225 | bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i)); |
1226 | ||
1227 | return bch2_trans_commit(trans, NULL, NULL, | |
1228 | BTREE_INSERT_NOFAIL|flags); | |
61c8d7c8 KO |
1229 | } |
1230 | ||
a0e0bda1 | 1231 | int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) |
61c8d7c8 | 1232 | { |
0564b167 KO |
1233 | struct btree_trans trans; |
1234 | struct btree_iter *iter; | |
61c8d7c8 KO |
1235 | struct genradix_iter giter; |
1236 | struct bkey_i_stripe *new_key; | |
1237 | struct stripe *m; | |
1238 | int ret = 0; | |
1239 | ||
1240 | new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); | |
1241 | BUG_ON(!new_key); | |
1242 | ||
20bceecb | 1243 | bch2_trans_init(&trans, c, 0, 0); |
0564b167 KO |
1244 | |
1245 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, | |
1246 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT); | |
61c8d7c8 KO |
1247 | |
1248 | genradix_for_each(&c->stripes[0], giter, m) { | |
1249 | if (!m->dirty) | |
1250 | continue; | |
1251 | ||
0564b167 | 1252 | ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, |
a0e0bda1 | 1253 | new_key, flags); |
61c8d7c8 KO |
1254 | if (ret) |
1255 | break; | |
1256 | ||
1257 | *wrote = true; | |
1258 | } | |
1259 | ||
0564b167 | 1260 | bch2_trans_exit(&trans); |
61c8d7c8 KO |
1261 | |
1262 | kfree(new_key); | |
1263 | ||
1264 | return ret; | |
1265 | } | |
1266 | ||
d0734356 | 1267 | int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) |
61c8d7c8 | 1268 | { |
d0734356 | 1269 | struct journal_key *i; |
424eb881 KO |
1270 | struct btree_trans trans; |
1271 | struct btree_iter *iter; | |
61c8d7c8 KO |
1272 | struct bkey_s_c k; |
1273 | int ret; | |
1274 | ||
1275 | ret = bch2_fs_ec_start(c); | |
1276 | if (ret) | |
1277 | return ret; | |
1278 | ||
20bceecb | 1279 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 1280 | |
94f651e2 | 1281 | for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret) |
6e738539 KO |
1282 | bch2_mark_key(c, k, 0, NULL, 0, |
1283 | BCH_BUCKET_MARK_ALLOC_READ| | |
1284 | BCH_BUCKET_MARK_NOATOMIC); | |
61c8d7c8 | 1285 | |
94f651e2 KO |
1286 | ret = bch2_trans_exit(&trans) ?: ret; |
1287 | if (ret) { | |
1288 | bch_err(c, "error reading stripes: %i", ret); | |
61c8d7c8 | 1289 | return ret; |
94f651e2 | 1290 | } |
61c8d7c8 | 1291 | |
d0734356 KO |
1292 | for_each_journal_key(*journal_keys, i) |
1293 | if (i->btree_id == BTREE_ID_EC) | |
460651ee | 1294 | bch2_mark_key(c, bkey_i_to_s_c(i->k), |
6e738539 KO |
1295 | 0, NULL, 0, |
1296 | BCH_BUCKET_MARK_ALLOC_READ| | |
1297 | BCH_BUCKET_MARK_NOATOMIC); | |
61c8d7c8 KO |
1298 | |
1299 | return 0; | |
1300 | } | |
1301 | ||
dfe9bfb3 | 1302 | int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) |
cd575ddf | 1303 | { |
424eb881 KO |
1304 | struct btree_trans trans; |
1305 | struct btree_iter *iter; | |
cd575ddf KO |
1306 | struct bkey_s_c k; |
1307 | size_t i, idx = 0; | |
1308 | int ret = 0; | |
1309 | ||
20bceecb | 1310 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 KO |
1311 | |
1312 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); | |
cd575ddf | 1313 | |
424eb881 | 1314 | k = bch2_btree_iter_prev(iter); |
cd575ddf KO |
1315 | if (!IS_ERR_OR_NULL(k.k)) |
1316 | idx = k.k->p.offset + 1; | |
424eb881 | 1317 | ret = bch2_trans_exit(&trans); |
cd575ddf KO |
1318 | if (ret) |
1319 | return ret; | |
1320 | ||
dfe9bfb3 KO |
1321 | if (!gc && |
1322 | !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), | |
cd575ddf KO |
1323 | GFP_KERNEL)) |
1324 | return -ENOMEM; | |
1325 | #if 0 | |
dfe9bfb3 | 1326 | ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); |
cd575ddf KO |
1327 | #else |
1328 | for (i = 0; i < idx; i++) | |
dfe9bfb3 | 1329 | if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) |
cd575ddf KO |
1330 | return -ENOMEM; |
1331 | #endif | |
1332 | return 0; | |
1333 | } | |
1334 | ||
dfe9bfb3 KO |
1335 | int bch2_fs_ec_start(struct bch_fs *c) |
1336 | { | |
1337 | return bch2_ec_mem_alloc(c, false); | |
1338 | } | |
1339 | ||
cd575ddf KO |
1340 | void bch2_fs_ec_exit(struct bch_fs *c) |
1341 | { | |
1342 | struct ec_stripe_head *h; | |
1343 | ||
1344 | while (1) { | |
1345 | mutex_lock(&c->ec_new_stripe_lock); | |
1346 | h = list_first_entry_or_null(&c->ec_new_stripe_list, | |
1347 | struct ec_stripe_head, list); | |
1348 | if (h) | |
1349 | list_del(&h->list); | |
1350 | mutex_unlock(&c->ec_new_stripe_lock); | |
1351 | if (!h) | |
1352 | break; | |
1353 | ||
1354 | BUG_ON(h->s); | |
1355 | BUG_ON(!list_empty(&h->stripes)); | |
1356 | kfree(h); | |
1357 | } | |
1358 | ||
1359 | free_heap(&c->ec_stripes_heap); | |
dfe9bfb3 | 1360 | genradix_free(&c->stripes[0]); |
cd575ddf KO |
1361 | bioset_exit(&c->ec_bioset); |
1362 | } | |
1363 | ||
1364 | int bch2_fs_ec_init(struct bch_fs *c) | |
1365 | { | |
1366 | INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); | |
1367 | ||
1368 | return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), | |
1369 | BIOSET_NEED_BVECS); | |
1370 | } |