Commit | Line | Data |
---|---|---|
cd575ddf KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* erasure coding */ | |
4 | ||
5 | #include "bcachefs.h" | |
6 | #include "alloc_foreground.h" | |
7 | #include "bset.h" | |
8 | #include "btree_gc.h" | |
9 | #include "btree_update.h" | |
10 | #include "buckets.h" | |
11 | #include "disk_groups.h" | |
12 | #include "ec.h" | |
13 | #include "error.h" | |
14 | #include "io.h" | |
61c8d7c8 | 15 | #include "journal_io.h" |
cd575ddf KO |
16 | #include "keylist.h" |
17 | #include "super-io.h" | |
18 | #include "util.h" | |
19 | ||
de5bb710 KO |
20 | #include <linux/sort.h> |
21 | ||
22 | #ifdef __KERNEL__ | |
23 | ||
cd575ddf KO |
24 | #include <linux/raid/pq.h> |
25 | #include <linux/raid/xor.h> | |
de5bb710 KO |
26 | |
27 | static void raid5_recov(unsigned disks, unsigned failed_idx, | |
28 | size_t size, void **data) | |
29 | { | |
30 | unsigned i = 2, nr; | |
31 | ||
32 | BUG_ON(failed_idx >= disks); | |
33 | ||
34 | swap(data[0], data[failed_idx]); | |
35 | memcpy(data[0], data[1], size); | |
36 | ||
37 | while (i < disks) { | |
38 | nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); | |
39 | xor_blocks(nr, size, data[0], data + i); | |
40 | i += nr; | |
41 | } | |
42 | ||
43 | swap(data[0], data[failed_idx]); | |
44 | } | |
45 | ||
46 | static void raid_gen(int nd, int np, size_t size, void **v) | |
47 | { | |
48 | if (np >= 1) | |
49 | raid5_recov(nd + np, nd, size, v); | |
50 | if (np >= 2) | |
51 | raid6_call.gen_syndrome(nd + np, size, v); | |
52 | BUG_ON(np > 2); | |
53 | } | |
54 | ||
55 | static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) | |
56 | { | |
57 | switch (nr) { | |
58 | case 0: | |
59 | break; | |
60 | case 1: | |
61 | if (ir[0] < nd + 1) | |
62 | raid5_recov(nd + 1, ir[0], size, v); | |
63 | else | |
64 | raid6_call.gen_syndrome(nd + np, size, v); | |
65 | break; | |
66 | case 2: | |
67 | if (ir[1] < nd) { | |
68 | /* data+data failure. */ | |
69 | raid6_2data_recov(nd + np, size, ir[0], ir[1], v); | |
70 | } else if (ir[0] < nd) { | |
71 | /* data + p/q failure */ | |
72 | ||
73 | if (ir[1] == nd) /* data + p failure */ | |
74 | raid6_datap_recov(nd + np, size, ir[0], v); | |
75 | else { /* data + q failure */ | |
76 | raid5_recov(nd + 1, ir[0], size, v); | |
77 | raid6_call.gen_syndrome(nd + np, size, v); | |
78 | } | |
79 | } else { | |
80 | raid_gen(nd, np, size, v); | |
81 | } | |
82 | break; | |
83 | default: | |
84 | BUG(); | |
85 | } | |
86 | } | |
87 | ||
88 | #else | |
89 | ||
90 | #include <raid/raid.h> | |
91 | ||
92 | #endif | |
cd575ddf KO |
93 | |
94 | struct ec_bio { | |
95 | struct bch_dev *ca; | |
96 | struct ec_stripe_buf *buf; | |
97 | size_t idx; | |
98 | struct bio bio; | |
99 | }; | |
100 | ||
101 | /* Stripes btree keys: */ | |
102 | ||
26609b61 | 103 | const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) |
cd575ddf | 104 | { |
26609b61 KO |
105 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
106 | ||
cd575ddf KO |
107 | if (k.k->p.inode) |
108 | return "invalid stripe key"; | |
109 | ||
26609b61 KO |
110 | if (bkey_val_bytes(k.k) < sizeof(*s)) |
111 | return "incorrect value size"; | |
cd575ddf | 112 | |
76640280 KO |
113 | if (bkey_val_bytes(k.k) < sizeof(*s) || |
114 | bkey_val_u64s(k.k) < stripe_val_u64s(s)) | |
26609b61 | 115 | return "incorrect value size"; |
cd575ddf | 116 | |
26609b61 | 117 | return NULL; |
cd575ddf KO |
118 | } |
119 | ||
26609b61 | 120 | void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, |
cd575ddf KO |
121 | struct bkey_s_c k) |
122 | { | |
26609b61 KO |
123 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
124 | unsigned i; | |
125 | ||
126 | pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", | |
127 | s->algorithm, | |
128 | le16_to_cpu(s->sectors), | |
129 | s->nr_blocks - s->nr_redundant, | |
130 | s->nr_redundant, | |
131 | s->csum_type, | |
132 | 1U << s->csum_granularity_bits); | |
133 | ||
134 | for (i = 0; i < s->nr_blocks; i++) | |
61c8d7c8 KO |
135 | pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, |
136 | (u64) s->ptrs[i].offset, | |
137 | stripe_blockcount_get(s, i)); | |
cd575ddf KO |
138 | } |
139 | ||
140 | static int ptr_matches_stripe(struct bch_fs *c, | |
141 | struct bch_stripe *v, | |
142 | const struct bch_extent_ptr *ptr) | |
143 | { | |
144 | unsigned i; | |
145 | ||
146 | for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { | |
147 | const struct bch_extent_ptr *ptr2 = v->ptrs + i; | |
148 | ||
149 | if (ptr->dev == ptr2->dev && | |
150 | ptr->gen == ptr2->gen && | |
151 | ptr->offset >= ptr2->offset && | |
152 | ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) | |
153 | return i; | |
154 | } | |
155 | ||
156 | return -1; | |
157 | } | |
158 | ||
159 | static int extent_matches_stripe(struct bch_fs *c, | |
160 | struct bch_stripe *v, | |
161 | struct bkey_s_c k) | |
162 | { | |
163 | struct bkey_s_c_extent e; | |
164 | const struct bch_extent_ptr *ptr; | |
165 | int idx; | |
166 | ||
167 | if (!bkey_extent_is_data(k.k)) | |
168 | return -1; | |
169 | ||
170 | e = bkey_s_c_to_extent(k); | |
171 | ||
172 | extent_for_each_ptr(e, ptr) { | |
173 | idx = ptr_matches_stripe(c, v, ptr); | |
174 | if (idx >= 0) | |
175 | return idx; | |
176 | } | |
177 | ||
178 | return -1; | |
179 | } | |
180 | ||
181 | static void ec_stripe_key_init(struct bch_fs *c, | |
182 | struct bkey_i_stripe *s, | |
183 | struct open_buckets *blocks, | |
184 | struct open_buckets *parity, | |
185 | unsigned stripe_size) | |
186 | { | |
187 | struct open_bucket *ob; | |
188 | unsigned i, u64s; | |
189 | ||
190 | bkey_stripe_init(&s->k_i); | |
191 | s->v.sectors = cpu_to_le16(stripe_size); | |
192 | s->v.algorithm = 0; | |
193 | s->v.nr_blocks = parity->nr + blocks->nr; | |
194 | s->v.nr_redundant = parity->nr; | |
195 | s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); | |
196 | s->v.csum_type = BCH_CSUM_CRC32C; | |
197 | s->v.pad = 0; | |
198 | ||
199 | open_bucket_for_each(c, blocks, ob, i) | |
200 | s->v.ptrs[i] = ob->ptr; | |
201 | ||
202 | open_bucket_for_each(c, parity, ob, i) | |
203 | s->v.ptrs[blocks->nr + i] = ob->ptr; | |
204 | ||
205 | while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { | |
206 | BUG_ON(1 << s->v.csum_granularity_bits >= | |
207 | le16_to_cpu(s->v.sectors) || | |
208 | s->v.csum_granularity_bits == U8_MAX); | |
209 | s->v.csum_granularity_bits++; | |
210 | } | |
211 | ||
212 | set_bkey_val_u64s(&s->k, u64s); | |
213 | } | |
214 | ||
215 | /* Checksumming: */ | |
216 | ||
217 | static void ec_generate_checksums(struct ec_stripe_buf *buf) | |
218 | { | |
219 | struct bch_stripe *v = &buf->key.v; | |
220 | unsigned csum_granularity = 1 << v->csum_granularity_bits; | |
221 | unsigned csums_per_device = stripe_csums_per_device(v); | |
222 | unsigned csum_bytes = bch_crc_bytes[v->csum_type]; | |
223 | unsigned i, j; | |
224 | ||
225 | if (!csum_bytes) | |
226 | return; | |
227 | ||
228 | BUG_ON(buf->offset); | |
229 | BUG_ON(buf->size != le16_to_cpu(v->sectors)); | |
230 | ||
231 | for (i = 0; i < v->nr_blocks; i++) { | |
232 | for (j = 0; j < csums_per_device; j++) { | |
233 | unsigned offset = j << v->csum_granularity_bits; | |
234 | unsigned len = min(csum_granularity, buf->size - offset); | |
235 | ||
236 | struct bch_csum csum = | |
237 | bch2_checksum(NULL, v->csum_type, | |
238 | null_nonce(), | |
239 | buf->data[i] + (offset << 9), | |
240 | len << 9); | |
241 | ||
242 | memcpy(stripe_csum(v, i, j), &csum, csum_bytes); | |
243 | } | |
244 | } | |
245 | } | |
246 | ||
247 | static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) | |
248 | { | |
249 | struct bch_stripe *v = &buf->key.v; | |
250 | unsigned csum_granularity = 1 << v->csum_granularity_bits; | |
251 | unsigned csum_bytes = bch_crc_bytes[v->csum_type]; | |
252 | unsigned i; | |
253 | ||
254 | if (!csum_bytes) | |
255 | return; | |
256 | ||
257 | for (i = 0; i < v->nr_blocks; i++) { | |
258 | unsigned offset = buf->offset; | |
259 | unsigned end = buf->offset + buf->size; | |
260 | ||
261 | if (!test_bit(i, buf->valid)) | |
262 | continue; | |
263 | ||
264 | while (offset < end) { | |
265 | unsigned j = offset >> v->csum_granularity_bits; | |
266 | unsigned len = min(csum_granularity, end - offset); | |
267 | struct bch_csum csum; | |
268 | ||
269 | BUG_ON(offset & (csum_granularity - 1)); | |
270 | BUG_ON(offset + len != le16_to_cpu(v->sectors) && | |
271 | ((offset + len) & (csum_granularity - 1))); | |
272 | ||
273 | csum = bch2_checksum(NULL, v->csum_type, | |
274 | null_nonce(), | |
275 | buf->data[i] + ((offset - buf->offset) << 9), | |
276 | len << 9); | |
277 | ||
278 | if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { | |
279 | __bcache_io_error(c, | |
280 | "checksum error while doing reconstruct read (%u:%u)", | |
281 | i, j); | |
282 | clear_bit(i, buf->valid); | |
283 | break; | |
284 | } | |
285 | ||
286 | offset += len; | |
287 | } | |
288 | } | |
289 | } | |
290 | ||
291 | /* Erasure coding: */ | |
292 | ||
cd575ddf KO |
293 | static void ec_generate_ec(struct ec_stripe_buf *buf) |
294 | { | |
295 | struct bch_stripe *v = &buf->key.v; | |
296 | unsigned nr_data = v->nr_blocks - v->nr_redundant; | |
297 | unsigned bytes = le16_to_cpu(v->sectors) << 9; | |
298 | ||
de5bb710 | 299 | raid_gen(nr_data, v->nr_redundant, bytes, buf->data); |
cd575ddf KO |
300 | } |
301 | ||
302 | static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) | |
303 | { | |
304 | return nr - bitmap_weight(buf->valid, nr); | |
305 | } | |
306 | ||
307 | static unsigned ec_nr_failed(struct ec_stripe_buf *buf) | |
308 | { | |
309 | return __ec_nr_failed(buf, buf->key.v.nr_blocks); | |
310 | } | |
311 | ||
312 | static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) | |
313 | { | |
314 | struct bch_stripe *v = &buf->key.v; | |
315 | unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; | |
316 | unsigned nr_data = v->nr_blocks - v->nr_redundant; | |
317 | unsigned bytes = buf->size << 9; | |
318 | ||
319 | if (ec_nr_failed(buf) > v->nr_redundant) { | |
320 | __bcache_io_error(c, | |
321 | "error doing reconstruct read: unable to read enough blocks"); | |
322 | return -1; | |
323 | } | |
324 | ||
325 | for (i = 0; i < nr_data; i++) | |
326 | if (!test_bit(i, buf->valid)) | |
327 | failed[nr_failed++] = i; | |
328 | ||
de5bb710 | 329 | raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); |
cd575ddf KO |
330 | return 0; |
331 | } | |
332 | ||
333 | /* IO: */ | |
334 | ||
335 | static void ec_block_endio(struct bio *bio) | |
336 | { | |
337 | struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); | |
338 | struct bch_dev *ca = ec_bio->ca; | |
339 | struct closure *cl = bio->bi_private; | |
340 | ||
341 | if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) | |
342 | clear_bit(ec_bio->idx, ec_bio->buf->valid); | |
343 | ||
344 | bio_put(&ec_bio->bio); | |
345 | percpu_ref_put(&ca->io_ref); | |
346 | closure_put(cl); | |
347 | } | |
348 | ||
349 | static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, | |
350 | unsigned rw, unsigned idx, struct closure *cl) | |
351 | { | |
352 | struct bch_stripe *v = &buf->key.v; | |
353 | unsigned offset = 0, bytes = buf->size << 9; | |
354 | struct bch_extent_ptr *ptr = &v->ptrs[idx]; | |
355 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
356 | ||
357 | if (!bch2_dev_get_ioref(ca, rw)) { | |
358 | clear_bit(idx, buf->valid); | |
359 | return; | |
360 | } | |
361 | ||
362 | while (offset < bytes) { | |
363 | unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, | |
364 | DIV_ROUND_UP(bytes, PAGE_SIZE)); | |
365 | unsigned b = min_t(size_t, bytes - offset, | |
366 | nr_iovecs << PAGE_SHIFT); | |
367 | struct ec_bio *ec_bio; | |
368 | ||
369 | ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, | |
370 | nr_iovecs, | |
371 | rw, | |
372 | GFP_KERNEL, | |
373 | &c->ec_bioset), | |
374 | struct ec_bio, bio); | |
375 | ||
376 | ec_bio->ca = ca; | |
377 | ec_bio->buf = buf; | |
378 | ec_bio->idx = idx; | |
379 | ||
380 | ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); | |
381 | ec_bio->bio.bi_iter.bi_size = b; | |
382 | ec_bio->bio.bi_end_io = ec_block_endio; | |
383 | ec_bio->bio.bi_private = cl; | |
384 | ||
385 | bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset); | |
386 | ||
387 | closure_get(cl); | |
388 | percpu_ref_get(&ca->io_ref); | |
389 | ||
390 | submit_bio(&ec_bio->bio); | |
391 | ||
392 | offset += b; | |
393 | } | |
394 | ||
395 | percpu_ref_put(&ca->io_ref); | |
396 | } | |
397 | ||
398 | /* recovery read path: */ | |
399 | int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) | |
400 | { | |
424eb881 KO |
401 | struct btree_trans trans; |
402 | struct btree_iter *iter; | |
cd575ddf KO |
403 | struct ec_stripe_buf *buf; |
404 | struct closure cl; | |
405 | struct bkey_s_c k; | |
406 | struct bch_stripe *v; | |
407 | unsigned stripe_idx; | |
408 | unsigned offset, end; | |
409 | unsigned i, nr_data, csum_granularity; | |
410 | int ret = 0, idx; | |
411 | ||
412 | closure_init_stack(&cl); | |
413 | ||
414 | BUG_ON(!rbio->pick.idx || | |
415 | rbio->pick.idx - 1 >= rbio->pick.ec_nr); | |
416 | ||
417 | stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; | |
418 | ||
419 | buf = kzalloc(sizeof(*buf), GFP_NOIO); | |
420 | if (!buf) | |
421 | return -ENOMEM; | |
422 | ||
424eb881 KO |
423 | bch2_trans_init(&trans, c); |
424 | ||
425 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, | |
426 | POS(0, stripe_idx), | |
427 | BTREE_ITER_SLOTS); | |
428 | k = bch2_btree_iter_peek_slot(iter); | |
0f238367 | 429 | if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { |
cd575ddf KO |
430 | __bcache_io_error(c, |
431 | "error doing reconstruct read: stripe not found"); | |
432 | kfree(buf); | |
424eb881 | 433 | return bch2_trans_exit(&trans) ?: -EIO; |
cd575ddf KO |
434 | } |
435 | ||
436 | bkey_reassemble(&buf->key.k_i, k); | |
424eb881 | 437 | bch2_trans_exit(&trans); |
cd575ddf KO |
438 | |
439 | v = &buf->key.v; | |
440 | ||
441 | nr_data = v->nr_blocks - v->nr_redundant; | |
442 | ||
443 | idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); | |
444 | BUG_ON(idx < 0); | |
445 | ||
446 | csum_granularity = 1U << v->csum_granularity_bits; | |
447 | ||
448 | offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; | |
449 | end = offset + bio_sectors(&rbio->bio); | |
450 | ||
451 | BUG_ON(end > le16_to_cpu(v->sectors)); | |
452 | ||
453 | buf->offset = round_down(offset, csum_granularity); | |
454 | buf->size = min_t(unsigned, le16_to_cpu(v->sectors), | |
455 | round_up(end, csum_granularity)) - buf->offset; | |
456 | ||
457 | for (i = 0; i < v->nr_blocks; i++) { | |
458 | buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); | |
459 | if (!buf->data[i]) { | |
460 | ret = -ENOMEM; | |
461 | goto err; | |
462 | } | |
463 | } | |
464 | ||
465 | memset(buf->valid, 0xFF, sizeof(buf->valid)); | |
466 | ||
467 | for (i = 0; i < v->nr_blocks; i++) { | |
468 | struct bch_extent_ptr *ptr = v->ptrs + i; | |
469 | struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); | |
470 | ||
471 | if (ptr_stale(ca, ptr)) { | |
472 | __bcache_io_error(c, | |
473 | "error doing reconstruct read: stale pointer"); | |
474 | clear_bit(i, buf->valid); | |
475 | continue; | |
476 | } | |
477 | ||
478 | ec_block_io(c, buf, REQ_OP_READ, i, &cl); | |
479 | } | |
480 | ||
481 | closure_sync(&cl); | |
482 | ||
483 | if (ec_nr_failed(buf) > v->nr_redundant) { | |
484 | __bcache_io_error(c, | |
485 | "error doing reconstruct read: unable to read enough blocks"); | |
486 | ret = -EIO; | |
487 | goto err; | |
488 | } | |
489 | ||
490 | ec_validate_checksums(c, buf); | |
491 | ||
492 | ret = ec_do_recov(c, buf); | |
493 | if (ret) | |
494 | goto err; | |
495 | ||
496 | memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, | |
497 | buf->data[idx] + ((offset - buf->offset) << 9)); | |
498 | err: | |
499 | for (i = 0; i < v->nr_blocks; i++) | |
500 | kfree(buf->data[i]); | |
501 | kfree(buf); | |
502 | return ret; | |
503 | } | |
504 | ||
dfe9bfb3 | 505 | /* stripe bucket accounting: */ |
cd575ddf KO |
506 | |
507 | static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) | |
508 | { | |
509 | ec_stripes_heap n, *h = &c->ec_stripes_heap; | |
510 | ||
511 | if (idx >= h->size) { | |
512 | if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) | |
513 | return -ENOMEM; | |
514 | ||
515 | spin_lock(&c->ec_stripes_heap_lock); | |
516 | if (n.size > h->size) { | |
517 | memcpy(n.data, h->data, h->used * sizeof(h->data[0])); | |
518 | n.used = h->used; | |
519 | swap(*h, n); | |
520 | } | |
521 | spin_unlock(&c->ec_stripes_heap_lock); | |
522 | ||
523 | free_heap(&n); | |
524 | } | |
525 | ||
dfe9bfb3 KO |
526 | if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) |
527 | return -ENOMEM; | |
528 | ||
529 | if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && | |
530 | !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) | |
cd575ddf KO |
531 | return -ENOMEM; |
532 | ||
533 | return 0; | |
534 | } | |
535 | ||
536 | static int ec_stripe_mem_alloc(struct bch_fs *c, | |
537 | struct btree_iter *iter) | |
538 | { | |
539 | size_t idx = iter->pos.offset; | |
540 | ||
541 | if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) | |
542 | return 0; | |
543 | ||
0f238367 | 544 | bch2_btree_trans_unlock(iter->trans); |
cd575ddf KO |
545 | |
546 | if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) | |
547 | return -EINTR; | |
548 | return -ENOMEM; | |
549 | } | |
550 | ||
551 | static ssize_t stripe_idx_to_delete(struct bch_fs *c) | |
552 | { | |
553 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
554 | ||
555 | return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1; | |
556 | } | |
557 | ||
558 | static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, | |
559 | struct ec_stripe_heap_entry l, | |
560 | struct ec_stripe_heap_entry r) | |
561 | { | |
562 | return ((l.blocks_nonempty > r.blocks_nonempty) - | |
563 | (l.blocks_nonempty < r.blocks_nonempty)); | |
564 | } | |
565 | ||
566 | static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, | |
567 | size_t i) | |
568 | { | |
569 | struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); | |
570 | ||
dfe9bfb3 | 571 | genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; |
cd575ddf KO |
572 | } |
573 | ||
574 | static void heap_verify_backpointer(struct bch_fs *c, size_t idx) | |
575 | { | |
576 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
dfe9bfb3 | 577 | struct stripe *m = genradix_ptr(&c->stripes[0], idx); |
cd575ddf KO |
578 | |
579 | BUG_ON(!m->alive); | |
580 | BUG_ON(m->heap_idx >= h->used); | |
581 | BUG_ON(h->data[m->heap_idx].idx != idx); | |
582 | } | |
583 | ||
cd575ddf | 584 | void bch2_stripes_heap_update(struct bch_fs *c, |
dfe9bfb3 | 585 | struct stripe *m, size_t idx) |
cd575ddf KO |
586 | { |
587 | ec_stripes_heap *h = &c->ec_stripes_heap; | |
cd575ddf KO |
588 | size_t i; |
589 | ||
cd575ddf KO |
590 | heap_verify_backpointer(c, idx); |
591 | ||
61c8d7c8 | 592 | h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; |
cd575ddf KO |
593 | |
594 | i = m->heap_idx; | |
595 | heap_sift_up(h, i, ec_stripes_heap_cmp, | |
596 | ec_stripes_heap_set_backpointer); | |
597 | heap_sift_down(h, i, ec_stripes_heap_cmp, | |
598 | ec_stripes_heap_set_backpointer); | |
599 | ||
600 | heap_verify_backpointer(c, idx); | |
601 | ||
61c8d7c8 | 602 | if (stripe_idx_to_delete(c) >= 0) |
cd575ddf KO |
603 | schedule_work(&c->ec_stripe_delete_work); |
604 | } | |
605 | ||
606 | void bch2_stripes_heap_del(struct bch_fs *c, | |
dfe9bfb3 | 607 | struct stripe *m, size_t idx) |
cd575ddf | 608 | { |
cd575ddf KO |
609 | heap_verify_backpointer(c, idx); |
610 | ||
611 | m->alive = false; | |
612 | heap_del(&c->ec_stripes_heap, m->heap_idx, | |
613 | ec_stripes_heap_cmp, | |
614 | ec_stripes_heap_set_backpointer); | |
cd575ddf KO |
615 | } |
616 | ||
617 | void bch2_stripes_heap_insert(struct bch_fs *c, | |
dfe9bfb3 | 618 | struct stripe *m, size_t idx) |
cd575ddf | 619 | { |
cd575ddf KO |
620 | BUG_ON(heap_full(&c->ec_stripes_heap)); |
621 | ||
622 | heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { | |
623 | .idx = idx, | |
61c8d7c8 | 624 | .blocks_nonempty = m->blocks_nonempty, |
cd575ddf KO |
625 | }), |
626 | ec_stripes_heap_cmp, | |
627 | ec_stripes_heap_set_backpointer); | |
628 | m->alive = true; | |
629 | ||
630 | heap_verify_backpointer(c, idx); | |
cd575ddf KO |
631 | } |
632 | ||
dfe9bfb3 KO |
633 | /* stripe deletion */ |
634 | ||
0564b167 | 635 | static int ec_stripe_delete(struct bch_fs *c, size_t idx) |
cd575ddf | 636 | { |
0564b167 KO |
637 | return bch2_btree_delete_range(c, BTREE_ID_EC, |
638 | POS(0, idx), | |
639 | POS(0, idx + 1), | |
640 | NULL); | |
cd575ddf KO |
641 | } |
642 | ||
643 | static void ec_stripe_delete_work(struct work_struct *work) | |
644 | { | |
645 | struct bch_fs *c = | |
646 | container_of(work, struct bch_fs, ec_stripe_delete_work); | |
647 | ssize_t idx; | |
648 | ||
649 | down_read(&c->gc_lock); | |
dfe9bfb3 | 650 | mutex_lock(&c->ec_stripe_create_lock); |
cd575ddf KO |
651 | |
652 | while (1) { | |
653 | spin_lock(&c->ec_stripes_heap_lock); | |
654 | idx = stripe_idx_to_delete(c); | |
655 | spin_unlock(&c->ec_stripes_heap_lock); | |
656 | ||
657 | if (idx < 0) | |
658 | break; | |
659 | ||
660 | ec_stripe_delete(c, idx); | |
661 | } | |
662 | ||
dfe9bfb3 | 663 | mutex_unlock(&c->ec_stripe_create_lock); |
cd575ddf KO |
664 | up_read(&c->gc_lock); |
665 | } | |
666 | ||
dfe9bfb3 KO |
667 | /* stripe creation: */ |
668 | ||
cd575ddf KO |
669 | static int ec_stripe_bkey_insert(struct bch_fs *c, |
670 | struct bkey_i_stripe *stripe) | |
671 | { | |
0564b167 KO |
672 | struct btree_trans trans; |
673 | struct btree_iter *iter; | |
cd575ddf KO |
674 | struct bkey_s_c k; |
675 | int ret; | |
676 | ||
0564b167 | 677 | bch2_trans_init(&trans, c); |
cd575ddf | 678 | retry: |
0564b167 KO |
679 | bch2_trans_begin(&trans); |
680 | ||
681 | /* XXX: start pos hint */ | |
682 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, | |
683 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT); | |
684 | ||
685 | for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { | |
686 | if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) | |
687 | break; | |
cd575ddf KO |
688 | |
689 | if (bkey_deleted(k.k)) | |
690 | goto found_slot; | |
691 | } | |
692 | ||
0564b167 KO |
693 | ret = -ENOSPC; |
694 | goto out; | |
cd575ddf | 695 | found_slot: |
0564b167 | 696 | ret = ec_stripe_mem_alloc(c, iter); |
cd575ddf KO |
697 | |
698 | if (ret == -EINTR) | |
699 | goto retry; | |
700 | if (ret) | |
701 | return ret; | |
702 | ||
0564b167 | 703 | stripe->k.p = iter->pos; |
cd575ddf | 704 | |
0564b167 KO |
705 | bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i)); |
706 | ||
707 | ret = bch2_trans_commit(&trans, NULL, NULL, | |
708 | BTREE_INSERT_NOFAIL| | |
709 | BTREE_INSERT_USE_RESERVE); | |
710 | out: | |
711 | bch2_trans_exit(&trans); | |
cd575ddf | 712 | |
cd575ddf KO |
713 | return ret; |
714 | } | |
715 | ||
cd575ddf KO |
716 | static void extent_stripe_ptr_add(struct bkey_s_extent e, |
717 | struct ec_stripe_buf *s, | |
718 | struct bch_extent_ptr *ptr, | |
719 | unsigned block) | |
720 | { | |
721 | struct bch_extent_stripe_ptr *dst = (void *) ptr; | |
722 | union bch_extent_entry *end = extent_entry_last(e); | |
723 | ||
724 | memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); | |
725 | e.k->u64s += sizeof(*dst) / sizeof(u64); | |
726 | ||
727 | *dst = (struct bch_extent_stripe_ptr) { | |
728 | .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, | |
729 | .block = block, | |
730 | .idx = s->key.k.p.offset, | |
731 | }; | |
732 | } | |
733 | ||
734 | static int ec_stripe_update_ptrs(struct bch_fs *c, | |
735 | struct ec_stripe_buf *s, | |
736 | struct bkey *pos) | |
737 | { | |
0564b167 KO |
738 | struct btree_trans trans; |
739 | struct btree_iter *iter; | |
cd575ddf KO |
740 | struct bkey_s_c k; |
741 | struct bkey_s_extent e; | |
742 | struct bch_extent_ptr *ptr; | |
743 | BKEY_PADDED(k) tmp; | |
744 | int ret = 0, dev, idx; | |
745 | ||
0564b167 | 746 | bch2_trans_init(&trans, c); |
cd575ddf | 747 | |
0564b167 KO |
748 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, |
749 | bkey_start_pos(pos), | |
750 | BTREE_ITER_INTENT); | |
751 | ||
752 | while ((k = bch2_btree_iter_peek(iter)).k && | |
0f238367 | 753 | !(ret = bkey_err(k)) && |
cd575ddf KO |
754 | bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { |
755 | idx = extent_matches_stripe(c, &s->key.v, k); | |
756 | if (idx < 0) { | |
0564b167 | 757 | bch2_btree_iter_next(iter); |
cd575ddf KO |
758 | continue; |
759 | } | |
760 | ||
761 | dev = s->key.v.ptrs[idx].dev; | |
762 | ||
763 | bkey_reassemble(&tmp.k, k); | |
764 | e = bkey_i_to_s_extent(&tmp.k); | |
765 | ||
766 | extent_for_each_ptr(e, ptr) | |
767 | if (ptr->dev != dev) | |
768 | ptr->cached = true; | |
769 | ||
770 | ptr = (void *) bch2_extent_has_device(e.c, dev); | |
771 | BUG_ON(!ptr); | |
772 | ||
773 | extent_stripe_ptr_add(e, s, ptr, idx); | |
774 | ||
0564b167 KO |
775 | bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k)); |
776 | ||
777 | ret = bch2_trans_commit(&trans, NULL, NULL, | |
778 | BTREE_INSERT_ATOMIC| | |
779 | BTREE_INSERT_NOFAIL| | |
780 | BTREE_INSERT_USE_RESERVE); | |
cd575ddf KO |
781 | if (ret == -EINTR) |
782 | ret = 0; | |
783 | if (ret) | |
784 | break; | |
785 | } | |
786 | ||
0564b167 KO |
787 | bch2_trans_exit(&trans); |
788 | ||
789 | return ret; | |
cd575ddf KO |
790 | } |
791 | ||
792 | /* | |
793 | * data buckets of new stripe all written: create the stripe | |
794 | */ | |
795 | static void ec_stripe_create(struct ec_stripe_new *s) | |
796 | { | |
cd575ddf KO |
797 | struct bch_fs *c = s->c; |
798 | struct open_bucket *ob; | |
799 | struct bkey_i *k; | |
800 | struct bch_stripe *v = &s->stripe.key.v; | |
801 | unsigned i, nr_data = v->nr_blocks - v->nr_redundant; | |
802 | struct closure cl; | |
803 | int ret; | |
804 | ||
805 | BUG_ON(s->h->s == s); | |
806 | ||
807 | closure_init_stack(&cl); | |
808 | ||
809 | if (s->err) { | |
810 | bch_err(c, "error creating stripe: error writing data buckets"); | |
811 | goto err; | |
812 | } | |
813 | ||
814 | if (!percpu_ref_tryget(&c->writes)) | |
815 | goto err; | |
816 | ||
817 | BUG_ON(bitmap_weight(s->blocks_allocated, | |
818 | s->blocks.nr) != s->blocks.nr); | |
819 | ||
820 | ec_generate_ec(&s->stripe); | |
821 | ||
822 | ec_generate_checksums(&s->stripe); | |
823 | ||
824 | /* write p/q: */ | |
825 | for (i = nr_data; i < v->nr_blocks; i++) | |
826 | ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); | |
827 | ||
828 | closure_sync(&cl); | |
829 | ||
830 | for (i = nr_data; i < v->nr_blocks; i++) | |
831 | if (!test_bit(i, s->stripe.valid)) { | |
832 | bch_err(c, "error creating stripe: error writing redundancy buckets"); | |
833 | goto err_put_writes; | |
834 | } | |
835 | ||
dfe9bfb3 KO |
836 | mutex_lock(&c->ec_stripe_create_lock); |
837 | ||
cd575ddf KO |
838 | ret = ec_stripe_bkey_insert(c, &s->stripe.key); |
839 | if (ret) { | |
840 | bch_err(c, "error creating stripe: error creating stripe key"); | |
dfe9bfb3 | 841 | goto err_unlock; |
cd575ddf KO |
842 | } |
843 | ||
844 | for_each_keylist_key(&s->keys, k) { | |
845 | ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); | |
846 | if (ret) | |
847 | break; | |
848 | } | |
849 | ||
dfe9bfb3 KO |
850 | err_unlock: |
851 | mutex_unlock(&c->ec_stripe_create_lock); | |
cd575ddf KO |
852 | err_put_writes: |
853 | percpu_ref_put(&c->writes); | |
854 | err: | |
855 | open_bucket_for_each(c, &s->blocks, ob, i) { | |
856 | ob->ec = NULL; | |
857 | __bch2_open_bucket_put(c, ob); | |
858 | } | |
859 | ||
860 | bch2_open_buckets_put(c, &s->parity); | |
861 | ||
862 | bch2_keylist_free(&s->keys, s->inline_keys); | |
863 | ||
864 | mutex_lock(&s->h->lock); | |
865 | list_del(&s->list); | |
866 | mutex_unlock(&s->h->lock); | |
867 | ||
868 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) | |
869 | kvpfree(s->stripe.data[i], s->stripe.size << 9); | |
870 | kfree(s); | |
871 | } | |
872 | ||
873 | static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) | |
874 | { | |
875 | struct ec_stripe_new *s = h->s; | |
876 | ||
877 | list_add(&s->list, &h->stripes); | |
878 | h->s = NULL; | |
879 | ||
880 | return s; | |
881 | } | |
882 | ||
883 | static void ec_stripe_new_put(struct ec_stripe_new *s) | |
884 | { | |
885 | BUG_ON(atomic_read(&s->pin) <= 0); | |
886 | if (atomic_dec_and_test(&s->pin)) | |
887 | ec_stripe_create(s); | |
888 | } | |
889 | ||
890 | /* have a full bucket - hand it off to be erasure coded: */ | |
891 | void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) | |
892 | { | |
893 | struct ec_stripe_new *s = ob->ec; | |
894 | ||
895 | if (ob->sectors_free) | |
896 | s->err = -1; | |
897 | ||
898 | ec_stripe_new_put(s); | |
899 | } | |
900 | ||
901 | void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) | |
902 | { | |
903 | struct ec_stripe_new *s = ob->ec; | |
904 | ||
905 | s->err = -EIO; | |
906 | } | |
907 | ||
908 | void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) | |
909 | { | |
910 | struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); | |
911 | struct bch_dev *ca; | |
912 | unsigned offset; | |
913 | ||
914 | if (!ob) | |
915 | return NULL; | |
916 | ||
917 | ca = bch_dev_bkey_exists(c, ob->ptr.dev); | |
918 | offset = ca->mi.bucket_size - ob->sectors_free; | |
919 | ||
920 | return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); | |
921 | } | |
922 | ||
923 | void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, | |
924 | struct bpos pos, unsigned sectors) | |
925 | { | |
926 | struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); | |
927 | struct ec_stripe_new *ec; | |
928 | ||
929 | if (!ob) | |
930 | return; | |
931 | ||
932 | ec = ob->ec; | |
933 | mutex_lock(&ec->lock); | |
934 | ||
935 | if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, | |
936 | ARRAY_SIZE(ec->inline_keys), | |
937 | BKEY_U64s)) { | |
938 | BUG(); | |
939 | } | |
940 | ||
941 | bkey_init(&ec->keys.top->k); | |
942 | ec->keys.top->k.p = pos; | |
943 | bch2_key_resize(&ec->keys.top->k, sectors); | |
944 | bch2_keylist_push(&ec->keys); | |
945 | ||
946 | mutex_unlock(&ec->lock); | |
947 | } | |
948 | ||
949 | static int unsigned_cmp(const void *_l, const void *_r) | |
950 | { | |
951 | unsigned l = *((const unsigned *) _l); | |
952 | unsigned r = *((const unsigned *) _r); | |
953 | ||
954 | return (l > r) - (l < r); | |
955 | } | |
956 | ||
957 | /* pick most common bucket size: */ | |
958 | static unsigned pick_blocksize(struct bch_fs *c, | |
959 | struct bch_devs_mask *devs) | |
960 | { | |
961 | struct bch_dev *ca; | |
962 | unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; | |
963 | struct { | |
964 | unsigned nr, size; | |
965 | } cur = { 0, 0 }, best = { 0, 0 }; | |
966 | ||
967 | for_each_member_device_rcu(ca, c, i, devs) | |
968 | sizes[nr++] = ca->mi.bucket_size; | |
969 | ||
970 | sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); | |
971 | ||
972 | for (i = 0; i < nr; i++) { | |
973 | if (sizes[i] != cur.size) { | |
974 | if (cur.nr > best.nr) | |
975 | best = cur; | |
976 | ||
977 | cur.nr = 0; | |
978 | cur.size = sizes[i]; | |
979 | } | |
980 | ||
981 | cur.nr++; | |
982 | } | |
983 | ||
984 | if (cur.nr > best.nr) | |
985 | best = cur; | |
986 | ||
987 | return best.size; | |
988 | } | |
989 | ||
990 | int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) | |
991 | { | |
992 | struct ec_stripe_new *s; | |
993 | unsigned i; | |
994 | ||
995 | BUG_ON(h->parity.nr != h->redundancy); | |
996 | BUG_ON(!h->blocks.nr); | |
997 | BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); | |
998 | lockdep_assert_held(&h->lock); | |
999 | ||
1000 | s = kzalloc(sizeof(*s), GFP_KERNEL); | |
1001 | if (!s) | |
1002 | return -ENOMEM; | |
1003 | ||
1004 | mutex_init(&s->lock); | |
1005 | atomic_set(&s->pin, 1); | |
1006 | s->c = c; | |
1007 | s->h = h; | |
1008 | s->blocks = h->blocks; | |
1009 | s->parity = h->parity; | |
1010 | ||
1011 | memset(&h->blocks, 0, sizeof(h->blocks)); | |
1012 | memset(&h->parity, 0, sizeof(h->parity)); | |
1013 | ||
1014 | bch2_keylist_init(&s->keys, s->inline_keys); | |
1015 | ||
1016 | s->stripe.offset = 0; | |
1017 | s->stripe.size = h->blocksize; | |
1018 | memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); | |
1019 | ||
1020 | ec_stripe_key_init(c, &s->stripe.key, | |
1021 | &s->blocks, &s->parity, | |
1022 | h->blocksize); | |
1023 | ||
1024 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { | |
1025 | s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); | |
1026 | if (!s->stripe.data[i]) | |
1027 | goto err; | |
1028 | } | |
1029 | ||
1030 | h->s = s; | |
1031 | ||
1032 | return 0; | |
1033 | err: | |
1034 | for (i = 0; i < s->stripe.key.v.nr_blocks; i++) | |
1035 | kvpfree(s->stripe.data[i], s->stripe.size << 9); | |
1036 | kfree(s); | |
1037 | return -ENOMEM; | |
1038 | } | |
1039 | ||
1040 | static struct ec_stripe_head * | |
1041 | ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, | |
1042 | unsigned algo, unsigned redundancy) | |
1043 | { | |
1044 | struct ec_stripe_head *h; | |
1045 | struct bch_dev *ca; | |
1046 | unsigned i; | |
1047 | ||
1048 | h = kzalloc(sizeof(*h), GFP_KERNEL); | |
1049 | if (!h) | |
1050 | return NULL; | |
1051 | ||
1052 | mutex_init(&h->lock); | |
1053 | mutex_lock(&h->lock); | |
1054 | INIT_LIST_HEAD(&h->stripes); | |
1055 | ||
1056 | h->target = target; | |
1057 | h->algo = algo; | |
1058 | h->redundancy = redundancy; | |
1059 | ||
1060 | rcu_read_lock(); | |
1061 | h->devs = target_rw_devs(c, BCH_DATA_USER, target); | |
1062 | ||
1063 | for_each_member_device_rcu(ca, c, i, &h->devs) | |
1064 | if (!ca->mi.durability) | |
1065 | __clear_bit(i, h->devs.d); | |
1066 | ||
1067 | h->blocksize = pick_blocksize(c, &h->devs); | |
1068 | ||
1069 | for_each_member_device_rcu(ca, c, i, &h->devs) | |
1070 | if (ca->mi.bucket_size == h->blocksize) | |
1071 | h->nr_active_devs++; | |
1072 | ||
1073 | rcu_read_unlock(); | |
1074 | list_add(&h->list, &c->ec_new_stripe_list); | |
1075 | return h; | |
1076 | } | |
1077 | ||
1078 | void bch2_ec_stripe_head_put(struct ec_stripe_head *h) | |
1079 | { | |
1080 | struct ec_stripe_new *s = NULL; | |
1081 | ||
1082 | if (h->s && | |
1083 | bitmap_weight(h->s->blocks_allocated, | |
1084 | h->s->blocks.nr) == h->s->blocks.nr) | |
1085 | s = ec_stripe_set_pending(h); | |
1086 | ||
1087 | mutex_unlock(&h->lock); | |
1088 | ||
1089 | if (s) | |
1090 | ec_stripe_new_put(s); | |
1091 | } | |
1092 | ||
1093 | struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, | |
1094 | unsigned target, | |
1095 | unsigned algo, | |
1096 | unsigned redundancy) | |
1097 | { | |
1098 | struct ec_stripe_head *h; | |
1099 | ||
1100 | if (!redundancy) | |
1101 | return NULL; | |
1102 | ||
1103 | mutex_lock(&c->ec_new_stripe_lock); | |
1104 | list_for_each_entry(h, &c->ec_new_stripe_list, list) | |
1105 | if (h->target == target && | |
1106 | h->algo == algo && | |
1107 | h->redundancy == redundancy) { | |
1108 | mutex_lock(&h->lock); | |
1109 | goto found; | |
1110 | } | |
1111 | ||
1112 | h = ec_new_stripe_head_alloc(c, target, algo, redundancy); | |
1113 | found: | |
1114 | mutex_unlock(&c->ec_new_stripe_lock); | |
1115 | return h; | |
1116 | } | |
1117 | ||
1118 | void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) | |
1119 | { | |
1120 | struct ec_stripe_head *h; | |
1121 | struct open_bucket *ob; | |
1122 | unsigned i; | |
1123 | ||
1124 | mutex_lock(&c->ec_new_stripe_lock); | |
1125 | list_for_each_entry(h, &c->ec_new_stripe_list, list) { | |
1126 | struct ec_stripe_new *s = NULL; | |
1127 | ||
1128 | mutex_lock(&h->lock); | |
1129 | bch2_open_buckets_stop_dev(c, ca, | |
1130 | &h->blocks, | |
1131 | BCH_DATA_USER); | |
1132 | bch2_open_buckets_stop_dev(c, ca, | |
1133 | &h->parity, | |
1134 | BCH_DATA_USER); | |
1135 | ||
1136 | if (!h->s) | |
1137 | goto unlock; | |
1138 | ||
1139 | open_bucket_for_each(c, &h->s->blocks, ob, i) | |
1140 | if (ob->ptr.dev == ca->dev_idx) | |
1141 | goto found; | |
1142 | open_bucket_for_each(c, &h->s->parity, ob, i) | |
1143 | if (ob->ptr.dev == ca->dev_idx) | |
1144 | goto found; | |
1145 | goto unlock; | |
1146 | found: | |
1147 | h->s->err = -1; | |
1148 | s = ec_stripe_set_pending(h); | |
1149 | unlock: | |
1150 | mutex_unlock(&h->lock); | |
1151 | ||
1152 | if (s) | |
1153 | ec_stripe_new_put(s); | |
1154 | } | |
1155 | mutex_unlock(&c->ec_new_stripe_lock); | |
1156 | } | |
1157 | ||
0564b167 | 1158 | static int __bch2_stripe_write_key(struct btree_trans *trans, |
61c8d7c8 KO |
1159 | struct btree_iter *iter, |
1160 | struct stripe *m, | |
1161 | size_t idx, | |
1162 | struct bkey_i_stripe *new_key, | |
1163 | unsigned flags) | |
1164 | { | |
0564b167 | 1165 | struct bch_fs *c = trans->c; |
61c8d7c8 KO |
1166 | struct bkey_s_c k; |
1167 | unsigned i; | |
1168 | int ret; | |
1169 | ||
1170 | bch2_btree_iter_set_pos(iter, POS(0, idx)); | |
1171 | ||
1172 | k = bch2_btree_iter_peek_slot(iter); | |
0f238367 | 1173 | ret = bkey_err(k); |
61c8d7c8 KO |
1174 | if (ret) |
1175 | return ret; | |
1176 | ||
1177 | if (k.k->type != KEY_TYPE_stripe) | |
1178 | return -EIO; | |
1179 | ||
1180 | bkey_reassemble(&new_key->k_i, k); | |
1181 | ||
1182 | spin_lock(&c->ec_stripes_heap_lock); | |
1183 | ||
1184 | for (i = 0; i < new_key->v.nr_blocks; i++) | |
1185 | stripe_blockcount_set(&new_key->v, i, | |
1186 | m->block_sectors[i]); | |
1187 | m->dirty = false; | |
1188 | ||
1189 | spin_unlock(&c->ec_stripes_heap_lock); | |
1190 | ||
0564b167 KO |
1191 | bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i)); |
1192 | ||
1193 | return bch2_trans_commit(trans, NULL, NULL, | |
1194 | BTREE_INSERT_NOFAIL|flags); | |
61c8d7c8 KO |
1195 | } |
1196 | ||
a0e0bda1 | 1197 | int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) |
61c8d7c8 | 1198 | { |
0564b167 KO |
1199 | struct btree_trans trans; |
1200 | struct btree_iter *iter; | |
61c8d7c8 KO |
1201 | struct genradix_iter giter; |
1202 | struct bkey_i_stripe *new_key; | |
1203 | struct stripe *m; | |
1204 | int ret = 0; | |
1205 | ||
1206 | new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); | |
1207 | BUG_ON(!new_key); | |
1208 | ||
0564b167 KO |
1209 | bch2_trans_init(&trans, c); |
1210 | ||
1211 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, | |
1212 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT); | |
61c8d7c8 KO |
1213 | |
1214 | genradix_for_each(&c->stripes[0], giter, m) { | |
1215 | if (!m->dirty) | |
1216 | continue; | |
1217 | ||
0564b167 | 1218 | ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, |
a0e0bda1 | 1219 | new_key, flags); |
61c8d7c8 KO |
1220 | if (ret) |
1221 | break; | |
1222 | ||
1223 | *wrote = true; | |
1224 | } | |
1225 | ||
0564b167 | 1226 | bch2_trans_exit(&trans); |
61c8d7c8 KO |
1227 | |
1228 | kfree(new_key); | |
1229 | ||
1230 | return ret; | |
1231 | } | |
1232 | ||
1233 | static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k) | |
1234 | { | |
36e916e1 | 1235 | bch2_mark_key(c, k, true, 0, NULL, 0, 0); |
61c8d7c8 KO |
1236 | } |
1237 | ||
1238 | int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list) | |
1239 | { | |
1240 | struct journal_replay *r; | |
424eb881 KO |
1241 | struct btree_trans trans; |
1242 | struct btree_iter *iter; | |
61c8d7c8 KO |
1243 | struct bkey_s_c k; |
1244 | int ret; | |
1245 | ||
1246 | ret = bch2_fs_ec_start(c); | |
1247 | if (ret) | |
1248 | return ret; | |
1249 | ||
424eb881 KO |
1250 | bch2_trans_init(&trans, c); |
1251 | ||
1252 | for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k) { | |
61c8d7c8 | 1253 | bch2_stripe_read_key(c, k); |
424eb881 | 1254 | bch2_trans_cond_resched(&trans); |
61c8d7c8 KO |
1255 | } |
1256 | ||
424eb881 | 1257 | ret = bch2_trans_exit(&trans); |
61c8d7c8 KO |
1258 | if (ret) |
1259 | return ret; | |
1260 | ||
1261 | list_for_each_entry(r, journal_replay_list, list) { | |
1262 | struct bkey_i *k, *n; | |
1263 | struct jset_entry *entry; | |
1264 | ||
1265 | for_each_jset_key(k, n, entry, &r->j) | |
1266 | if (entry->btree_id == BTREE_ID_EC) | |
1267 | bch2_stripe_read_key(c, bkey_i_to_s_c(k)); | |
1268 | } | |
1269 | ||
1270 | return 0; | |
1271 | } | |
1272 | ||
dfe9bfb3 | 1273 | int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) |
cd575ddf | 1274 | { |
424eb881 KO |
1275 | struct btree_trans trans; |
1276 | struct btree_iter *iter; | |
cd575ddf KO |
1277 | struct bkey_s_c k; |
1278 | size_t i, idx = 0; | |
1279 | int ret = 0; | |
1280 | ||
424eb881 KO |
1281 | bch2_trans_init(&trans, c); |
1282 | ||
1283 | iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); | |
cd575ddf | 1284 | |
424eb881 | 1285 | k = bch2_btree_iter_prev(iter); |
cd575ddf KO |
1286 | if (!IS_ERR_OR_NULL(k.k)) |
1287 | idx = k.k->p.offset + 1; | |
424eb881 | 1288 | ret = bch2_trans_exit(&trans); |
cd575ddf KO |
1289 | if (ret) |
1290 | return ret; | |
1291 | ||
dfe9bfb3 KO |
1292 | if (!gc && |
1293 | !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), | |
cd575ddf KO |
1294 | GFP_KERNEL)) |
1295 | return -ENOMEM; | |
1296 | #if 0 | |
dfe9bfb3 | 1297 | ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); |
cd575ddf KO |
1298 | #else |
1299 | for (i = 0; i < idx; i++) | |
dfe9bfb3 | 1300 | if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) |
cd575ddf KO |
1301 | return -ENOMEM; |
1302 | #endif | |
1303 | return 0; | |
1304 | } | |
1305 | ||
dfe9bfb3 KO |
1306 | int bch2_fs_ec_start(struct bch_fs *c) |
1307 | { | |
1308 | return bch2_ec_mem_alloc(c, false); | |
1309 | } | |
1310 | ||
cd575ddf KO |
1311 | void bch2_fs_ec_exit(struct bch_fs *c) |
1312 | { | |
1313 | struct ec_stripe_head *h; | |
1314 | ||
1315 | while (1) { | |
1316 | mutex_lock(&c->ec_new_stripe_lock); | |
1317 | h = list_first_entry_or_null(&c->ec_new_stripe_list, | |
1318 | struct ec_stripe_head, list); | |
1319 | if (h) | |
1320 | list_del(&h->list); | |
1321 | mutex_unlock(&c->ec_new_stripe_lock); | |
1322 | if (!h) | |
1323 | break; | |
1324 | ||
1325 | BUG_ON(h->s); | |
1326 | BUG_ON(!list_empty(&h->stripes)); | |
1327 | kfree(h); | |
1328 | } | |
1329 | ||
1330 | free_heap(&c->ec_stripes_heap); | |
dfe9bfb3 | 1331 | genradix_free(&c->stripes[0]); |
cd575ddf KO |
1332 | bioset_exit(&c->ec_bioset); |
1333 | } | |
1334 | ||
1335 | int bch2_fs_ec_init(struct bch_fs *c) | |
1336 | { | |
1337 | INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); | |
1338 | ||
1339 | return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), | |
1340 | BIOSET_NEED_BVECS); | |
1341 | } |