raid5-ppl: support disk hot add/remove with PPL
[linux-2.6-block.git] / drivers / md / raid5-ppl.c
CommitLineData
3418d036
AP
1/*
2 * Partial Parity Log for closing the RAID5 write hole
3 * Copyright (c) 2017, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/kernel.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/crc32c.h>
19#include <linux/flex_array.h>
20#include <linux/async_tx.h>
21#include <linux/raid/md_p.h>
22#include "md.h"
23#include "raid5.h"
24
25/*
26 * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
27 * partial parity data. The header contains an array of entries
28 * (struct ppl_header_entry) which describe the logged write requests.
29 * Partial parity for the entries comes after the header, written in the same
30 * sequence as the entries:
31 *
32 * Header
33 * entry0
34 * ...
35 * entryN
36 * PP data
37 * PP for entry0
38 * ...
39 * PP for entryN
40 *
41 * An entry describes one or more consecutive stripe_heads, up to a full
42 * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
43 * number of stripe_heads in the entry and n is the number of modified data
44 * disks. Every stripe_head in the entry must write to the same data disks.
45 * An example of a valid case described by a single entry (writes to the first
46 * stripe of a 4 disk array, 16k chunk size):
47 *
48 * sh->sector dd0 dd1 dd2 ppl
49 * +-----+-----+-----+
50 * 0 | --- | --- | --- | +----+
51 * 8 | -W- | -W- | --- | | pp | data_sector = 8
52 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
53 * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
54 * +-----+-----+-----+ +----+
55 *
56 * data_sector is the first raid sector of the modified data, data_size is the
57 * total size of modified data and pp_size is the size of partial parity for
58 * this entry. Entries for full stripe writes contain no partial parity
59 * (pp_size = 0), they only mark the stripes for which parity should be
60 * recalculated after an unclean shutdown. Every entry holds a checksum of its
61 * partial parity, the header also has a checksum of the header itself.
62 *
63 * A write request is always logged to the PPL instance stored on the parity
64 * disk of the corresponding stripe. For each member disk there is one ppl_log
65 * used to handle logging for this disk, independently from others. They are
66 * grouped in child_logs array in struct ppl_conf, which is assigned to
67 * r5conf->log_private.
68 *
69 * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
70 * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
71 * can be appended to the last entry if it meets the conditions for a valid
72 * entry described above, otherwise a new entry is added. Checksums of entries
73 * are calculated incrementally as stripes containing partial parity are being
74 * added. ppl_submit_iounit() calculates the checksum of the header and submits
75 * a bio containing the header page and partial parity pages (sh->ppl_page) for
76 * all stripes of the io_unit. When the PPL write completes, the stripes
77 * associated with the io_unit are released and raid5d starts writing their data
78 * and parity. When all stripes are written, the io_unit is freed and the next
79 * can be submitted.
80 *
81 * An io_unit is used to gather stripes until it is submitted or becomes full
82 * (if the maximum number of entries or size of PPL is reached). Another io_unit
83 * can't be submitted until the previous has completed (PPL and stripe
84 * data+parity is written). The log->io_list tracks all io_units of a log
85 * (for a single member disk). New io_units are added to the end of the list
86 * and the first io_unit is submitted, if it is not submitted already.
87 * The current io_unit accepting new stripes is always at the end of the list.
88 */
89
90struct ppl_conf {
91 struct mddev *mddev;
92
93 /* array of child logs, one for each raid disk */
94 struct ppl_log *child_logs;
95 int count;
96
97 int block_size; /* the logical block size used for data_sector
98 * in ppl_header_entry */
99 u32 signature; /* raid array identifier */
100 atomic64_t seq; /* current log write sequence number */
101
102 struct kmem_cache *io_kc;
103 mempool_t *io_pool;
104 struct bio_set *bs;
105 mempool_t *meta_pool;
4536bf9b
AP
106
107 /* used only for recovery */
108 int recovered_entries;
109 int mismatch_count;
3418d036
AP
110};
111
112struct ppl_log {
113 struct ppl_conf *ppl_conf; /* shared between all log instances */
114
115 struct md_rdev *rdev; /* array member disk associated with
116 * this log instance */
117 struct mutex io_mutex;
118 struct ppl_io_unit *current_io; /* current io_unit accepting new data
119 * always at the end of io_list */
120 spinlock_t io_list_lock;
121 struct list_head io_list; /* all io_units of this log */
122 struct list_head no_mem_stripes;/* stripes to retry if failed to
123 * allocate io_unit */
124};
125
126#define PPL_IO_INLINE_BVECS 32
127
128struct ppl_io_unit {
129 struct ppl_log *log;
130
131 struct page *header_page; /* for ppl_header */
132
133 unsigned int entries_count; /* number of entries in ppl_header */
134 unsigned int pp_size; /* total size current of partial parity */
135
136 u64 seq; /* sequence number of this log write */
137 struct list_head log_sibling; /* log->io_list */
138
139 struct list_head stripe_list; /* stripes added to the io_unit */
140 atomic_t pending_stripes; /* how many stripes not written to raid */
141
142 bool submitted; /* true if write to log started */
143
144 /* inline bio and its biovec for submitting the iounit */
145 struct bio bio;
146 struct bio_vec biovec[PPL_IO_INLINE_BVECS];
147};
148
149struct dma_async_tx_descriptor *
150ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
151 struct dma_async_tx_descriptor *tx)
152{
153 int disks = sh->disks;
154 struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
155 int count = 0, pd_idx = sh->pd_idx, i;
156 struct async_submit_ctl submit;
157
158 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
159
160 /*
161 * Partial parity is the XOR of stripe data chunks that are not changed
162 * during the write request. Depending on available data
163 * (read-modify-write vs. reconstruct-write case) we calculate it
164 * differently.
165 */
166 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
167 /* rmw: xor old data and parity from updated disks */
168 for (i = disks; i--;) {
169 struct r5dev *dev = &sh->dev[i];
170 if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
171 xor_srcs[count++] = dev->page;
172 }
173 } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
174 /* rcw: xor data from all not updated disks */
175 for (i = disks; i--;) {
176 struct r5dev *dev = &sh->dev[i];
177 if (test_bit(R5_UPTODATE, &dev->flags))
178 xor_srcs[count++] = dev->page;
179 }
180 } else {
181 return tx;
182 }
183
184 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
185 NULL, sh, flex_array_get(percpu->scribble, 0)
186 + sizeof(struct page *) * (sh->disks + 2));
187
188 if (count == 1)
189 tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
190 &submit);
191 else
192 tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
193 &submit);
194
195 return tx;
196}
197
198static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
199 struct stripe_head *sh)
200{
201 struct ppl_conf *ppl_conf = log->ppl_conf;
202 struct ppl_io_unit *io;
203 struct ppl_header *pplhdr;
204
205 io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
206 if (!io)
207 return NULL;
208
209 memset(io, 0, sizeof(*io));
210 io->log = log;
211 INIT_LIST_HEAD(&io->log_sibling);
212 INIT_LIST_HEAD(&io->stripe_list);
213 atomic_set(&io->pending_stripes, 0);
214 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
215
216 io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
217 pplhdr = page_address(io->header_page);
218 clear_page(pplhdr);
219 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
220 pplhdr->signature = cpu_to_le32(ppl_conf->signature);
221
222 io->seq = atomic64_add_return(1, &ppl_conf->seq);
223 pplhdr->generation = cpu_to_le64(io->seq);
224
225 return io;
226}
227
228static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
229{
230 struct ppl_io_unit *io = log->current_io;
231 struct ppl_header_entry *e = NULL;
232 struct ppl_header *pplhdr;
233 int i;
234 sector_t data_sector = 0;
235 int data_disks = 0;
236 unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
237 struct r5conf *conf = sh->raid_conf;
238
239 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
240
241 /* check if current io_unit is full */
242 if (io && (io->pp_size == entry_space ||
243 io->entries_count == PPL_HDR_MAX_ENTRIES)) {
244 pr_debug("%s: add io_unit blocked by seq: %llu\n",
245 __func__, io->seq);
246 io = NULL;
247 }
248
249 /* add a new unit if there is none or the current is full */
250 if (!io) {
251 io = ppl_new_iounit(log, sh);
252 if (!io)
253 return -ENOMEM;
254 spin_lock_irq(&log->io_list_lock);
255 list_add_tail(&io->log_sibling, &log->io_list);
256 spin_unlock_irq(&log->io_list_lock);
257
258 log->current_io = io;
259 }
260
261 for (i = 0; i < sh->disks; i++) {
262 struct r5dev *dev = &sh->dev[i];
263
264 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
265 if (!data_disks || dev->sector < data_sector)
266 data_sector = dev->sector;
267 data_disks++;
268 }
269 }
270 BUG_ON(!data_disks);
271
272 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
273 io->seq, (unsigned long long)data_sector, data_disks);
274
275 pplhdr = page_address(io->header_page);
276
277 if (io->entries_count > 0) {
278 struct ppl_header_entry *last =
279 &pplhdr->entries[io->entries_count - 1];
280 struct stripe_head *sh_last = list_last_entry(
281 &io->stripe_list, struct stripe_head, log_list);
282 u64 data_sector_last = le64_to_cpu(last->data_sector);
283 u32 data_size_last = le32_to_cpu(last->data_size);
284
285 /*
286 * Check if we can append the stripe to the last entry. It must
287 * be just after the last logged stripe and write to the same
288 * disks. Use bit shift and logarithm to avoid 64-bit division.
289 */
290 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
291 (data_sector >> ilog2(conf->chunk_sectors) ==
292 data_sector_last >> ilog2(conf->chunk_sectors)) &&
293 ((data_sector - data_sector_last) * data_disks ==
294 data_size_last >> 9))
295 e = last;
296 }
297
298 if (!e) {
299 e = &pplhdr->entries[io->entries_count++];
300 e->data_sector = cpu_to_le64(data_sector);
301 e->parity_disk = cpu_to_le32(sh->pd_idx);
302 e->checksum = cpu_to_le32(~0);
303 }
304
305 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
306
307 /* don't write any PP if full stripe write */
308 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
309 le32_add_cpu(&e->pp_size, PAGE_SIZE);
310 io->pp_size += PAGE_SIZE;
311 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
312 page_address(sh->ppl_page),
313 PAGE_SIZE));
314 }
315
316 list_add_tail(&sh->log_list, &io->stripe_list);
317 atomic_inc(&io->pending_stripes);
318 sh->ppl_io = io;
319
320 return 0;
321}
322
323int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
324{
325 struct ppl_conf *ppl_conf = conf->log_private;
326 struct ppl_io_unit *io = sh->ppl_io;
327 struct ppl_log *log;
328
329 if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
330 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
331 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
332 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
333 return -EAGAIN;
334 }
335
336 log = &ppl_conf->child_logs[sh->pd_idx];
337
338 mutex_lock(&log->io_mutex);
339
340 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
341 mutex_unlock(&log->io_mutex);
342 return -EAGAIN;
343 }
344
345 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
346 clear_bit(STRIPE_DELAYED, &sh->state);
347 atomic_inc(&sh->count);
348
349 if (ppl_log_stripe(log, sh)) {
350 spin_lock_irq(&log->io_list_lock);
351 list_add_tail(&sh->log_list, &log->no_mem_stripes);
352 spin_unlock_irq(&log->io_list_lock);
353 }
354
355 mutex_unlock(&log->io_mutex);
356
357 return 0;
358}
359
360static void ppl_log_endio(struct bio *bio)
361{
362 struct ppl_io_unit *io = bio->bi_private;
363 struct ppl_log *log = io->log;
364 struct ppl_conf *ppl_conf = log->ppl_conf;
365 struct stripe_head *sh, *next;
366
367 pr_debug("%s: seq: %llu\n", __func__, io->seq);
368
369 if (bio->bi_error)
370 md_error(ppl_conf->mddev, log->rdev);
371
372 mempool_free(io->header_page, ppl_conf->meta_pool);
373
374 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
375 list_del_init(&sh->log_list);
376
377 set_bit(STRIPE_HANDLE, &sh->state);
378 raid5_release_stripe(sh);
379 }
380}
381
382static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
383{
384 char b[BDEVNAME_SIZE];
385
386 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
387 __func__, io->seq, bio->bi_iter.bi_size,
388 (unsigned long long)bio->bi_iter.bi_sector,
389 bdevname(bio->bi_bdev, b));
390
391 submit_bio(bio);
392}
393
394static void ppl_submit_iounit(struct ppl_io_unit *io)
395{
396 struct ppl_log *log = io->log;
397 struct ppl_conf *ppl_conf = log->ppl_conf;
398 struct ppl_header *pplhdr = page_address(io->header_page);
399 struct bio *bio = &io->bio;
400 struct stripe_head *sh;
401 int i;
402
6358c239
AP
403 bio->bi_private = io;
404
405 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
406 ppl_log_endio(bio);
407 return;
408 }
409
3418d036
AP
410 for (i = 0; i < io->entries_count; i++) {
411 struct ppl_header_entry *e = &pplhdr->entries[i];
412
413 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
414 __func__, io->seq, i, le64_to_cpu(e->data_sector),
415 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
416
417 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
418 ilog2(ppl_conf->block_size >> 9));
419 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
420 }
421
422 pplhdr->entries_count = cpu_to_le32(io->entries_count);
423 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
424
3418d036
AP
425 bio->bi_end_io = ppl_log_endio;
426 bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
427 bio->bi_bdev = log->rdev->bdev;
428 bio->bi_iter.bi_sector = log->rdev->ppl.sector;
429 bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
430
431 list_for_each_entry(sh, &io->stripe_list, log_list) {
432 /* entries for full stripe writes have no partial parity */
433 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
434 continue;
435
436 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
437 struct bio *prev = bio;
438
439 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
440 ppl_conf->bs);
441 bio->bi_opf = prev->bi_opf;
442 bio->bi_bdev = prev->bi_bdev;
443 bio->bi_iter.bi_sector = bio_end_sector(prev);
444 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
445
446 bio_chain(bio, prev);
447 ppl_submit_iounit_bio(io, prev);
448 }
449 }
450
451 ppl_submit_iounit_bio(io, bio);
452}
453
454static void ppl_submit_current_io(struct ppl_log *log)
455{
456 struct ppl_io_unit *io;
457
458 spin_lock_irq(&log->io_list_lock);
459
460 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
461 log_sibling);
462 if (io && io->submitted)
463 io = NULL;
464
465 spin_unlock_irq(&log->io_list_lock);
466
467 if (io) {
468 io->submitted = true;
469
470 if (io == log->current_io)
471 log->current_io = NULL;
472
473 ppl_submit_iounit(io);
474 }
475}
476
477void ppl_write_stripe_run(struct r5conf *conf)
478{
479 struct ppl_conf *ppl_conf = conf->log_private;
480 struct ppl_log *log;
481 int i;
482
483 for (i = 0; i < ppl_conf->count; i++) {
484 log = &ppl_conf->child_logs[i];
485
486 mutex_lock(&log->io_mutex);
487 ppl_submit_current_io(log);
488 mutex_unlock(&log->io_mutex);
489 }
490}
491
492static void ppl_io_unit_finished(struct ppl_io_unit *io)
493{
494 struct ppl_log *log = io->log;
495 unsigned long flags;
496
497 pr_debug("%s: seq: %llu\n", __func__, io->seq);
498
499 spin_lock_irqsave(&log->io_list_lock, flags);
500
501 list_del(&io->log_sibling);
502 mempool_free(io, log->ppl_conf->io_pool);
503
504 if (!list_empty(&log->no_mem_stripes)) {
505 struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
506 struct stripe_head,
507 log_list);
508 list_del_init(&sh->log_list);
509 set_bit(STRIPE_HANDLE, &sh->state);
510 raid5_release_stripe(sh);
511 }
512
513 spin_unlock_irqrestore(&log->io_list_lock, flags);
514}
515
516void ppl_stripe_write_finished(struct stripe_head *sh)
517{
518 struct ppl_io_unit *io;
519
520 io = sh->ppl_io;
521 sh->ppl_io = NULL;
522
523 if (io && atomic_dec_and_test(&io->pending_stripes))
524 ppl_io_unit_finished(io);
525}
526
4536bf9b
AP
527static void ppl_xor(int size, struct page *page1, struct page *page2)
528{
529 struct async_submit_ctl submit;
530 struct dma_async_tx_descriptor *tx;
531 struct page *xor_srcs[] = { page1, page2 };
532
533 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
534 NULL, NULL, NULL, NULL);
535 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
536
537 async_tx_quiesce(&tx);
538}
539
540/*
541 * PPL recovery strategy: xor partial parity and data from all modified data
542 * disks within a stripe and write the result as the new stripe parity. If all
543 * stripe data disks are modified (full stripe write), no partial parity is
544 * available, so just xor the data disks.
545 *
546 * Recovery of a PPL entry shall occur only if all modified data disks are
547 * available and read from all of them succeeds.
548 *
549 * A PPL entry applies to a stripe, partial parity size for an entry is at most
550 * the size of the chunk. Examples of possible cases for a single entry:
551 *
552 * case 0: single data disk write:
553 * data0 data1 data2 ppl parity
554 * +--------+--------+--------+ +--------------------+
555 * | ------ | ------ | ------ | +----+ | (no change) |
556 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
557 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
558 * | ------ | ------ | ------ | +----+ | (no change) |
559 * +--------+--------+--------+ +--------------------+
560 * pp_size = data_size
561 *
562 * case 1: more than one data disk write:
563 * data0 data1 data2 ppl parity
564 * +--------+--------+--------+ +--------------------+
565 * | ------ | ------ | ------ | +----+ | (no change) |
566 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
567 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
568 * | ------ | ------ | ------ | +----+ | (no change) |
569 * +--------+--------+--------+ +--------------------+
570 * pp_size = data_size / modified_data_disks
571 *
572 * case 2: write to all data disks (also full stripe write):
573 * data0 data1 data2 parity
574 * +--------+--------+--------+ +--------------------+
575 * | ------ | ------ | ------ | | (no change) |
576 * | -data- | -data- | -data- | --------> | xor all data |
577 * | ------ | ------ | ------ | --------> | (no change) |
578 * | ------ | ------ | ------ | | (no change) |
579 * +--------+--------+--------+ +--------------------+
580 * pp_size = 0
581 *
582 * The following cases are possible only in other implementations. The recovery
583 * code can handle them, but they are not generated at runtime because they can
584 * be reduced to cases 0, 1 and 2:
585 *
586 * case 3:
587 * data0 data1 data2 ppl parity
588 * +--------+--------+--------+ +----+ +--------------------+
589 * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp |
590 * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
591 * | -data- | -data- | -data- | | -- | -> | xor all data |
592 * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp |
593 * +--------+--------+--------+ +----+ +--------------------+
594 * pp_size = chunk_size
595 *
596 * case 4:
597 * data0 data1 data2 ppl parity
598 * +--------+--------+--------+ +----+ +--------------------+
599 * | ------ | -data- | ------ | | pp | | data1 ^ pp |
600 * | ------ | ------ | ------ | | -- | -> | (no change) |
601 * | ------ | ------ | ------ | | -- | -> | (no change) |
602 * | -data- | ------ | ------ | | pp | | data0 ^ pp |
603 * +--------+--------+--------+ +----+ +--------------------+
604 * pp_size = chunk_size
605 */
606static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
607 sector_t ppl_sector)
608{
609 struct ppl_conf *ppl_conf = log->ppl_conf;
610 struct mddev *mddev = ppl_conf->mddev;
611 struct r5conf *conf = mddev->private;
612 int block_size = ppl_conf->block_size;
613 struct page *page1;
614 struct page *page2;
615 sector_t r_sector_first;
616 sector_t r_sector_last;
617 int strip_sectors;
618 int data_disks;
619 int i;
620 int ret = 0;
621 char b[BDEVNAME_SIZE];
622 unsigned int pp_size = le32_to_cpu(e->pp_size);
623 unsigned int data_size = le32_to_cpu(e->data_size);
624
625 page1 = alloc_page(GFP_KERNEL);
626 page2 = alloc_page(GFP_KERNEL);
627
628 if (!page1 || !page2) {
629 ret = -ENOMEM;
630 goto out;
631 }
632
633 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
634
635 if ((pp_size >> 9) < conf->chunk_sectors) {
636 if (pp_size > 0) {
637 data_disks = data_size / pp_size;
638 strip_sectors = pp_size >> 9;
639 } else {
640 data_disks = conf->raid_disks - conf->max_degraded;
641 strip_sectors = (data_size >> 9) / data_disks;
642 }
643 r_sector_last = r_sector_first +
644 (data_disks - 1) * conf->chunk_sectors +
645 strip_sectors;
646 } else {
647 data_disks = conf->raid_disks - conf->max_degraded;
648 strip_sectors = conf->chunk_sectors;
649 r_sector_last = r_sector_first + (data_size >> 9);
650 }
651
652 pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
653 (unsigned long long)r_sector_first,
654 (unsigned long long)r_sector_last);
655
656 /* if start and end is 4k aligned, use a 4k block */
657 if (block_size == 512 &&
658 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
659 (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
660 block_size = STRIPE_SIZE;
661
662 /* iterate through blocks in strip */
663 for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
664 bool update_parity = false;
665 sector_t parity_sector;
666 struct md_rdev *parity_rdev;
667 struct stripe_head sh;
668 int disk;
669 int indent = 0;
670
671 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
672 indent += 2;
673
674 memset(page_address(page1), 0, PAGE_SIZE);
675
676 /* iterate through data member disks */
677 for (disk = 0; disk < data_disks; disk++) {
678 int dd_idx;
679 struct md_rdev *rdev;
680 sector_t sector;
681 sector_t r_sector = r_sector_first + i +
682 (disk * conf->chunk_sectors);
683
684 pr_debug("%s:%*s data member disk %d start\n",
685 __func__, indent, "", disk);
686 indent += 2;
687
688 if (r_sector >= r_sector_last) {
689 pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
690 __func__, indent, "",
691 (unsigned long long)r_sector);
692 indent -= 2;
693 continue;
694 }
695
696 update_parity = true;
697
698 /* map raid sector to member disk */
699 sector = raid5_compute_sector(conf, r_sector, 0,
700 &dd_idx, NULL);
701 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
702 __func__, indent, "",
703 (unsigned long long)r_sector, dd_idx,
704 (unsigned long long)sector);
705
706 rdev = conf->disks[dd_idx].rdev;
707 if (!rdev) {
708 pr_debug("%s:%*s data member disk %d missing\n",
709 __func__, indent, "", dd_idx);
710 update_parity = false;
711 break;
712 }
713
714 pr_debug("%s:%*s reading data member disk %s sector %llu\n",
715 __func__, indent, "", bdevname(rdev->bdev, b),
716 (unsigned long long)sector);
717 if (!sync_page_io(rdev, sector, block_size, page2,
718 REQ_OP_READ, 0, false)) {
719 md_error(mddev, rdev);
720 pr_debug("%s:%*s read failed!\n", __func__,
721 indent, "");
722 ret = -EIO;
723 goto out;
724 }
725
726 ppl_xor(block_size, page1, page2);
727
728 indent -= 2;
729 }
730
731 if (!update_parity)
732 continue;
733
734 if (pp_size > 0) {
735 pr_debug("%s:%*s reading pp disk sector %llu\n",
736 __func__, indent, "",
737 (unsigned long long)(ppl_sector + i));
738 if (!sync_page_io(log->rdev,
739 ppl_sector - log->rdev->data_offset + i,
740 block_size, page2, REQ_OP_READ, 0,
741 false)) {
742 pr_debug("%s:%*s read failed!\n", __func__,
743 indent, "");
744 md_error(mddev, log->rdev);
745 ret = -EIO;
746 goto out;
747 }
748
749 ppl_xor(block_size, page1, page2);
750 }
751
752 /* map raid sector to parity disk */
753 parity_sector = raid5_compute_sector(conf, r_sector_first + i,
754 0, &disk, &sh);
755 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
756 parity_rdev = conf->disks[sh.pd_idx].rdev;
757
758 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
759 pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
760 __func__, indent, "",
761 (unsigned long long)parity_sector,
762 bdevname(parity_rdev->bdev, b));
763 if (!sync_page_io(parity_rdev, parity_sector, block_size,
764 page1, REQ_OP_WRITE, 0, false)) {
765 pr_debug("%s:%*s parity write error!\n", __func__,
766 indent, "");
767 md_error(mddev, parity_rdev);
768 ret = -EIO;
769 goto out;
770 }
771 }
772out:
773 if (page1)
774 __free_page(page1);
775 if (page2)
776 __free_page(page2);
777 return ret;
778}
779
780static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
781{
782 struct ppl_conf *ppl_conf = log->ppl_conf;
783 struct md_rdev *rdev = log->rdev;
784 struct mddev *mddev = rdev->mddev;
785 sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
786 struct page *page;
787 int i;
788 int ret = 0;
789
790 page = alloc_page(GFP_KERNEL);
791 if (!page)
792 return -ENOMEM;
793
794 /* iterate through all PPL entries saved */
795 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
796 struct ppl_header_entry *e = &pplhdr->entries[i];
797 u32 pp_size = le32_to_cpu(e->pp_size);
798 sector_t sector = ppl_sector;
799 int ppl_entry_sectors = pp_size >> 9;
800 u32 crc, crc_stored;
801
802 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
803 __func__, rdev->raid_disk, i,
804 (unsigned long long)ppl_sector, pp_size);
805
806 crc = ~0;
807 crc_stored = le32_to_cpu(e->checksum);
808
809 /* read parial parity for this entry and calculate its checksum */
810 while (pp_size) {
811 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
812
813 if (!sync_page_io(rdev, sector - rdev->data_offset,
814 s, page, REQ_OP_READ, 0, false)) {
815 md_error(mddev, rdev);
816 ret = -EIO;
817 goto out;
818 }
819
820 crc = crc32c_le(crc, page_address(page), s);
821
822 pp_size -= s;
823 sector += s >> 9;
824 }
825
826 crc = ~crc;
827
828 if (crc != crc_stored) {
829 /*
830 * Don't recover this entry if the checksum does not
831 * match, but keep going and try to recover other
832 * entries.
833 */
834 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
835 __func__, crc_stored, crc);
836 ppl_conf->mismatch_count++;
837 } else {
838 ret = ppl_recover_entry(log, e, ppl_sector);
839 if (ret)
840 goto out;
841 ppl_conf->recovered_entries++;
842 }
843
844 ppl_sector += ppl_entry_sectors;
845 }
846
847 /* flush the disk cache after recovery if necessary */
848 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
849out:
850 __free_page(page);
851 return ret;
852}
853
854static int ppl_write_empty_header(struct ppl_log *log)
855{
856 struct page *page;
857 struct ppl_header *pplhdr;
858 struct md_rdev *rdev = log->rdev;
859 int ret = 0;
860
861 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
862 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
863
864 page = alloc_page(GFP_NOIO | __GFP_ZERO);
865 if (!page)
866 return -ENOMEM;
867
868 pplhdr = page_address(page);
869 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
870 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
871 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
872
873 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
874 PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
875 false)) {
876 md_error(rdev->mddev, rdev);
877 ret = -EIO;
878 }
879
880 __free_page(page);
881 return ret;
882}
883
884static int ppl_load_distributed(struct ppl_log *log)
885{
886 struct ppl_conf *ppl_conf = log->ppl_conf;
887 struct md_rdev *rdev = log->rdev;
888 struct mddev *mddev = rdev->mddev;
889 struct page *page;
890 struct ppl_header *pplhdr;
891 u32 crc, crc_stored;
892 u32 signature;
893 int ret = 0;
894
895 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
896
897 /* read PPL header */
898 page = alloc_page(GFP_KERNEL);
899 if (!page)
900 return -ENOMEM;
901
902 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
903 PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
904 md_error(mddev, rdev);
905 ret = -EIO;
906 goto out;
907 }
908 pplhdr = page_address(page);
909
910 /* check header validity */
911 crc_stored = le32_to_cpu(pplhdr->checksum);
912 pplhdr->checksum = 0;
913 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
914
915 if (crc_stored != crc) {
916 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
917 __func__, crc_stored, crc);
918 ppl_conf->mismatch_count++;
919 goto out;
920 }
921
922 signature = le32_to_cpu(pplhdr->signature);
923
924 if (mddev->external) {
925 /*
926 * For external metadata the header signature is set and
927 * validated in userspace.
928 */
929 ppl_conf->signature = signature;
930 } else if (ppl_conf->signature != signature) {
931 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
932 __func__, signature, ppl_conf->signature);
933 ppl_conf->mismatch_count++;
934 goto out;
935 }
936
937 /* attempt to recover from log if we are starting a dirty array */
938 if (!mddev->pers && mddev->recovery_cp != MaxSector)
939 ret = ppl_recover(log, pplhdr);
940out:
941 /* write empty header if we are starting the array */
942 if (!ret && !mddev->pers)
943 ret = ppl_write_empty_header(log);
944
945 __free_page(page);
946
947 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
948 __func__, ret, ppl_conf->mismatch_count,
949 ppl_conf->recovered_entries);
950 return ret;
951}
952
953static int ppl_load(struct ppl_conf *ppl_conf)
954{
955 int ret = 0;
956 u32 signature = 0;
957 bool signature_set = false;
958 int i;
959
960 for (i = 0; i < ppl_conf->count; i++) {
961 struct ppl_log *log = &ppl_conf->child_logs[i];
962
963 /* skip missing drive */
964 if (!log->rdev)
965 continue;
966
967 ret = ppl_load_distributed(log);
968 if (ret)
969 break;
970
971 /*
972 * For external metadata we can't check if the signature is
973 * correct on a single drive, but we can check if it is the same
974 * on all drives.
975 */
976 if (ppl_conf->mddev->external) {
977 if (!signature_set) {
978 signature = ppl_conf->signature;
979 signature_set = true;
980 } else if (signature != ppl_conf->signature) {
981 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
982 mdname(ppl_conf->mddev));
983 ret = -EINVAL;
984 break;
985 }
986 }
987 }
988
989 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
990 __func__, ret, ppl_conf->mismatch_count,
991 ppl_conf->recovered_entries);
992 return ret;
993}
994
3418d036
AP
995static void __ppl_exit_log(struct ppl_conf *ppl_conf)
996{
997 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
998
999 kfree(ppl_conf->child_logs);
1000
1001 mempool_destroy(ppl_conf->meta_pool);
1002 if (ppl_conf->bs)
1003 bioset_free(ppl_conf->bs);
1004 mempool_destroy(ppl_conf->io_pool);
1005 kmem_cache_destroy(ppl_conf->io_kc);
1006
1007 kfree(ppl_conf);
1008}
1009
1010void ppl_exit_log(struct r5conf *conf)
1011{
1012 struct ppl_conf *ppl_conf = conf->log_private;
1013
1014 if (ppl_conf) {
1015 __ppl_exit_log(ppl_conf);
1016 conf->log_private = NULL;
1017 }
1018}
1019
1020static int ppl_validate_rdev(struct md_rdev *rdev)
1021{
1022 char b[BDEVNAME_SIZE];
1023 int ppl_data_sectors;
1024 int ppl_size_new;
1025
1026 /*
1027 * The configured PPL size must be enough to store
1028 * the header and (at the very least) partial parity
1029 * for one stripe. Round it down to ensure the data
1030 * space is cleanly divisible by stripe size.
1031 */
1032 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
1033
1034 if (ppl_data_sectors > 0)
1035 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
1036
1037 if (ppl_data_sectors <= 0) {
1038 pr_warn("md/raid:%s: PPL space too small on %s\n",
1039 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1040 return -ENOSPC;
1041 }
1042
1043 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
1044
1045 if ((rdev->ppl.sector < rdev->data_offset &&
1046 rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
1047 (rdev->ppl.sector >= rdev->data_offset &&
1048 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
1049 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
1050 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1051 return -EINVAL;
1052 }
1053
1054 if (!rdev->mddev->external &&
1055 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
1056 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
1057 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
1058 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1059 return -EINVAL;
1060 }
1061
1062 rdev->ppl.size = ppl_size_new;
1063
1064 return 0;
1065}
1066
1067int ppl_init_log(struct r5conf *conf)
1068{
1069 struct ppl_conf *ppl_conf;
1070 struct mddev *mddev = conf->mddev;
1071 int ret = 0;
1072 int i;
1073 bool need_cache_flush;
1074
1075 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
1076 mdname(conf->mddev));
1077
1078 if (PAGE_SIZE != 4096)
1079 return -EINVAL;
1080
1081 if (mddev->level != 5) {
1082 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
1083 mdname(mddev), mddev->level);
1084 return -EINVAL;
1085 }
1086
1087 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
1088 pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
1089 mdname(mddev));
1090 return -EINVAL;
1091 }
1092
1093 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
1094 pr_warn("md/raid:%s PPL is not compatible with journal\n",
1095 mdname(mddev));
1096 return -EINVAL;
1097 }
1098
1099 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
1100 if (!ppl_conf)
1101 return -ENOMEM;
1102
1103 ppl_conf->mddev = mddev;
1104
1105 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
1106 if (!ppl_conf->io_kc) {
1107 ret = -EINVAL;
1108 goto err;
1109 }
1110
1111 ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
1112 if (!ppl_conf->io_pool) {
1113 ret = -EINVAL;
1114 goto err;
1115 }
1116
1117 ppl_conf->bs = bioset_create(conf->raid_disks, 0);
1118 if (!ppl_conf->bs) {
1119 ret = -EINVAL;
1120 goto err;
1121 }
1122
1123 ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
1124 if (!ppl_conf->meta_pool) {
1125 ret = -EINVAL;
1126 goto err;
1127 }
1128
1129 ppl_conf->count = conf->raid_disks;
1130 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
1131 GFP_KERNEL);
1132 if (!ppl_conf->child_logs) {
1133 ret = -ENOMEM;
1134 goto err;
1135 }
1136
1137 atomic64_set(&ppl_conf->seq, 0);
1138
1139 if (!mddev->external) {
1140 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
1141 ppl_conf->block_size = 512;
1142 } else {
1143 ppl_conf->block_size = queue_logical_block_size(mddev->queue);
1144 }
1145
1146 for (i = 0; i < ppl_conf->count; i++) {
1147 struct ppl_log *log = &ppl_conf->child_logs[i];
1148 struct md_rdev *rdev = conf->disks[i].rdev;
1149
1150 mutex_init(&log->io_mutex);
1151 spin_lock_init(&log->io_list_lock);
1152 INIT_LIST_HEAD(&log->io_list);
1153 INIT_LIST_HEAD(&log->no_mem_stripes);
1154
1155 log->ppl_conf = ppl_conf;
1156 log->rdev = rdev;
1157
1158 if (rdev) {
1159 struct request_queue *q;
1160
1161 ret = ppl_validate_rdev(rdev);
1162 if (ret)
1163 goto err;
1164
1165 q = bdev_get_queue(rdev->bdev);
1166 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1167 need_cache_flush = true;
1168 }
1169 }
1170
1171 if (need_cache_flush)
1172 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
1173 mdname(mddev));
1174
4536bf9b
AP
1175 /* load and possibly recover the logs from the member disks */
1176 ret = ppl_load(ppl_conf);
1177
1178 if (ret) {
1179 goto err;
1180 } else if (!mddev->pers &&
1181 mddev->recovery_cp == 0 && !mddev->degraded &&
1182 ppl_conf->recovered_entries > 0 &&
1183 ppl_conf->mismatch_count == 0) {
1184 /*
1185 * If we are starting a dirty array and the recovery succeeds
1186 * without any issues, set the array as clean.
1187 */
1188 mddev->recovery_cp = MaxSector;
1189 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1190 }
1191
3418d036
AP
1192 conf->log_private = ppl_conf;
1193
1194 return 0;
1195err:
1196 __ppl_exit_log(ppl_conf);
1197 return ret;
1198}
6358c239
AP
1199
1200int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
1201{
1202 struct ppl_conf *ppl_conf = conf->log_private;
1203 struct ppl_log *log;
1204 int ret = 0;
1205 char b[BDEVNAME_SIZE];
1206
1207 if (!rdev)
1208 return -EINVAL;
1209
1210 pr_debug("%s: disk: %d operation: %s dev: %s\n",
1211 __func__, rdev->raid_disk, add ? "add" : "remove",
1212 bdevname(rdev->bdev, b));
1213
1214 if (rdev->raid_disk < 0)
1215 return 0;
1216
1217 if (rdev->raid_disk >= ppl_conf->count)
1218 return -ENODEV;
1219
1220 log = &ppl_conf->child_logs[rdev->raid_disk];
1221
1222 mutex_lock(&log->io_mutex);
1223 if (add) {
1224 ret = ppl_validate_rdev(rdev);
1225 if (!ret) {
1226 log->rdev = rdev;
1227 ret = ppl_write_empty_header(log);
1228 }
1229 } else {
1230 log->rdev = NULL;
1231 }
1232 mutex_unlock(&log->io_mutex);
1233
1234 return ret;
1235}