md/r5cache: r5cache recovery: part 1
[linux-2.6-block.git] / drivers / md / raid5-cache.c
CommitLineData
f6bed0ef
SL
1/*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
b4c625c6 3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
f6bed0ef
SL
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 */
15#include <linux/kernel.h>
16#include <linux/wait.h>
17#include <linux/blkdev.h>
18#include <linux/slab.h>
19#include <linux/raid/md_p.h>
5cb2fbd6 20#include <linux/crc32c.h>
f6bed0ef
SL
21#include <linux/random.h>
22#include "md.h"
23#include "raid5.h"
1e6d690b 24#include "bitmap.h"
f6bed0ef
SL
25
26/*
27 * metadata/data stored in disk with 4k size unit (a block) regardless
28 * underneath hardware sector size. only works with PAGE_SIZE == 4096
29 */
30#define BLOCK_SECTORS (8)
31
0576b1c6 32/*
a39f7afd
SL
33 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
34 *
35 * In write through mode, the reclaim runs every log->max_free_space.
36 * This can prevent the recovery scans for too long
0576b1c6
SL
37 */
38#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
39#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
40
a39f7afd
SL
41/* wake up reclaim thread periodically */
42#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
43/* start flush with these full stripes */
44#define R5C_FULL_STRIPE_FLUSH_BATCH 256
45/* reclaim stripes in groups */
46#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
47
c38d29b3
CH
48/*
49 * We only need 2 bios per I/O unit to make progress, but ensure we
50 * have a few more available to not get too tight.
51 */
52#define R5L_POOL_SIZE 4
53
2ded3703
SL
54/*
55 * r5c journal modes of the array: write-back or write-through.
56 * write-through mode has identical behavior as existing log only
57 * implementation.
58 */
59enum r5c_journal_mode {
60 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
61 R5C_JOURNAL_MODE_WRITE_BACK = 1,
62};
63
2c7da14b
SL
64static char *r5c_journal_mode_str[] = {"write-through",
65 "write-back"};
2ded3703
SL
66/*
67 * raid5 cache state machine
68 *
69 * With rhe RAID cache, each stripe works in two phases:
70 * - caching phase
71 * - writing-out phase
72 *
73 * These two phases are controlled by bit STRIPE_R5C_CACHING:
74 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
75 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
76 *
77 * When there is no journal, or the journal is in write-through mode,
78 * the stripe is always in writing-out phase.
79 *
80 * For write-back journal, the stripe is sent to caching phase on write
81 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
82 * the write-out phase by clearing STRIPE_R5C_CACHING.
83 *
84 * Stripes in caching phase do not write the raid disks. Instead, all
85 * writes are committed from the log device. Therefore, a stripe in
86 * caching phase handles writes as:
87 * - write to log device
88 * - return IO
89 *
90 * Stripes in writing-out phase handle writes as:
91 * - calculate parity
92 * - write pending data and parity to journal
93 * - write data and parity to raid disks
94 * - return IO for pending writes
95 */
96
f6bed0ef
SL
97struct r5l_log {
98 struct md_rdev *rdev;
99
100 u32 uuid_checksum;
101
102 sector_t device_size; /* log device size, round to
103 * BLOCK_SECTORS */
0576b1c6
SL
104 sector_t max_free_space; /* reclaim run if free space is at
105 * this size */
f6bed0ef
SL
106
107 sector_t last_checkpoint; /* log tail. where recovery scan
108 * starts from */
109 u64 last_cp_seq; /* log tail sequence */
110
111 sector_t log_start; /* log head. where new data appends */
112 u64 seq; /* log head sequence */
113
17036461
CH
114 sector_t next_checkpoint;
115 u64 next_cp_seq;
116
f6bed0ef
SL
117 struct mutex io_mutex;
118 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
119
120 spinlock_t io_list_lock;
121 struct list_head running_ios; /* io_units which are still running,
122 * and have not yet been completely
123 * written to the log */
124 struct list_head io_end_ios; /* io_units which have been completely
125 * written to the log but not yet written
126 * to the RAID */
a8c34f91
SL
127 struct list_head flushing_ios; /* io_units which are waiting for log
128 * cache flush */
04732f74 129 struct list_head finished_ios; /* io_units which settle down in log disk */
a8c34f91 130 struct bio flush_bio;
f6bed0ef 131
5036c390
CH
132 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
133
f6bed0ef 134 struct kmem_cache *io_kc;
5036c390 135 mempool_t *io_pool;
c38d29b3 136 struct bio_set *bs;
e8deb638 137 mempool_t *meta_pool;
f6bed0ef 138
0576b1c6
SL
139 struct md_thread *reclaim_thread;
140 unsigned long reclaim_target; /* number of space that need to be
141 * reclaimed. if it's 0, reclaim spaces
142 * used by io_units which are in
143 * IO_UNIT_STRIPE_END state (eg, reclaim
144 * dones't wait for specific io_unit
145 * switching to IO_UNIT_STRIPE_END
146 * state) */
0fd22b45 147 wait_queue_head_t iounit_wait;
0576b1c6 148
f6bed0ef
SL
149 struct list_head no_space_stripes; /* pending stripes, log has no space */
150 spinlock_t no_space_stripes_lock;
56fef7c6
CH
151
152 bool need_cache_flush;
2ded3703
SL
153
154 /* for r5c_cache */
155 enum r5c_journal_mode r5c_journal_mode;
a39f7afd
SL
156
157 /* all stripes in r5cache, in the order of seq at sh->log_start */
158 struct list_head stripe_in_journal_list;
159
160 spinlock_t stripe_in_journal_lock;
161 atomic_t stripe_in_journal_count;
f6bed0ef
SL
162};
163
164/*
165 * an IO range starts from a meta data block and end at the next meta data
166 * block. The io unit's the meta data block tracks data/parity followed it. io
167 * unit is written to log disk with normal write, as we always flush log disk
168 * first and then start move data to raid disks, there is no requirement to
169 * write io unit with FLUSH/FUA
170 */
171struct r5l_io_unit {
172 struct r5l_log *log;
173
174 struct page *meta_page; /* store meta block */
175 int meta_offset; /* current offset in meta_page */
176
f6bed0ef
SL
177 struct bio *current_bio;/* current_bio accepting new data */
178
179 atomic_t pending_stripe;/* how many stripes not flushed to raid */
180 u64 seq; /* seq number of the metablock */
181 sector_t log_start; /* where the io_unit starts */
182 sector_t log_end; /* where the io_unit ends */
183 struct list_head log_sibling; /* log->running_ios */
184 struct list_head stripe_list; /* stripes added to the io_unit */
185
186 int state;
6143e2ce 187 bool need_split_bio;
f6bed0ef
SL
188};
189
190/* r5l_io_unit state */
191enum r5l_io_unit_state {
192 IO_UNIT_RUNNING = 0, /* accepting new IO */
193 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
194 * don't accepting new bio */
195 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
a8c34f91 196 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
f6bed0ef
SL
197};
198
2ded3703
SL
199bool r5c_is_writeback(struct r5l_log *log)
200{
201 return (log != NULL &&
202 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
203}
204
f6bed0ef
SL
205static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
206{
207 start += inc;
208 if (start >= log->device_size)
209 start = start - log->device_size;
210 return start;
211}
212
213static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
214 sector_t end)
215{
216 if (end >= start)
217 return end - start;
218 else
219 return end + log->device_size - start;
220}
221
222static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
223{
224 sector_t used_size;
225
226 used_size = r5l_ring_distance(log, log->last_checkpoint,
227 log->log_start);
228
229 return log->device_size > used_size + size;
230}
231
f6bed0ef
SL
232static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
233 enum r5l_io_unit_state state)
234{
f6bed0ef
SL
235 if (WARN_ON(io->state >= state))
236 return;
237 io->state = state;
f6bed0ef
SL
238}
239
1e6d690b
SL
240static void
241r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
242 struct bio_list *return_bi)
243{
244 struct bio *wbi, *wbi2;
245
246 wbi = dev->written;
247 dev->written = NULL;
248 while (wbi && wbi->bi_iter.bi_sector <
249 dev->sector + STRIPE_SECTORS) {
250 wbi2 = r5_next_bio(wbi, dev->sector);
251 if (!raid5_dec_bi_active_stripes(wbi)) {
252 md_write_end(conf->mddev);
253 bio_list_add(return_bi, wbi);
254 }
255 wbi = wbi2;
256 }
257}
258
259void r5c_handle_cached_data_endio(struct r5conf *conf,
260 struct stripe_head *sh, int disks, struct bio_list *return_bi)
261{
262 int i;
263
264 for (i = sh->disks; i--; ) {
265 if (sh->dev[i].written) {
266 set_bit(R5_UPTODATE, &sh->dev[i].flags);
267 r5c_return_dev_pending_writes(conf, &sh->dev[i],
268 return_bi);
269 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
270 STRIPE_SECTORS,
271 !test_bit(STRIPE_DEGRADED, &sh->state),
272 0);
273 }
274 }
275}
276
a39f7afd
SL
277/* Check whether we should flush some stripes to free up stripe cache */
278void r5c_check_stripe_cache_usage(struct r5conf *conf)
279{
280 int total_cached;
281
282 if (!r5c_is_writeback(conf->log))
283 return;
284
285 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
286 atomic_read(&conf->r5c_cached_full_stripes);
287
288 /*
289 * The following condition is true for either of the following:
290 * - stripe cache pressure high:
291 * total_cached > 3/4 min_nr_stripes ||
292 * empty_inactive_list_nr > 0
293 * - stripe cache pressure moderate:
294 * total_cached > 1/2 min_nr_stripes
295 */
296 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
297 atomic_read(&conf->empty_inactive_list_nr) > 0)
298 r5l_wake_reclaim(conf->log, 0);
299}
300
301/*
302 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
303 * stripes in the cache
304 */
305void r5c_check_cached_full_stripe(struct r5conf *conf)
306{
307 if (!r5c_is_writeback(conf->log))
308 return;
309
310 /*
311 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
312 * or a full stripe (chunk size / 4k stripes).
313 */
314 if (atomic_read(&conf->r5c_cached_full_stripes) >=
315 min(R5C_FULL_STRIPE_FLUSH_BATCH,
316 conf->chunk_sectors >> STRIPE_SHIFT))
317 r5l_wake_reclaim(conf->log, 0);
318}
319
320/*
321 * Total log space (in sectors) needed to flush all data in cache
322 *
323 * Currently, writing-out phase automatically includes all pending writes
324 * to the same sector. So the reclaim of each stripe takes up to
325 * (conf->raid_disks + 1) pages of log space.
326 *
327 * To totally avoid deadlock due to log space, the code reserves
328 * (conf->raid_disks + 1) pages for each stripe in cache, which is not
329 * necessary in most cases.
330 *
331 * To improve this, we will need writing-out phase to be able to NOT include
332 * pending writes, which will reduce the requirement to
333 * (conf->max_degraded + 1) pages per stripe in cache.
334 */
335static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
336{
337 struct r5l_log *log = conf->log;
338
339 if (!r5c_is_writeback(log))
340 return 0;
341
342 return BLOCK_SECTORS * (conf->raid_disks + 1) *
343 atomic_read(&log->stripe_in_journal_count);
344}
345
346/*
347 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
348 *
349 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
350 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
351 * device is less than 2x of reclaim_required_space.
352 */
353static inline void r5c_update_log_state(struct r5l_log *log)
354{
355 struct r5conf *conf = log->rdev->mddev->private;
356 sector_t free_space;
357 sector_t reclaim_space;
358
359 if (!r5c_is_writeback(log))
360 return;
361
362 free_space = r5l_ring_distance(log, log->log_start,
363 log->last_checkpoint);
364 reclaim_space = r5c_log_required_to_flush_cache(conf);
365 if (free_space < 2 * reclaim_space)
366 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
367 else
368 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
369 if (free_space < 3 * reclaim_space)
370 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
371 else
372 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
373}
374
2ded3703
SL
375/*
376 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
377 * This function should only be called in write-back mode.
378 */
a39f7afd 379void r5c_make_stripe_write_out(struct stripe_head *sh)
2ded3703
SL
380{
381 struct r5conf *conf = sh->raid_conf;
382 struct r5l_log *log = conf->log;
383
384 BUG_ON(!r5c_is_writeback(log));
385
386 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
387 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1e6d690b
SL
388
389 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
390 atomic_inc(&conf->preread_active_stripes);
391
392 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
393 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
394 atomic_dec(&conf->r5c_cached_partial_stripes);
395 }
396
397 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
398 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
399 atomic_dec(&conf->r5c_cached_full_stripes);
400 }
401}
402
403static void r5c_handle_data_cached(struct stripe_head *sh)
404{
405 int i;
406
407 for (i = sh->disks; i--; )
408 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
409 set_bit(R5_InJournal, &sh->dev[i].flags);
410 clear_bit(R5_LOCKED, &sh->dev[i].flags);
411 }
412 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
413}
414
415/*
416 * this journal write must contain full parity,
417 * it may also contain some data pages
418 */
419static void r5c_handle_parity_cached(struct stripe_head *sh)
420{
421 int i;
422
423 for (i = sh->disks; i--; )
424 if (test_bit(R5_InJournal, &sh->dev[i].flags))
425 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2ded3703
SL
426}
427
428/*
429 * Setting proper flags after writing (or flushing) data and/or parity to the
430 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
431 */
432static void r5c_finish_cache_stripe(struct stripe_head *sh)
433{
434 struct r5l_log *log = sh->raid_conf->log;
435
436 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
437 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
438 /*
439 * Set R5_InJournal for parity dev[pd_idx]. This means
440 * all data AND parity in the journal. For RAID 6, it is
441 * NOT necessary to set the flag for dev[qd_idx], as the
442 * two parities are written out together.
443 */
444 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1e6d690b
SL
445 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
446 r5c_handle_data_cached(sh);
447 } else {
448 r5c_handle_parity_cached(sh);
449 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
450 }
2ded3703
SL
451}
452
d8858f43
CH
453static void r5l_io_run_stripes(struct r5l_io_unit *io)
454{
455 struct stripe_head *sh, *next;
456
457 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
458 list_del_init(&sh->log_list);
2ded3703
SL
459
460 r5c_finish_cache_stripe(sh);
461
d8858f43
CH
462 set_bit(STRIPE_HANDLE, &sh->state);
463 raid5_release_stripe(sh);
464 }
465}
466
56fef7c6
CH
467static void r5l_log_run_stripes(struct r5l_log *log)
468{
469 struct r5l_io_unit *io, *next;
470
471 assert_spin_locked(&log->io_list_lock);
472
473 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
474 /* don't change list order */
475 if (io->state < IO_UNIT_IO_END)
476 break;
477
478 list_move_tail(&io->log_sibling, &log->finished_ios);
479 r5l_io_run_stripes(io);
480 }
481}
482
3848c0bc
CH
483static void r5l_move_to_end_ios(struct r5l_log *log)
484{
485 struct r5l_io_unit *io, *next;
486
487 assert_spin_locked(&log->io_list_lock);
488
489 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
490 /* don't change list order */
491 if (io->state < IO_UNIT_IO_END)
492 break;
493 list_move_tail(&io->log_sibling, &log->io_end_ios);
494 }
495}
496
f6bed0ef
SL
497static void r5l_log_endio(struct bio *bio)
498{
499 struct r5l_io_unit *io = bio->bi_private;
500 struct r5l_log *log = io->log;
509ffec7 501 unsigned long flags;
f6bed0ef 502
6e74a9cf
SL
503 if (bio->bi_error)
504 md_error(log->rdev->mddev, log->rdev);
505
f6bed0ef 506 bio_put(bio);
e8deb638 507 mempool_free(io->meta_page, log->meta_pool);
f6bed0ef 508
509ffec7
CH
509 spin_lock_irqsave(&log->io_list_lock, flags);
510 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
56fef7c6 511 if (log->need_cache_flush)
3848c0bc 512 r5l_move_to_end_ios(log);
56fef7c6
CH
513 else
514 r5l_log_run_stripes(log);
509ffec7
CH
515 spin_unlock_irqrestore(&log->io_list_lock, flags);
516
56fef7c6
CH
517 if (log->need_cache_flush)
518 md_wakeup_thread(log->rdev->mddev->thread);
f6bed0ef
SL
519}
520
521static void r5l_submit_current_io(struct r5l_log *log)
522{
523 struct r5l_io_unit *io = log->current_io;
524 struct r5l_meta_block *block;
509ffec7 525 unsigned long flags;
f6bed0ef
SL
526 u32 crc;
527
528 if (!io)
529 return;
530
531 block = page_address(io->meta_page);
532 block->meta_size = cpu_to_le32(io->meta_offset);
5cb2fbd6 533 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
f6bed0ef
SL
534 block->checksum = cpu_to_le32(crc);
535
536 log->current_io = NULL;
509ffec7
CH
537 spin_lock_irqsave(&log->io_list_lock, flags);
538 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
539 spin_unlock_irqrestore(&log->io_list_lock, flags);
f6bed0ef 540
4e49ea4a 541 submit_bio(io->current_bio);
f6bed0ef
SL
542}
543
6143e2ce 544static struct bio *r5l_bio_alloc(struct r5l_log *log)
b349feb3 545{
c38d29b3 546 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
b349feb3 547
796a5cf0 548 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
b349feb3 549 bio->bi_bdev = log->rdev->bdev;
1e932a37 550 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
b349feb3 551
b349feb3
CH
552 return bio;
553}
554
c1b99198
CH
555static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
556{
557 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
558
a39f7afd 559 r5c_update_log_state(log);
c1b99198
CH
560 /*
561 * If we filled up the log device start from the beginning again,
562 * which will require a new bio.
563 *
564 * Note: for this to work properly the log size needs to me a multiple
565 * of BLOCK_SECTORS.
566 */
567 if (log->log_start == 0)
6143e2ce 568 io->need_split_bio = true;
c1b99198
CH
569
570 io->log_end = log->log_start;
571}
572
f6bed0ef
SL
573static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
574{
575 struct r5l_io_unit *io;
576 struct r5l_meta_block *block;
f6bed0ef 577
5036c390
CH
578 io = mempool_alloc(log->io_pool, GFP_ATOMIC);
579 if (!io)
580 return NULL;
581 memset(io, 0, sizeof(*io));
582
51039cd0 583 io->log = log;
51039cd0
CH
584 INIT_LIST_HEAD(&io->log_sibling);
585 INIT_LIST_HEAD(&io->stripe_list);
586 io->state = IO_UNIT_RUNNING;
f6bed0ef 587
e8deb638 588 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
f6bed0ef 589 block = page_address(io->meta_page);
e8deb638 590 clear_page(block);
f6bed0ef
SL
591 block->magic = cpu_to_le32(R5LOG_MAGIC);
592 block->version = R5LOG_VERSION;
593 block->seq = cpu_to_le64(log->seq);
594 block->position = cpu_to_le64(log->log_start);
595
596 io->log_start = log->log_start;
597 io->meta_offset = sizeof(struct r5l_meta_block);
2b8ef16e 598 io->seq = log->seq++;
f6bed0ef 599
6143e2ce
CH
600 io->current_bio = r5l_bio_alloc(log);
601 io->current_bio->bi_end_io = r5l_log_endio;
602 io->current_bio->bi_private = io;
b349feb3 603 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
f6bed0ef 604
c1b99198 605 r5_reserve_log_entry(log, io);
f6bed0ef
SL
606
607 spin_lock_irq(&log->io_list_lock);
608 list_add_tail(&io->log_sibling, &log->running_ios);
609 spin_unlock_irq(&log->io_list_lock);
610
611 return io;
612}
613
614static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
615{
22581f58
CH
616 if (log->current_io &&
617 log->current_io->meta_offset + payload_size > PAGE_SIZE)
f6bed0ef 618 r5l_submit_current_io(log);
f6bed0ef 619
5036c390 620 if (!log->current_io) {
22581f58 621 log->current_io = r5l_new_meta(log);
5036c390
CH
622 if (!log->current_io)
623 return -ENOMEM;
624 }
625
f6bed0ef
SL
626 return 0;
627}
628
629static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
630 sector_t location,
631 u32 checksum1, u32 checksum2,
632 bool checksum2_valid)
633{
634 struct r5l_io_unit *io = log->current_io;
635 struct r5l_payload_data_parity *payload;
636
637 payload = page_address(io->meta_page) + io->meta_offset;
638 payload->header.type = cpu_to_le16(type);
639 payload->header.flags = cpu_to_le16(0);
640 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
641 (PAGE_SHIFT - 9));
642 payload->location = cpu_to_le64(location);
643 payload->checksum[0] = cpu_to_le32(checksum1);
644 if (checksum2_valid)
645 payload->checksum[1] = cpu_to_le32(checksum2);
646
647 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
648 sizeof(__le32) * (1 + !!checksum2_valid);
649}
650
651static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
652{
653 struct r5l_io_unit *io = log->current_io;
654
6143e2ce
CH
655 if (io->need_split_bio) {
656 struct bio *prev = io->current_bio;
b349feb3 657
6143e2ce
CH
658 io->current_bio = r5l_bio_alloc(log);
659 bio_chain(io->current_bio, prev);
660
4e49ea4a 661 submit_bio(prev);
f6bed0ef 662 }
f6bed0ef 663
6143e2ce
CH
664 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
665 BUG();
666
c1b99198 667 r5_reserve_log_entry(log, io);
f6bed0ef
SL
668}
669
5036c390 670static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
f6bed0ef
SL
671 int data_pages, int parity_pages)
672{
673 int i;
674 int meta_size;
5036c390 675 int ret;
f6bed0ef
SL
676 struct r5l_io_unit *io;
677
678 meta_size =
679 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
680 * data_pages) +
681 sizeof(struct r5l_payload_data_parity) +
682 sizeof(__le32) * parity_pages;
683
5036c390
CH
684 ret = r5l_get_meta(log, meta_size);
685 if (ret)
686 return ret;
687
f6bed0ef
SL
688 io = log->current_io;
689
690 for (i = 0; i < sh->disks; i++) {
1e6d690b
SL
691 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
692 test_bit(R5_InJournal, &sh->dev[i].flags))
f6bed0ef
SL
693 continue;
694 if (i == sh->pd_idx || i == sh->qd_idx)
695 continue;
696 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
697 raid5_compute_blocknr(sh, i, 0),
698 sh->dev[i].log_checksum, 0, false);
699 r5l_append_payload_page(log, sh->dev[i].page);
700 }
701
2ded3703 702 if (parity_pages == 2) {
f6bed0ef
SL
703 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
704 sh->sector, sh->dev[sh->pd_idx].log_checksum,
705 sh->dev[sh->qd_idx].log_checksum, true);
706 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
707 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
2ded3703 708 } else if (parity_pages == 1) {
f6bed0ef
SL
709 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
710 sh->sector, sh->dev[sh->pd_idx].log_checksum,
711 0, false);
712 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
2ded3703
SL
713 } else /* Just writing data, not parity, in caching phase */
714 BUG_ON(parity_pages != 0);
f6bed0ef
SL
715
716 list_add_tail(&sh->log_list, &io->stripe_list);
717 atomic_inc(&io->pending_stripe);
718 sh->log_io = io;
5036c390 719
a39f7afd
SL
720 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
721 return 0;
722
723 if (sh->log_start == MaxSector) {
724 BUG_ON(!list_empty(&sh->r5c));
725 sh->log_start = io->log_start;
726 spin_lock_irq(&log->stripe_in_journal_lock);
727 list_add_tail(&sh->r5c,
728 &log->stripe_in_journal_list);
729 spin_unlock_irq(&log->stripe_in_journal_lock);
730 atomic_inc(&log->stripe_in_journal_count);
731 }
5036c390 732 return 0;
f6bed0ef
SL
733}
734
a39f7afd
SL
735/* add stripe to no_space_stripes, and then wake up reclaim */
736static inline void r5l_add_no_space_stripe(struct r5l_log *log,
737 struct stripe_head *sh)
738{
739 spin_lock(&log->no_space_stripes_lock);
740 list_add_tail(&sh->log_list, &log->no_space_stripes);
741 spin_unlock(&log->no_space_stripes_lock);
742}
743
f6bed0ef
SL
744/*
745 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
746 * data from log to raid disks), so we shouldn't wait for reclaim here
747 */
748int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
749{
a39f7afd 750 struct r5conf *conf = sh->raid_conf;
f6bed0ef
SL
751 int write_disks = 0;
752 int data_pages, parity_pages;
f6bed0ef
SL
753 int reserve;
754 int i;
5036c390 755 int ret = 0;
a39f7afd 756 bool wake_reclaim = false;
f6bed0ef
SL
757
758 if (!log)
759 return -EAGAIN;
760 /* Don't support stripe batch */
761 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
762 test_bit(STRIPE_SYNCING, &sh->state)) {
763 /* the stripe is written to log, we start writing it to raid */
764 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
765 return -EAGAIN;
766 }
767
2ded3703
SL
768 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
769
f6bed0ef
SL
770 for (i = 0; i < sh->disks; i++) {
771 void *addr;
772
1e6d690b
SL
773 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
774 test_bit(R5_InJournal, &sh->dev[i].flags))
f6bed0ef 775 continue;
1e6d690b 776
f6bed0ef
SL
777 write_disks++;
778 /* checksum is already calculated in last run */
779 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
780 continue;
781 addr = kmap_atomic(sh->dev[i].page);
5cb2fbd6
SL
782 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
783 addr, PAGE_SIZE);
f6bed0ef
SL
784 kunmap_atomic(addr);
785 }
786 parity_pages = 1 + !!(sh->qd_idx >= 0);
787 data_pages = write_disks - parity_pages;
788
f6bed0ef 789 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
253f9fd4
SL
790 /*
791 * The stripe must enter state machine again to finish the write, so
792 * don't delay.
793 */
794 clear_bit(STRIPE_DELAYED, &sh->state);
f6bed0ef
SL
795 atomic_inc(&sh->count);
796
797 mutex_lock(&log->io_mutex);
798 /* meta + data */
799 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
f6bed0ef 800
a39f7afd
SL
801 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
802 if (!r5l_has_free_space(log, reserve)) {
803 r5l_add_no_space_stripe(log, sh);
804 wake_reclaim = true;
805 } else {
806 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
807 if (ret) {
808 spin_lock_irq(&log->io_list_lock);
809 list_add_tail(&sh->log_list,
810 &log->no_mem_stripes);
811 spin_unlock_irq(&log->io_list_lock);
812 }
813 }
814 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */
815 /*
816 * log space critical, do not process stripes that are
817 * not in cache yet (sh->log_start == MaxSector).
818 */
819 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
820 sh->log_start == MaxSector) {
821 r5l_add_no_space_stripe(log, sh);
822 wake_reclaim = true;
823 reserve = 0;
824 } else if (!r5l_has_free_space(log, reserve)) {
825 if (sh->log_start == log->last_checkpoint)
826 BUG();
827 else
828 r5l_add_no_space_stripe(log, sh);
829 } else {
830 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
831 if (ret) {
832 spin_lock_irq(&log->io_list_lock);
833 list_add_tail(&sh->log_list,
834 &log->no_mem_stripes);
835 spin_unlock_irq(&log->io_list_lock);
836 }
5036c390 837 }
f6bed0ef 838 }
f6bed0ef 839
5036c390 840 mutex_unlock(&log->io_mutex);
a39f7afd
SL
841 if (wake_reclaim)
842 r5l_wake_reclaim(log, reserve);
f6bed0ef
SL
843 return 0;
844}
845
846void r5l_write_stripe_run(struct r5l_log *log)
847{
848 if (!log)
849 return;
850 mutex_lock(&log->io_mutex);
851 r5l_submit_current_io(log);
852 mutex_unlock(&log->io_mutex);
853}
854
828cbe98
SL
855int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
856{
857 if (!log)
858 return -ENODEV;
859 /*
860 * we flush log disk cache first, then write stripe data to raid disks.
861 * So if bio is finished, the log disk cache is flushed already. The
862 * recovery guarantees we can recovery the bio from log disk, so we
863 * don't need to flush again
864 */
865 if (bio->bi_iter.bi_size == 0) {
866 bio_endio(bio);
867 return 0;
868 }
1eff9d32 869 bio->bi_opf &= ~REQ_PREFLUSH;
828cbe98
SL
870 return -EAGAIN;
871}
872
f6bed0ef
SL
873/* This will run after log space is reclaimed */
874static void r5l_run_no_space_stripes(struct r5l_log *log)
875{
876 struct stripe_head *sh;
877
878 spin_lock(&log->no_space_stripes_lock);
879 while (!list_empty(&log->no_space_stripes)) {
880 sh = list_first_entry(&log->no_space_stripes,
881 struct stripe_head, log_list);
882 list_del_init(&sh->log_list);
883 set_bit(STRIPE_HANDLE, &sh->state);
884 raid5_release_stripe(sh);
885 }
886 spin_unlock(&log->no_space_stripes_lock);
887}
888
a39f7afd
SL
889/*
890 * calculate new last_checkpoint
891 * for write through mode, returns log->next_checkpoint
892 * for write back, returns log_start of first sh in stripe_in_journal_list
893 */
894static sector_t r5c_calculate_new_cp(struct r5conf *conf)
895{
896 struct stripe_head *sh;
897 struct r5l_log *log = conf->log;
898 sector_t new_cp;
899 unsigned long flags;
900
901 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
902 return log->next_checkpoint;
903
904 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
905 if (list_empty(&conf->log->stripe_in_journal_list)) {
906 /* all stripes flushed */
907 spin_unlock(&log->stripe_in_journal_lock);
908 return log->next_checkpoint;
909 }
910 sh = list_first_entry(&conf->log->stripe_in_journal_list,
911 struct stripe_head, r5c);
912 new_cp = sh->log_start;
913 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
914 return new_cp;
915}
916
17036461
CH
917static sector_t r5l_reclaimable_space(struct r5l_log *log)
918{
a39f7afd
SL
919 struct r5conf *conf = log->rdev->mddev->private;
920
17036461 921 return r5l_ring_distance(log, log->last_checkpoint,
a39f7afd 922 r5c_calculate_new_cp(conf));
17036461
CH
923}
924
5036c390
CH
925static void r5l_run_no_mem_stripe(struct r5l_log *log)
926{
927 struct stripe_head *sh;
928
929 assert_spin_locked(&log->io_list_lock);
930
931 if (!list_empty(&log->no_mem_stripes)) {
932 sh = list_first_entry(&log->no_mem_stripes,
933 struct stripe_head, log_list);
934 list_del_init(&sh->log_list);
935 set_bit(STRIPE_HANDLE, &sh->state);
936 raid5_release_stripe(sh);
937 }
938}
939
04732f74 940static bool r5l_complete_finished_ios(struct r5l_log *log)
17036461
CH
941{
942 struct r5l_io_unit *io, *next;
943 bool found = false;
944
945 assert_spin_locked(&log->io_list_lock);
946
04732f74 947 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
17036461
CH
948 /* don't change list order */
949 if (io->state < IO_UNIT_STRIPE_END)
950 break;
951
952 log->next_checkpoint = io->log_start;
953 log->next_cp_seq = io->seq;
954
955 list_del(&io->log_sibling);
5036c390
CH
956 mempool_free(io, log->io_pool);
957 r5l_run_no_mem_stripe(log);
17036461
CH
958
959 found = true;
960 }
961
962 return found;
963}
964
509ffec7
CH
965static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
966{
967 struct r5l_log *log = io->log;
a39f7afd 968 struct r5conf *conf = log->rdev->mddev->private;
509ffec7
CH
969 unsigned long flags;
970
971 spin_lock_irqsave(&log->io_list_lock, flags);
972 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
17036461 973
04732f74 974 if (!r5l_complete_finished_ios(log)) {
85f2f9a4
SL
975 spin_unlock_irqrestore(&log->io_list_lock, flags);
976 return;
977 }
509ffec7 978
a39f7afd
SL
979 if (r5l_reclaimable_space(log) > log->max_free_space ||
980 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
509ffec7
CH
981 r5l_wake_reclaim(log, 0);
982
509ffec7
CH
983 spin_unlock_irqrestore(&log->io_list_lock, flags);
984 wake_up(&log->iounit_wait);
985}
986
0576b1c6
SL
987void r5l_stripe_write_finished(struct stripe_head *sh)
988{
989 struct r5l_io_unit *io;
990
0576b1c6 991 io = sh->log_io;
0576b1c6
SL
992 sh->log_io = NULL;
993
509ffec7
CH
994 if (io && atomic_dec_and_test(&io->pending_stripe))
995 __r5l_stripe_write_finished(io);
0576b1c6
SL
996}
997
a8c34f91
SL
998static void r5l_log_flush_endio(struct bio *bio)
999{
1000 struct r5l_log *log = container_of(bio, struct r5l_log,
1001 flush_bio);
1002 unsigned long flags;
1003 struct r5l_io_unit *io;
a8c34f91 1004
6e74a9cf
SL
1005 if (bio->bi_error)
1006 md_error(log->rdev->mddev, log->rdev);
1007
a8c34f91 1008 spin_lock_irqsave(&log->io_list_lock, flags);
d8858f43
CH
1009 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1010 r5l_io_run_stripes(io);
04732f74 1011 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
a8c34f91
SL
1012 spin_unlock_irqrestore(&log->io_list_lock, flags);
1013}
1014
0576b1c6
SL
1015/*
1016 * Starting dispatch IO to raid.
1017 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1018 * broken meta in the middle of a log causes recovery can't find meta at the
1019 * head of log. If operations require meta at the head persistent in log, we
1020 * must make sure meta before it persistent in log too. A case is:
1021 *
1022 * stripe data/parity is in log, we start write stripe to raid disks. stripe
1023 * data/parity must be persistent in log before we do the write to raid disks.
1024 *
1025 * The solution is we restrictly maintain io_unit list order. In this case, we
1026 * only write stripes of an io_unit to raid disks till the io_unit is the first
1027 * one whose data/parity is in log.
1028 */
1029void r5l_flush_stripe_to_raid(struct r5l_log *log)
1030{
a8c34f91 1031 bool do_flush;
56fef7c6
CH
1032
1033 if (!log || !log->need_cache_flush)
0576b1c6 1034 return;
0576b1c6
SL
1035
1036 spin_lock_irq(&log->io_list_lock);
a8c34f91
SL
1037 /* flush bio is running */
1038 if (!list_empty(&log->flushing_ios)) {
1039 spin_unlock_irq(&log->io_list_lock);
1040 return;
0576b1c6 1041 }
a8c34f91
SL
1042 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1043 do_flush = !list_empty(&log->flushing_ios);
0576b1c6 1044 spin_unlock_irq(&log->io_list_lock);
a8c34f91
SL
1045
1046 if (!do_flush)
1047 return;
1048 bio_reset(&log->flush_bio);
1049 log->flush_bio.bi_bdev = log->rdev->bdev;
1050 log->flush_bio.bi_end_io = r5l_log_flush_endio;
796a5cf0 1051 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
4e49ea4a 1052 submit_bio(&log->flush_bio);
0576b1c6
SL
1053}
1054
0576b1c6 1055static void r5l_write_super(struct r5l_log *log, sector_t cp);
4b482044
SL
1056static void r5l_write_super_and_discard_space(struct r5l_log *log,
1057 sector_t end)
1058{
1059 struct block_device *bdev = log->rdev->bdev;
1060 struct mddev *mddev;
1061
1062 r5l_write_super(log, end);
1063
1064 if (!blk_queue_discard(bdev_get_queue(bdev)))
1065 return;
1066
1067 mddev = log->rdev->mddev;
1068 /*
8e018c21
SL
1069 * Discard could zero data, so before discard we must make sure
1070 * superblock is updated to new log tail. Updating superblock (either
1071 * directly call md_update_sb() or depend on md thread) must hold
1072 * reconfig mutex. On the other hand, raid5_quiesce is called with
1073 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1074 * for all IO finish, hence waitting for reclaim thread, while reclaim
1075 * thread is calling this function and waitting for reconfig mutex. So
1076 * there is a deadlock. We workaround this issue with a trylock.
1077 * FIXME: we could miss discard if we can't take reconfig mutex
4b482044 1078 */
8e018c21
SL
1079 set_mask_bits(&mddev->flags, 0,
1080 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1081 if (!mddev_trylock(mddev))
1082 return;
1083 md_update_sb(mddev, 1);
1084 mddev_unlock(mddev);
4b482044 1085
6e74a9cf 1086 /* discard IO error really doesn't matter, ignore it */
4b482044
SL
1087 if (log->last_checkpoint < end) {
1088 blkdev_issue_discard(bdev,
1089 log->last_checkpoint + log->rdev->data_offset,
1090 end - log->last_checkpoint, GFP_NOIO, 0);
1091 } else {
1092 blkdev_issue_discard(bdev,
1093 log->last_checkpoint + log->rdev->data_offset,
1094 log->device_size - log->last_checkpoint,
1095 GFP_NOIO, 0);
1096 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1097 GFP_NOIO, 0);
1098 }
1099}
1100
a39f7afd
SL
1101/*
1102 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1103 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1104 *
1105 * must hold conf->device_lock
1106 */
1107static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1108{
1109 BUG_ON(list_empty(&sh->lru));
1110 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1111 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1112
1113 /*
1114 * The stripe is not ON_RELEASE_LIST, so it is safe to call
1115 * raid5_release_stripe() while holding conf->device_lock
1116 */
1117 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1118 assert_spin_locked(&conf->device_lock);
1119
1120 list_del_init(&sh->lru);
1121 atomic_inc(&sh->count);
1122
1123 set_bit(STRIPE_HANDLE, &sh->state);
1124 atomic_inc(&conf->active_stripes);
1125 r5c_make_stripe_write_out(sh);
1126
1127 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1128 atomic_inc(&conf->preread_active_stripes);
1129 raid5_release_stripe(sh);
1130}
1131
1132/*
1133 * if num == 0, flush all full stripes
1134 * if num > 0, flush all full stripes. If less than num full stripes are
1135 * flushed, flush some partial stripes until totally num stripes are
1136 * flushed or there is no more cached stripes.
1137 */
1138void r5c_flush_cache(struct r5conf *conf, int num)
1139{
1140 int count;
1141 struct stripe_head *sh, *next;
1142
1143 assert_spin_locked(&conf->device_lock);
1144 if (!conf->log)
1145 return;
1146
1147 count = 0;
1148 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1149 r5c_flush_stripe(conf, sh);
1150 count++;
1151 }
1152
1153 if (count >= num)
1154 return;
1155 list_for_each_entry_safe(sh, next,
1156 &conf->r5c_partial_stripe_list, lru) {
1157 r5c_flush_stripe(conf, sh);
1158 if (++count >= num)
1159 break;
1160 }
1161}
1162
1163static void r5c_do_reclaim(struct r5conf *conf)
1164{
1165 struct r5l_log *log = conf->log;
1166 struct stripe_head *sh;
1167 int count = 0;
1168 unsigned long flags;
1169 int total_cached;
1170 int stripes_to_flush;
1171
1172 if (!r5c_is_writeback(log))
1173 return;
1174
1175 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1176 atomic_read(&conf->r5c_cached_full_stripes);
1177
1178 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1179 atomic_read(&conf->empty_inactive_list_nr) > 0)
1180 /*
1181 * if stripe cache pressure high, flush all full stripes and
1182 * some partial stripes
1183 */
1184 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1185 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1186 atomic_read(&conf->r5c_cached_full_stripes) >
1187 R5C_FULL_STRIPE_FLUSH_BATCH)
1188 /*
1189 * if stripe cache pressure moderate, or if there is many full
1190 * stripes,flush all full stripes
1191 */
1192 stripes_to_flush = 0;
1193 else
1194 /* no need to flush */
1195 stripes_to_flush = -1;
1196
1197 if (stripes_to_flush >= 0) {
1198 spin_lock_irqsave(&conf->device_lock, flags);
1199 r5c_flush_cache(conf, stripes_to_flush);
1200 spin_unlock_irqrestore(&conf->device_lock, flags);
1201 }
1202
1203 /* if log space is tight, flush stripes on stripe_in_journal_list */
1204 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1205 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1206 spin_lock(&conf->device_lock);
1207 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1208 /*
1209 * stripes on stripe_in_journal_list could be in any
1210 * state of the stripe_cache state machine. In this
1211 * case, we only want to flush stripe on
1212 * r5c_cached_full/partial_stripes. The following
1213 * condition makes sure the stripe is on one of the
1214 * two lists.
1215 */
1216 if (!list_empty(&sh->lru) &&
1217 !test_bit(STRIPE_HANDLE, &sh->state) &&
1218 atomic_read(&sh->count) == 0) {
1219 r5c_flush_stripe(conf, sh);
1220 }
1221 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1222 break;
1223 }
1224 spin_unlock(&conf->device_lock);
1225 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1226 }
1227 md_wakeup_thread(conf->mddev->thread);
1228}
1229
0576b1c6
SL
1230static void r5l_do_reclaim(struct r5l_log *log)
1231{
a39f7afd 1232 struct r5conf *conf = log->rdev->mddev->private;
0576b1c6 1233 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
17036461
CH
1234 sector_t reclaimable;
1235 sector_t next_checkpoint;
a39f7afd 1236 bool write_super;
0576b1c6
SL
1237
1238 spin_lock_irq(&log->io_list_lock);
a39f7afd
SL
1239 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1240 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
0576b1c6
SL
1241 /*
1242 * move proper io_unit to reclaim list. We should not change the order.
1243 * reclaimable/unreclaimable io_unit can be mixed in the list, we
1244 * shouldn't reuse space of an unreclaimable io_unit
1245 */
1246 while (1) {
17036461
CH
1247 reclaimable = r5l_reclaimable_space(log);
1248 if (reclaimable >= reclaim_target ||
0576b1c6
SL
1249 (list_empty(&log->running_ios) &&
1250 list_empty(&log->io_end_ios) &&
a8c34f91 1251 list_empty(&log->flushing_ios) &&
04732f74 1252 list_empty(&log->finished_ios)))
0576b1c6
SL
1253 break;
1254
17036461
CH
1255 md_wakeup_thread(log->rdev->mddev->thread);
1256 wait_event_lock_irq(log->iounit_wait,
1257 r5l_reclaimable_space(log) > reclaimable,
1258 log->io_list_lock);
0576b1c6 1259 }
17036461 1260
a39f7afd 1261 next_checkpoint = r5c_calculate_new_cp(conf);
0576b1c6
SL
1262 spin_unlock_irq(&log->io_list_lock);
1263
17036461 1264 BUG_ON(reclaimable < 0);
a39f7afd
SL
1265
1266 if (reclaimable == 0 || !write_super)
0576b1c6
SL
1267 return;
1268
0576b1c6
SL
1269 /*
1270 * write_super will flush cache of each raid disk. We must write super
1271 * here, because the log area might be reused soon and we don't want to
1272 * confuse recovery
1273 */
4b482044 1274 r5l_write_super_and_discard_space(log, next_checkpoint);
0576b1c6
SL
1275
1276 mutex_lock(&log->io_mutex);
17036461 1277 log->last_checkpoint = next_checkpoint;
a39f7afd 1278 r5c_update_log_state(log);
0576b1c6 1279 mutex_unlock(&log->io_mutex);
0576b1c6 1280
17036461 1281 r5l_run_no_space_stripes(log);
0576b1c6
SL
1282}
1283
1284static void r5l_reclaim_thread(struct md_thread *thread)
1285{
1286 struct mddev *mddev = thread->mddev;
1287 struct r5conf *conf = mddev->private;
1288 struct r5l_log *log = conf->log;
1289
1290 if (!log)
1291 return;
a39f7afd 1292 r5c_do_reclaim(conf);
0576b1c6
SL
1293 r5l_do_reclaim(log);
1294}
1295
a39f7afd 1296void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
f6bed0ef 1297{
0576b1c6
SL
1298 unsigned long target;
1299 unsigned long new = (unsigned long)space; /* overflow in theory */
1300
a39f7afd
SL
1301 if (!log)
1302 return;
0576b1c6
SL
1303 do {
1304 target = log->reclaim_target;
1305 if (new < target)
1306 return;
1307 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1308 md_wakeup_thread(log->reclaim_thread);
f6bed0ef
SL
1309}
1310
e6c033f7
SL
1311void r5l_quiesce(struct r5l_log *log, int state)
1312{
4b482044 1313 struct mddev *mddev;
e6c033f7
SL
1314 if (!log || state == 2)
1315 return;
1316 if (state == 0) {
16a43f6a
SL
1317 /*
1318 * This is a special case for hotadd. In suspend, the array has
1319 * no journal. In resume, journal is initialized as well as the
1320 * reclaim thread.
1321 */
1322 if (log->reclaim_thread)
1323 return;
e6c033f7
SL
1324 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1325 log->rdev->mddev, "reclaim");
a39f7afd 1326 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
e6c033f7 1327 } else if (state == 1) {
4b482044
SL
1328 /* make sure r5l_write_super_and_discard_space exits */
1329 mddev = log->rdev->mddev;
1330 wake_up(&mddev->sb_wait);
a39f7afd 1331 r5l_wake_reclaim(log, MaxSector);
e6c033f7
SL
1332 md_unregister_thread(&log->reclaim_thread);
1333 r5l_do_reclaim(log);
1334 }
1335}
1336
6e74a9cf
SL
1337bool r5l_log_disk_error(struct r5conf *conf)
1338{
f6b6ec5c
SL
1339 struct r5l_log *log;
1340 bool ret;
7dde2ad3 1341 /* don't allow write if journal disk is missing */
f6b6ec5c
SL
1342 rcu_read_lock();
1343 log = rcu_dereference(conf->log);
1344
1345 if (!log)
1346 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1347 else
1348 ret = test_bit(Faulty, &log->rdev->flags);
1349 rcu_read_unlock();
1350 return ret;
6e74a9cf
SL
1351}
1352
355810d1
SL
1353struct r5l_recovery_ctx {
1354 struct page *meta_page; /* current meta */
1355 sector_t meta_total_blocks; /* total size of current meta and data */
1356 sector_t pos; /* recovery position */
1357 u64 seq; /* recovery position seq */
b4c625c6
SL
1358 int data_parity_stripes; /* number of data_parity stripes */
1359 int data_only_stripes; /* number of data_only stripes */
1360 struct list_head cached_list;
355810d1
SL
1361};
1362
9ed988f5
SL
1363static int r5l_recovery_read_meta_block(struct r5l_log *log,
1364 struct r5l_recovery_ctx *ctx)
355810d1
SL
1365{
1366 struct page *page = ctx->meta_page;
1367 struct r5l_meta_block *mb;
1368 u32 crc, stored_crc;
1369
796a5cf0
MC
1370 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1371 false))
355810d1
SL
1372 return -EIO;
1373
1374 mb = page_address(page);
1375 stored_crc = le32_to_cpu(mb->checksum);
1376 mb->checksum = 0;
1377
1378 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1379 le64_to_cpu(mb->seq) != ctx->seq ||
1380 mb->version != R5LOG_VERSION ||
1381 le64_to_cpu(mb->position) != ctx->pos)
1382 return -EINVAL;
1383
5cb2fbd6 1384 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
355810d1
SL
1385 if (stored_crc != crc)
1386 return -EINVAL;
1387
1388 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1389 return -EINVAL;
1390
1391 ctx->meta_total_blocks = BLOCK_SECTORS;
1392
1393 return 0;
1394}
1395
1396static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1397 struct r5l_recovery_ctx *ctx,
1398 sector_t stripe_sect,
3fd880af 1399 int *offset)
355810d1
SL
1400{
1401 struct r5conf *conf = log->rdev->mddev->private;
1402 struct stripe_head *sh;
1403 struct r5l_payload_data_parity *payload;
1404 int disk_index;
1405
1406 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1407 while (1) {
3fd880af
J
1408 sector_t log_offset = r5l_ring_add(log, ctx->pos,
1409 ctx->meta_total_blocks);
355810d1
SL
1410 payload = page_address(ctx->meta_page) + *offset;
1411
1412 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1413 raid5_compute_sector(conf,
1414 le64_to_cpu(payload->location), 0,
1415 &disk_index, sh);
1416
3fd880af 1417 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
796a5cf0
MC
1418 sh->dev[disk_index].page, REQ_OP_READ, 0,
1419 false);
355810d1
SL
1420 sh->dev[disk_index].log_checksum =
1421 le32_to_cpu(payload->checksum[0]);
1422 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
355810d1
SL
1423 } else {
1424 disk_index = sh->pd_idx;
3fd880af 1425 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
796a5cf0
MC
1426 sh->dev[disk_index].page, REQ_OP_READ, 0,
1427 false);
355810d1
SL
1428 sh->dev[disk_index].log_checksum =
1429 le32_to_cpu(payload->checksum[0]);
1430 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1431
1432 if (sh->qd_idx >= 0) {
1433 disk_index = sh->qd_idx;
1434 sync_page_io(log->rdev,
3fd880af 1435 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
355810d1 1436 PAGE_SIZE, sh->dev[disk_index].page,
796a5cf0 1437 REQ_OP_READ, 0, false);
355810d1
SL
1438 sh->dev[disk_index].log_checksum =
1439 le32_to_cpu(payload->checksum[1]);
1440 set_bit(R5_Wantwrite,
1441 &sh->dev[disk_index].flags);
1442 }
355810d1
SL
1443 }
1444
3fd880af 1445 ctx->meta_total_blocks += le32_to_cpu(payload->size);
355810d1
SL
1446 *offset += sizeof(struct r5l_payload_data_parity) +
1447 sizeof(__le32) *
1448 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1449 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1450 break;
1451 }
1452
1453 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1454 void *addr;
1455 u32 checksum;
1456
1457 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1458 continue;
1459 addr = kmap_atomic(sh->dev[disk_index].page);
5cb2fbd6 1460 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
355810d1
SL
1461 kunmap_atomic(addr);
1462 if (checksum != sh->dev[disk_index].log_checksum)
1463 goto error;
1464 }
1465
1466 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1467 struct md_rdev *rdev, *rrdev;
1468
1469 if (!test_and_clear_bit(R5_Wantwrite,
1470 &sh->dev[disk_index].flags))
1471 continue;
1472
1473 /* in case device is broken */
354b445b 1474 rcu_read_lock();
355810d1 1475 rdev = rcu_dereference(conf->disks[disk_index].rdev);
354b445b
SL
1476 if (rdev) {
1477 atomic_inc(&rdev->nr_pending);
1478 rcu_read_unlock();
355810d1 1479 sync_page_io(rdev, stripe_sect, PAGE_SIZE,
796a5cf0
MC
1480 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1481 false);
354b445b
SL
1482 rdev_dec_pending(rdev, rdev->mddev);
1483 rcu_read_lock();
1484 }
355810d1 1485 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
354b445b
SL
1486 if (rrdev) {
1487 atomic_inc(&rrdev->nr_pending);
1488 rcu_read_unlock();
355810d1 1489 sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
796a5cf0
MC
1490 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1491 false);
354b445b
SL
1492 rdev_dec_pending(rrdev, rrdev->mddev);
1493 rcu_read_lock();
1494 }
1495 rcu_read_unlock();
355810d1
SL
1496 }
1497 raid5_release_stripe(sh);
1498 return 0;
1499
1500error:
1501 for (disk_index = 0; disk_index < sh->disks; disk_index++)
1502 sh->dev[disk_index].flags = 0;
1503 raid5_release_stripe(sh);
1504 return -EINVAL;
1505}
1506
1507static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1508 struct r5l_recovery_ctx *ctx)
1509{
1510 struct r5conf *conf = log->rdev->mddev->private;
1511 struct r5l_payload_data_parity *payload;
1512 struct r5l_meta_block *mb;
1513 int offset;
355810d1
SL
1514 sector_t stripe_sector;
1515
1516 mb = page_address(ctx->meta_page);
1517 offset = sizeof(struct r5l_meta_block);
355810d1
SL
1518
1519 while (offset < le32_to_cpu(mb->meta_size)) {
1520 int dd;
1521
1522 payload = (void *)mb + offset;
1523 stripe_sector = raid5_compute_sector(conf,
1524 le64_to_cpu(payload->location), 0, &dd, NULL);
1525 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
3fd880af 1526 &offset))
355810d1
SL
1527 return -EINVAL;
1528 }
1529 return 0;
1530}
1531
1532/* copy data/parity from log to raid disks */
1533static void r5l_recovery_flush_log(struct r5l_log *log,
1534 struct r5l_recovery_ctx *ctx)
1535{
1536 while (1) {
9ed988f5 1537 if (r5l_recovery_read_meta_block(log, ctx))
355810d1
SL
1538 return;
1539 if (r5l_recovery_flush_one_meta(log, ctx))
1540 return;
1541 ctx->seq++;
1542 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1543 }
1544}
1545
9ed988f5
SL
1546static void
1547r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1548 struct page *page,
1549 sector_t pos, u64 seq)
355810d1 1550{
355810d1
SL
1551 struct r5l_meta_block *mb;
1552 u32 crc;
1553
355810d1 1554 mb = page_address(page);
9ed988f5 1555 clear_page(mb);
355810d1
SL
1556 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1557 mb->version = R5LOG_VERSION;
1558 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1559 mb->seq = cpu_to_le64(seq);
1560 mb->position = cpu_to_le64(pos);
5cb2fbd6 1561 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
355810d1 1562 mb->checksum = cpu_to_le32(crc);
9ed988f5 1563}
355810d1 1564
9ed988f5
SL
1565static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1566 u64 seq)
1567{
1568 struct page *page;
1569
1570 page = alloc_page(GFP_KERNEL);
1571 if (!page)
1572 return -ENOMEM;
1573 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
796a5cf0
MC
1574 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1575 WRITE_FUA, false)) {
355810d1
SL
1576 __free_page(page);
1577 return -EIO;
1578 }
1579 __free_page(page);
1580 return 0;
1581}
1582
b4c625c6
SL
1583/*
1584 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
1585 * to mark valid (potentially not flushed) data in the journal.
1586 *
1587 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
1588 * so there should not be any mismatch here.
1589 */
1590static void r5l_recovery_load_data(struct r5l_log *log,
1591 struct stripe_head *sh,
1592 struct r5l_recovery_ctx *ctx,
1593 struct r5l_payload_data_parity *payload,
1594 sector_t log_offset)
1595{
1596 struct mddev *mddev = log->rdev->mddev;
1597 struct r5conf *conf = mddev->private;
1598 int dd_idx;
1599
1600 raid5_compute_sector(conf,
1601 le64_to_cpu(payload->location), 0,
1602 &dd_idx, sh);
1603 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1604 sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
1605 sh->dev[dd_idx].log_checksum =
1606 le32_to_cpu(payload->checksum[0]);
1607 ctx->meta_total_blocks += BLOCK_SECTORS;
1608
1609 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1610 set_bit(STRIPE_R5C_CACHING, &sh->state);
1611}
1612
1613static void r5l_recovery_load_parity(struct r5l_log *log,
1614 struct stripe_head *sh,
1615 struct r5l_recovery_ctx *ctx,
1616 struct r5l_payload_data_parity *payload,
1617 sector_t log_offset)
1618{
1619 struct mddev *mddev = log->rdev->mddev;
1620 struct r5conf *conf = mddev->private;
1621
1622 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1623 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1624 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
1625 sh->dev[sh->pd_idx].log_checksum =
1626 le32_to_cpu(payload->checksum[0]);
1627 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1628
1629 if (sh->qd_idx >= 0) {
1630 sync_page_io(log->rdev,
1631 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1632 PAGE_SIZE, sh->dev[sh->qd_idx].page,
1633 REQ_OP_READ, 0, false);
1634 sh->dev[sh->qd_idx].log_checksum =
1635 le32_to_cpu(payload->checksum[1]);
1636 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1637 }
1638 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1639}
1640
1641static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1642{
1643 int i;
1644
1645 sh->state = 0;
1646 sh->log_start = MaxSector;
1647 for (i = sh->disks; i--; )
1648 sh->dev[i].flags = 0;
1649}
1650
1651static void
1652r5l_recovery_replay_one_stripe(struct r5conf *conf,
1653 struct stripe_head *sh,
1654 struct r5l_recovery_ctx *ctx)
1655{
1656 struct md_rdev *rdev, *rrdev;
1657 int disk_index;
1658 int data_count = 0;
1659
1660 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1661 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1662 continue;
1663 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1664 continue;
1665 data_count++;
1666 }
1667
1668 /*
1669 * stripes that only have parity must have been flushed
1670 * before the crash that we are now recovering from, so
1671 * there is nothing more to recovery.
1672 */
1673 if (data_count == 0)
1674 goto out;
1675
1676 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1677 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1678 continue;
1679
1680 /* in case device is broken */
1681 rcu_read_lock();
1682 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1683 if (rdev) {
1684 atomic_inc(&rdev->nr_pending);
1685 rcu_read_unlock();
1686 sync_page_io(rdev, sh->sector, PAGE_SIZE,
1687 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1688 false);
1689 rdev_dec_pending(rdev, rdev->mddev);
1690 rcu_read_lock();
1691 }
1692 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1693 if (rrdev) {
1694 atomic_inc(&rrdev->nr_pending);
1695 rcu_read_unlock();
1696 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1697 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1698 false);
1699 rdev_dec_pending(rrdev, rrdev->mddev);
1700 rcu_read_lock();
1701 }
1702 rcu_read_unlock();
1703 }
1704 ctx->data_parity_stripes++;
1705out:
1706 r5l_recovery_reset_stripe(sh);
1707}
1708
1709static struct stripe_head *
1710r5c_recovery_alloc_stripe(struct r5conf *conf,
1711 struct list_head *recovery_list,
1712 sector_t stripe_sect,
1713 sector_t log_start)
1714{
1715 struct stripe_head *sh;
1716
1717 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1718 if (!sh)
1719 return NULL; /* no more stripe available */
1720
1721 r5l_recovery_reset_stripe(sh);
1722 sh->log_start = log_start;
1723
1724 return sh;
1725}
1726
1727static struct stripe_head *
1728r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1729{
1730 struct stripe_head *sh;
1731
1732 list_for_each_entry(sh, list, lru)
1733 if (sh->sector == sect)
1734 return sh;
1735 return NULL;
1736}
1737
1738static void
1739r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1740 struct r5l_recovery_ctx *ctx)
1741{
1742 struct stripe_head *sh, *next;
1743
1744 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1745 r5l_recovery_reset_stripe(sh);
1746 list_del_init(&sh->lru);
1747 raid5_release_stripe(sh);
1748 }
1749}
1750
1751static void
1752r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1753 struct r5l_recovery_ctx *ctx)
1754{
1755 struct stripe_head *sh, *next;
1756
1757 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1758 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1759 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1760 list_del_init(&sh->lru);
1761 raid5_release_stripe(sh);
1762 }
1763}
1764
1765/* if matches return 0; otherwise return -EINVAL */
1766static int
1767r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
1768 sector_t log_offset, __le32 log_checksum)
1769{
1770 void *addr;
1771 u32 checksum;
1772
1773 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1774 page, REQ_OP_READ, 0, false);
1775 addr = kmap_atomic(page);
1776 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1777 kunmap_atomic(addr);
1778 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1779}
1780
1781/*
1782 * before loading data to stripe cache, we need verify checksum for all data,
1783 * if there is mismatch for any data page, we drop all data in the mata block
1784 */
1785static int
1786r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1787 struct r5l_recovery_ctx *ctx)
1788{
1789 struct mddev *mddev = log->rdev->mddev;
1790 struct r5conf *conf = mddev->private;
1791 struct r5l_meta_block *mb = page_address(ctx->meta_page);
1792 sector_t mb_offset = sizeof(struct r5l_meta_block);
1793 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1794 struct page *page;
1795 struct r5l_payload_data_parity *payload;
1796
1797 page = alloc_page(GFP_KERNEL);
1798 if (!page)
1799 return -ENOMEM;
1800
1801 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1802 payload = (void *)mb + mb_offset;
1803
1804 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1805 if (r5l_recovery_verify_data_checksum(
1806 log, page, log_offset,
1807 payload->checksum[0]) < 0)
1808 goto mismatch;
1809 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
1810 if (r5l_recovery_verify_data_checksum(
1811 log, page, log_offset,
1812 payload->checksum[0]) < 0)
1813 goto mismatch;
1814 if (conf->max_degraded == 2 && /* q for RAID 6 */
1815 r5l_recovery_verify_data_checksum(
1816 log, page,
1817 r5l_ring_add(log, log_offset,
1818 BLOCK_SECTORS),
1819 payload->checksum[1]) < 0)
1820 goto mismatch;
1821 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
1822 goto mismatch;
1823
1824 log_offset = r5l_ring_add(log, log_offset,
1825 le32_to_cpu(payload->size));
1826
1827 mb_offset += sizeof(struct r5l_payload_data_parity) +
1828 sizeof(__le32) *
1829 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1830 }
1831
1832 put_page(page);
1833 return 0;
1834
1835mismatch:
1836 put_page(page);
1837 return -EINVAL;
1838}
1839
1840/*
1841 * Analyze all data/parity pages in one meta block
1842 * Returns:
1843 * 0 for success
1844 * -EINVAL for unknown playload type
1845 * -EAGAIN for checksum mismatch of data page
1846 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
1847 */
1848static int
1849r5c_recovery_analyze_meta_block(struct r5l_log *log,
1850 struct r5l_recovery_ctx *ctx,
1851 struct list_head *cached_stripe_list)
1852{
1853 struct mddev *mddev = log->rdev->mddev;
1854 struct r5conf *conf = mddev->private;
1855 struct r5l_meta_block *mb;
1856 struct r5l_payload_data_parity *payload;
1857 int mb_offset;
1858 sector_t log_offset;
1859 sector_t stripe_sect;
1860 struct stripe_head *sh;
1861 int ret;
1862
1863 /*
1864 * for mismatch in data blocks, we will drop all data in this mb, but
1865 * we will still read next mb for other data with FLUSH flag, as
1866 * io_unit could finish out of order.
1867 */
1868 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
1869 if (ret == -EINVAL)
1870 return -EAGAIN;
1871 else if (ret)
1872 return ret; /* -ENOMEM duo to alloc_page() failed */
1873
1874 mb = page_address(ctx->meta_page);
1875 mb_offset = sizeof(struct r5l_meta_block);
1876 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1877
1878 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1879 int dd;
1880
1881 payload = (void *)mb + mb_offset;
1882 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
1883 raid5_compute_sector(
1884 conf, le64_to_cpu(payload->location), 0, &dd,
1885 NULL)
1886 : le64_to_cpu(payload->location);
1887
1888 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
1889 stripe_sect);
1890
1891 if (!sh) {
1892 sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
1893 stripe_sect, ctx->pos);
1894 /*
1895 * cannot get stripe from raid5_get_active_stripe
1896 * try replay some stripes
1897 */
1898 if (!sh) {
1899 r5c_recovery_replay_stripes(
1900 cached_stripe_list, ctx);
1901 sh = r5c_recovery_alloc_stripe(
1902 conf, cached_stripe_list,
1903 stripe_sect, ctx->pos);
1904 }
1905 if (!sh) {
1906 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
1907 mdname(mddev),
1908 conf->min_nr_stripes * 2);
1909 raid5_set_cache_size(mddev,
1910 conf->min_nr_stripes * 2);
1911 sh = r5c_recovery_alloc_stripe(
1912 conf, cached_stripe_list, stripe_sect,
1913 ctx->pos);
1914 }
1915 if (!sh) {
1916 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
1917 mdname(mddev));
1918 return -ENOMEM;
1919 }
1920 list_add_tail(&sh->lru, cached_stripe_list);
1921 }
1922
1923 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1924 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1925 r5l_recovery_replay_one_stripe(conf, sh, ctx);
1926 r5l_recovery_reset_stripe(sh);
1927 sh->log_start = ctx->pos;
1928 list_move_tail(&sh->lru, cached_stripe_list);
1929 }
1930 r5l_recovery_load_data(log, sh, ctx, payload,
1931 log_offset);
1932 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
1933 r5l_recovery_load_parity(log, sh, ctx, payload,
1934 log_offset);
1935 else
1936 return -EINVAL;
1937
1938 log_offset = r5l_ring_add(log, log_offset,
1939 le32_to_cpu(payload->size));
1940
1941 mb_offset += sizeof(struct r5l_payload_data_parity) +
1942 sizeof(__le32) *
1943 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1944 }
1945
1946 return 0;
1947}
1948
1949/*
1950 * Load the stripe into cache. The stripe will be written out later by
1951 * the stripe cache state machine.
1952 */
1953static void r5c_recovery_load_one_stripe(struct r5l_log *log,
1954 struct stripe_head *sh)
1955{
1956 struct r5conf *conf = sh->raid_conf;
1957 struct r5dev *dev;
1958 int i;
1959
1960 for (i = sh->disks; i--; ) {
1961 dev = sh->dev + i;
1962 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
1963 set_bit(R5_InJournal, &dev->flags);
1964 set_bit(R5_UPTODATE, &dev->flags);
1965 }
1966 }
1967 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
1968 atomic_inc(&conf->r5c_cached_partial_stripes);
1969 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
1970}
1971
1972/*
1973 * Scan through the log for all to-be-flushed data
1974 *
1975 * For stripes with data and parity, namely Data-Parity stripe
1976 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
1977 *
1978 * For stripes with only data, namely Data-Only stripe
1979 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
1980 *
1981 * For a stripe, if we see data after parity, we should discard all previous
1982 * data and parity for this stripe, as these data are already flushed to
1983 * the array.
1984 *
1985 * At the end of the scan, we return the new journal_tail, which points to
1986 * first data-only stripe on the journal device, or next invalid meta block.
1987 */
1988static int r5c_recovery_flush_log(struct r5l_log *log,
1989 struct r5l_recovery_ctx *ctx)
1990{
1991 struct stripe_head *sh, *next;
1992 int ret = 0;
1993
1994 /* scan through the log */
1995 while (1) {
1996 if (r5l_recovery_read_meta_block(log, ctx))
1997 break;
1998
1999 ret = r5c_recovery_analyze_meta_block(log, ctx,
2000 &ctx->cached_list);
2001 /*
2002 * -EAGAIN means mismatch in data block, in this case, we still
2003 * try scan the next metablock
2004 */
2005 if (ret && ret != -EAGAIN)
2006 break; /* ret == -EINVAL or -ENOMEM */
2007 ctx->seq++;
2008 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
2009 }
2010
2011 if (ret == -ENOMEM) {
2012 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
2013 return ret;
2014 }
2015
2016 /* replay data-parity stripes */
2017 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
2018
2019 /* load data-only stripes to stripe cache */
2020 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2021 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2022 r5c_recovery_load_one_stripe(log, sh);
2023 list_del_init(&sh->lru);
2024 raid5_release_stripe(sh);
2025 ctx->data_only_stripes++;
2026 }
2027
2028 return 0;
2029}
2030
2031/*
2032 * we did a recovery. Now ctx.pos points to an invalid meta block. New
2033 * log will start here. but we can't let superblock point to last valid
2034 * meta block. The log might looks like:
2035 * | meta 1| meta 2| meta 3|
2036 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
2037 * superblock points to meta 1, we write a new valid meta 2n. if crash
2038 * happens again, new recovery will start from meta 1. Since meta 2n is
2039 * valid now, recovery will think meta 3 is valid, which is wrong.
2040 * The solution is we create a new meta in meta2 with its seq == meta
2041 * 1's seq + 10 and let superblock points to meta2. The same recovery will
2042 * not think meta 3 is a valid meta, because its seq doesn't match
2043 */
2044
2045/*
2046 * Before recovery, the log looks like the following
2047 *
2048 * ---------------------------------------------
2049 * | valid log | invalid log |
2050 * ---------------------------------------------
2051 * ^
2052 * |- log->last_checkpoint
2053 * |- log->last_cp_seq
2054 *
2055 * Now we scan through the log until we see invalid entry
2056 *
2057 * ---------------------------------------------
2058 * | valid log | invalid log |
2059 * ---------------------------------------------
2060 * ^ ^
2061 * |- log->last_checkpoint |- ctx->pos
2062 * |- log->last_cp_seq |- ctx->seq
2063 *
2064 * From this point, we need to increase seq number by 10 to avoid
2065 * confusing next recovery.
2066 *
2067 * ---------------------------------------------
2068 * | valid log | invalid log |
2069 * ---------------------------------------------
2070 * ^ ^
2071 * |- log->last_checkpoint |- ctx->pos+1
2072 * |- log->last_cp_seq |- ctx->seq+11
2073 *
2074 * However, it is not safe to start the state machine yet, because data only
2075 * parities are not yet secured in RAID. To save these data only parities, we
2076 * rewrite them from seq+11.
2077 *
2078 * -----------------------------------------------------------------
2079 * | valid log | data only stripes | invalid log |
2080 * -----------------------------------------------------------------
2081 * ^ ^
2082 * |- log->last_checkpoint |- ctx->pos+n
2083 * |- log->last_cp_seq |- ctx->seq+10+n
2084 *
2085 * If failure happens again during this process, the recovery can safe start
2086 * again from log->last_checkpoint.
2087 *
2088 * Once data only stripes are rewritten to journal, we move log_tail
2089 *
2090 * -----------------------------------------------------------------
2091 * | old log | data only stripes | invalid log |
2092 * -----------------------------------------------------------------
2093 * ^ ^
2094 * |- log->last_checkpoint |- ctx->pos+n
2095 * |- log->last_cp_seq |- ctx->seq+10+n
2096 *
2097 * Then we can safely start the state machine. If failure happens from this
2098 * point on, the recovery will start from new log->last_checkpoint.
2099 */
2100static int
2101r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2102 struct r5l_recovery_ctx *ctx)
2103{
2104 struct stripe_head *sh;
2105 struct mddev *mddev = log->rdev->mddev;
2106 struct page *page;
2107
2108 page = alloc_page(GFP_KERNEL);
2109 if (!page) {
2110 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2111 mdname(mddev));
2112 return -ENOMEM;
2113 }
2114
2115 ctx->seq += 10;
2116 list_for_each_entry(sh, &ctx->cached_list, lru) {
2117 struct r5l_meta_block *mb;
2118 int i;
2119 int offset;
2120 sector_t write_pos;
2121
2122 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2123 r5l_recovery_create_empty_meta_block(log, page,
2124 ctx->pos, ctx->seq);
2125 mb = page_address(page);
2126 offset = le32_to_cpu(mb->meta_size);
2127 write_pos = ctx->pos + BLOCK_SECTORS;
2128
2129 for (i = sh->disks; i--; ) {
2130 struct r5dev *dev = &sh->dev[i];
2131 struct r5l_payload_data_parity *payload;
2132 void *addr;
2133
2134 if (test_bit(R5_InJournal, &dev->flags)) {
2135 payload = (void *)mb + offset;
2136 payload->header.type = cpu_to_le16(
2137 R5LOG_PAYLOAD_DATA);
2138 payload->size = BLOCK_SECTORS;
2139 payload->location = cpu_to_le64(
2140 raid5_compute_blocknr(sh, i, 0));
2141 addr = kmap_atomic(dev->page);
2142 payload->checksum[0] = cpu_to_le32(
2143 crc32c_le(log->uuid_checksum, addr,
2144 PAGE_SIZE));
2145 kunmap_atomic(addr);
2146 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2147 dev->page, REQ_OP_WRITE, 0, false);
2148 write_pos = r5l_ring_add(log, write_pos,
2149 BLOCK_SECTORS);
2150 offset += sizeof(__le32) +
2151 sizeof(struct r5l_payload_data_parity);
2152
2153 }
2154 }
2155 mb->meta_size = cpu_to_le32(offset);
2156 mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2157 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2158 REQ_OP_WRITE, WRITE_FUA, false);
2159 sh->log_start = ctx->pos;
2160 ctx->pos = write_pos;
2161 ctx->seq += 1;
2162 }
2163 __free_page(page);
2164 return 0;
2165}
2166
f6bed0ef
SL
2167static int r5l_recovery_log(struct r5l_log *log)
2168{
355810d1
SL
2169 struct r5l_recovery_ctx ctx;
2170
2171 ctx.pos = log->last_checkpoint;
2172 ctx.seq = log->last_cp_seq;
2173 ctx.meta_page = alloc_page(GFP_KERNEL);
b4c625c6
SL
2174 ctx.data_only_stripes = 0;
2175 ctx.data_parity_stripes = 0;
2176 INIT_LIST_HEAD(&ctx.cached_list);
2177
355810d1
SL
2178 if (!ctx.meta_page)
2179 return -ENOMEM;
2180
2181 r5l_recovery_flush_log(log, &ctx);
2182 __free_page(ctx.meta_page);
2183
2184 /*
2185 * we did a recovery. Now ctx.pos points to an invalid meta block. New
2186 * log will start here. but we can't let superblock point to last valid
2187 * meta block. The log might looks like:
2188 * | meta 1| meta 2| meta 3|
2189 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
2190 * superblock points to meta 1, we write a new valid meta 2n. if crash
2191 * happens again, new recovery will start from meta 1. Since meta 2n is
2192 * valid now, recovery will think meta 3 is valid, which is wrong.
2193 * The solution is we create a new meta in meta2 with its seq == meta
2194 * 1's seq + 10 and let superblock points to meta2. The same recovery will
2195 * not think meta 3 is a valid meta, because its seq doesn't match
2196 */
9a8b27fa 2197 if (ctx.seq > log->last_cp_seq) {
355810d1
SL
2198 int ret;
2199
2200 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
2201 if (ret)
2202 return ret;
2203 log->seq = ctx.seq + 11;
2204 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
2205 r5l_write_super(log, ctx.pos);
28cd88e2
ZL
2206 log->last_checkpoint = ctx.pos;
2207 log->next_checkpoint = ctx.pos;
355810d1
SL
2208 } else {
2209 log->log_start = ctx.pos;
2210 log->seq = ctx.seq;
2211 }
b4c625c6
SL
2212
2213 /*
2214 * This is to suppress "function defined but not used" warning.
2215 * It will be removed when the two functions are used (next patch).
2216 */
2217 if (!log) {
2218 r5c_recovery_flush_log(log, &ctx);
2219 r5c_recovery_rewrite_data_only_stripes(log, &ctx);
2220 }
2221
f6bed0ef
SL
2222 return 0;
2223}
2224
2225static void r5l_write_super(struct r5l_log *log, sector_t cp)
2226{
2227 struct mddev *mddev = log->rdev->mddev;
2228
2229 log->rdev->journal_tail = cp;
2230 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2231}
2232
2c7da14b
SL
2233static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2234{
2235 struct r5conf *conf = mddev->private;
2236 int ret;
2237
2238 if (!conf->log)
2239 return 0;
2240
2241 switch (conf->log->r5c_journal_mode) {
2242 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2243 ret = snprintf(
2244 page, PAGE_SIZE, "[%s] %s\n",
2245 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2246 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2247 break;
2248 case R5C_JOURNAL_MODE_WRITE_BACK:
2249 ret = snprintf(
2250 page, PAGE_SIZE, "%s [%s]\n",
2251 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2252 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2253 break;
2254 default:
2255 ret = 0;
2256 }
2257 return ret;
2258}
2259
2260static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2261 const char *page, size_t length)
2262{
2263 struct r5conf *conf = mddev->private;
2264 struct r5l_log *log = conf->log;
2265 int val = -1, i;
2266 int len = length;
2267
2268 if (!log)
2269 return -ENODEV;
2270
2271 if (len && page[len - 1] == '\n')
2272 len -= 1;
2273 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2274 if (strlen(r5c_journal_mode_str[i]) == len &&
2275 strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2276 val = i;
2277 break;
2278 }
2279 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2280 val > R5C_JOURNAL_MODE_WRITE_BACK)
2281 return -EINVAL;
2282
2283 mddev_suspend(mddev);
2284 conf->log->r5c_journal_mode = val;
2285 mddev_resume(mddev);
2286
2287 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2288 mdname(mddev), val, r5c_journal_mode_str[val]);
2289 return length;
2290}
2291
2292struct md_sysfs_entry
2293r5c_journal_mode = __ATTR(journal_mode, 0644,
2294 r5c_journal_mode_show, r5c_journal_mode_store);
2295
2ded3703
SL
2296/*
2297 * Try handle write operation in caching phase. This function should only
2298 * be called in write-back mode.
2299 *
2300 * If all outstanding writes can be handled in caching phase, returns 0
2301 * If writes requires write-out phase, call r5c_make_stripe_write_out()
2302 * and returns -EAGAIN
2303 */
2304int r5c_try_caching_write(struct r5conf *conf,
2305 struct stripe_head *sh,
2306 struct stripe_head_state *s,
2307 int disks)
2308{
2309 struct r5l_log *log = conf->log;
1e6d690b
SL
2310 int i;
2311 struct r5dev *dev;
2312 int to_cache = 0;
2ded3703
SL
2313
2314 BUG_ON(!r5c_is_writeback(log));
2315
1e6d690b
SL
2316 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2317 /*
2318 * There are two different scenarios here:
2319 * 1. The stripe has some data cached, and it is sent to
2320 * write-out phase for reclaim
2321 * 2. The stripe is clean, and this is the first write
2322 *
2323 * For 1, return -EAGAIN, so we continue with
2324 * handle_stripe_dirtying().
2325 *
2326 * For 2, set STRIPE_R5C_CACHING and continue with caching
2327 * write.
2328 */
2329
2330 /* case 1: anything injournal or anything in written */
2331 if (s->injournal > 0 || s->written > 0)
2332 return -EAGAIN;
2333 /* case 2 */
2334 set_bit(STRIPE_R5C_CACHING, &sh->state);
2335 }
2336
2337 for (i = disks; i--; ) {
2338 dev = &sh->dev[i];
2339 /* if non-overwrite, use writing-out phase */
2340 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2341 !test_bit(R5_InJournal, &dev->flags)) {
2342 r5c_make_stripe_write_out(sh);
2343 return -EAGAIN;
2344 }
2345 }
2346
2347 for (i = disks; i--; ) {
2348 dev = &sh->dev[i];
2349 if (dev->towrite) {
2350 set_bit(R5_Wantwrite, &dev->flags);
2351 set_bit(R5_Wantdrain, &dev->flags);
2352 set_bit(R5_LOCKED, &dev->flags);
2353 to_cache++;
2354 }
2355 }
2356
2357 if (to_cache) {
2358 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2359 /*
2360 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
2361 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
2362 * r5c_handle_data_cached()
2363 */
2364 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2365 }
2366
2367 return 0;
2368}
2369
2370/*
2371 * free extra pages (orig_page) we allocated for prexor
2372 */
2373void r5c_release_extra_page(struct stripe_head *sh)
2374{
2375 int i;
2376
2377 for (i = sh->disks; i--; )
2378 if (sh->dev[i].page != sh->dev[i].orig_page) {
2379 struct page *p = sh->dev[i].orig_page;
2380
2381 sh->dev[i].orig_page = sh->dev[i].page;
2382 put_page(p);
2383 }
2ded3703
SL
2384}
2385
2386/*
2387 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
2388 * stripe is committed to RAID disks.
2389 */
2390void r5c_finish_stripe_write_out(struct r5conf *conf,
2391 struct stripe_head *sh,
2392 struct stripe_head_state *s)
2393{
1e6d690b
SL
2394 int i;
2395 int do_wakeup = 0;
2396
2ded3703
SL
2397 if (!conf->log ||
2398 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2399 return;
2400
2401 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2402 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2403
2404 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2405 return;
1e6d690b
SL
2406
2407 for (i = sh->disks; i--; ) {
2408 clear_bit(R5_InJournal, &sh->dev[i].flags);
2409 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2410 do_wakeup = 1;
2411 }
2412
2413 /*
2414 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
2415 * We updated R5_InJournal, so we also update s->injournal.
2416 */
2417 s->injournal = 0;
2418
2419 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2420 if (atomic_dec_and_test(&conf->pending_full_writes))
2421 md_wakeup_thread(conf->mddev->thread);
2422
2423 if (do_wakeup)
2424 wake_up(&conf->wait_for_overlap);
a39f7afd
SL
2425
2426 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2427 return;
2428
2429 spin_lock_irq(&conf->log->stripe_in_journal_lock);
2430 list_del_init(&sh->r5c);
2431 spin_unlock_irq(&conf->log->stripe_in_journal_lock);
2432 sh->log_start = MaxSector;
2433 atomic_dec(&conf->log->stripe_in_journal_count);
1e6d690b
SL
2434}
2435
2436int
2437r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
2438 struct stripe_head_state *s)
2439{
a39f7afd 2440 struct r5conf *conf = sh->raid_conf;
1e6d690b
SL
2441 int pages = 0;
2442 int reserve;
2443 int i;
2444 int ret = 0;
2445
2446 BUG_ON(!log);
2447
2448 for (i = 0; i < sh->disks; i++) {
2449 void *addr;
2450
2451 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2452 continue;
2453 addr = kmap_atomic(sh->dev[i].page);
2454 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2455 addr, PAGE_SIZE);
2456 kunmap_atomic(addr);
2457 pages++;
2458 }
2459 WARN_ON(pages == 0);
2460
2461 /*
2462 * The stripe must enter state machine again to call endio, so
2463 * don't delay.
2464 */
2465 clear_bit(STRIPE_DELAYED, &sh->state);
2466 atomic_inc(&sh->count);
2467
2468 mutex_lock(&log->io_mutex);
2469 /* meta + data */
2470 reserve = (1 + pages) << (PAGE_SHIFT - 9);
1e6d690b 2471
a39f7afd
SL
2472 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2473 sh->log_start == MaxSector)
2474 r5l_add_no_space_stripe(log, sh);
2475 else if (!r5l_has_free_space(log, reserve)) {
2476 if (sh->log_start == log->last_checkpoint)
2477 BUG();
2478 else
2479 r5l_add_no_space_stripe(log, sh);
1e6d690b
SL
2480 } else {
2481 ret = r5l_log_stripe(log, sh, pages, 0);
2482 if (ret) {
2483 spin_lock_irq(&log->io_list_lock);
2484 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2485 spin_unlock_irq(&log->io_list_lock);
2486 }
2487 }
2488
2489 mutex_unlock(&log->io_mutex);
2490 return 0;
2ded3703
SL
2491}
2492
f6bed0ef
SL
2493static int r5l_load_log(struct r5l_log *log)
2494{
2495 struct md_rdev *rdev = log->rdev;
2496 struct page *page;
2497 struct r5l_meta_block *mb;
2498 sector_t cp = log->rdev->journal_tail;
2499 u32 stored_crc, expected_crc;
2500 bool create_super = false;
2501 int ret;
2502
2503 /* Make sure it's valid */
2504 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2505 cp = 0;
2506 page = alloc_page(GFP_KERNEL);
2507 if (!page)
2508 return -ENOMEM;
2509
796a5cf0 2510 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
f6bed0ef
SL
2511 ret = -EIO;
2512 goto ioerr;
2513 }
2514 mb = page_address(page);
2515
2516 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2517 mb->version != R5LOG_VERSION) {
2518 create_super = true;
2519 goto create;
2520 }
2521 stored_crc = le32_to_cpu(mb->checksum);
2522 mb->checksum = 0;
5cb2fbd6 2523 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
f6bed0ef
SL
2524 if (stored_crc != expected_crc) {
2525 create_super = true;
2526 goto create;
2527 }
2528 if (le64_to_cpu(mb->position) != cp) {
2529 create_super = true;
2530 goto create;
2531 }
2532create:
2533 if (create_super) {
2534 log->last_cp_seq = prandom_u32();
2535 cp = 0;
56056c2e 2536 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
f6bed0ef
SL
2537 /*
2538 * Make sure super points to correct address. Log might have
2539 * data very soon. If super hasn't correct log tail address,
2540 * recovery can't find the log
2541 */
2542 r5l_write_super(log, cp);
2543 } else
2544 log->last_cp_seq = le64_to_cpu(mb->seq);
2545
2546 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
0576b1c6
SL
2547 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
2548 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
2549 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
f6bed0ef 2550 log->last_checkpoint = cp;
28cd88e2 2551 log->next_checkpoint = cp;
a39f7afd
SL
2552 mutex_lock(&log->io_mutex);
2553 r5c_update_log_state(log);
2554 mutex_unlock(&log->io_mutex);
f6bed0ef
SL
2555
2556 __free_page(page);
2557
2558 return r5l_recovery_log(log);
2559ioerr:
2560 __free_page(page);
2561 return ret;
2562}
2563
2564int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2565{
c888a8f9 2566 struct request_queue *q = bdev_get_queue(rdev->bdev);
f6bed0ef
SL
2567 struct r5l_log *log;
2568
2569 if (PAGE_SIZE != 4096)
2570 return -EINVAL;
c757ec95
SL
2571
2572 /*
2573 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
2574 * raid_disks r5l_payload_data_parity.
2575 *
2576 * Write journal and cache does not work for very big array
2577 * (raid_disks > 203)
2578 */
2579 if (sizeof(struct r5l_meta_block) +
2580 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
2581 conf->raid_disks) > PAGE_SIZE) {
2582 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
2583 mdname(conf->mddev), conf->raid_disks);
2584 return -EINVAL;
2585 }
2586
f6bed0ef
SL
2587 log = kzalloc(sizeof(*log), GFP_KERNEL);
2588 if (!log)
2589 return -ENOMEM;
2590 log->rdev = rdev;
2591
c888a8f9 2592 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
56fef7c6 2593
5cb2fbd6
SL
2594 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
2595 sizeof(rdev->mddev->uuid));
f6bed0ef
SL
2596
2597 mutex_init(&log->io_mutex);
2598
2599 spin_lock_init(&log->io_list_lock);
2600 INIT_LIST_HEAD(&log->running_ios);
0576b1c6 2601 INIT_LIST_HEAD(&log->io_end_ios);
a8c34f91 2602 INIT_LIST_HEAD(&log->flushing_ios);
04732f74 2603 INIT_LIST_HEAD(&log->finished_ios);
a8c34f91 2604 bio_init(&log->flush_bio);
f6bed0ef
SL
2605
2606 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
2607 if (!log->io_kc)
2608 goto io_kc;
2609
5036c390
CH
2610 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
2611 if (!log->io_pool)
2612 goto io_pool;
2613
c38d29b3
CH
2614 log->bs = bioset_create(R5L_POOL_SIZE, 0);
2615 if (!log->bs)
2616 goto io_bs;
2617
e8deb638
CH
2618 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
2619 if (!log->meta_pool)
2620 goto out_mempool;
2621
0576b1c6
SL
2622 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
2623 log->rdev->mddev, "reclaim");
2624 if (!log->reclaim_thread)
2625 goto reclaim_thread;
a39f7afd
SL
2626 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2627
0fd22b45 2628 init_waitqueue_head(&log->iounit_wait);
0576b1c6 2629
5036c390
CH
2630 INIT_LIST_HEAD(&log->no_mem_stripes);
2631
f6bed0ef
SL
2632 INIT_LIST_HEAD(&log->no_space_stripes);
2633 spin_lock_init(&log->no_space_stripes_lock);
2634
2ded3703 2635 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
a39f7afd
SL
2636 INIT_LIST_HEAD(&log->stripe_in_journal_list);
2637 spin_lock_init(&log->stripe_in_journal_lock);
2638 atomic_set(&log->stripe_in_journal_count, 0);
2ded3703 2639
f6bed0ef
SL
2640 if (r5l_load_log(log))
2641 goto error;
2642
f6b6ec5c 2643 rcu_assign_pointer(conf->log, log);
a62ab49e 2644 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
f6bed0ef 2645 return 0;
e8deb638 2646
f6bed0ef 2647error:
0576b1c6
SL
2648 md_unregister_thread(&log->reclaim_thread);
2649reclaim_thread:
e8deb638
CH
2650 mempool_destroy(log->meta_pool);
2651out_mempool:
c38d29b3
CH
2652 bioset_free(log->bs);
2653io_bs:
5036c390
CH
2654 mempool_destroy(log->io_pool);
2655io_pool:
f6bed0ef
SL
2656 kmem_cache_destroy(log->io_kc);
2657io_kc:
2658 kfree(log);
2659 return -EINVAL;
2660}
2661
2662void r5l_exit_log(struct r5l_log *log)
2663{
0576b1c6 2664 md_unregister_thread(&log->reclaim_thread);
e8deb638 2665 mempool_destroy(log->meta_pool);
c38d29b3 2666 bioset_free(log->bs);
5036c390 2667 mempool_destroy(log->io_pool);
f6bed0ef
SL
2668 kmem_cache_destroy(log->io_kc);
2669 kfree(log);
2670}