raid5-cache: free I/O units earlier
[linux-2.6-block.git] / drivers / md / raid5-cache.c
1 /*
2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23
24 /*
25  * metadata/data stored in disk with 4k size unit (a block) regardless
26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27  */
28 #define BLOCK_SECTORS (8)
29
30 /*
31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32  * recovery scans a very long log
33  */
34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
37 struct r5l_log {
38         struct md_rdev *rdev;
39
40         u32 uuid_checksum;
41
42         sector_t device_size;           /* log device size, round to
43                                          * BLOCK_SECTORS */
44         sector_t max_free_space;        /* reclaim run if free space is at
45                                          * this size */
46
47         sector_t last_checkpoint;       /* log tail. where recovery scan
48                                          * starts from */
49         u64 last_cp_seq;                /* log tail sequence */
50
51         sector_t log_start;             /* log head. where new data appends */
52         u64 seq;                        /* log head sequence */
53
54         sector_t next_checkpoint;
55         u64 next_cp_seq;
56
57         struct mutex io_mutex;
58         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
59
60         spinlock_t io_list_lock;
61         struct list_head running_ios;   /* io_units which are still running,
62                                          * and have not yet been completely
63                                          * written to the log */
64         struct list_head io_end_ios;    /* io_units which have been completely
65                                          * written to the log but not yet written
66                                          * to the RAID */
67         struct list_head flushing_ios;  /* io_units which are waiting for log
68                                          * cache flush */
69         struct list_head flushed_ios;   /* io_units which settle down in log disk */
70         struct bio flush_bio;
71
72         struct kmem_cache *io_kc;
73
74         struct md_thread *reclaim_thread;
75         unsigned long reclaim_target;   /* number of space that need to be
76                                          * reclaimed.  if it's 0, reclaim spaces
77                                          * used by io_units which are in
78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
79                                          * dones't wait for specific io_unit
80                                          * switching to IO_UNIT_STRIPE_END
81                                          * state) */
82         wait_queue_head_t iounit_wait;
83
84         struct list_head no_space_stripes; /* pending stripes, log has no space */
85         spinlock_t no_space_stripes_lock;
86 };
87
88 /*
89  * an IO range starts from a meta data block and end at the next meta data
90  * block. The io unit's the meta data block tracks data/parity followed it. io
91  * unit is written to log disk with normal write, as we always flush log disk
92  * first and then start move data to raid disks, there is no requirement to
93  * write io unit with FLUSH/FUA
94  */
95 struct r5l_io_unit {
96         struct r5l_log *log;
97
98         struct page *meta_page; /* store meta block */
99         int meta_offset;        /* current offset in meta_page */
100
101         struct bio_list bios;
102         atomic_t pending_io;    /* pending bios not written to log yet */
103         struct bio *current_bio;/* current_bio accepting new data */
104
105         atomic_t pending_stripe;/* how many stripes not flushed to raid */
106         u64 seq;                /* seq number of the metablock */
107         sector_t log_start;     /* where the io_unit starts */
108         sector_t log_end;       /* where the io_unit ends */
109         struct list_head log_sibling; /* log->running_ios */
110         struct list_head stripe_list; /* stripes added to the io_unit */
111
112         int state;
113 };
114
115 /* r5l_io_unit state */
116 enum r5l_io_unit_state {
117         IO_UNIT_RUNNING = 0,    /* accepting new IO */
118         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
119                                  * don't accepting new bio */
120         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
121         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
122 };
123
124 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
125 {
126         start += inc;
127         if (start >= log->device_size)
128                 start = start - log->device_size;
129         return start;
130 }
131
132 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
133                                   sector_t end)
134 {
135         if (end >= start)
136                 return end - start;
137         else
138                 return end + log->device_size - start;
139 }
140
141 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
142 {
143         sector_t used_size;
144
145         used_size = r5l_ring_distance(log, log->last_checkpoint,
146                                         log->log_start);
147
148         return log->device_size > used_size + size;
149 }
150
151 static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
152 {
153         struct r5l_io_unit *io;
154         /* We can't handle memory allocate failure so far */
155         gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
156
157         io = kmem_cache_zalloc(log->io_kc, gfp);
158         io->log = log;
159         io->meta_page = alloc_page(gfp | __GFP_ZERO);
160
161         bio_list_init(&io->bios);
162         INIT_LIST_HEAD(&io->log_sibling);
163         INIT_LIST_HEAD(&io->stripe_list);
164         io->state = IO_UNIT_RUNNING;
165         return io;
166 }
167
168 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
169 {
170         __free_page(io->meta_page);
171         kmem_cache_free(log->io_kc, io);
172 }
173
174 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
175                                   enum r5l_io_unit_state state)
176 {
177         struct r5l_io_unit *io;
178
179         while (!list_empty(from)) {
180                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
181                 /* don't change list order */
182                 if (io->state >= state)
183                         list_move_tail(&io->log_sibling, to);
184                 else
185                         break;
186         }
187 }
188
189 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
190                                     enum r5l_io_unit_state state)
191 {
192         if (WARN_ON(io->state >= state))
193                 return;
194         io->state = state;
195 }
196
197 /* XXX: totally ignores I/O errors */
198 static void r5l_log_endio(struct bio *bio)
199 {
200         struct r5l_io_unit *io = bio->bi_private;
201         struct r5l_log *log = io->log;
202         unsigned long flags;
203
204         bio_put(bio);
205
206         if (!atomic_dec_and_test(&io->pending_io))
207                 return;
208
209         spin_lock_irqsave(&log->io_list_lock, flags);
210         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
211         r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
212                         IO_UNIT_IO_END);
213         spin_unlock_irqrestore(&log->io_list_lock, flags);
214
215         md_wakeup_thread(log->rdev->mddev->thread);
216 }
217
218 static void r5l_submit_current_io(struct r5l_log *log)
219 {
220         struct r5l_io_unit *io = log->current_io;
221         struct r5l_meta_block *block;
222         struct bio *bio;
223         unsigned long flags;
224         u32 crc;
225
226         if (!io)
227                 return;
228
229         block = page_address(io->meta_page);
230         block->meta_size = cpu_to_le32(io->meta_offset);
231         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
232         block->checksum = cpu_to_le32(crc);
233
234         log->current_io = NULL;
235         spin_lock_irqsave(&log->io_list_lock, flags);
236         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
237         spin_unlock_irqrestore(&log->io_list_lock, flags);
238
239         while ((bio = bio_list_pop(&io->bios))) {
240                 /* all IO must start from rdev->data_offset */
241                 bio->bi_iter.bi_sector += log->rdev->data_offset;
242                 submit_bio(WRITE, bio);
243         }
244 }
245
246 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
247 {
248         struct r5l_io_unit *io;
249         struct r5l_meta_block *block;
250         struct bio *bio;
251
252         io = r5l_alloc_io_unit(log);
253
254         block = page_address(io->meta_page);
255         block->magic = cpu_to_le32(R5LOG_MAGIC);
256         block->version = R5LOG_VERSION;
257         block->seq = cpu_to_le64(log->seq);
258         block->position = cpu_to_le64(log->log_start);
259
260         io->log_start = log->log_start;
261         io->meta_offset = sizeof(struct r5l_meta_block);
262         io->seq = log->seq;
263
264         bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
265         io->current_bio = bio;
266         bio->bi_rw = WRITE;
267         bio->bi_bdev = log->rdev->bdev;
268         bio->bi_iter.bi_sector = log->log_start;
269         bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
270         bio->bi_end_io = r5l_log_endio;
271         bio->bi_private = io;
272
273         bio_list_add(&io->bios, bio);
274         atomic_inc(&io->pending_io);
275
276         log->seq++;
277         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
278         io->log_end = log->log_start;
279         /* current bio hit disk end */
280         if (log->log_start == 0)
281                 io->current_bio = NULL;
282
283         spin_lock_irq(&log->io_list_lock);
284         list_add_tail(&io->log_sibling, &log->running_ios);
285         spin_unlock_irq(&log->io_list_lock);
286
287         return io;
288 }
289
290 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
291 {
292         struct r5l_io_unit *io;
293
294         io = log->current_io;
295         if (io && io->meta_offset + payload_size > PAGE_SIZE)
296                 r5l_submit_current_io(log);
297         io = log->current_io;
298         if (io)
299                 return 0;
300
301         log->current_io = r5l_new_meta(log);
302         return 0;
303 }
304
305 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
306                                     sector_t location,
307                                     u32 checksum1, u32 checksum2,
308                                     bool checksum2_valid)
309 {
310         struct r5l_io_unit *io = log->current_io;
311         struct r5l_payload_data_parity *payload;
312
313         payload = page_address(io->meta_page) + io->meta_offset;
314         payload->header.type = cpu_to_le16(type);
315         payload->header.flags = cpu_to_le16(0);
316         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
317                                     (PAGE_SHIFT - 9));
318         payload->location = cpu_to_le64(location);
319         payload->checksum[0] = cpu_to_le32(checksum1);
320         if (checksum2_valid)
321                 payload->checksum[1] = cpu_to_le32(checksum2);
322
323         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
324                 sizeof(__le32) * (1 + !!checksum2_valid);
325 }
326
327 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
328 {
329         struct r5l_io_unit *io = log->current_io;
330
331 alloc_bio:
332         if (!io->current_bio) {
333                 struct bio *bio;
334
335                 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
336                 bio->bi_rw = WRITE;
337                 bio->bi_bdev = log->rdev->bdev;
338                 bio->bi_iter.bi_sector = log->log_start;
339                 bio->bi_end_io = r5l_log_endio;
340                 bio->bi_private = io;
341                 bio_list_add(&io->bios, bio);
342                 atomic_inc(&io->pending_io);
343                 io->current_bio = bio;
344         }
345         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
346                 io->current_bio = NULL;
347                 goto alloc_bio;
348         }
349         log->log_start = r5l_ring_add(log, log->log_start,
350                                       BLOCK_SECTORS);
351         /* current bio hit disk end */
352         if (log->log_start == 0)
353                 io->current_bio = NULL;
354
355         io->log_end = log->log_start;
356 }
357
358 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
359                            int data_pages, int parity_pages)
360 {
361         int i;
362         int meta_size;
363         struct r5l_io_unit *io;
364
365         meta_size =
366                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
367                  * data_pages) +
368                 sizeof(struct r5l_payload_data_parity) +
369                 sizeof(__le32) * parity_pages;
370
371         r5l_get_meta(log, meta_size);
372         io = log->current_io;
373
374         for (i = 0; i < sh->disks; i++) {
375                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
376                         continue;
377                 if (i == sh->pd_idx || i == sh->qd_idx)
378                         continue;
379                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
380                                         raid5_compute_blocknr(sh, i, 0),
381                                         sh->dev[i].log_checksum, 0, false);
382                 r5l_append_payload_page(log, sh->dev[i].page);
383         }
384
385         if (sh->qd_idx >= 0) {
386                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
387                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
388                                         sh->dev[sh->qd_idx].log_checksum, true);
389                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
390                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
391         } else {
392                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
393                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
394                                         0, false);
395                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
396         }
397
398         list_add_tail(&sh->log_list, &io->stripe_list);
399         atomic_inc(&io->pending_stripe);
400         sh->log_io = io;
401 }
402
403 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
404 /*
405  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
406  * data from log to raid disks), so we shouldn't wait for reclaim here
407  */
408 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
409 {
410         int write_disks = 0;
411         int data_pages, parity_pages;
412         int meta_size;
413         int reserve;
414         int i;
415
416         if (!log)
417                 return -EAGAIN;
418         /* Don't support stripe batch */
419         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
420             test_bit(STRIPE_SYNCING, &sh->state)) {
421                 /* the stripe is written to log, we start writing it to raid */
422                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
423                 return -EAGAIN;
424         }
425
426         for (i = 0; i < sh->disks; i++) {
427                 void *addr;
428
429                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
430                         continue;
431                 write_disks++;
432                 /* checksum is already calculated in last run */
433                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
434                         continue;
435                 addr = kmap_atomic(sh->dev[i].page);
436                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
437                                                     addr, PAGE_SIZE);
438                 kunmap_atomic(addr);
439         }
440         parity_pages = 1 + !!(sh->qd_idx >= 0);
441         data_pages = write_disks - parity_pages;
442
443         meta_size =
444                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
445                  * data_pages) +
446                 sizeof(struct r5l_payload_data_parity) +
447                 sizeof(__le32) * parity_pages;
448         /* Doesn't work with very big raid array */
449         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
450                 return -EINVAL;
451
452         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
453         /*
454          * The stripe must enter state machine again to finish the write, so
455          * don't delay.
456          */
457         clear_bit(STRIPE_DELAYED, &sh->state);
458         atomic_inc(&sh->count);
459
460         mutex_lock(&log->io_mutex);
461         /* meta + data */
462         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
463         if (r5l_has_free_space(log, reserve))
464                 r5l_log_stripe(log, sh, data_pages, parity_pages);
465         else {
466                 spin_lock(&log->no_space_stripes_lock);
467                 list_add_tail(&sh->log_list, &log->no_space_stripes);
468                 spin_unlock(&log->no_space_stripes_lock);
469
470                 r5l_wake_reclaim(log, reserve);
471         }
472         mutex_unlock(&log->io_mutex);
473
474         return 0;
475 }
476
477 void r5l_write_stripe_run(struct r5l_log *log)
478 {
479         if (!log)
480                 return;
481         mutex_lock(&log->io_mutex);
482         r5l_submit_current_io(log);
483         mutex_unlock(&log->io_mutex);
484 }
485
486 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
487 {
488         if (!log)
489                 return -ENODEV;
490         /*
491          * we flush log disk cache first, then write stripe data to raid disks.
492          * So if bio is finished, the log disk cache is flushed already. The
493          * recovery guarantees we can recovery the bio from log disk, so we
494          * don't need to flush again
495          */
496         if (bio->bi_iter.bi_size == 0) {
497                 bio_endio(bio);
498                 return 0;
499         }
500         bio->bi_rw &= ~REQ_FLUSH;
501         return -EAGAIN;
502 }
503
504 /* This will run after log space is reclaimed */
505 static void r5l_run_no_space_stripes(struct r5l_log *log)
506 {
507         struct stripe_head *sh;
508
509         spin_lock(&log->no_space_stripes_lock);
510         while (!list_empty(&log->no_space_stripes)) {
511                 sh = list_first_entry(&log->no_space_stripes,
512                                       struct stripe_head, log_list);
513                 list_del_init(&sh->log_list);
514                 set_bit(STRIPE_HANDLE, &sh->state);
515                 raid5_release_stripe(sh);
516         }
517         spin_unlock(&log->no_space_stripes_lock);
518 }
519
520 static sector_t r5l_reclaimable_space(struct r5l_log *log)
521 {
522         return r5l_ring_distance(log, log->last_checkpoint,
523                                  log->next_checkpoint);
524 }
525
526 static bool r5l_complete_flushed_ios(struct r5l_log *log)
527 {
528         struct r5l_io_unit *io, *next;
529         bool found = false;
530
531         assert_spin_locked(&log->io_list_lock);
532
533         list_for_each_entry_safe(io, next, &log->flushed_ios, log_sibling) {
534                 /* don't change list order */
535                 if (io->state < IO_UNIT_STRIPE_END)
536                         break;
537
538                 log->next_checkpoint = io->log_start;
539                 log->next_cp_seq = io->seq;
540
541                 list_del(&io->log_sibling);
542                 r5l_free_io_unit(log, io);
543
544                 found = true;
545         }
546
547         return found;
548 }
549
550 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
551 {
552         struct r5l_log *log = io->log;
553         unsigned long flags;
554
555         spin_lock_irqsave(&log->io_list_lock, flags);
556         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
557
558         if (!r5l_complete_flushed_ios(log)) {
559                 spin_unlock_irqrestore(&log->io_list_lock, flags);
560                 return;
561         }
562
563         if (r5l_reclaimable_space(log) > log->max_free_space)
564                 r5l_wake_reclaim(log, 0);
565
566         spin_unlock_irqrestore(&log->io_list_lock, flags);
567         wake_up(&log->iounit_wait);
568 }
569
570 void r5l_stripe_write_finished(struct stripe_head *sh)
571 {
572         struct r5l_io_unit *io;
573
574         io = sh->log_io;
575         sh->log_io = NULL;
576
577         if (io && atomic_dec_and_test(&io->pending_stripe))
578                 __r5l_stripe_write_finished(io);
579 }
580
581 static void r5l_log_flush_endio(struct bio *bio)
582 {
583         struct r5l_log *log = container_of(bio, struct r5l_log,
584                 flush_bio);
585         unsigned long flags;
586         struct r5l_io_unit *io;
587         struct stripe_head *sh;
588
589         spin_lock_irqsave(&log->io_list_lock, flags);
590         list_for_each_entry(io, &log->flushing_ios, log_sibling) {
591                 while (!list_empty(&io->stripe_list)) {
592                         sh = list_first_entry(&io->stripe_list,
593                                 struct stripe_head, log_list);
594                         list_del_init(&sh->log_list);
595                         set_bit(STRIPE_HANDLE, &sh->state);
596                         raid5_release_stripe(sh);
597                 }
598         }
599         list_splice_tail_init(&log->flushing_ios, &log->flushed_ios);
600         spin_unlock_irqrestore(&log->io_list_lock, flags);
601 }
602
603 /*
604  * Starting dispatch IO to raid.
605  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
606  * broken meta in the middle of a log causes recovery can't find meta at the
607  * head of log. If operations require meta at the head persistent in log, we
608  * must make sure meta before it persistent in log too. A case is:
609  *
610  * stripe data/parity is in log, we start write stripe to raid disks. stripe
611  * data/parity must be persistent in log before we do the write to raid disks.
612  *
613  * The solution is we restrictly maintain io_unit list order. In this case, we
614  * only write stripes of an io_unit to raid disks till the io_unit is the first
615  * one whose data/parity is in log.
616  */
617 void r5l_flush_stripe_to_raid(struct r5l_log *log)
618 {
619         bool do_flush;
620         if (!log)
621                 return;
622
623         spin_lock_irq(&log->io_list_lock);
624         /* flush bio is running */
625         if (!list_empty(&log->flushing_ios)) {
626                 spin_unlock_irq(&log->io_list_lock);
627                 return;
628         }
629         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
630         do_flush = !list_empty(&log->flushing_ios);
631         spin_unlock_irq(&log->io_list_lock);
632
633         if (!do_flush)
634                 return;
635         bio_reset(&log->flush_bio);
636         log->flush_bio.bi_bdev = log->rdev->bdev;
637         log->flush_bio.bi_end_io = r5l_log_flush_endio;
638         submit_bio(WRITE_FLUSH, &log->flush_bio);
639 }
640
641 static void r5l_write_super(struct r5l_log *log, sector_t cp);
642 static void r5l_do_reclaim(struct r5l_log *log)
643 {
644         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
645         sector_t reclaimable;
646         sector_t next_checkpoint;
647         u64 next_cp_seq;
648
649         spin_lock_irq(&log->io_list_lock);
650         /*
651          * move proper io_unit to reclaim list. We should not change the order.
652          * reclaimable/unreclaimable io_unit can be mixed in the list, we
653          * shouldn't reuse space of an unreclaimable io_unit
654          */
655         while (1) {
656                 reclaimable = r5l_reclaimable_space(log);
657                 if (reclaimable >= reclaim_target ||
658                     (list_empty(&log->running_ios) &&
659                      list_empty(&log->io_end_ios) &&
660                      list_empty(&log->flushing_ios) &&
661                      list_empty(&log->flushed_ios)))
662                         break;
663
664                 md_wakeup_thread(log->rdev->mddev->thread);
665                 wait_event_lock_irq(log->iounit_wait,
666                                     r5l_reclaimable_space(log) > reclaimable,
667                                     log->io_list_lock);
668         }
669
670         next_checkpoint = log->next_checkpoint;
671         next_cp_seq = log->next_cp_seq;
672         spin_unlock_irq(&log->io_list_lock);
673
674         BUG_ON(reclaimable < 0);
675         if (reclaimable == 0)
676                 return;
677
678         /*
679          * write_super will flush cache of each raid disk. We must write super
680          * here, because the log area might be reused soon and we don't want to
681          * confuse recovery
682          */
683         r5l_write_super(log, next_checkpoint);
684
685         mutex_lock(&log->io_mutex);
686         log->last_checkpoint = next_checkpoint;
687         log->last_cp_seq = next_cp_seq;
688         mutex_unlock(&log->io_mutex);
689
690         r5l_run_no_space_stripes(log);
691 }
692
693 static void r5l_reclaim_thread(struct md_thread *thread)
694 {
695         struct mddev *mddev = thread->mddev;
696         struct r5conf *conf = mddev->private;
697         struct r5l_log *log = conf->log;
698
699         if (!log)
700                 return;
701         r5l_do_reclaim(log);
702 }
703
704 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
705 {
706         unsigned long target;
707         unsigned long new = (unsigned long)space; /* overflow in theory */
708
709         do {
710                 target = log->reclaim_target;
711                 if (new < target)
712                         return;
713         } while (cmpxchg(&log->reclaim_target, target, new) != target);
714         md_wakeup_thread(log->reclaim_thread);
715 }
716
717 void r5l_quiesce(struct r5l_log *log, int state)
718 {
719         if (!log || state == 2)
720                 return;
721         if (state == 0) {
722                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
723                                         log->rdev->mddev, "reclaim");
724         } else if (state == 1) {
725                 /*
726                  * at this point all stripes are finished, so io_unit is at
727                  * least in STRIPE_END state
728                  */
729                 r5l_wake_reclaim(log, -1L);
730                 md_unregister_thread(&log->reclaim_thread);
731                 r5l_do_reclaim(log);
732         }
733 }
734
735 struct r5l_recovery_ctx {
736         struct page *meta_page;         /* current meta */
737         sector_t meta_total_blocks;     /* total size of current meta and data */
738         sector_t pos;                   /* recovery position */
739         u64 seq;                        /* recovery position seq */
740 };
741
742 static int r5l_read_meta_block(struct r5l_log *log,
743                                struct r5l_recovery_ctx *ctx)
744 {
745         struct page *page = ctx->meta_page;
746         struct r5l_meta_block *mb;
747         u32 crc, stored_crc;
748
749         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
750                 return -EIO;
751
752         mb = page_address(page);
753         stored_crc = le32_to_cpu(mb->checksum);
754         mb->checksum = 0;
755
756         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
757             le64_to_cpu(mb->seq) != ctx->seq ||
758             mb->version != R5LOG_VERSION ||
759             le64_to_cpu(mb->position) != ctx->pos)
760                 return -EINVAL;
761
762         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
763         if (stored_crc != crc)
764                 return -EINVAL;
765
766         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
767                 return -EINVAL;
768
769         ctx->meta_total_blocks = BLOCK_SECTORS;
770
771         return 0;
772 }
773
774 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
775                                          struct r5l_recovery_ctx *ctx,
776                                          sector_t stripe_sect,
777                                          int *offset, sector_t *log_offset)
778 {
779         struct r5conf *conf = log->rdev->mddev->private;
780         struct stripe_head *sh;
781         struct r5l_payload_data_parity *payload;
782         int disk_index;
783
784         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
785         while (1) {
786                 payload = page_address(ctx->meta_page) + *offset;
787
788                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
789                         raid5_compute_sector(conf,
790                                              le64_to_cpu(payload->location), 0,
791                                              &disk_index, sh);
792
793                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
794                                      sh->dev[disk_index].page, READ, false);
795                         sh->dev[disk_index].log_checksum =
796                                 le32_to_cpu(payload->checksum[0]);
797                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
798                         ctx->meta_total_blocks += BLOCK_SECTORS;
799                 } else {
800                         disk_index = sh->pd_idx;
801                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
802                                      sh->dev[disk_index].page, READ, false);
803                         sh->dev[disk_index].log_checksum =
804                                 le32_to_cpu(payload->checksum[0]);
805                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
806
807                         if (sh->qd_idx >= 0) {
808                                 disk_index = sh->qd_idx;
809                                 sync_page_io(log->rdev,
810                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
811                                              PAGE_SIZE, sh->dev[disk_index].page,
812                                              READ, false);
813                                 sh->dev[disk_index].log_checksum =
814                                         le32_to_cpu(payload->checksum[1]);
815                                 set_bit(R5_Wantwrite,
816                                         &sh->dev[disk_index].flags);
817                         }
818                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
819                 }
820
821                 *log_offset = r5l_ring_add(log, *log_offset,
822                                            le32_to_cpu(payload->size));
823                 *offset += sizeof(struct r5l_payload_data_parity) +
824                         sizeof(__le32) *
825                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
826                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
827                         break;
828         }
829
830         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
831                 void *addr;
832                 u32 checksum;
833
834                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
835                         continue;
836                 addr = kmap_atomic(sh->dev[disk_index].page);
837                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
838                 kunmap_atomic(addr);
839                 if (checksum != sh->dev[disk_index].log_checksum)
840                         goto error;
841         }
842
843         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
844                 struct md_rdev *rdev, *rrdev;
845
846                 if (!test_and_clear_bit(R5_Wantwrite,
847                                         &sh->dev[disk_index].flags))
848                         continue;
849
850                 /* in case device is broken */
851                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
852                 if (rdev)
853                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
854                                      sh->dev[disk_index].page, WRITE, false);
855                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
856                 if (rrdev)
857                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
858                                      sh->dev[disk_index].page, WRITE, false);
859         }
860         raid5_release_stripe(sh);
861         return 0;
862
863 error:
864         for (disk_index = 0; disk_index < sh->disks; disk_index++)
865                 sh->dev[disk_index].flags = 0;
866         raid5_release_stripe(sh);
867         return -EINVAL;
868 }
869
870 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
871                                        struct r5l_recovery_ctx *ctx)
872 {
873         struct r5conf *conf = log->rdev->mddev->private;
874         struct r5l_payload_data_parity *payload;
875         struct r5l_meta_block *mb;
876         int offset;
877         sector_t log_offset;
878         sector_t stripe_sector;
879
880         mb = page_address(ctx->meta_page);
881         offset = sizeof(struct r5l_meta_block);
882         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
883
884         while (offset < le32_to_cpu(mb->meta_size)) {
885                 int dd;
886
887                 payload = (void *)mb + offset;
888                 stripe_sector = raid5_compute_sector(conf,
889                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
890                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
891                                                   &offset, &log_offset))
892                         return -EINVAL;
893         }
894         return 0;
895 }
896
897 /* copy data/parity from log to raid disks */
898 static void r5l_recovery_flush_log(struct r5l_log *log,
899                                    struct r5l_recovery_ctx *ctx)
900 {
901         while (1) {
902                 if (r5l_read_meta_block(log, ctx))
903                         return;
904                 if (r5l_recovery_flush_one_meta(log, ctx))
905                         return;
906                 ctx->seq++;
907                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
908         }
909 }
910
911 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
912                                           u64 seq)
913 {
914         struct page *page;
915         struct r5l_meta_block *mb;
916         u32 crc;
917
918         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
919         if (!page)
920                 return -ENOMEM;
921         mb = page_address(page);
922         mb->magic = cpu_to_le32(R5LOG_MAGIC);
923         mb->version = R5LOG_VERSION;
924         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
925         mb->seq = cpu_to_le64(seq);
926         mb->position = cpu_to_le64(pos);
927         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
928         mb->checksum = cpu_to_le32(crc);
929
930         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
931                 __free_page(page);
932                 return -EIO;
933         }
934         __free_page(page);
935         return 0;
936 }
937
938 static int r5l_recovery_log(struct r5l_log *log)
939 {
940         struct r5l_recovery_ctx ctx;
941
942         ctx.pos = log->last_checkpoint;
943         ctx.seq = log->last_cp_seq;
944         ctx.meta_page = alloc_page(GFP_KERNEL);
945         if (!ctx.meta_page)
946                 return -ENOMEM;
947
948         r5l_recovery_flush_log(log, &ctx);
949         __free_page(ctx.meta_page);
950
951         /*
952          * we did a recovery. Now ctx.pos points to an invalid meta block. New
953          * log will start here. but we can't let superblock point to last valid
954          * meta block. The log might looks like:
955          * | meta 1| meta 2| meta 3|
956          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
957          * superblock points to meta 1, we write a new valid meta 2n.  if crash
958          * happens again, new recovery will start from meta 1. Since meta 2n is
959          * valid now, recovery will think meta 3 is valid, which is wrong.
960          * The solution is we create a new meta in meta2 with its seq == meta
961          * 1's seq + 10 and let superblock points to meta2. The same recovery will
962          * not think meta 3 is a valid meta, because its seq doesn't match
963          */
964         if (ctx.seq > log->last_cp_seq + 1) {
965                 int ret;
966
967                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
968                 if (ret)
969                         return ret;
970                 log->seq = ctx.seq + 11;
971                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
972                 r5l_write_super(log, ctx.pos);
973         } else {
974                 log->log_start = ctx.pos;
975                 log->seq = ctx.seq;
976         }
977         return 0;
978 }
979
980 static void r5l_write_super(struct r5l_log *log, sector_t cp)
981 {
982         struct mddev *mddev = log->rdev->mddev;
983
984         log->rdev->journal_tail = cp;
985         set_bit(MD_CHANGE_DEVS, &mddev->flags);
986 }
987
988 static int r5l_load_log(struct r5l_log *log)
989 {
990         struct md_rdev *rdev = log->rdev;
991         struct page *page;
992         struct r5l_meta_block *mb;
993         sector_t cp = log->rdev->journal_tail;
994         u32 stored_crc, expected_crc;
995         bool create_super = false;
996         int ret;
997
998         /* Make sure it's valid */
999         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1000                 cp = 0;
1001         page = alloc_page(GFP_KERNEL);
1002         if (!page)
1003                 return -ENOMEM;
1004
1005         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1006                 ret = -EIO;
1007                 goto ioerr;
1008         }
1009         mb = page_address(page);
1010
1011         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1012             mb->version != R5LOG_VERSION) {
1013                 create_super = true;
1014                 goto create;
1015         }
1016         stored_crc = le32_to_cpu(mb->checksum);
1017         mb->checksum = 0;
1018         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1019         if (stored_crc != expected_crc) {
1020                 create_super = true;
1021                 goto create;
1022         }
1023         if (le64_to_cpu(mb->position) != cp) {
1024                 create_super = true;
1025                 goto create;
1026         }
1027 create:
1028         if (create_super) {
1029                 log->last_cp_seq = prandom_u32();
1030                 cp = 0;
1031                 /*
1032                  * Make sure super points to correct address. Log might have
1033                  * data very soon. If super hasn't correct log tail address,
1034                  * recovery can't find the log
1035                  */
1036                 r5l_write_super(log, cp);
1037         } else
1038                 log->last_cp_seq = le64_to_cpu(mb->seq);
1039
1040         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1041         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1042         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1043                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1044         log->last_checkpoint = cp;
1045
1046         __free_page(page);
1047
1048         return r5l_recovery_log(log);
1049 ioerr:
1050         __free_page(page);
1051         return ret;
1052 }
1053
1054 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1055 {
1056         struct r5l_log *log;
1057
1058         if (PAGE_SIZE != 4096)
1059                 return -EINVAL;
1060         log = kzalloc(sizeof(*log), GFP_KERNEL);
1061         if (!log)
1062                 return -ENOMEM;
1063         log->rdev = rdev;
1064
1065         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1066                                        sizeof(rdev->mddev->uuid));
1067
1068         mutex_init(&log->io_mutex);
1069
1070         spin_lock_init(&log->io_list_lock);
1071         INIT_LIST_HEAD(&log->running_ios);
1072         INIT_LIST_HEAD(&log->io_end_ios);
1073         INIT_LIST_HEAD(&log->flushing_ios);
1074         INIT_LIST_HEAD(&log->flushed_ios);
1075         bio_init(&log->flush_bio);
1076
1077         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1078         if (!log->io_kc)
1079                 goto io_kc;
1080
1081         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1082                                                  log->rdev->mddev, "reclaim");
1083         if (!log->reclaim_thread)
1084                 goto reclaim_thread;
1085         init_waitqueue_head(&log->iounit_wait);
1086
1087         INIT_LIST_HEAD(&log->no_space_stripes);
1088         spin_lock_init(&log->no_space_stripes_lock);
1089
1090         if (r5l_load_log(log))
1091                 goto error;
1092
1093         conf->log = log;
1094         return 0;
1095 error:
1096         md_unregister_thread(&log->reclaim_thread);
1097 reclaim_thread:
1098         kmem_cache_destroy(log->io_kc);
1099 io_kc:
1100         kfree(log);
1101         return -EINVAL;
1102 }
1103
1104 void r5l_exit_log(struct r5l_log *log)
1105 {
1106         md_unregister_thread(&log->reclaim_thread);
1107         kmem_cache_destroy(log->io_kc);
1108         kfree(log);
1109 }