dm writecache: have ssd writeback wait if the kcopyd workqueue is busy
[linux-block.git] / drivers / md / dm-writecache.c
CommitLineData
48debafe
MP
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2018 Red Hat. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/device-mapper.h>
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/vmalloc.h>
12#include <linux/kthread.h>
13#include <linux/dm-io.h>
14#include <linux/dm-kcopyd.h>
15#include <linux/dax.h>
16#include <linux/pfn_t.h>
17#include <linux/libnvdimm.h>
18
19#define DM_MSG_PREFIX "writecache"
20
21#define HIGH_WATERMARK 50
22#define LOW_WATERMARK 45
23#define MAX_WRITEBACK_JOBS 0
24#define ENDIO_LATENCY 16
25#define WRITEBACK_LATENCY 64
26#define AUTOCOMMIT_BLOCKS_SSD 65536
27#define AUTOCOMMIT_BLOCKS_PMEM 64
28#define AUTOCOMMIT_MSEC 1000
3923d485
MP
29#define MAX_AGE_DIV 16
30#define MAX_AGE_UNSPECIFIED -1UL
48debafe
MP
31
32#define BITMAP_GRANULARITY 65536
33#if BITMAP_GRANULARITY < PAGE_SIZE
34#undef BITMAP_GRANULARITY
35#define BITMAP_GRANULARITY PAGE_SIZE
36#endif
37
38#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
39#define DM_WRITECACHE_HAS_PMEM
40#endif
41
42#ifdef DM_WRITECACHE_HAS_PMEM
43#define pmem_assign(dest, src) \
44do { \
45 typeof(dest) uniq = (src); \
46 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
47} while (0)
48#else
49#define pmem_assign(dest, src) ((dest) = (src))
50#endif
51
ec6347bb 52#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
48debafe
MP
53#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
54#endif
55
56#define MEMORY_SUPERBLOCK_MAGIC 0x23489321
57#define MEMORY_SUPERBLOCK_VERSION 1
58
59struct wc_memory_entry {
60 __le64 original_sector;
61 __le64 seq_count;
62};
63
64struct wc_memory_superblock {
65 union {
66 struct {
67 __le32 magic;
68 __le32 version;
69 __le32 block_size;
70 __le32 pad;
71 __le64 n_blocks;
72 __le64 seq_count;
73 };
74 __le64 padding[8];
75 };
c40819f2 76 struct wc_memory_entry entries[];
48debafe
MP
77};
78
79struct wc_entry {
80 struct rb_node rb_node;
81 struct list_head lru;
82 unsigned short wc_list_contiguous;
83 bool write_in_progress
84#if BITS_PER_LONG == 64
85 :1
86#endif
87 ;
88 unsigned long index
89#if BITS_PER_LONG == 64
90 :47
91#endif
92 ;
3923d485 93 unsigned long age;
48debafe
MP
94#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
95 uint64_t original_sector;
96 uint64_t seq_count;
97#endif
98};
99
100#ifdef DM_WRITECACHE_HAS_PMEM
101#define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
102#define WC_MODE_FUA(wc) ((wc)->writeback_fua)
103#else
104#define WC_MODE_PMEM(wc) false
105#define WC_MODE_FUA(wc) false
106#endif
107#define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
108
109struct dm_writecache {
110 struct mutex lock;
111 struct list_head lru;
112 union {
113 struct list_head freelist;
114 struct {
115 struct rb_root freetree;
116 struct wc_entry *current_free;
117 };
118 };
119 struct rb_root tree;
120
121 size_t freelist_size;
122 size_t writeback_size;
123 size_t freelist_high_watermark;
124 size_t freelist_low_watermark;
3923d485 125 unsigned long max_age;
48debafe
MP
126
127 unsigned uncommitted_blocks;
128 unsigned autocommit_blocks;
129 unsigned max_writeback_jobs;
130
131 int error;
132
133 unsigned long autocommit_jiffies;
134 struct timer_list autocommit_timer;
135 struct wait_queue_head freelist_wait;
136
3923d485
MP
137 struct timer_list max_age_timer;
138
48debafe
MP
139 atomic_t bio_in_progress[2];
140 struct wait_queue_head bio_in_progress_wait[2];
141
142 struct dm_target *ti;
143 struct dm_dev *dev;
144 struct dm_dev *ssd_dev;
d284f824 145 sector_t start_sector;
48debafe
MP
146 void *memory_map;
147 uint64_t memory_map_size;
148 size_t metadata_sectors;
149 size_t n_blocks;
150 uint64_t seq_count;
4134455f 151 sector_t data_device_sectors;
48debafe
MP
152 void *block_start;
153 struct wc_entry *entries;
154 unsigned block_size;
155 unsigned char block_size_bits;
156
157 bool pmem_mode:1;
158 bool writeback_fua:1;
159
160 bool overwrote_committed:1;
161 bool memory_vmapped:1;
162
054bee16 163 bool start_sector_set:1;
48debafe
MP
164 bool high_wm_percent_set:1;
165 bool low_wm_percent_set:1;
166 bool max_writeback_jobs_set:1;
167 bool autocommit_blocks_set:1;
168 bool autocommit_time_set:1;
054bee16 169 bool max_age_set:1;
48debafe
MP
170 bool writeback_fua_set:1;
171 bool flush_on_suspend:1;
93de44eb 172 bool cleaner:1;
054bee16
MP
173 bool cleaner_set:1;
174
175 unsigned high_wm_percent_value;
176 unsigned low_wm_percent_value;
177 unsigned autocommit_time_value;
178 unsigned max_age_value;
48debafe
MP
179
180 unsigned writeback_all;
181 struct workqueue_struct *writeback_wq;
182 struct work_struct writeback_work;
183 struct work_struct flush_work;
184
185 struct dm_io_client *dm_io;
186
187 raw_spinlock_t endio_list_lock;
188 struct list_head endio_list;
189 struct task_struct *endio_thread;
190
191 struct task_struct *flush_thread;
192 struct bio_list flush_list;
193
194 struct dm_kcopyd_client *dm_kcopyd;
195 unsigned long *dirty_bitmap;
196 unsigned dirty_bitmap_size;
197
198 struct bio_set bio_set;
199 mempool_t copy_pool;
200};
201
202#define WB_LIST_INLINE 16
203
204struct writeback_struct {
205 struct list_head endio_entry;
206 struct dm_writecache *wc;
207 struct wc_entry **wc_list;
208 unsigned wc_list_n;
48debafe
MP
209 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
210 struct bio bio;
211};
212
213struct copy_struct {
214 struct list_head endio_entry;
215 struct dm_writecache *wc;
216 struct wc_entry *e;
217 unsigned n_entries;
218 int error;
219};
220
221DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
222 "A percentage of time allocated for data copying");
223
224static void wc_lock(struct dm_writecache *wc)
225{
226 mutex_lock(&wc->lock);
227}
228
229static void wc_unlock(struct dm_writecache *wc)
230{
231 mutex_unlock(&wc->lock);
232}
233
234#ifdef DM_WRITECACHE_HAS_PMEM
235static int persistent_memory_claim(struct dm_writecache *wc)
236{
237 int r;
238 loff_t s;
239 long p, da;
240 pfn_t pfn;
241 int id;
242 struct page **pages;
f9e040ef 243 sector_t offset;
48debafe
MP
244
245 wc->memory_vmapped = false;
246
48debafe
MP
247 s = wc->memory_map_size;
248 p = s >> PAGE_SHIFT;
249 if (!p) {
250 r = -EINVAL;
251 goto err1;
252 }
253 if (p != s >> PAGE_SHIFT) {
254 r = -EOVERFLOW;
255 goto err1;
256 }
257
f9e040ef
MP
258 offset = get_start_sect(wc->ssd_dev->bdev);
259 if (offset & (PAGE_SIZE / 512 - 1)) {
260 r = -EINVAL;
261 goto err1;
262 }
263 offset >>= PAGE_SHIFT - 9;
264
48debafe
MP
265 id = dax_read_lock();
266
f9e040ef 267 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn);
48debafe
MP
268 if (da < 0) {
269 wc->memory_map = NULL;
270 r = da;
271 goto err2;
272 }
273 if (!pfn_t_has_page(pfn)) {
274 wc->memory_map = NULL;
275 r = -EOPNOTSUPP;
276 goto err2;
277 }
278 if (da != p) {
279 long i;
280 wc->memory_map = NULL;
50a7d3ba 281 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
48debafe
MP
282 if (!pages) {
283 r = -ENOMEM;
284 goto err2;
285 }
286 i = 0;
287 do {
288 long daa;
f9e040ef 289 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i,
f742267a 290 NULL, &pfn);
48debafe
MP
291 if (daa <= 0) {
292 r = daa ? daa : -EINVAL;
293 goto err3;
294 }
295 if (!pfn_t_has_page(pfn)) {
296 r = -EOPNOTSUPP;
297 goto err3;
298 }
299 while (daa-- && i < p) {
300 pages[i++] = pfn_t_to_page(pfn);
301 pfn.val++;
d35bd764
MP
302 if (!(i & 15))
303 cond_resched();
48debafe
MP
304 }
305 } while (i < p);
306 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
307 if (!wc->memory_map) {
308 r = -ENOMEM;
309 goto err3;
310 }
311 kvfree(pages);
312 wc->memory_vmapped = true;
313 }
314
315 dax_read_unlock(id);
d284f824
MP
316
317 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
318 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
319
48debafe
MP
320 return 0;
321err3:
322 kvfree(pages);
323err2:
324 dax_read_unlock(id);
325err1:
326 return r;
327}
328#else
329static int persistent_memory_claim(struct dm_writecache *wc)
330{
857c4c0a 331 return -EOPNOTSUPP;
48debafe
MP
332}
333#endif
334
335static void persistent_memory_release(struct dm_writecache *wc)
336{
337 if (wc->memory_vmapped)
d284f824 338 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
48debafe
MP
339}
340
341static struct page *persistent_memory_page(void *addr)
342{
343 if (is_vmalloc_addr(addr))
344 return vmalloc_to_page(addr);
345 else
346 return virt_to_page(addr);
347}
348
349static unsigned persistent_memory_page_offset(void *addr)
350{
351 return (unsigned long)addr & (PAGE_SIZE - 1);
352}
353
354static void persistent_memory_flush_cache(void *ptr, size_t size)
355{
356 if (is_vmalloc_addr(ptr))
357 flush_kernel_vmap_range(ptr, size);
358}
359
360static void persistent_memory_invalidate_cache(void *ptr, size_t size)
361{
362 if (is_vmalloc_addr(ptr))
363 invalidate_kernel_vmap_range(ptr, size);
364}
365
366static struct wc_memory_superblock *sb(struct dm_writecache *wc)
367{
368 return wc->memory_map;
369}
370
371static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
372{
da4ad3a2 373 return &sb(wc)->entries[e->index];
48debafe
MP
374}
375
376static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
377{
378 return (char *)wc->block_start + (e->index << wc->block_size_bits);
379}
380
381static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
382{
d284f824 383 return wc->start_sector + wc->metadata_sectors +
48debafe
MP
384 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
385}
386
387static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
388{
389#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
390 return e->original_sector;
391#else
392 return le64_to_cpu(memory_entry(wc, e)->original_sector);
393#endif
394}
395
396static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
397{
398#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
399 return e->seq_count;
400#else
401 return le64_to_cpu(memory_entry(wc, e)->seq_count);
402#endif
403}
404
405static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
406{
407#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
408 e->seq_count = -1;
409#endif
410 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
411}
412
413static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
414 uint64_t original_sector, uint64_t seq_count)
415{
416 struct wc_memory_entry me;
417#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
418 e->original_sector = original_sector;
419 e->seq_count = seq_count;
420#endif
421 me.original_sector = cpu_to_le64(original_sector);
422 me.seq_count = cpu_to_le64(seq_count);
423 pmem_assign(*memory_entry(wc, e), me);
424}
425
426#define writecache_error(wc, err, msg, arg...) \
427do { \
428 if (!cmpxchg(&(wc)->error, 0, err)) \
429 DMERR(msg, ##arg); \
430 wake_up(&(wc)->freelist_wait); \
431} while (0)
432
433#define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
434
435static void writecache_flush_all_metadata(struct dm_writecache *wc)
436{
437 if (!WC_MODE_PMEM(wc))
438 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
439}
440
441static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
442{
443 if (!WC_MODE_PMEM(wc))
444 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
445 wc->dirty_bitmap);
446}
447
448static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
449
450struct io_notify {
451 struct dm_writecache *wc;
452 struct completion c;
453 atomic_t count;
454};
455
456static void writecache_notify_io(unsigned long error, void *context)
457{
458 struct io_notify *endio = context;
459
460 if (unlikely(error != 0))
461 writecache_error(endio->wc, -EIO, "error writing metadata");
462 BUG_ON(atomic_read(&endio->count) <= 0);
463 if (atomic_dec_and_test(&endio->count))
464 complete(&endio->c);
465}
466
aa950920
MP
467static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
468{
469 wait_event(wc->bio_in_progress_wait[direction],
470 !atomic_read(&wc->bio_in_progress[direction]));
471}
472
473static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
48debafe
MP
474{
475 struct dm_io_region region;
476 struct dm_io_request req;
477 struct io_notify endio = {
478 wc,
479 COMPLETION_INITIALIZER_ONSTACK(endio.c),
480 ATOMIC_INIT(1),
481 };
1e1132ea 482 unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
48debafe
MP
483 unsigned i = 0;
484
485 while (1) {
486 unsigned j;
487 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
488 if (unlikely(i == bitmap_bits))
489 break;
490 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
491
492 region.bdev = wc->ssd_dev->bdev;
493 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
494 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
495
496 if (unlikely(region.sector >= wc->metadata_sectors))
497 break;
498 if (unlikely(region.sector + region.count > wc->metadata_sectors))
499 region.count = wc->metadata_sectors - region.sector;
500
d284f824 501 region.sector += wc->start_sector;
48debafe
MP
502 atomic_inc(&endio.count);
503 req.bi_op = REQ_OP_WRITE;
504 req.bi_op_flags = REQ_SYNC;
505 req.mem.type = DM_IO_VMA;
506 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
507 req.client = wc->dm_io;
508 req.notify.fn = writecache_notify_io;
509 req.notify.context = &endio;
510
511 /* writing via async dm-io (implied by notify.fn above) won't return an error */
512 (void) dm_io(&req, 1, &region, NULL);
513 i = j;
514 }
515
516 writecache_notify_io(0, &endio);
517 wait_for_completion_io(&endio.c);
518
aa950920
MP
519 if (wait_for_ios)
520 writecache_wait_for_ios(wc, WRITE);
521
48debafe
MP
522 writecache_disk_flush(wc, wc->ssd_dev);
523
524 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
525}
526
dc8a01ae
MP
527static void ssd_commit_superblock(struct dm_writecache *wc)
528{
529 int r;
530 struct dm_io_region region;
531 struct dm_io_request req;
532
533 region.bdev = wc->ssd_dev->bdev;
534 region.sector = 0;
991bd8d7 535 region.count = wc->block_size >> SECTOR_SHIFT;
dc8a01ae
MP
536 region.sector += wc->start_sector;
537
538 req.bi_op = REQ_OP_WRITE;
539 req.bi_op_flags = REQ_SYNC | REQ_FUA;
540 req.mem.type = DM_IO_VMA;
541 req.mem.ptr.vma = (char *)wc->memory_map;
542 req.client = wc->dm_io;
543 req.notify.fn = NULL;
544 req.notify.context = NULL;
545
546 r = dm_io(&req, 1, &region, NULL);
547 if (unlikely(r))
548 writecache_error(wc, r, "error writing superblock");
549}
550
aa950920 551static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
48debafe
MP
552{
553 if (WC_MODE_PMEM(wc))
3e79f082 554 pmem_wmb();
48debafe 555 else
aa950920 556 ssd_commit_flushed(wc, wait_for_ios);
48debafe
MP
557}
558
559static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
560{
561 int r;
562 struct dm_io_region region;
563 struct dm_io_request req;
564
565 region.bdev = dev->bdev;
566 region.sector = 0;
567 region.count = 0;
568 req.bi_op = REQ_OP_WRITE;
569 req.bi_op_flags = REQ_PREFLUSH;
570 req.mem.type = DM_IO_KMEM;
571 req.mem.ptr.addr = NULL;
572 req.client = wc->dm_io;
573 req.notify.fn = NULL;
574
575 r = dm_io(&req, 1, &region, NULL);
576 if (unlikely(r))
577 writecache_error(wc, r, "error flushing metadata: %d", r);
578}
579
48debafe
MP
580#define WFE_RETURN_FOLLOWING 1
581#define WFE_LOWEST_SEQ 2
582
583static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
584 uint64_t block, int flags)
585{
586 struct wc_entry *e;
587 struct rb_node *node = wc->tree.rb_node;
588
589 if (unlikely(!node))
590 return NULL;
591
592 while (1) {
593 e = container_of(node, struct wc_entry, rb_node);
594 if (read_original_sector(wc, e) == block)
595 break;
f8011d33 596
48debafe
MP
597 node = (read_original_sector(wc, e) >= block ?
598 e->rb_node.rb_left : e->rb_node.rb_right);
599 if (unlikely(!node)) {
f8011d33 600 if (!(flags & WFE_RETURN_FOLLOWING))
48debafe 601 return NULL;
48debafe 602 if (read_original_sector(wc, e) >= block) {
f8011d33 603 return e;
48debafe
MP
604 } else {
605 node = rb_next(&e->rb_node);
f8011d33 606 if (unlikely(!node))
48debafe 607 return NULL;
48debafe 608 e = container_of(node, struct wc_entry, rb_node);
f8011d33 609 return e;
48debafe
MP
610 }
611 }
612 }
613
614 while (1) {
615 struct wc_entry *e2;
616 if (flags & WFE_LOWEST_SEQ)
617 node = rb_prev(&e->rb_node);
618 else
619 node = rb_next(&e->rb_node);
84420b1e 620 if (unlikely(!node))
48debafe
MP
621 return e;
622 e2 = container_of(node, struct wc_entry, rb_node);
623 if (read_original_sector(wc, e2) != block)
624 return e;
625 e = e2;
626 }
627}
628
629static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
630{
631 struct wc_entry *e;
632 struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
633
634 while (*node) {
635 e = container_of(*node, struct wc_entry, rb_node);
636 parent = &e->rb_node;
637 if (read_original_sector(wc, e) > read_original_sector(wc, ins))
638 node = &parent->rb_left;
639 else
640 node = &parent->rb_right;
641 }
642 rb_link_node(&ins->rb_node, parent, node);
643 rb_insert_color(&ins->rb_node, &wc->tree);
644 list_add(&ins->lru, &wc->lru);
3923d485 645 ins->age = jiffies;
48debafe
MP
646}
647
648static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
649{
650 list_del(&e->lru);
651 rb_erase(&e->rb_node, &wc->tree);
652}
653
654static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
655{
656 if (WC_MODE_SORT_FREELIST(wc)) {
657 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
658 if (unlikely(!*node))
659 wc->current_free = e;
660 while (*node) {
661 parent = *node;
662 if (&e->rb_node < *node)
663 node = &parent->rb_left;
664 else
665 node = &parent->rb_right;
666 }
667 rb_link_node(&e->rb_node, parent, node);
668 rb_insert_color(&e->rb_node, &wc->freetree);
669 } else {
670 list_add_tail(&e->lru, &wc->freelist);
671 }
672 wc->freelist_size++;
673}
674
41c526c5
MP
675static inline void writecache_verify_watermark(struct dm_writecache *wc)
676{
677 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
678 queue_work(wc->writeback_wq, &wc->writeback_work);
679}
680
3923d485
MP
681static void writecache_max_age_timer(struct timer_list *t)
682{
683 struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
684
685 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
686 queue_work(wc->writeback_wq, &wc->writeback_work);
687 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
688 }
689}
690
dcd19507 691static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
48debafe
MP
692{
693 struct wc_entry *e;
694
695 if (WC_MODE_SORT_FREELIST(wc)) {
696 struct rb_node *next;
697 if (unlikely(!wc->current_free))
698 return NULL;
699 e = wc->current_free;
dcd19507
MP
700 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
701 return NULL;
48debafe
MP
702 next = rb_next(&e->rb_node);
703 rb_erase(&e->rb_node, &wc->freetree);
704 if (unlikely(!next))
705 next = rb_first(&wc->freetree);
706 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
707 } else {
708 if (unlikely(list_empty(&wc->freelist)))
709 return NULL;
710 e = container_of(wc->freelist.next, struct wc_entry, lru);
dcd19507
MP
711 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
712 return NULL;
48debafe
MP
713 list_del(&e->lru);
714 }
715 wc->freelist_size--;
41c526c5
MP
716
717 writecache_verify_watermark(wc);
48debafe
MP
718
719 return e;
720}
721
722static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
723{
724 writecache_unlink(wc, e);
725 writecache_add_to_freelist(wc, e);
726 clear_seq_count(wc, e);
727 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
728 if (unlikely(waitqueue_active(&wc->freelist_wait)))
729 wake_up(&wc->freelist_wait);
730}
731
732static void writecache_wait_on_freelist(struct dm_writecache *wc)
733{
734 DEFINE_WAIT(wait);
735
736 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
737 wc_unlock(wc);
738 io_schedule();
739 finish_wait(&wc->freelist_wait, &wait);
740 wc_lock(wc);
741}
742
743static void writecache_poison_lists(struct dm_writecache *wc)
744{
745 /*
746 * Catch incorrect access to these values while the device is suspended.
747 */
748 memset(&wc->tree, -1, sizeof wc->tree);
749 wc->lru.next = LIST_POISON1;
750 wc->lru.prev = LIST_POISON2;
751 wc->freelist.next = LIST_POISON1;
752 wc->freelist.prev = LIST_POISON2;
753}
754
755static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
756{
757 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
758 if (WC_MODE_PMEM(wc))
759 writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
760}
761
762static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
763{
764 return read_seq_count(wc, e) < wc->seq_count;
765}
766
767static void writecache_flush(struct dm_writecache *wc)
768{
769 struct wc_entry *e, *e2;
770 bool need_flush_after_free;
771
772 wc->uncommitted_blocks = 0;
773 del_timer(&wc->autocommit_timer);
774
775 if (list_empty(&wc->lru))
776 return;
777
778 e = container_of(wc->lru.next, struct wc_entry, lru);
779 if (writecache_entry_is_committed(wc, e)) {
780 if (wc->overwrote_committed) {
781 writecache_wait_for_ios(wc, WRITE);
782 writecache_disk_flush(wc, wc->ssd_dev);
783 wc->overwrote_committed = false;
784 }
785 return;
786 }
787 while (1) {
788 writecache_flush_entry(wc, e);
789 if (unlikely(e->lru.next == &wc->lru))
790 break;
791 e2 = container_of(e->lru.next, struct wc_entry, lru);
792 if (writecache_entry_is_committed(wc, e2))
793 break;
794 e = e2;
795 cond_resched();
796 }
aa950920 797 writecache_commit_flushed(wc, true);
48debafe
MP
798
799 wc->seq_count++;
800 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
dc8a01ae
MP
801 if (WC_MODE_PMEM(wc))
802 writecache_commit_flushed(wc, false);
803 else
804 ssd_commit_superblock(wc);
48debafe
MP
805
806 wc->overwrote_committed = false;
807
808 need_flush_after_free = false;
809 while (1) {
810 /* Free another committed entry with lower seq-count */
811 struct rb_node *rb_node = rb_prev(&e->rb_node);
812
813 if (rb_node) {
814 e2 = container_of(rb_node, struct wc_entry, rb_node);
815 if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
816 likely(!e2->write_in_progress)) {
817 writecache_free_entry(wc, e2);
818 need_flush_after_free = true;
819 }
820 }
821 if (unlikely(e->lru.prev == &wc->lru))
822 break;
823 e = container_of(e->lru.prev, struct wc_entry, lru);
824 cond_resched();
825 }
826
827 if (need_flush_after_free)
aa950920 828 writecache_commit_flushed(wc, false);
48debafe
MP
829}
830
831static void writecache_flush_work(struct work_struct *work)
832{
833 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
834
835 wc_lock(wc);
836 writecache_flush(wc);
837 wc_unlock(wc);
838}
839
840static void writecache_autocommit_timer(struct timer_list *t)
841{
842 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
843 if (!writecache_has_error(wc))
844 queue_work(wc->writeback_wq, &wc->flush_work);
845}
846
847static void writecache_schedule_autocommit(struct dm_writecache *wc)
848{
849 if (!timer_pending(&wc->autocommit_timer))
850 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
851}
852
853static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
854{
855 struct wc_entry *e;
856 bool discarded_something = false;
857
858 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
859 if (unlikely(!e))
860 return;
861
862 while (read_original_sector(wc, e) < end) {
863 struct rb_node *node = rb_next(&e->rb_node);
864
865 if (likely(!e->write_in_progress)) {
866 if (!discarded_something) {
a143e172
HY
867 if (!WC_MODE_PMEM(wc)) {
868 writecache_wait_for_ios(wc, READ);
869 writecache_wait_for_ios(wc, WRITE);
870 }
48debafe
MP
871 discarded_something = true;
872 }
39495b12
HY
873 if (!writecache_entry_is_committed(wc, e))
874 wc->uncommitted_blocks--;
48debafe
MP
875 writecache_free_entry(wc, e);
876 }
877
84420b1e 878 if (unlikely(!node))
48debafe
MP
879 break;
880
881 e = container_of(node, struct wc_entry, rb_node);
882 }
883
884 if (discarded_something)
aa950920 885 writecache_commit_flushed(wc, false);
48debafe
MP
886}
887
888static bool writecache_wait_for_writeback(struct dm_writecache *wc)
889{
890 if (wc->writeback_size) {
891 writecache_wait_on_freelist(wc);
892 return true;
893 }
894 return false;
895}
896
897static void writecache_suspend(struct dm_target *ti)
898{
899 struct dm_writecache *wc = ti->private;
900 bool flush_on_suspend;
901
902 del_timer_sync(&wc->autocommit_timer);
3923d485 903 del_timer_sync(&wc->max_age_timer);
48debafe
MP
904
905 wc_lock(wc);
906 writecache_flush(wc);
907 flush_on_suspend = wc->flush_on_suspend;
908 if (flush_on_suspend) {
909 wc->flush_on_suspend = false;
910 wc->writeback_all++;
911 queue_work(wc->writeback_wq, &wc->writeback_work);
912 }
913 wc_unlock(wc);
914
adc0daad 915 drain_workqueue(wc->writeback_wq);
48debafe
MP
916
917 wc_lock(wc);
918 if (flush_on_suspend)
919 wc->writeback_all--;
920 while (writecache_wait_for_writeback(wc));
921
922 if (WC_MODE_PMEM(wc))
923 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
924
925 writecache_poison_lists(wc);
926
927 wc_unlock(wc);
928}
929
930static int writecache_alloc_entries(struct dm_writecache *wc)
931{
932 size_t b;
933
934 if (wc->entries)
935 return 0;
50a7d3ba 936 wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
48debafe
MP
937 if (!wc->entries)
938 return -ENOMEM;
939 for (b = 0; b < wc->n_blocks; b++) {
940 struct wc_entry *e = &wc->entries[b];
941 e->index = b;
942 e->write_in_progress = false;
1edaa447 943 cond_resched();
48debafe
MP
944 }
945
946 return 0;
947}
948
31b22120
MP
949static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
950{
951 struct dm_io_region region;
952 struct dm_io_request req;
953
954 region.bdev = wc->ssd_dev->bdev;
955 region.sector = wc->start_sector;
956 region.count = n_sectors;
957 req.bi_op = REQ_OP_READ;
958 req.bi_op_flags = REQ_SYNC;
959 req.mem.type = DM_IO_VMA;
960 req.mem.ptr.vma = (char *)wc->memory_map;
961 req.client = wc->dm_io;
962 req.notify.fn = NULL;
963
964 return dm_io(&req, 1, &region, NULL);
965}
966
48debafe
MP
967static void writecache_resume(struct dm_target *ti)
968{
969 struct dm_writecache *wc = ti->private;
970 size_t b;
971 bool need_flush = false;
972 __le64 sb_seq_count;
973 int r;
974
975 wc_lock(wc);
976
d9928ac5 977 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
4134455f 978
31b22120 979 if (WC_MODE_PMEM(wc)) {
48debafe 980 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
31b22120
MP
981 } else {
982 r = writecache_read_metadata(wc, wc->metadata_sectors);
983 if (r) {
984 size_t sb_entries_offset;
985 writecache_error(wc, r, "unable to read metadata: %d", r);
986 sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
987 memset((char *)wc->memory_map + sb_entries_offset, -1,
988 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
989 }
990 }
48debafe
MP
991
992 wc->tree = RB_ROOT;
993 INIT_LIST_HEAD(&wc->lru);
994 if (WC_MODE_SORT_FREELIST(wc)) {
995 wc->freetree = RB_ROOT;
996 wc->current_free = NULL;
997 } else {
998 INIT_LIST_HEAD(&wc->freelist);
999 }
1000 wc->freelist_size = 0;
1001
ec6347bb
DW
1002 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1003 sizeof(uint64_t));
48debafe
MP
1004 if (r) {
1005 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1006 sb_seq_count = cpu_to_le64(0);
1007 }
1008 wc->seq_count = le64_to_cpu(sb_seq_count);
1009
1010#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1011 for (b = 0; b < wc->n_blocks; b++) {
1012 struct wc_entry *e = &wc->entries[b];
1013 struct wc_memory_entry wme;
1014 if (writecache_has_error(wc)) {
1015 e->original_sector = -1;
1016 e->seq_count = -1;
1017 continue;
1018 }
ec6347bb
DW
1019 r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1020 sizeof(struct wc_memory_entry));
48debafe
MP
1021 if (r) {
1022 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1023 (unsigned long)b, r);
1024 e->original_sector = -1;
1025 e->seq_count = -1;
1026 } else {
1027 e->original_sector = le64_to_cpu(wme.original_sector);
1028 e->seq_count = le64_to_cpu(wme.seq_count);
1029 }
1edaa447 1030 cond_resched();
48debafe
MP
1031 }
1032#endif
1033 for (b = 0; b < wc->n_blocks; b++) {
1034 struct wc_entry *e = &wc->entries[b];
1035 if (!writecache_entry_is_committed(wc, e)) {
1036 if (read_seq_count(wc, e) != -1) {
1037erase_this:
1038 clear_seq_count(wc, e);
1039 need_flush = true;
1040 }
1041 writecache_add_to_freelist(wc, e);
1042 } else {
1043 struct wc_entry *old;
1044
1045 old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1046 if (!old) {
1047 writecache_insert_entry(wc, e);
1048 } else {
1049 if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1050 writecache_error(wc, -EINVAL,
1051 "two identical entries, position %llu, sector %llu, sequence %llu",
1052 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1053 (unsigned long long)read_seq_count(wc, e));
1054 }
1055 if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1056 goto erase_this;
1057 } else {
1058 writecache_free_entry(wc, old);
1059 writecache_insert_entry(wc, e);
1060 need_flush = true;
1061 }
1062 }
1063 }
1064 cond_resched();
1065 }
1066
1067 if (need_flush) {
1068 writecache_flush_all_metadata(wc);
aa950920 1069 writecache_commit_flushed(wc, false);
48debafe
MP
1070 }
1071
41c526c5
MP
1072 writecache_verify_watermark(wc);
1073
3923d485
MP
1074 if (wc->max_age != MAX_AGE_UNSPECIFIED)
1075 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1076
48debafe
MP
1077 wc_unlock(wc);
1078}
1079
1080static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1081{
1082 if (argc != 1)
1083 return -EINVAL;
1084
1085 wc_lock(wc);
1086 if (dm_suspended(wc->ti)) {
1087 wc_unlock(wc);
1088 return -EBUSY;
1089 }
1090 if (writecache_has_error(wc)) {
1091 wc_unlock(wc);
1092 return -EIO;
1093 }
1094
1095 writecache_flush(wc);
1096 wc->writeback_all++;
1097 queue_work(wc->writeback_wq, &wc->writeback_work);
1098 wc_unlock(wc);
1099
1100 flush_workqueue(wc->writeback_wq);
1101
1102 wc_lock(wc);
1103 wc->writeback_all--;
1104 if (writecache_has_error(wc)) {
1105 wc_unlock(wc);
1106 return -EIO;
1107 }
1108 wc_unlock(wc);
1109
1110 return 0;
1111}
1112
1113static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1114{
1115 if (argc != 1)
1116 return -EINVAL;
1117
1118 wc_lock(wc);
1119 wc->flush_on_suspend = true;
1120 wc_unlock(wc);
1121
1122 return 0;
1123}
1124
93de44eb
MP
1125static void activate_cleaner(struct dm_writecache *wc)
1126{
1127 wc->flush_on_suspend = true;
1128 wc->cleaner = true;
1129 wc->freelist_high_watermark = wc->n_blocks;
1130 wc->freelist_low_watermark = wc->n_blocks;
1131}
1132
1133static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1134{
1135 if (argc != 1)
1136 return -EINVAL;
1137
1138 wc_lock(wc);
1139 activate_cleaner(wc);
1140 if (!dm_suspended(wc->ti))
1141 writecache_verify_watermark(wc);
1142 wc_unlock(wc);
1143
1144 return 0;
1145}
1146
48debafe
MP
1147static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1148 char *result, unsigned maxlen)
1149{
1150 int r = -EINVAL;
1151 struct dm_writecache *wc = ti->private;
1152
1153 if (!strcasecmp(argv[0], "flush"))
1154 r = process_flush_mesg(argc, argv, wc);
1155 else if (!strcasecmp(argv[0], "flush_on_suspend"))
1156 r = process_flush_on_suspend_mesg(argc, argv, wc);
93de44eb
MP
1157 else if (!strcasecmp(argv[0], "cleaner"))
1158 r = process_cleaner_mesg(argc, argv, wc);
48debafe
MP
1159 else
1160 DMERR("unrecognised message received: %s", argv[0]);
1161
1162 return r;
1163}
1164
48338daa
MP
1165static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1166{
1167 /*
1168 * clflushopt performs better with block size 1024, 2048, 4096
1169 * non-temporal stores perform better with block size 512
1170 *
1171 * block size 512 1024 2048 4096
1172 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
1173 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
1174 *
1175 * We see that movnti performs better for 512-byte blocks, and
1176 * clflushopt performs better for 1024-byte and larger blocks. So, we
1177 * prefer clflushopt for sizes >= 768.
1178 *
1179 * NOTE: this happens to be the case now (with dm-writecache's single
1180 * threaded model) but re-evaluate this once memcpy_flushcache() is
1181 * enabled to use movdir64b which might invalidate this performance
1182 * advantage seen with cache-allocating-writes plus flushing.
1183 */
1184#ifdef CONFIG_X86
1185 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1186 likely(boot_cpu_data.x86_clflush_size == 64) &&
1187 likely(size >= 768)) {
1188 do {
1189 memcpy((void *)dest, (void *)source, 64);
1190 clflushopt((void *)dest);
1191 dest += 64;
1192 source += 64;
1193 size -= 64;
1194 } while (size >= 64);
1195 return;
1196 }
1197#endif
1198 memcpy_flushcache(dest, source, size);
1199}
1200
48debafe
MP
1201static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1202{
1203 void *buf;
1204 unsigned long flags;
1205 unsigned size;
1206 int rw = bio_data_dir(bio);
1207 unsigned remaining_size = wc->block_size;
1208
1209 do {
1210 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1211 buf = bvec_kmap_irq(&bv, &flags);
1212 size = bv.bv_len;
1213 if (unlikely(size > remaining_size))
1214 size = remaining_size;
1215
1216 if (rw == READ) {
1217 int r;
ec6347bb 1218 r = copy_mc_to_kernel(buf, data, size);
48debafe
MP
1219 flush_dcache_page(bio_page(bio));
1220 if (unlikely(r)) {
1221 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1222 bio->bi_status = BLK_STS_IOERR;
1223 }
1224 } else {
1225 flush_dcache_page(bio_page(bio));
48338daa 1226 memcpy_flushcache_optimized(data, buf, size);
48debafe
MP
1227 }
1228
1229 bvec_kunmap_irq(buf, &flags);
1230
1231 data = (char *)data + size;
1232 remaining_size -= size;
1233 bio_advance(bio, size);
1234 } while (unlikely(remaining_size));
1235}
1236
1237static int writecache_flush_thread(void *data)
1238{
1239 struct dm_writecache *wc = data;
1240
1241 while (1) {
1242 struct bio *bio;
1243
1244 wc_lock(wc);
1245 bio = bio_list_pop(&wc->flush_list);
1246 if (!bio) {
1247 set_current_state(TASK_INTERRUPTIBLE);
1248 wc_unlock(wc);
1249
1250 if (unlikely(kthread_should_stop())) {
1251 set_current_state(TASK_RUNNING);
1252 break;
1253 }
1254
1255 schedule();
1256 continue;
1257 }
1258
1259 if (bio_op(bio) == REQ_OP_DISCARD) {
1260 writecache_discard(wc, bio->bi_iter.bi_sector,
1261 bio_end_sector(bio));
1262 wc_unlock(wc);
1263 bio_set_dev(bio, wc->dev->bdev);
ed00aabd 1264 submit_bio_noacct(bio);
48debafe
MP
1265 } else {
1266 writecache_flush(wc);
1267 wc_unlock(wc);
1268 if (writecache_has_error(wc))
1269 bio->bi_status = BLK_STS_IOERR;
1270 bio_endio(bio);
1271 }
1272 }
1273
1274 return 0;
1275}
1276
1277static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1278{
1279 if (bio_list_empty(&wc->flush_list))
1280 wake_up_process(wc->flush_thread);
1281 bio_list_add(&wc->flush_list, bio);
1282}
1283
1284static int writecache_map(struct dm_target *ti, struct bio *bio)
1285{
1286 struct wc_entry *e;
1287 struct dm_writecache *wc = ti->private;
1288
1289 bio->bi_private = NULL;
1290
1291 wc_lock(wc);
1292
1293 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1294 if (writecache_has_error(wc))
1295 goto unlock_error;
1296 if (WC_MODE_PMEM(wc)) {
1297 writecache_flush(wc);
1298 if (writecache_has_error(wc))
1299 goto unlock_error;
1300 goto unlock_submit;
1301 } else {
1302 writecache_offload_bio(wc, bio);
1303 goto unlock_return;
1304 }
1305 }
1306
1307 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1308
1309 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1310 (wc->block_size / 512 - 1)) != 0)) {
1311 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1312 (unsigned long long)bio->bi_iter.bi_sector,
1313 bio->bi_iter.bi_size, wc->block_size);
1314 goto unlock_error;
1315 }
1316
1317 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1318 if (writecache_has_error(wc))
1319 goto unlock_error;
1320 if (WC_MODE_PMEM(wc)) {
1321 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1322 goto unlock_remap_origin;
1323 } else {
1324 writecache_offload_bio(wc, bio);
1325 goto unlock_return;
1326 }
1327 }
1328
1329 if (bio_data_dir(bio) == READ) {
1330read_next_block:
1331 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1332 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1333 if (WC_MODE_PMEM(wc)) {
1334 bio_copy_block(wc, bio, memory_data(wc, e));
1335 if (bio->bi_iter.bi_size)
1336 goto read_next_block;
1337 goto unlock_submit;
1338 } else {
1339 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1340 bio_set_dev(bio, wc->ssd_dev->bdev);
1341 bio->bi_iter.bi_sector = cache_sector(wc, e);
1342 if (!writecache_entry_is_committed(wc, e))
1343 writecache_wait_for_ios(wc, WRITE);
1344 goto unlock_remap;
1345 }
1346 } else {
1347 if (e) {
1348 sector_t next_boundary =
1349 read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1350 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1351 dm_accept_partial_bio(bio, next_boundary);
1352 }
1353 }
1354 goto unlock_remap_origin;
1355 }
1356 } else {
1357 do {
d53f1faf 1358 bool found_entry = false;
ee50cc19 1359 bool search_used = false;
48debafe
MP
1360 if (writecache_has_error(wc))
1361 goto unlock_error;
1362 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1363 if (e) {
ee50cc19
MP
1364 if (!writecache_entry_is_committed(wc, e)) {
1365 search_used = true;
48debafe 1366 goto bio_copy;
ee50cc19 1367 }
48debafe
MP
1368 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1369 wc->overwrote_committed = true;
ee50cc19 1370 search_used = true;
48debafe
MP
1371 goto bio_copy;
1372 }
d53f1faf 1373 found_entry = true;
93de44eb
MP
1374 } else {
1375 if (unlikely(wc->cleaner))
1376 goto direct_write;
48debafe 1377 }
dcd19507 1378 e = writecache_pop_from_freelist(wc, (sector_t)-1);
48debafe 1379 if (unlikely(!e)) {
d53f1faf 1380 if (!found_entry) {
93de44eb 1381direct_write:
d53f1faf
MP
1382 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1383 if (e) {
1384 sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1385 BUG_ON(!next_boundary);
1386 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1387 dm_accept_partial_bio(bio, next_boundary);
1388 }
1389 }
1390 goto unlock_remap_origin;
1391 }
48debafe
MP
1392 writecache_wait_on_freelist(wc);
1393 continue;
1394 }
1395 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1396 writecache_insert_entry(wc, e);
1397 wc->uncommitted_blocks++;
1398bio_copy:
1399 if (WC_MODE_PMEM(wc)) {
1400 bio_copy_block(wc, bio, memory_data(wc, e));
1401 } else {
dcd19507
MP
1402 unsigned bio_size = wc->block_size;
1403 sector_t start_cache_sec = cache_sector(wc, e);
1404 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1405
1406 while (bio_size < bio->bi_iter.bi_size) {
ee50cc19
MP
1407 if (!search_used) {
1408 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1409 if (!f)
1410 break;
1411 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1412 (bio_size >> SECTOR_SHIFT), wc->seq_count);
1413 writecache_insert_entry(wc, f);
1414 wc->uncommitted_blocks++;
1415 } else {
1416 struct wc_entry *f;
1417 struct rb_node *next = rb_next(&e->rb_node);
1418 if (!next)
1419 break;
1420 f = container_of(next, struct wc_entry, rb_node);
1421 if (f != e + 1)
1422 break;
1423 if (read_original_sector(wc, f) !=
1424 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1425 break;
1426 if (unlikely(f->write_in_progress))
1427 break;
1428 if (writecache_entry_is_committed(wc, f))
1429 wc->overwrote_committed = true;
1430 e = f;
1431 }
dcd19507
MP
1432 bio_size += wc->block_size;
1433 current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1434 }
1435
48debafe 1436 bio_set_dev(bio, wc->ssd_dev->bdev);
dcd19507
MP
1437 bio->bi_iter.bi_sector = start_cache_sec;
1438 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1439
48debafe
MP
1440 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1441 wc->uncommitted_blocks = 0;
1442 queue_work(wc->writeback_wq, &wc->flush_work);
1443 } else {
1444 writecache_schedule_autocommit(wc);
1445 }
1446 goto unlock_remap;
1447 }
1448 } while (bio->bi_iter.bi_size);
1449
c1005322
MM
1450 if (unlikely(bio->bi_opf & REQ_FUA ||
1451 wc->uncommitted_blocks >= wc->autocommit_blocks))
48debafe
MP
1452 writecache_flush(wc);
1453 else
1454 writecache_schedule_autocommit(wc);
1455 goto unlock_submit;
1456 }
1457
1458unlock_remap_origin:
1459 bio_set_dev(bio, wc->dev->bdev);
1460 wc_unlock(wc);
1461 return DM_MAPIO_REMAPPED;
1462
1463unlock_remap:
1464 /* make sure that writecache_end_io decrements bio_in_progress: */
1465 bio->bi_private = (void *)1;
1466 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1467 wc_unlock(wc);
1468 return DM_MAPIO_REMAPPED;
1469
1470unlock_submit:
1471 wc_unlock(wc);
1472 bio_endio(bio);
1473 return DM_MAPIO_SUBMITTED;
1474
1475unlock_return:
1476 wc_unlock(wc);
1477 return DM_MAPIO_SUBMITTED;
1478
1479unlock_error:
1480 wc_unlock(wc);
1481 bio_io_error(bio);
1482 return DM_MAPIO_SUBMITTED;
1483}
1484
1485static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1486{
1487 struct dm_writecache *wc = ti->private;
1488
1489 if (bio->bi_private != NULL) {
1490 int dir = bio_data_dir(bio);
1491 if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1492 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1493 wake_up(&wc->bio_in_progress_wait[dir]);
1494 }
1495 return 0;
1496}
1497
1498static int writecache_iterate_devices(struct dm_target *ti,
1499 iterate_devices_callout_fn fn, void *data)
1500{
1501 struct dm_writecache *wc = ti->private;
1502
1503 return fn(ti, wc->dev, 0, ti->len, data);
1504}
1505
1506static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1507{
1508 struct dm_writecache *wc = ti->private;
1509
1510 if (limits->logical_block_size < wc->block_size)
1511 limits->logical_block_size = wc->block_size;
1512
1513 if (limits->physical_block_size < wc->block_size)
1514 limits->physical_block_size = wc->block_size;
1515
1516 if (limits->io_min < wc->block_size)
1517 limits->io_min = wc->block_size;
1518}
1519
1520
1521static void writecache_writeback_endio(struct bio *bio)
1522{
1523 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1524 struct dm_writecache *wc = wb->wc;
1525 unsigned long flags;
1526
1527 raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1528 if (unlikely(list_empty(&wc->endio_list)))
1529 wake_up_process(wc->endio_thread);
1530 list_add_tail(&wb->endio_entry, &wc->endio_list);
1531 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1532}
1533
1534static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1535{
1536 struct copy_struct *c = ptr;
1537 struct dm_writecache *wc = c->wc;
1538
1539 c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1540
1541 raw_spin_lock_irq(&wc->endio_list_lock);
1542 if (unlikely(list_empty(&wc->endio_list)))
1543 wake_up_process(wc->endio_thread);
1544 list_add_tail(&c->endio_entry, &wc->endio_list);
1545 raw_spin_unlock_irq(&wc->endio_list_lock);
1546}
1547
1548static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1549{
1550 unsigned i;
1551 struct writeback_struct *wb;
1552 struct wc_entry *e;
1553 unsigned long n_walked = 0;
1554
1555 do {
1556 wb = list_entry(list->next, struct writeback_struct, endio_entry);
1557 list_del(&wb->endio_entry);
1558
1559 if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1560 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1561 "write error %d", wb->bio.bi_status);
1562 i = 0;
1563 do {
1564 e = wb->wc_list[i];
1565 BUG_ON(!e->write_in_progress);
1566 e->write_in_progress = false;
1567 INIT_LIST_HEAD(&e->lru);
1568 if (!writecache_has_error(wc))
1569 writecache_free_entry(wc, e);
1570 BUG_ON(!wc->writeback_size);
1571 wc->writeback_size--;
1572 n_walked++;
1573 if (unlikely(n_walked >= ENDIO_LATENCY)) {
aa950920 1574 writecache_commit_flushed(wc, false);
48debafe
MP
1575 wc_unlock(wc);
1576 wc_lock(wc);
1577 n_walked = 0;
1578 }
1579 } while (++i < wb->wc_list_n);
1580
1581 if (wb->wc_list != wb->wc_list_inline)
1582 kfree(wb->wc_list);
1583 bio_put(&wb->bio);
1584 } while (!list_empty(list));
1585}
1586
1587static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1588{
1589 struct copy_struct *c;
1590 struct wc_entry *e;
1591
1592 do {
1593 c = list_entry(list->next, struct copy_struct, endio_entry);
1594 list_del(&c->endio_entry);
1595
1596 if (unlikely(c->error))
1597 writecache_error(wc, c->error, "copy error");
1598
1599 e = c->e;
1600 do {
1601 BUG_ON(!e->write_in_progress);
1602 e->write_in_progress = false;
1603 INIT_LIST_HEAD(&e->lru);
1604 if (!writecache_has_error(wc))
1605 writecache_free_entry(wc, e);
1606
1607 BUG_ON(!wc->writeback_size);
1608 wc->writeback_size--;
1609 e++;
1610 } while (--c->n_entries);
1611 mempool_free(c, &wc->copy_pool);
1612 } while (!list_empty(list));
1613}
1614
1615static int writecache_endio_thread(void *data)
1616{
1617 struct dm_writecache *wc = data;
1618
1619 while (1) {
1620 struct list_head list;
1621
1622 raw_spin_lock_irq(&wc->endio_list_lock);
1623 if (!list_empty(&wc->endio_list))
1624 goto pop_from_list;
1625 set_current_state(TASK_INTERRUPTIBLE);
1626 raw_spin_unlock_irq(&wc->endio_list_lock);
1627
1628 if (unlikely(kthread_should_stop())) {
1629 set_current_state(TASK_RUNNING);
1630 break;
1631 }
1632
1633 schedule();
1634
1635 continue;
1636
1637pop_from_list:
1638 list = wc->endio_list;
1639 list.next->prev = list.prev->next = &list;
1640 INIT_LIST_HEAD(&wc->endio_list);
1641 raw_spin_unlock_irq(&wc->endio_list_lock);
1642
1643 if (!WC_MODE_FUA(wc))
1644 writecache_disk_flush(wc, wc->dev);
1645
1646 wc_lock(wc);
1647
1648 if (WC_MODE_PMEM(wc)) {
1649 __writecache_endio_pmem(wc, &list);
1650 } else {
1651 __writecache_endio_ssd(wc, &list);
1652 writecache_wait_for_ios(wc, READ);
1653 }
1654
aa950920 1655 writecache_commit_flushed(wc, false);
48debafe
MP
1656
1657 wc_unlock(wc);
1658 }
1659
1660 return 0;
1661}
1662
620cbe40 1663static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
48debafe
MP
1664{
1665 struct dm_writecache *wc = wb->wc;
1666 unsigned block_size = wc->block_size;
1667 void *address = memory_data(wc, e);
1668
1669 persistent_memory_flush_cache(address, block_size);
4134455f
MP
1670
1671 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1672 return true;
1673
48debafe
MP
1674 return bio_add_page(&wb->bio, persistent_memory_page(address),
1675 block_size, persistent_memory_page_offset(address)) != 0;
1676}
1677
1678struct writeback_list {
1679 struct list_head list;
1680 size_t size;
1681};
1682
1683static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1684{
1685 if (unlikely(wc->max_writeback_jobs)) {
1686 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1687 wc_lock(wc);
1688 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1689 writecache_wait_on_freelist(wc);
1690 wc_unlock(wc);
1691 }
1692 }
1693 cond_resched();
1694}
1695
1696static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1697{
1698 struct wc_entry *e, *f;
1699 struct bio *bio;
1700 struct writeback_struct *wb;
1701 unsigned max_pages;
1702
1703 while (wbl->size) {
1704 wbl->size--;
1705 e = container_of(wbl->list.prev, struct wc_entry, lru);
1706 list_del(&e->lru);
1707
1708 max_pages = e->wc_list_contiguous;
1709
1710 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1711 wb = container_of(bio, struct writeback_struct, bio);
1712 wb->wc = wc;
09f2d656
HY
1713 bio->bi_end_io = writecache_writeback_endio;
1714 bio_set_dev(bio, wc->dev->bdev);
1715 bio->bi_iter.bi_sector = read_original_sector(wc, e);
48debafe 1716 if (max_pages <= WB_LIST_INLINE ||
50a7d3ba
KC
1717 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1718 GFP_NOIO | __GFP_NORETRY |
1719 __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
48debafe
MP
1720 wb->wc_list = wb->wc_list_inline;
1721 max_pages = WB_LIST_INLINE;
1722 }
1723
620cbe40 1724 BUG_ON(!wc_add_block(wb, e));
48debafe
MP
1725
1726 wb->wc_list[0] = e;
1727 wb->wc_list_n = 1;
1728
1729 while (wbl->size && wb->wc_list_n < max_pages) {
1730 f = container_of(wbl->list.prev, struct wc_entry, lru);
1731 if (read_original_sector(wc, f) !=
1732 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1733 break;
620cbe40 1734 if (!wc_add_block(wb, f))
48debafe
MP
1735 break;
1736 wbl->size--;
1737 list_del(&f->lru);
1738 wb->wc_list[wb->wc_list_n++] = f;
1739 e = f;
1740 }
09f2d656 1741 bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
48debafe
MP
1742 if (writecache_has_error(wc)) {
1743 bio->bi_status = BLK_STS_IOERR;
09f2d656 1744 bio_endio(bio);
4134455f
MP
1745 } else if (unlikely(!bio_sectors(bio))) {
1746 bio->bi_status = BLK_STS_OK;
1747 bio_endio(bio);
48debafe 1748 } else {
09f2d656 1749 submit_bio(bio);
48debafe
MP
1750 }
1751
1752 __writeback_throttle(wc, wbl);
1753 }
1754}
1755
1756static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1757{
1758 struct wc_entry *e, *f;
1759 struct dm_io_region from, to;
1760 struct copy_struct *c;
1761
1762 while (wbl->size) {
1763 unsigned n_sectors;
1764
1765 wbl->size--;
1766 e = container_of(wbl->list.prev, struct wc_entry, lru);
1767 list_del(&e->lru);
1768
1769 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1770
1771 from.bdev = wc->ssd_dev->bdev;
1772 from.sector = cache_sector(wc, e);
1773 from.count = n_sectors;
1774 to.bdev = wc->dev->bdev;
1775 to.sector = read_original_sector(wc, e);
1776 to.count = n_sectors;
1777
1778 c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1779 c->wc = wc;
1780 c->e = e;
1781 c->n_entries = e->wc_list_contiguous;
1782
1783 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1784 wbl->size--;
1785 f = container_of(wbl->list.prev, struct wc_entry, lru);
1786 BUG_ON(f != e + 1);
1787 list_del(&f->lru);
1788 e = f;
1789 }
1790
4134455f
MP
1791 if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1792 if (to.sector >= wc->data_device_sectors) {
1793 writecache_copy_endio(0, 0, c);
1794 continue;
1795 }
1796 from.count = to.count = wc->data_device_sectors - to.sector;
1797 }
1798
48debafe
MP
1799 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1800
1801 __writeback_throttle(wc, wbl);
1802 }
1803}
1804
1805static void writecache_writeback(struct work_struct *work)
1806{
1807 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1808 struct blk_plug plug;
3f649ab7 1809 struct wc_entry *f, *g, *e = NULL;
48debafe
MP
1810 struct rb_node *node, *next_node;
1811 struct list_head skipped;
1812 struct writeback_list wbl;
1813 unsigned long n_walked;
1814
293128b1
MP
1815 if (!WC_MODE_PMEM(wc)) {
1816 /* Wait for any active kcopyd work on behalf of ssd writeback */
1817 dm_kcopyd_client_flush(wc->dm_kcopyd);
1818 }
1819
48debafe
MP
1820 wc_lock(wc);
1821restart:
1822 if (writecache_has_error(wc)) {
1823 wc_unlock(wc);
1824 return;
1825 }
1826
1827 if (unlikely(wc->writeback_all)) {
1828 if (writecache_wait_for_writeback(wc))
1829 goto restart;
1830 }
1831
1832 if (wc->overwrote_committed) {
1833 writecache_wait_for_ios(wc, WRITE);
1834 }
1835
1836 n_walked = 0;
1837 INIT_LIST_HEAD(&skipped);
1838 INIT_LIST_HEAD(&wbl.list);
1839 wbl.size = 0;
1840 while (!list_empty(&wc->lru) &&
1841 (wc->writeback_all ||
3923d485
MP
1842 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1843 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1844 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
48debafe
MP
1845
1846 n_walked++;
1847 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
af4f6cab
MP
1848 likely(!wc->writeback_all)) {
1849 if (likely(!dm_suspended(wc->ti)))
1850 queue_work(wc->writeback_wq, &wc->writeback_work);
48debafe
MP
1851 break;
1852 }
1853
5229b489
HY
1854 if (unlikely(wc->writeback_all)) {
1855 if (unlikely(!e)) {
1856 writecache_flush(wc);
1857 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1858 } else
1859 e = g;
1860 } else
1861 e = container_of(wc->lru.prev, struct wc_entry, lru);
48debafe
MP
1862 BUG_ON(e->write_in_progress);
1863 if (unlikely(!writecache_entry_is_committed(wc, e))) {
1864 writecache_flush(wc);
1865 }
1866 node = rb_prev(&e->rb_node);
1867 if (node) {
1868 f = container_of(node, struct wc_entry, rb_node);
1869 if (unlikely(read_original_sector(wc, f) ==
1870 read_original_sector(wc, e))) {
1871 BUG_ON(!f->write_in_progress);
8c77f1cb 1872 list_move(&e->lru, &skipped);
48debafe
MP
1873 cond_resched();
1874 continue;
1875 }
1876 }
1877 wc->writeback_size++;
8c77f1cb 1878 list_move(&e->lru, &wbl.list);
48debafe
MP
1879 wbl.size++;
1880 e->write_in_progress = true;
1881 e->wc_list_contiguous = 1;
1882
1883 f = e;
1884
1885 while (1) {
1886 next_node = rb_next(&f->rb_node);
1887 if (unlikely(!next_node))
1888 break;
1889 g = container_of(next_node, struct wc_entry, rb_node);
62421b38
HY
1890 if (unlikely(read_original_sector(wc, g) ==
1891 read_original_sector(wc, f))) {
48debafe
MP
1892 f = g;
1893 continue;
1894 }
1895 if (read_original_sector(wc, g) !=
1896 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1897 break;
1898 if (unlikely(g->write_in_progress))
1899 break;
1900 if (unlikely(!writecache_entry_is_committed(wc, g)))
1901 break;
1902
1903 if (!WC_MODE_PMEM(wc)) {
1904 if (g != f + 1)
1905 break;
1906 }
1907
1908 n_walked++;
1909 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1910 // break;
1911
1912 wc->writeback_size++;
8c77f1cb 1913 list_move(&g->lru, &wbl.list);
48debafe
MP
1914 wbl.size++;
1915 g->write_in_progress = true;
a8affc03 1916 g->wc_list_contiguous = BIO_MAX_VECS;
48debafe
MP
1917 f = g;
1918 e->wc_list_contiguous++;
a8affc03 1919 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
5229b489
HY
1920 if (unlikely(wc->writeback_all)) {
1921 next_node = rb_next(&f->rb_node);
1922 if (likely(next_node))
1923 g = container_of(next_node, struct wc_entry, rb_node);
1924 }
48debafe 1925 break;
5229b489 1926 }
48debafe
MP
1927 }
1928 cond_resched();
1929 }
1930
1931 if (!list_empty(&skipped)) {
1932 list_splice_tail(&skipped, &wc->lru);
1933 /*
1934 * If we didn't do any progress, we must wait until some
1935 * writeback finishes to avoid burning CPU in a loop
1936 */
1937 if (unlikely(!wbl.size))
1938 writecache_wait_for_writeback(wc);
1939 }
1940
1941 wc_unlock(wc);
1942
1943 blk_start_plug(&plug);
1944
1945 if (WC_MODE_PMEM(wc))
1946 __writecache_writeback_pmem(wc, &wbl);
1947 else
1948 __writecache_writeback_ssd(wc, &wbl);
1949
1950 blk_finish_plug(&plug);
1951
1952 if (unlikely(wc->writeback_all)) {
1953 wc_lock(wc);
1954 while (writecache_wait_for_writeback(wc));
1955 wc_unlock(wc);
1956 }
1957}
1958
1959static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1960 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1961{
1962 uint64_t n_blocks, offset;
1963 struct wc_entry e;
1964
1965 n_blocks = device_size;
1966 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1967
1968 while (1) {
1969 if (!n_blocks)
1970 return -ENOSPC;
1971 /* Verify the following entries[n_blocks] won't overflow */
1972 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1973 sizeof(struct wc_memory_entry)))
1974 return -EFBIG;
1975 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1976 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1977 if (offset + n_blocks * block_size <= device_size)
1978 break;
1979 n_blocks--;
1980 }
1981
1982 /* check if the bit field overflows */
1983 e.index = n_blocks;
1984 if (e.index != n_blocks)
1985 return -EFBIG;
1986
1987 if (n_blocks_p)
1988 *n_blocks_p = n_blocks;
1989 if (n_metadata_blocks_p)
1990 *n_metadata_blocks_p = offset >> __ffs(block_size);
1991 return 0;
1992}
1993
1994static int init_memory(struct dm_writecache *wc)
1995{
1996 size_t b;
1997 int r;
1998
1999 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2000 if (r)
2001 return r;
2002
2003 r = writecache_alloc_entries(wc);
2004 if (r)
2005 return r;
2006
2007 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2008 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2009 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2010 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2011 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2012 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2013
1edaa447 2014 for (b = 0; b < wc->n_blocks; b++) {
48debafe 2015 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1edaa447
MP
2016 cond_resched();
2017 }
48debafe
MP
2018
2019 writecache_flush_all_metadata(wc);
aa950920 2020 writecache_commit_flushed(wc, false);
48debafe
MP
2021 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2022 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
aa950920 2023 writecache_commit_flushed(wc, false);
48debafe
MP
2024
2025 return 0;
2026}
2027
2028static void writecache_dtr(struct dm_target *ti)
2029{
2030 struct dm_writecache *wc = ti->private;
2031
2032 if (!wc)
2033 return;
2034
2035 if (wc->endio_thread)
2036 kthread_stop(wc->endio_thread);
2037
2038 if (wc->flush_thread)
2039 kthread_stop(wc->flush_thread);
2040
2041 bioset_exit(&wc->bio_set);
2042
2043 mempool_exit(&wc->copy_pool);
2044
2045 if (wc->writeback_wq)
2046 destroy_workqueue(wc->writeback_wq);
2047
2048 if (wc->dev)
2049 dm_put_device(ti, wc->dev);
2050
2051 if (wc->ssd_dev)
2052 dm_put_device(ti, wc->ssd_dev);
2053
21ec672e 2054 vfree(wc->entries);
48debafe
MP
2055
2056 if (wc->memory_map) {
2057 if (WC_MODE_PMEM(wc))
2058 persistent_memory_release(wc);
2059 else
2060 vfree(wc->memory_map);
2061 }
2062
2063 if (wc->dm_kcopyd)
2064 dm_kcopyd_client_destroy(wc->dm_kcopyd);
2065
2066 if (wc->dm_io)
2067 dm_io_client_destroy(wc->dm_io);
2068
21ec672e 2069 vfree(wc->dirty_bitmap);
48debafe
MP
2070
2071 kfree(wc);
2072}
2073
2074static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2075{
2076 struct dm_writecache *wc;
2077 struct dm_arg_set as;
2078 const char *string;
2079 unsigned opt_params;
2080 size_t offset, data_size;
2081 int i, r;
2082 char dummy;
2083 int high_wm_percent = HIGH_WATERMARK;
2084 int low_wm_percent = LOW_WATERMARK;
2085 uint64_t x;
2086 struct wc_memory_superblock s;
2087
2088 static struct dm_arg _args[] = {
67aa3ec3 2089 {0, 16, "Invalid number of feature args"},
48debafe
MP
2090 };
2091
2092 as.argc = argc;
2093 as.argv = argv;
2094
2095 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2096 if (!wc) {
2097 ti->error = "Cannot allocate writecache structure";
2098 r = -ENOMEM;
2099 goto bad;
2100 }
2101 ti->private = wc;
2102 wc->ti = ti;
2103
2104 mutex_init(&wc->lock);
3923d485 2105 wc->max_age = MAX_AGE_UNSPECIFIED;
48debafe
MP
2106 writecache_poison_lists(wc);
2107 init_waitqueue_head(&wc->freelist_wait);
2108 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
3923d485 2109 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
48debafe
MP
2110
2111 for (i = 0; i < 2; i++) {
2112 atomic_set(&wc->bio_in_progress[i], 0);
2113 init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2114 }
2115
2116 wc->dm_io = dm_io_client_create();
2117 if (IS_ERR(wc->dm_io)) {
2118 r = PTR_ERR(wc->dm_io);
2119 ti->error = "Unable to allocate dm-io client";
2120 wc->dm_io = NULL;
2121 goto bad;
2122 }
2123
f87e033b 2124 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
48debafe
MP
2125 if (!wc->writeback_wq) {
2126 r = -ENOMEM;
2127 ti->error = "Could not allocate writeback workqueue";
2128 goto bad;
2129 }
2130 INIT_WORK(&wc->writeback_work, writecache_writeback);
2131 INIT_WORK(&wc->flush_work, writecache_flush_work);
2132
2133 raw_spin_lock_init(&wc->endio_list_lock);
2134 INIT_LIST_HEAD(&wc->endio_list);
2135 wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
2136 if (IS_ERR(wc->endio_thread)) {
2137 r = PTR_ERR(wc->endio_thread);
2138 wc->endio_thread = NULL;
2139 ti->error = "Couldn't spawn endio thread";
2140 goto bad;
2141 }
2142 wake_up_process(wc->endio_thread);
2143
2144 /*
2145 * Parse the mode (pmem or ssd)
2146 */
2147 string = dm_shift_arg(&as);
2148 if (!string)
2149 goto bad_arguments;
2150
2151 if (!strcasecmp(string, "s")) {
2152 wc->pmem_mode = false;
2153 } else if (!strcasecmp(string, "p")) {
2154#ifdef DM_WRITECACHE_HAS_PMEM
2155 wc->pmem_mode = true;
2156 wc->writeback_fua = true;
2157#else
2158 /*
2159 * If the architecture doesn't support persistent memory or
2160 * the kernel doesn't support any DAX drivers, this driver can
2161 * only be used in SSD-only mode.
2162 */
2163 r = -EOPNOTSUPP;
2164 ti->error = "Persistent memory or DAX not supported on this system";
2165 goto bad;
2166#endif
2167 } else {
2168 goto bad_arguments;
2169 }
2170
2171 if (WC_MODE_PMEM(wc)) {
2172 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2173 offsetof(struct writeback_struct, bio),
2174 BIOSET_NEED_BVECS);
2175 if (r) {
2176 ti->error = "Could not allocate bio set";
2177 goto bad;
2178 }
2179 } else {
2180 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2181 if (r) {
2182 ti->error = "Could not allocate mempool";
2183 goto bad;
2184 }
2185 }
2186
2187 /*
2188 * Parse the origin data device
2189 */
2190 string = dm_shift_arg(&as);
2191 if (!string)
2192 goto bad_arguments;
2193 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2194 if (r) {
2195 ti->error = "Origin data device lookup failed";
2196 goto bad;
2197 }
2198
2199 /*
2200 * Parse cache data device (be it pmem or ssd)
2201 */
2202 string = dm_shift_arg(&as);
2203 if (!string)
2204 goto bad_arguments;
2205
2206 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2207 if (r) {
2208 ti->error = "Cache data device lookup failed";
2209 goto bad;
2210 }
2211 wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
2212
48debafe
MP
2213 /*
2214 * Parse the cache block size
2215 */
2216 string = dm_shift_arg(&as);
2217 if (!string)
2218 goto bad_arguments;
2219 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2220 wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2221 (wc->block_size & (wc->block_size - 1))) {
2222 r = -EINVAL;
2223 ti->error = "Invalid block size";
2224 goto bad;
2225 }
31b22120
MP
2226 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2227 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2228 r = -EINVAL;
2229 ti->error = "Block size is smaller than device logical block size";
2230 goto bad;
2231 }
48debafe
MP
2232 wc->block_size_bits = __ffs(wc->block_size);
2233
2234 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2235 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2236 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2237
2238 /*
2239 * Parse optional arguments
2240 */
2241 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2242 if (r)
2243 goto bad;
2244
2245 while (opt_params) {
2246 string = dm_shift_arg(&as), opt_params--;
d284f824
MP
2247 if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2248 unsigned long long start_sector;
2249 string = dm_shift_arg(&as), opt_params--;
2250 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2251 goto invalid_optional;
2252 wc->start_sector = start_sector;
054bee16 2253 wc->start_sector_set = true;
d284f824
MP
2254 if (wc->start_sector != start_sector ||
2255 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2256 goto invalid_optional;
2257 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
48debafe
MP
2258 string = dm_shift_arg(&as), opt_params--;
2259 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2260 goto invalid_optional;
2261 if (high_wm_percent < 0 || high_wm_percent > 100)
2262 goto invalid_optional;
054bee16 2263 wc->high_wm_percent_value = high_wm_percent;
48debafe
MP
2264 wc->high_wm_percent_set = true;
2265 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2266 string = dm_shift_arg(&as), opt_params--;
2267 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2268 goto invalid_optional;
2269 if (low_wm_percent < 0 || low_wm_percent > 100)
2270 goto invalid_optional;
054bee16 2271 wc->low_wm_percent_value = low_wm_percent;
48debafe
MP
2272 wc->low_wm_percent_set = true;
2273 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2274 string = dm_shift_arg(&as), opt_params--;
2275 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2276 goto invalid_optional;
2277 wc->max_writeback_jobs_set = true;
2278 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2279 string = dm_shift_arg(&as), opt_params--;
2280 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2281 goto invalid_optional;
2282 wc->autocommit_blocks_set = true;
2283 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2284 unsigned autocommit_msecs;
2285 string = dm_shift_arg(&as), opt_params--;
2286 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2287 goto invalid_optional;
2288 if (autocommit_msecs > 3600000)
2289 goto invalid_optional;
2290 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
054bee16 2291 wc->autocommit_time_value = autocommit_msecs;
48debafe 2292 wc->autocommit_time_set = true;
3923d485
MP
2293 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2294 unsigned max_age_msecs;
2295 string = dm_shift_arg(&as), opt_params--;
2296 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2297 goto invalid_optional;
2298 if (max_age_msecs > 86400000)
2299 goto invalid_optional;
2300 wc->max_age = msecs_to_jiffies(max_age_msecs);
054bee16
MP
2301 wc->max_age_set = true;
2302 wc->max_age_value = max_age_msecs;
93de44eb 2303 } else if (!strcasecmp(string, "cleaner")) {
054bee16 2304 wc->cleaner_set = true;
93de44eb 2305 wc->cleaner = true;
48debafe
MP
2306 } else if (!strcasecmp(string, "fua")) {
2307 if (WC_MODE_PMEM(wc)) {
2308 wc->writeback_fua = true;
2309 wc->writeback_fua_set = true;
2310 } else goto invalid_optional;
2311 } else if (!strcasecmp(string, "nofua")) {
2312 if (WC_MODE_PMEM(wc)) {
2313 wc->writeback_fua = false;
2314 wc->writeback_fua_set = true;
2315 } else goto invalid_optional;
2316 } else {
2317invalid_optional:
2318 r = -EINVAL;
2319 ti->error = "Invalid optional argument";
2320 goto bad;
2321 }
2322 }
2323
2324 if (high_wm_percent < low_wm_percent) {
2325 r = -EINVAL;
2326 ti->error = "High watermark must be greater than or equal to low watermark";
2327 goto bad;
2328 }
2329
d284f824 2330 if (WC_MODE_PMEM(wc)) {
a4662458
MS
2331 if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2332 r = -EOPNOTSUPP;
2333 ti->error = "Asynchronous persistent memory not supported as pmem cache";
2334 goto bad;
2335 }
2336
d284f824
MP
2337 r = persistent_memory_claim(wc);
2338 if (r) {
2339 ti->error = "Unable to map persistent memory for cache";
2340 goto bad;
2341 }
2342 } else {
48debafe
MP
2343 size_t n_blocks, n_metadata_blocks;
2344 uint64_t n_bitmap_bits;
2345
d284f824
MP
2346 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2347
48debafe
MP
2348 bio_list_init(&wc->flush_list);
2349 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2350 if (IS_ERR(wc->flush_thread)) {
2351 r = PTR_ERR(wc->flush_thread);
2352 wc->flush_thread = NULL;
e8ea141a 2353 ti->error = "Couldn't spawn flush thread";
48debafe
MP
2354 goto bad;
2355 }
2356 wake_up_process(wc->flush_thread);
2357
2358 r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2359 &n_blocks, &n_metadata_blocks);
2360 if (r) {
2361 ti->error = "Invalid device size";
2362 goto bad;
2363 }
2364
2365 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2366 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2367 /* this is limitation of test_bit functions */
2368 if (n_bitmap_bits > 1U << 31) {
2369 r = -EFBIG;
2370 ti->error = "Invalid device size";
2371 goto bad;
2372 }
2373
2374 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2375 if (!wc->memory_map) {
2376 r = -ENOMEM;
2377 ti->error = "Unable to allocate memory for metadata";
2378 goto bad;
2379 }
2380
2381 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2382 if (IS_ERR(wc->dm_kcopyd)) {
2383 r = PTR_ERR(wc->dm_kcopyd);
2384 ti->error = "Unable to allocate dm-kcopyd client";
2385 wc->dm_kcopyd = NULL;
2386 goto bad;
2387 }
2388
2389 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2390 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2391 BITS_PER_LONG * sizeof(unsigned long);
2392 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2393 if (!wc->dirty_bitmap) {
2394 r = -ENOMEM;
2395 ti->error = "Unable to allocate dirty bitmap";
2396 goto bad;
2397 }
2398
31b22120 2399 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
48debafe 2400 if (r) {
31b22120 2401 ti->error = "Unable to read first block of metadata";
48debafe
MP
2402 goto bad;
2403 }
2404 }
2405
ec6347bb 2406 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
48debafe
MP
2407 if (r) {
2408 ti->error = "Hardware memory error when reading superblock";
2409 goto bad;
2410 }
2411 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2412 r = init_memory(wc);
2413 if (r) {
2414 ti->error = "Unable to initialize device";
2415 goto bad;
2416 }
ec6347bb
DW
2417 r = copy_mc_to_kernel(&s, sb(wc),
2418 sizeof(struct wc_memory_superblock));
48debafe
MP
2419 if (r) {
2420 ti->error = "Hardware memory error when reading superblock";
2421 goto bad;
2422 }
2423 }
2424
2425 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2426 ti->error = "Invalid magic in the superblock";
2427 r = -EINVAL;
2428 goto bad;
2429 }
2430
2431 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2432 ti->error = "Invalid version in the superblock";
2433 r = -EINVAL;
2434 goto bad;
2435 }
2436
2437 if (le32_to_cpu(s.block_size) != wc->block_size) {
2438 ti->error = "Block size does not match superblock";
2439 r = -EINVAL;
2440 goto bad;
2441 }
2442
2443 wc->n_blocks = le64_to_cpu(s.n_blocks);
2444
2445 offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2446 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2447overflow:
2448 ti->error = "Overflow in size calculation";
2449 r = -EINVAL;
2450 goto bad;
2451 }
2452 offset += sizeof(struct wc_memory_superblock);
2453 if (offset < sizeof(struct wc_memory_superblock))
2454 goto overflow;
2455 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2456 data_size = wc->n_blocks * (size_t)wc->block_size;
2457 if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2458 (offset + data_size < offset))
2459 goto overflow;
2460 if (offset + data_size > wc->memory_map_size) {
2461 ti->error = "Memory area is too small";
2462 r = -EINVAL;
2463 goto bad;
2464 }
2465
2466 wc->metadata_sectors = offset >> SECTOR_SHIFT;
2467 wc->block_start = (char *)sb(wc) + offset;
2468
2469 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2470 x += 50;
2471 do_div(x, 100);
2472 wc->freelist_high_watermark = x;
2473 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2474 x += 50;
2475 do_div(x, 100);
2476 wc->freelist_low_watermark = x;
2477
93de44eb
MP
2478 if (wc->cleaner)
2479 activate_cleaner(wc);
2480
48debafe
MP
2481 r = writecache_alloc_entries(wc);
2482 if (r) {
2483 ti->error = "Cannot allocate memory";
2484 goto bad;
2485 }
2486
2487 ti->num_flush_bios = 1;
2488 ti->flush_supported = true;
2489 ti->num_discard_bios = 1;
2490
2491 if (WC_MODE_PMEM(wc))
2492 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2493
2494 return 0;
2495
2496bad_arguments:
2497 r = -EINVAL;
2498 ti->error = "Bad arguments";
2499bad:
2500 writecache_dtr(ti);
2501 return r;
2502}
2503
2504static void writecache_status(struct dm_target *ti, status_type_t type,
2505 unsigned status_flags, char *result, unsigned maxlen)
2506{
2507 struct dm_writecache *wc = ti->private;
2508 unsigned extra_args;
2509 unsigned sz = 0;
48debafe
MP
2510
2511 switch (type) {
2512 case STATUSTYPE_INFO:
2513 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2514 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2515 (unsigned long long)wc->writeback_size);
2516 break;
2517 case STATUSTYPE_TABLE:
2518 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2519 wc->dev->name, wc->ssd_dev->name, wc->block_size);
2520 extra_args = 0;
054bee16 2521 if (wc->start_sector_set)
9ff07e7d 2522 extra_args += 2;
054bee16 2523 if (wc->high_wm_percent_set)
48debafe 2524 extra_args += 2;
054bee16 2525 if (wc->low_wm_percent_set)
48debafe
MP
2526 extra_args += 2;
2527 if (wc->max_writeback_jobs_set)
2528 extra_args += 2;
2529 if (wc->autocommit_blocks_set)
2530 extra_args += 2;
2531 if (wc->autocommit_time_set)
2532 extra_args += 2;
054bee16 2533 if (wc->max_age_set)
e5d41cbc 2534 extra_args += 2;
054bee16 2535 if (wc->cleaner_set)
93de44eb 2536 extra_args++;
48debafe
MP
2537 if (wc->writeback_fua_set)
2538 extra_args++;
2539
2540 DMEMIT("%u", extra_args);
054bee16 2541 if (wc->start_sector_set)
9ff07e7d 2542 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
054bee16
MP
2543 if (wc->high_wm_percent_set)
2544 DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2545 if (wc->low_wm_percent_set)
2546 DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
48debafe
MP
2547 if (wc->max_writeback_jobs_set)
2548 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2549 if (wc->autocommit_blocks_set)
2550 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2551 if (wc->autocommit_time_set)
054bee16
MP
2552 DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2553 if (wc->max_age_set)
2554 DMEMIT(" max_age %u", wc->max_age_value);
2555 if (wc->cleaner_set)
93de44eb 2556 DMEMIT(" cleaner");
48debafe
MP
2557 if (wc->writeback_fua_set)
2558 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2559 break;
2560 }
2561}
2562
2563static struct target_type writecache_target = {
2564 .name = "writecache",
054bee16 2565 .version = {1, 4, 0},
48debafe
MP
2566 .module = THIS_MODULE,
2567 .ctr = writecache_ctr,
2568 .dtr = writecache_dtr,
2569 .status = writecache_status,
2570 .postsuspend = writecache_suspend,
2571 .resume = writecache_resume,
2572 .message = writecache_message,
2573 .map = writecache_map,
2574 .end_io = writecache_end_io,
2575 .iterate_devices = writecache_iterate_devices,
2576 .io_hints = writecache_io_hints,
2577};
2578
2579static int __init dm_writecache_init(void)
2580{
2581 int r;
2582
2583 r = dm_register_target(&writecache_target);
2584 if (r < 0) {
2585 DMERR("register failed %d", r);
2586 return r;
2587 }
2588
2589 return 0;
2590}
2591
2592static void __exit dm_writecache_exit(void)
2593{
2594 dm_unregister_target(&writecache_target);
2595}
2596
2597module_init(dm_writecache_init);
2598module_exit(dm_writecache_exit);
2599
2600MODULE_DESCRIPTION(DM_NAME " writecache target");
2601MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2602MODULE_LICENSE("GPL");