dm cache: pass a new 'critical' flag to the policies when requesting writeback work
[linux-2.6-block.git] / drivers / md / dm-cache-target.c
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/jiffies.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20
21 #define DM_MSG_PREFIX "cache"
22
23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
24         "A percentage of time allocated for copying to and/or from cache");
25
26 /*----------------------------------------------------------------*/
27
28 #define IOT_RESOLUTION 4
29
30 struct io_tracker {
31         spinlock_t lock;
32
33         /*
34          * Sectors of in-flight IO.
35          */
36         sector_t in_flight;
37
38         /*
39          * The time, in jiffies, when this device became idle (if it is
40          * indeed idle).
41          */
42         unsigned long idle_time;
43         unsigned long last_update_time;
44 };
45
46 static void iot_init(struct io_tracker *iot)
47 {
48         spin_lock_init(&iot->lock);
49         iot->in_flight = 0ul;
50         iot->idle_time = 0ul;
51         iot->last_update_time = jiffies;
52 }
53
54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
55 {
56         if (iot->in_flight)
57                 return false;
58
59         return time_after(jiffies, iot->idle_time + jifs);
60 }
61
62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
63 {
64         bool r;
65         unsigned long flags;
66
67         spin_lock_irqsave(&iot->lock, flags);
68         r = __iot_idle_for(iot, jifs);
69         spin_unlock_irqrestore(&iot->lock, flags);
70
71         return r;
72 }
73
74 static void iot_io_begin(struct io_tracker *iot, sector_t len)
75 {
76         unsigned long flags;
77
78         spin_lock_irqsave(&iot->lock, flags);
79         iot->in_flight += len;
80         spin_unlock_irqrestore(&iot->lock, flags);
81 }
82
83 static void __iot_io_end(struct io_tracker *iot, sector_t len)
84 {
85         iot->in_flight -= len;
86         if (!iot->in_flight)
87                 iot->idle_time = jiffies;
88 }
89
90 static void iot_io_end(struct io_tracker *iot, sector_t len)
91 {
92         unsigned long flags;
93
94         spin_lock_irqsave(&iot->lock, flags);
95         __iot_io_end(iot, len);
96         spin_unlock_irqrestore(&iot->lock, flags);
97 }
98
99 /*----------------------------------------------------------------*/
100
101 /*
102  * Glossary:
103  *
104  * oblock: index of an origin block
105  * cblock: index of a cache block
106  * promotion: movement of a block from origin to cache
107  * demotion: movement of a block from cache to origin
108  * migration: movement of a block between the origin and cache device,
109  *            either direction
110  */
111
112 /*----------------------------------------------------------------*/
113
114 static size_t bitset_size_in_bytes(unsigned nr_entries)
115 {
116         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
117 }
118
119 static unsigned long *alloc_bitset(unsigned nr_entries)
120 {
121         size_t s = bitset_size_in_bytes(nr_entries);
122         return vzalloc(s);
123 }
124
125 static void clear_bitset(void *bitset, unsigned nr_entries)
126 {
127         size_t s = bitset_size_in_bytes(nr_entries);
128         memset(bitset, 0, s);
129 }
130
131 static void free_bitset(unsigned long *bits)
132 {
133         vfree(bits);
134 }
135
136 /*----------------------------------------------------------------*/
137
138 /*
139  * There are a couple of places where we let a bio run, but want to do some
140  * work before calling its endio function.  We do this by temporarily
141  * changing the endio fn.
142  */
143 struct dm_hook_info {
144         bio_end_io_t *bi_end_io;
145         void *bi_private;
146 };
147
148 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
149                         bio_end_io_t *bi_end_io, void *bi_private)
150 {
151         h->bi_end_io = bio->bi_end_io;
152         h->bi_private = bio->bi_private;
153
154         bio->bi_end_io = bi_end_io;
155         bio->bi_private = bi_private;
156 }
157
158 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
159 {
160         bio->bi_end_io = h->bi_end_io;
161         bio->bi_private = h->bi_private;
162 }
163
164 /*----------------------------------------------------------------*/
165
166 #define MIGRATION_POOL_SIZE 128
167 #define COMMIT_PERIOD HZ
168 #define MIGRATION_COUNT_WINDOW 10
169
170 /*
171  * The block size of the device holding cache data must be
172  * between 32KB and 1GB.
173  */
174 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
175 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
176
177 /*
178  * FIXME: the cache is read/write for the time being.
179  */
180 enum cache_metadata_mode {
181         CM_WRITE,               /* metadata may be changed */
182         CM_READ_ONLY,           /* metadata may not be changed */
183 };
184
185 enum cache_io_mode {
186         /*
187          * Data is written to cached blocks only.  These blocks are marked
188          * dirty.  If you lose the cache device you will lose data.
189          * Potential performance increase for both reads and writes.
190          */
191         CM_IO_WRITEBACK,
192
193         /*
194          * Data is written to both cache and origin.  Blocks are never
195          * dirty.  Potential performance benfit for reads only.
196          */
197         CM_IO_WRITETHROUGH,
198
199         /*
200          * A degraded mode useful for various cache coherency situations
201          * (eg, rolling back snapshots).  Reads and writes always go to the
202          * origin.  If a write goes to a cached oblock, then the cache
203          * block is invalidated.
204          */
205         CM_IO_PASSTHROUGH
206 };
207
208 struct cache_features {
209         enum cache_metadata_mode mode;
210         enum cache_io_mode io_mode;
211 };
212
213 struct cache_stats {
214         atomic_t read_hit;
215         atomic_t read_miss;
216         atomic_t write_hit;
217         atomic_t write_miss;
218         atomic_t demotion;
219         atomic_t promotion;
220         atomic_t copies_avoided;
221         atomic_t cache_cell_clash;
222         atomic_t commit_count;
223         atomic_t discard_count;
224 };
225
226 /*
227  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
228  * the one-past-the-end value.
229  */
230 struct cblock_range {
231         dm_cblock_t begin;
232         dm_cblock_t end;
233 };
234
235 struct invalidation_request {
236         struct list_head list;
237         struct cblock_range *cblocks;
238
239         atomic_t complete;
240         int err;
241
242         wait_queue_head_t result_wait;
243 };
244
245 struct cache {
246         struct dm_target *ti;
247         struct dm_target_callbacks callbacks;
248
249         struct dm_cache_metadata *cmd;
250
251         /*
252          * Metadata is written to this device.
253          */
254         struct dm_dev *metadata_dev;
255
256         /*
257          * The slower of the two data devices.  Typically a spindle.
258          */
259         struct dm_dev *origin_dev;
260
261         /*
262          * The faster of the two data devices.  Typically an SSD.
263          */
264         struct dm_dev *cache_dev;
265
266         /*
267          * Size of the origin device in _complete_ blocks and native sectors.
268          */
269         dm_oblock_t origin_blocks;
270         sector_t origin_sectors;
271
272         /*
273          * Size of the cache device in blocks.
274          */
275         dm_cblock_t cache_size;
276
277         /*
278          * Fields for converting from sectors to blocks.
279          */
280         uint32_t sectors_per_block;
281         int sectors_per_block_shift;
282
283         spinlock_t lock;
284         struct bio_list deferred_bios;
285         struct bio_list deferred_flush_bios;
286         struct bio_list deferred_writethrough_bios;
287         struct list_head quiesced_migrations;
288         struct list_head completed_migrations;
289         struct list_head need_commit_migrations;
290         sector_t migration_threshold;
291         wait_queue_head_t migration_wait;
292         atomic_t nr_allocated_migrations;
293
294         /*
295          * The number of in flight migrations that are performing
296          * background io. eg, promotion, writeback.
297          */
298         atomic_t nr_io_migrations;
299
300         wait_queue_head_t quiescing_wait;
301         atomic_t quiescing;
302         atomic_t quiescing_ack;
303
304         /*
305          * cache_size entries, dirty if set
306          */
307         atomic_t nr_dirty;
308         unsigned long *dirty_bitset;
309
310         /*
311          * origin_blocks entries, discarded if set.
312          */
313         dm_dblock_t discard_nr_blocks;
314         unsigned long *discard_bitset;
315         uint32_t discard_block_size; /* a power of 2 times sectors per block */
316
317         /*
318          * Rather than reconstructing the table line for the status we just
319          * save it and regurgitate.
320          */
321         unsigned nr_ctr_args;
322         const char **ctr_args;
323
324         struct dm_kcopyd_client *copier;
325         struct workqueue_struct *wq;
326         struct work_struct worker;
327
328         struct delayed_work waker;
329         unsigned long last_commit_jiffies;
330
331         struct dm_bio_prison *prison;
332         struct dm_deferred_set *all_io_ds;
333
334         mempool_t *migration_pool;
335
336         struct dm_cache_policy *policy;
337         unsigned policy_nr_args;
338
339         bool need_tick_bio:1;
340         bool sized:1;
341         bool invalidate:1;
342         bool commit_requested:1;
343         bool loaded_mappings:1;
344         bool loaded_discards:1;
345
346         /*
347          * Cache features such as write-through.
348          */
349         struct cache_features features;
350
351         struct cache_stats stats;
352
353         /*
354          * Invalidation fields.
355          */
356         spinlock_t invalidation_lock;
357         struct list_head invalidation_requests;
358
359         struct io_tracker origin_tracker;
360 };
361
362 struct per_bio_data {
363         bool tick:1;
364         unsigned req_nr:2;
365         struct dm_deferred_entry *all_io_entry;
366         struct dm_hook_info hook_info;
367         sector_t len;
368
369         /*
370          * writethrough fields.  These MUST remain at the end of this
371          * structure and the 'cache' member must be the first as it
372          * is used to determine the offset of the writethrough fields.
373          */
374         struct cache *cache;
375         dm_cblock_t cblock;
376         struct dm_bio_details bio_details;
377 };
378
379 struct dm_cache_migration {
380         struct list_head list;
381         struct cache *cache;
382
383         unsigned long start_jiffies;
384         dm_oblock_t old_oblock;
385         dm_oblock_t new_oblock;
386         dm_cblock_t cblock;
387
388         bool err:1;
389         bool discard:1;
390         bool writeback:1;
391         bool demote:1;
392         bool promote:1;
393         bool requeue_holder:1;
394         bool invalidate:1;
395
396         struct dm_bio_prison_cell *old_ocell;
397         struct dm_bio_prison_cell *new_ocell;
398 };
399
400 /*
401  * Processing a bio in the worker thread may require these memory
402  * allocations.  We prealloc to avoid deadlocks (the same worker thread
403  * frees them back to the mempool).
404  */
405 struct prealloc {
406         struct dm_cache_migration *mg;
407         struct dm_bio_prison_cell *cell1;
408         struct dm_bio_prison_cell *cell2;
409 };
410
411 static void wake_worker(struct cache *cache)
412 {
413         queue_work(cache->wq, &cache->worker);
414 }
415
416 /*----------------------------------------------------------------*/
417
418 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
419 {
420         /* FIXME: change to use a local slab. */
421         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
422 }
423
424 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
425 {
426         dm_bio_prison_free_cell(cache->prison, cell);
427 }
428
429 static struct dm_cache_migration *alloc_migration(struct cache *cache)
430 {
431         struct dm_cache_migration *mg;
432
433         mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
434         if (mg) {
435                 mg->cache = cache;
436                 atomic_inc(&mg->cache->nr_allocated_migrations);
437         }
438
439         return mg;
440 }
441
442 static void free_migration(struct dm_cache_migration *mg)
443 {
444         if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
445                 wake_up(&mg->cache->migration_wait);
446
447         mempool_free(mg, mg->cache->migration_pool);
448 }
449
450 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
451 {
452         if (!p->mg) {
453                 p->mg = alloc_migration(cache);
454                 if (!p->mg)
455                         return -ENOMEM;
456         }
457
458         if (!p->cell1) {
459                 p->cell1 = alloc_prison_cell(cache);
460                 if (!p->cell1)
461                         return -ENOMEM;
462         }
463
464         if (!p->cell2) {
465                 p->cell2 = alloc_prison_cell(cache);
466                 if (!p->cell2)
467                         return -ENOMEM;
468         }
469
470         return 0;
471 }
472
473 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
474 {
475         if (p->cell2)
476                 free_prison_cell(cache, p->cell2);
477
478         if (p->cell1)
479                 free_prison_cell(cache, p->cell1);
480
481         if (p->mg)
482                 free_migration(p->mg);
483 }
484
485 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
486 {
487         struct dm_cache_migration *mg = p->mg;
488
489         BUG_ON(!mg);
490         p->mg = NULL;
491
492         return mg;
493 }
494
495 /*
496  * You must have a cell within the prealloc struct to return.  If not this
497  * function will BUG() rather than returning NULL.
498  */
499 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
500 {
501         struct dm_bio_prison_cell *r = NULL;
502
503         if (p->cell1) {
504                 r = p->cell1;
505                 p->cell1 = NULL;
506
507         } else if (p->cell2) {
508                 r = p->cell2;
509                 p->cell2 = NULL;
510         } else
511                 BUG();
512
513         return r;
514 }
515
516 /*
517  * You can't have more than two cells in a prealloc struct.  BUG() will be
518  * called if you try and overfill.
519  */
520 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
521 {
522         if (!p->cell2)
523                 p->cell2 = cell;
524
525         else if (!p->cell1)
526                 p->cell1 = cell;
527
528         else
529                 BUG();
530 }
531
532 /*----------------------------------------------------------------*/
533
534 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
535 {
536         key->virtual = 0;
537         key->dev = 0;
538         key->block_begin = from_oblock(begin);
539         key->block_end = from_oblock(end);
540 }
541
542 /*
543  * The caller hands in a preallocated cell, and a free function for it.
544  * The cell will be freed if there's an error, or if it wasn't used because
545  * a cell with that key already exists.
546  */
547 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
548
549 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
550                             struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
551                             cell_free_fn free_fn, void *free_context,
552                             struct dm_bio_prison_cell **cell_result)
553 {
554         int r;
555         struct dm_cell_key key;
556
557         build_key(oblock_begin, oblock_end, &key);
558         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
559         if (r)
560                 free_fn(free_context, cell_prealloc);
561
562         return r;
563 }
564
565 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
566                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
567                       cell_free_fn free_fn, void *free_context,
568                       struct dm_bio_prison_cell **cell_result)
569 {
570         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
571         return bio_detain_range(cache, oblock, end, bio,
572                                 cell_prealloc, free_fn, free_context, cell_result);
573 }
574
575 static int get_cell(struct cache *cache,
576                     dm_oblock_t oblock,
577                     struct prealloc *structs,
578                     struct dm_bio_prison_cell **cell_result)
579 {
580         int r;
581         struct dm_cell_key key;
582         struct dm_bio_prison_cell *cell_prealloc;
583
584         cell_prealloc = prealloc_get_cell(structs);
585
586         build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
587         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
588         if (r)
589                 prealloc_put_cell(structs, cell_prealloc);
590
591         return r;
592 }
593
594 /*----------------------------------------------------------------*/
595
596 static bool is_dirty(struct cache *cache, dm_cblock_t b)
597 {
598         return test_bit(from_cblock(b), cache->dirty_bitset);
599 }
600
601 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
602 {
603         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
604                 atomic_inc(&cache->nr_dirty);
605                 policy_set_dirty(cache->policy, oblock);
606         }
607 }
608
609 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
610 {
611         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
612                 policy_clear_dirty(cache->policy, oblock);
613                 if (atomic_dec_return(&cache->nr_dirty) == 0)
614                         dm_table_event(cache->ti->table);
615         }
616 }
617
618 /*----------------------------------------------------------------*/
619
620 static bool block_size_is_power_of_two(struct cache *cache)
621 {
622         return cache->sectors_per_block_shift >= 0;
623 }
624
625 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
626 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
627 __always_inline
628 #endif
629 static dm_block_t block_div(dm_block_t b, uint32_t n)
630 {
631         do_div(b, n);
632
633         return b;
634 }
635
636 static dm_block_t oblocks_per_dblock(struct cache *cache)
637 {
638         dm_block_t oblocks = cache->discard_block_size;
639
640         if (block_size_is_power_of_two(cache))
641                 oblocks >>= cache->sectors_per_block_shift;
642         else
643                 oblocks = block_div(oblocks, cache->sectors_per_block);
644
645         return oblocks;
646 }
647
648 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
649 {
650         return to_dblock(block_div(from_oblock(oblock),
651                                    oblocks_per_dblock(cache)));
652 }
653
654 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
655 {
656         return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
657 }
658
659 static void set_discard(struct cache *cache, dm_dblock_t b)
660 {
661         unsigned long flags;
662
663         BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
664         atomic_inc(&cache->stats.discard_count);
665
666         spin_lock_irqsave(&cache->lock, flags);
667         set_bit(from_dblock(b), cache->discard_bitset);
668         spin_unlock_irqrestore(&cache->lock, flags);
669 }
670
671 static void clear_discard(struct cache *cache, dm_dblock_t b)
672 {
673         unsigned long flags;
674
675         spin_lock_irqsave(&cache->lock, flags);
676         clear_bit(from_dblock(b), cache->discard_bitset);
677         spin_unlock_irqrestore(&cache->lock, flags);
678 }
679
680 static bool is_discarded(struct cache *cache, dm_dblock_t b)
681 {
682         int r;
683         unsigned long flags;
684
685         spin_lock_irqsave(&cache->lock, flags);
686         r = test_bit(from_dblock(b), cache->discard_bitset);
687         spin_unlock_irqrestore(&cache->lock, flags);
688
689         return r;
690 }
691
692 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
693 {
694         int r;
695         unsigned long flags;
696
697         spin_lock_irqsave(&cache->lock, flags);
698         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
699                      cache->discard_bitset);
700         spin_unlock_irqrestore(&cache->lock, flags);
701
702         return r;
703 }
704
705 /*----------------------------------------------------------------*/
706
707 static void load_stats(struct cache *cache)
708 {
709         struct dm_cache_statistics stats;
710
711         dm_cache_metadata_get_stats(cache->cmd, &stats);
712         atomic_set(&cache->stats.read_hit, stats.read_hits);
713         atomic_set(&cache->stats.read_miss, stats.read_misses);
714         atomic_set(&cache->stats.write_hit, stats.write_hits);
715         atomic_set(&cache->stats.write_miss, stats.write_misses);
716 }
717
718 static void save_stats(struct cache *cache)
719 {
720         struct dm_cache_statistics stats;
721
722         stats.read_hits = atomic_read(&cache->stats.read_hit);
723         stats.read_misses = atomic_read(&cache->stats.read_miss);
724         stats.write_hits = atomic_read(&cache->stats.write_hit);
725         stats.write_misses = atomic_read(&cache->stats.write_miss);
726
727         dm_cache_metadata_set_stats(cache->cmd, &stats);
728 }
729
730 /*----------------------------------------------------------------
731  * Per bio data
732  *--------------------------------------------------------------*/
733
734 /*
735  * If using writeback, leave out struct per_bio_data's writethrough fields.
736  */
737 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
738 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
739
740 static bool writethrough_mode(struct cache_features *f)
741 {
742         return f->io_mode == CM_IO_WRITETHROUGH;
743 }
744
745 static bool writeback_mode(struct cache_features *f)
746 {
747         return f->io_mode == CM_IO_WRITEBACK;
748 }
749
750 static bool passthrough_mode(struct cache_features *f)
751 {
752         return f->io_mode == CM_IO_PASSTHROUGH;
753 }
754
755 static size_t get_per_bio_data_size(struct cache *cache)
756 {
757         return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
758 }
759
760 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
761 {
762         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
763         BUG_ON(!pb);
764         return pb;
765 }
766
767 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
768 {
769         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
770
771         pb->tick = false;
772         pb->req_nr = dm_bio_get_target_bio_nr(bio);
773         pb->all_io_entry = NULL;
774         pb->len = 0;
775
776         return pb;
777 }
778
779 /*----------------------------------------------------------------
780  * Remapping
781  *--------------------------------------------------------------*/
782 static void remap_to_origin(struct cache *cache, struct bio *bio)
783 {
784         bio->bi_bdev = cache->origin_dev->bdev;
785 }
786
787 static void remap_to_cache(struct cache *cache, struct bio *bio,
788                            dm_cblock_t cblock)
789 {
790         sector_t bi_sector = bio->bi_iter.bi_sector;
791         sector_t block = from_cblock(cblock);
792
793         bio->bi_bdev = cache->cache_dev->bdev;
794         if (!block_size_is_power_of_two(cache))
795                 bio->bi_iter.bi_sector =
796                         (block * cache->sectors_per_block) +
797                         sector_div(bi_sector, cache->sectors_per_block);
798         else
799                 bio->bi_iter.bi_sector =
800                         (block << cache->sectors_per_block_shift) |
801                         (bi_sector & (cache->sectors_per_block - 1));
802 }
803
804 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
805 {
806         unsigned long flags;
807         size_t pb_data_size = get_per_bio_data_size(cache);
808         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
809
810         spin_lock_irqsave(&cache->lock, flags);
811         if (cache->need_tick_bio &&
812             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
813                 pb->tick = true;
814                 cache->need_tick_bio = false;
815         }
816         spin_unlock_irqrestore(&cache->lock, flags);
817 }
818
819 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
820                                   dm_oblock_t oblock)
821 {
822         check_if_tick_bio_needed(cache, bio);
823         remap_to_origin(cache, bio);
824         if (bio_data_dir(bio) == WRITE)
825                 clear_discard(cache, oblock_to_dblock(cache, oblock));
826 }
827
828 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
829                                  dm_oblock_t oblock, dm_cblock_t cblock)
830 {
831         check_if_tick_bio_needed(cache, bio);
832         remap_to_cache(cache, bio, cblock);
833         if (bio_data_dir(bio) == WRITE) {
834                 set_dirty(cache, oblock, cblock);
835                 clear_discard(cache, oblock_to_dblock(cache, oblock));
836         }
837 }
838
839 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
840 {
841         sector_t block_nr = bio->bi_iter.bi_sector;
842
843         if (!block_size_is_power_of_two(cache))
844                 (void) sector_div(block_nr, cache->sectors_per_block);
845         else
846                 block_nr >>= cache->sectors_per_block_shift;
847
848         return to_oblock(block_nr);
849 }
850
851 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
852 {
853         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
854 }
855
856 /*
857  * You must increment the deferred set whilst the prison cell is held.  To
858  * encourage this, we ask for 'cell' to be passed in.
859  */
860 static void inc_ds(struct cache *cache, struct bio *bio,
861                    struct dm_bio_prison_cell *cell)
862 {
863         size_t pb_data_size = get_per_bio_data_size(cache);
864         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
865
866         BUG_ON(!cell);
867         BUG_ON(pb->all_io_entry);
868
869         pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
870 }
871
872 static bool accountable_bio(struct cache *cache, struct bio *bio)
873 {
874         return ((bio->bi_bdev == cache->origin_dev->bdev) &&
875                 !(bio->bi_rw & REQ_DISCARD));
876 }
877
878 static void accounted_begin(struct cache *cache, struct bio *bio)
879 {
880         size_t pb_data_size = get_per_bio_data_size(cache);
881         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
882
883         if (accountable_bio(cache, bio)) {
884                 pb->len = bio_sectors(bio);
885                 iot_io_begin(&cache->origin_tracker, pb->len);
886         }
887 }
888
889 static void accounted_complete(struct cache *cache, struct bio *bio)
890 {
891         size_t pb_data_size = get_per_bio_data_size(cache);
892         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
893
894         iot_io_end(&cache->origin_tracker, pb->len);
895 }
896
897 static void accounted_request(struct cache *cache, struct bio *bio)
898 {
899         accounted_begin(cache, bio);
900         generic_make_request(bio);
901 }
902
903 static void issue(struct cache *cache, struct bio *bio)
904 {
905         unsigned long flags;
906
907         if (!bio_triggers_commit(cache, bio)) {
908                 accounted_request(cache, bio);
909                 return;
910         }
911
912         /*
913          * Batch together any bios that trigger commits and then issue a
914          * single commit for them in do_worker().
915          */
916         spin_lock_irqsave(&cache->lock, flags);
917         cache->commit_requested = true;
918         bio_list_add(&cache->deferred_flush_bios, bio);
919         spin_unlock_irqrestore(&cache->lock, flags);
920 }
921
922 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
923 {
924         inc_ds(cache, bio, cell);
925         issue(cache, bio);
926 }
927
928 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
929 {
930         unsigned long flags;
931
932         spin_lock_irqsave(&cache->lock, flags);
933         bio_list_add(&cache->deferred_writethrough_bios, bio);
934         spin_unlock_irqrestore(&cache->lock, flags);
935
936         wake_worker(cache);
937 }
938
939 static void writethrough_endio(struct bio *bio, int err)
940 {
941         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
942
943         dm_unhook_bio(&pb->hook_info, bio);
944
945         if (err) {
946                 bio_endio(bio, err);
947                 return;
948         }
949
950         dm_bio_restore(&pb->bio_details, bio);
951         remap_to_cache(pb->cache, bio, pb->cblock);
952
953         /*
954          * We can't issue this bio directly, since we're in interrupt
955          * context.  So it gets put on a bio list for processing by the
956          * worker thread.
957          */
958         defer_writethrough_bio(pb->cache, bio);
959 }
960
961 /*
962  * When running in writethrough mode we need to send writes to clean blocks
963  * to both the cache and origin devices.  In future we'd like to clone the
964  * bio and send them in parallel, but for now we're doing them in
965  * series as this is easier.
966  */
967 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
968                                        dm_oblock_t oblock, dm_cblock_t cblock)
969 {
970         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
971
972         pb->cache = cache;
973         pb->cblock = cblock;
974         dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
975         dm_bio_record(&pb->bio_details, bio);
976
977         remap_to_origin_clear_discard(pb->cache, bio, oblock);
978 }
979
980 /*----------------------------------------------------------------
981  * Migration processing
982  *
983  * Migration covers moving data from the origin device to the cache, or
984  * vice versa.
985  *--------------------------------------------------------------*/
986 static void inc_io_migrations(struct cache *cache)
987 {
988         atomic_inc(&cache->nr_io_migrations);
989 }
990
991 static void dec_io_migrations(struct cache *cache)
992 {
993         atomic_dec(&cache->nr_io_migrations);
994 }
995
996 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
997                          bool holder)
998 {
999         (holder ? dm_cell_release : dm_cell_release_no_holder)
1000                 (cache->prison, cell, &cache->deferred_bios);
1001         free_prison_cell(cache, cell);
1002 }
1003
1004 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
1005                        bool holder)
1006 {
1007         unsigned long flags;
1008
1009         spin_lock_irqsave(&cache->lock, flags);
1010         __cell_defer(cache, cell, holder);
1011         spin_unlock_irqrestore(&cache->lock, flags);
1012
1013         wake_worker(cache);
1014 }
1015
1016 static void free_io_migration(struct dm_cache_migration *mg)
1017 {
1018         dec_io_migrations(mg->cache);
1019         free_migration(mg);
1020 }
1021
1022 static void migration_failure(struct dm_cache_migration *mg)
1023 {
1024         struct cache *cache = mg->cache;
1025
1026         if (mg->writeback) {
1027                 DMWARN_LIMIT("writeback failed; couldn't copy block");
1028                 set_dirty(cache, mg->old_oblock, mg->cblock);
1029                 cell_defer(cache, mg->old_ocell, false);
1030
1031         } else if (mg->demote) {
1032                 DMWARN_LIMIT("demotion failed; couldn't copy block");
1033                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1034
1035                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1036                 if (mg->promote)
1037                         cell_defer(cache, mg->new_ocell, true);
1038         } else {
1039                 DMWARN_LIMIT("promotion failed; couldn't copy block");
1040                 policy_remove_mapping(cache->policy, mg->new_oblock);
1041                 cell_defer(cache, mg->new_ocell, true);
1042         }
1043
1044         free_io_migration(mg);
1045 }
1046
1047 static void migration_success_pre_commit(struct dm_cache_migration *mg)
1048 {
1049         unsigned long flags;
1050         struct cache *cache = mg->cache;
1051
1052         if (mg->writeback) {
1053                 clear_dirty(cache, mg->old_oblock, mg->cblock);
1054                 cell_defer(cache, mg->old_ocell, false);
1055                 free_io_migration(mg);
1056                 return;
1057
1058         } else if (mg->demote) {
1059                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
1060                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
1061                         policy_force_mapping(cache->policy, mg->new_oblock,
1062                                              mg->old_oblock);
1063                         if (mg->promote)
1064                                 cell_defer(cache, mg->new_ocell, true);
1065                         free_io_migration(mg);
1066                         return;
1067                 }
1068         } else {
1069                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
1070                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
1071                         policy_remove_mapping(cache->policy, mg->new_oblock);
1072                         free_io_migration(mg);
1073                         return;
1074                 }
1075         }
1076
1077         spin_lock_irqsave(&cache->lock, flags);
1078         list_add_tail(&mg->list, &cache->need_commit_migrations);
1079         cache->commit_requested = true;
1080         spin_unlock_irqrestore(&cache->lock, flags);
1081 }
1082
1083 static void migration_success_post_commit(struct dm_cache_migration *mg)
1084 {
1085         unsigned long flags;
1086         struct cache *cache = mg->cache;
1087
1088         if (mg->writeback) {
1089                 DMWARN("writeback unexpectedly triggered commit");
1090                 return;
1091
1092         } else if (mg->demote) {
1093                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1094
1095                 if (mg->promote) {
1096                         mg->demote = false;
1097
1098                         spin_lock_irqsave(&cache->lock, flags);
1099                         list_add_tail(&mg->list, &cache->quiesced_migrations);
1100                         spin_unlock_irqrestore(&cache->lock, flags);
1101
1102                 } else {
1103                         if (mg->invalidate)
1104                                 policy_remove_mapping(cache->policy, mg->old_oblock);
1105                         free_io_migration(mg);
1106                 }
1107
1108         } else {
1109                 if (mg->requeue_holder) {
1110                         clear_dirty(cache, mg->new_oblock, mg->cblock);
1111                         cell_defer(cache, mg->new_ocell, true);
1112                 } else {
1113                         /*
1114                          * The block was promoted via an overwrite, so it's dirty.
1115                          */
1116                         set_dirty(cache, mg->new_oblock, mg->cblock);
1117                         bio_endio(mg->new_ocell->holder, 0);
1118                         cell_defer(cache, mg->new_ocell, false);
1119                 }
1120                 free_io_migration(mg);
1121         }
1122 }
1123
1124 static void copy_complete(int read_err, unsigned long write_err, void *context)
1125 {
1126         unsigned long flags;
1127         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1128         struct cache *cache = mg->cache;
1129
1130         if (read_err || write_err)
1131                 mg->err = true;
1132
1133         spin_lock_irqsave(&cache->lock, flags);
1134         list_add_tail(&mg->list, &cache->completed_migrations);
1135         spin_unlock_irqrestore(&cache->lock, flags);
1136
1137         wake_worker(cache);
1138 }
1139
1140 static void issue_copy(struct dm_cache_migration *mg)
1141 {
1142         int r;
1143         struct dm_io_region o_region, c_region;
1144         struct cache *cache = mg->cache;
1145         sector_t cblock = from_cblock(mg->cblock);
1146
1147         o_region.bdev = cache->origin_dev->bdev;
1148         o_region.count = cache->sectors_per_block;
1149
1150         c_region.bdev = cache->cache_dev->bdev;
1151         c_region.sector = cblock * cache->sectors_per_block;
1152         c_region.count = cache->sectors_per_block;
1153
1154         if (mg->writeback || mg->demote) {
1155                 /* demote */
1156                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
1157                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
1158         } else {
1159                 /* promote */
1160                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1161                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1162         }
1163
1164         if (r < 0) {
1165                 DMERR_LIMIT("issuing migration failed");
1166                 migration_failure(mg);
1167         }
1168 }
1169
1170 static void overwrite_endio(struct bio *bio, int err)
1171 {
1172         struct dm_cache_migration *mg = bio->bi_private;
1173         struct cache *cache = mg->cache;
1174         size_t pb_data_size = get_per_bio_data_size(cache);
1175         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1176         unsigned long flags;
1177
1178         dm_unhook_bio(&pb->hook_info, bio);
1179
1180         if (err)
1181                 mg->err = true;
1182
1183         mg->requeue_holder = false;
1184
1185         spin_lock_irqsave(&cache->lock, flags);
1186         list_add_tail(&mg->list, &cache->completed_migrations);
1187         spin_unlock_irqrestore(&cache->lock, flags);
1188
1189         wake_worker(cache);
1190 }
1191
1192 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1193 {
1194         size_t pb_data_size = get_per_bio_data_size(mg->cache);
1195         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1196
1197         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1198         remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1199
1200         /*
1201          * No need to inc_ds() here, since the cell will be held for the
1202          * duration of the io.
1203          */
1204         accounted_request(mg->cache, bio);
1205 }
1206
1207 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1208 {
1209         return (bio_data_dir(bio) == WRITE) &&
1210                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1211 }
1212
1213 static void avoid_copy(struct dm_cache_migration *mg)
1214 {
1215         atomic_inc(&mg->cache->stats.copies_avoided);
1216         migration_success_pre_commit(mg);
1217 }
1218
1219 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1220                                      dm_dblock_t *b, dm_dblock_t *e)
1221 {
1222         sector_t sb = bio->bi_iter.bi_sector;
1223         sector_t se = bio_end_sector(bio);
1224
1225         *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1226
1227         if (se - sb < cache->discard_block_size)
1228                 *e = *b;
1229         else
1230                 *e = to_dblock(block_div(se, cache->discard_block_size));
1231 }
1232
1233 static void issue_discard(struct dm_cache_migration *mg)
1234 {
1235         dm_dblock_t b, e;
1236         struct bio *bio = mg->new_ocell->holder;
1237
1238         calc_discard_block_range(mg->cache, bio, &b, &e);
1239         while (b != e) {
1240                 set_discard(mg->cache, b);
1241                 b = to_dblock(from_dblock(b) + 1);
1242         }
1243
1244         bio_endio(bio, 0);
1245         cell_defer(mg->cache, mg->new_ocell, false);
1246         free_migration(mg);
1247 }
1248
1249 static void issue_copy_or_discard(struct dm_cache_migration *mg)
1250 {
1251         bool avoid;
1252         struct cache *cache = mg->cache;
1253
1254         if (mg->discard) {
1255                 issue_discard(mg);
1256                 return;
1257         }
1258
1259         if (mg->writeback || mg->demote)
1260                 avoid = !is_dirty(cache, mg->cblock) ||
1261                         is_discarded_oblock(cache, mg->old_oblock);
1262         else {
1263                 struct bio *bio = mg->new_ocell->holder;
1264
1265                 avoid = is_discarded_oblock(cache, mg->new_oblock);
1266
1267                 if (writeback_mode(&cache->features) &&
1268                     !avoid && bio_writes_complete_block(cache, bio)) {
1269                         issue_overwrite(mg, bio);
1270                         return;
1271                 }
1272         }
1273
1274         avoid ? avoid_copy(mg) : issue_copy(mg);
1275 }
1276
1277 static void complete_migration(struct dm_cache_migration *mg)
1278 {
1279         if (mg->err)
1280                 migration_failure(mg);
1281         else
1282                 migration_success_pre_commit(mg);
1283 }
1284
1285 static void process_migrations(struct cache *cache, struct list_head *head,
1286                                void (*fn)(struct dm_cache_migration *))
1287 {
1288         unsigned long flags;
1289         struct list_head list;
1290         struct dm_cache_migration *mg, *tmp;
1291
1292         INIT_LIST_HEAD(&list);
1293         spin_lock_irqsave(&cache->lock, flags);
1294         list_splice_init(head, &list);
1295         spin_unlock_irqrestore(&cache->lock, flags);
1296
1297         list_for_each_entry_safe(mg, tmp, &list, list)
1298                 fn(mg);
1299 }
1300
1301 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1302 {
1303         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1304 }
1305
1306 static void queue_quiesced_migration(struct dm_cache_migration *mg)
1307 {
1308         unsigned long flags;
1309         struct cache *cache = mg->cache;
1310
1311         spin_lock_irqsave(&cache->lock, flags);
1312         __queue_quiesced_migration(mg);
1313         spin_unlock_irqrestore(&cache->lock, flags);
1314
1315         wake_worker(cache);
1316 }
1317
1318 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1319 {
1320         unsigned long flags;
1321         struct dm_cache_migration *mg, *tmp;
1322
1323         spin_lock_irqsave(&cache->lock, flags);
1324         list_for_each_entry_safe(mg, tmp, work, list)
1325                 __queue_quiesced_migration(mg);
1326         spin_unlock_irqrestore(&cache->lock, flags);
1327
1328         wake_worker(cache);
1329 }
1330
1331 static void check_for_quiesced_migrations(struct cache *cache,
1332                                           struct per_bio_data *pb)
1333 {
1334         struct list_head work;
1335
1336         if (!pb->all_io_entry)
1337                 return;
1338
1339         INIT_LIST_HEAD(&work);
1340         dm_deferred_entry_dec(pb->all_io_entry, &work);
1341
1342         if (!list_empty(&work))
1343                 queue_quiesced_migrations(cache, &work);
1344 }
1345
1346 static void quiesce_migration(struct dm_cache_migration *mg)
1347 {
1348         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1349                 queue_quiesced_migration(mg);
1350 }
1351
1352 static void promote(struct cache *cache, struct prealloc *structs,
1353                     dm_oblock_t oblock, dm_cblock_t cblock,
1354                     struct dm_bio_prison_cell *cell)
1355 {
1356         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1357
1358         mg->err = false;
1359         mg->discard = false;
1360         mg->writeback = false;
1361         mg->demote = false;
1362         mg->promote = true;
1363         mg->requeue_holder = true;
1364         mg->invalidate = false;
1365         mg->cache = cache;
1366         mg->new_oblock = oblock;
1367         mg->cblock = cblock;
1368         mg->old_ocell = NULL;
1369         mg->new_ocell = cell;
1370         mg->start_jiffies = jiffies;
1371
1372         inc_io_migrations(cache);
1373         quiesce_migration(mg);
1374 }
1375
1376 static void writeback(struct cache *cache, struct prealloc *structs,
1377                       dm_oblock_t oblock, dm_cblock_t cblock,
1378                       struct dm_bio_prison_cell *cell)
1379 {
1380         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1381
1382         mg->err = false;
1383         mg->discard = false;
1384         mg->writeback = true;
1385         mg->demote = false;
1386         mg->promote = false;
1387         mg->requeue_holder = true;
1388         mg->invalidate = false;
1389         mg->cache = cache;
1390         mg->old_oblock = oblock;
1391         mg->cblock = cblock;
1392         mg->old_ocell = cell;
1393         mg->new_ocell = NULL;
1394         mg->start_jiffies = jiffies;
1395
1396         inc_io_migrations(cache);
1397         quiesce_migration(mg);
1398 }
1399
1400 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1401                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1402                                 dm_cblock_t cblock,
1403                                 struct dm_bio_prison_cell *old_ocell,
1404                                 struct dm_bio_prison_cell *new_ocell)
1405 {
1406         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1407
1408         mg->err = false;
1409         mg->discard = false;
1410         mg->writeback = false;
1411         mg->demote = true;
1412         mg->promote = true;
1413         mg->requeue_holder = true;
1414         mg->invalidate = false;
1415         mg->cache = cache;
1416         mg->old_oblock = old_oblock;
1417         mg->new_oblock = new_oblock;
1418         mg->cblock = cblock;
1419         mg->old_ocell = old_ocell;
1420         mg->new_ocell = new_ocell;
1421         mg->start_jiffies = jiffies;
1422
1423         inc_io_migrations(cache);
1424         quiesce_migration(mg);
1425 }
1426
1427 /*
1428  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1429  * block are thrown away.
1430  */
1431 static void invalidate(struct cache *cache, struct prealloc *structs,
1432                        dm_oblock_t oblock, dm_cblock_t cblock,
1433                        struct dm_bio_prison_cell *cell)
1434 {
1435         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1436
1437         mg->err = false;
1438         mg->discard = false;
1439         mg->writeback = false;
1440         mg->demote = true;
1441         mg->promote = false;
1442         mg->requeue_holder = true;
1443         mg->invalidate = true;
1444         mg->cache = cache;
1445         mg->old_oblock = oblock;
1446         mg->cblock = cblock;
1447         mg->old_ocell = cell;
1448         mg->new_ocell = NULL;
1449         mg->start_jiffies = jiffies;
1450
1451         inc_io_migrations(cache);
1452         quiesce_migration(mg);
1453 }
1454
1455 static void discard(struct cache *cache, struct prealloc *structs,
1456                     struct dm_bio_prison_cell *cell)
1457 {
1458         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1459
1460         mg->err = false;
1461         mg->discard = true;
1462         mg->writeback = false;
1463         mg->demote = false;
1464         mg->promote = false;
1465         mg->requeue_holder = false;
1466         mg->invalidate = false;
1467         mg->cache = cache;
1468         mg->old_ocell = NULL;
1469         mg->new_ocell = cell;
1470         mg->start_jiffies = jiffies;
1471
1472         quiesce_migration(mg);
1473 }
1474
1475 /*----------------------------------------------------------------
1476  * bio processing
1477  *--------------------------------------------------------------*/
1478 static void defer_bio(struct cache *cache, struct bio *bio)
1479 {
1480         unsigned long flags;
1481
1482         spin_lock_irqsave(&cache->lock, flags);
1483         bio_list_add(&cache->deferred_bios, bio);
1484         spin_unlock_irqrestore(&cache->lock, flags);
1485
1486         wake_worker(cache);
1487 }
1488
1489 static void process_flush_bio(struct cache *cache, struct bio *bio)
1490 {
1491         size_t pb_data_size = get_per_bio_data_size(cache);
1492         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1493
1494         BUG_ON(bio->bi_iter.bi_size);
1495         if (!pb->req_nr)
1496                 remap_to_origin(cache, bio);
1497         else
1498                 remap_to_cache(cache, bio, 0);
1499
1500         /*
1501          * REQ_FLUSH is not directed at any particular block so we don't
1502          * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
1503          * by dm-core.
1504          */
1505         issue(cache, bio);
1506 }
1507
1508 static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1509                                 struct bio *bio)
1510 {
1511         int r;
1512         dm_dblock_t b, e;
1513         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1514
1515         calc_discard_block_range(cache, bio, &b, &e);
1516         if (b == e) {
1517                 bio_endio(bio, 0);
1518                 return;
1519         }
1520
1521         cell_prealloc = prealloc_get_cell(structs);
1522         r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1523                              (cell_free_fn) prealloc_put_cell,
1524                              structs, &new_ocell);
1525         if (r > 0)
1526                 return;
1527
1528         discard(cache, structs, new_ocell);
1529 }
1530
1531 static bool spare_migration_bandwidth(struct cache *cache)
1532 {
1533         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1534                 cache->sectors_per_block;
1535         return current_volume < cache->migration_threshold;
1536 }
1537
1538 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1539 {
1540         atomic_inc(bio_data_dir(bio) == READ ?
1541                    &cache->stats.read_hit : &cache->stats.write_hit);
1542 }
1543
1544 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1545 {
1546         atomic_inc(bio_data_dir(bio) == READ ?
1547                    &cache->stats.read_miss : &cache->stats.write_miss);
1548 }
1549
1550 /*----------------------------------------------------------------*/
1551
1552 struct old_oblock_lock {
1553         struct policy_locker locker;
1554         struct cache *cache;
1555         struct prealloc *structs;
1556         struct dm_bio_prison_cell *cell;
1557 };
1558
1559 static int null_locker(struct policy_locker *locker, dm_oblock_t b)
1560 {
1561         /* This should never be called */
1562         BUG();
1563         return 0;
1564 }
1565
1566 static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
1567 {
1568         struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
1569         struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
1570
1571         return bio_detain(l->cache, b, NULL, cell_prealloc,
1572                           (cell_free_fn) prealloc_put_cell,
1573                           l->structs, &l->cell);
1574 }
1575
1576 static void process_bio(struct cache *cache, struct prealloc *structs,
1577                         struct bio *bio)
1578 {
1579         int r;
1580         bool release_cell = true;
1581         dm_oblock_t block = get_bio_block(cache, bio);
1582         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1583         struct policy_result lookup_result;
1584         bool passthrough = passthrough_mode(&cache->features);
1585         bool discarded_block, can_migrate;
1586         struct old_oblock_lock ool;
1587
1588         /*
1589          * Check to see if that block is currently migrating.
1590          */
1591         cell_prealloc = prealloc_get_cell(structs);
1592         r = bio_detain(cache, block, bio, cell_prealloc,
1593                        (cell_free_fn) prealloc_put_cell,
1594                        structs, &new_ocell);
1595         if (r > 0)
1596                 return;
1597
1598         discarded_block = is_discarded_oblock(cache, block);
1599         can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1600
1601         ool.locker.fn = cell_locker;
1602         ool.cache = cache;
1603         ool.structs = structs;
1604         ool.cell = NULL;
1605         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1606                        bio, &ool.locker, &lookup_result);
1607
1608         if (r == -EWOULDBLOCK)
1609                 /* migration has been denied */
1610                 lookup_result.op = POLICY_MISS;
1611
1612         switch (lookup_result.op) {
1613         case POLICY_HIT:
1614                 if (passthrough) {
1615                         inc_miss_counter(cache, bio);
1616
1617                         /*
1618                          * Passthrough always maps to the origin,
1619                          * invalidating any cache blocks that are written
1620                          * to.
1621                          */
1622
1623                         if (bio_data_dir(bio) == WRITE) {
1624                                 atomic_inc(&cache->stats.demotion);
1625                                 invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1626                                 release_cell = false;
1627
1628                         } else {
1629                                 /* FIXME: factor out issue_origin() */
1630                                 remap_to_origin_clear_discard(cache, bio, block);
1631                                 inc_and_issue(cache, bio, new_ocell);
1632                         }
1633                 } else {
1634                         inc_hit_counter(cache, bio);
1635
1636                         if (bio_data_dir(bio) == WRITE &&
1637                             writethrough_mode(&cache->features) &&
1638                             !is_dirty(cache, lookup_result.cblock)) {
1639                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1640                                 inc_and_issue(cache, bio, new_ocell);
1641
1642                         } else  {
1643                                 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1644                                 inc_and_issue(cache, bio, new_ocell);
1645                         }
1646                 }
1647
1648                 break;
1649
1650         case POLICY_MISS:
1651                 inc_miss_counter(cache, bio);
1652                 remap_to_origin_clear_discard(cache, bio, block);
1653                 inc_and_issue(cache, bio, new_ocell);
1654                 break;
1655
1656         case POLICY_NEW:
1657                 atomic_inc(&cache->stats.promotion);
1658                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1659                 release_cell = false;
1660                 break;
1661
1662         case POLICY_REPLACE:
1663                 atomic_inc(&cache->stats.demotion);
1664                 atomic_inc(&cache->stats.promotion);
1665                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1666                                     block, lookup_result.cblock,
1667                                     ool.cell, new_ocell);
1668                 release_cell = false;
1669                 break;
1670
1671         default:
1672                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1673                             (unsigned) lookup_result.op);
1674                 bio_io_error(bio);
1675         }
1676
1677         if (release_cell)
1678                 cell_defer(cache, new_ocell, false);
1679 }
1680
1681 static int need_commit_due_to_time(struct cache *cache)
1682 {
1683         return !time_in_range(jiffies, cache->last_commit_jiffies,
1684                               cache->last_commit_jiffies + COMMIT_PERIOD);
1685 }
1686
1687 static int commit_if_needed(struct cache *cache)
1688 {
1689         int r = 0;
1690
1691         if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1692             dm_cache_changed_this_transaction(cache->cmd)) {
1693                 atomic_inc(&cache->stats.commit_count);
1694                 cache->commit_requested = false;
1695                 r = dm_cache_commit(cache->cmd, false);
1696                 cache->last_commit_jiffies = jiffies;
1697         }
1698
1699         return r;
1700 }
1701
1702 static void process_deferred_bios(struct cache *cache)
1703 {
1704         unsigned long flags;
1705         struct bio_list bios;
1706         struct bio *bio;
1707         struct prealloc structs;
1708
1709         memset(&structs, 0, sizeof(structs));
1710         bio_list_init(&bios);
1711
1712         spin_lock_irqsave(&cache->lock, flags);
1713         bio_list_merge(&bios, &cache->deferred_bios);
1714         bio_list_init(&cache->deferred_bios);
1715         spin_unlock_irqrestore(&cache->lock, flags);
1716
1717         while (!bio_list_empty(&bios)) {
1718                 /*
1719                  * If we've got no free migration structs, and processing
1720                  * this bio might require one, we pause until there are some
1721                  * prepared mappings to process.
1722                  */
1723                 if (prealloc_data_structs(cache, &structs)) {
1724                         spin_lock_irqsave(&cache->lock, flags);
1725                         bio_list_merge(&cache->deferred_bios, &bios);
1726                         spin_unlock_irqrestore(&cache->lock, flags);
1727                         break;
1728                 }
1729
1730                 bio = bio_list_pop(&bios);
1731
1732                 if (bio->bi_rw & REQ_FLUSH)
1733                         process_flush_bio(cache, bio);
1734                 else if (bio->bi_rw & REQ_DISCARD)
1735                         process_discard_bio(cache, &structs, bio);
1736                 else
1737                         process_bio(cache, &structs, bio);
1738         }
1739
1740         prealloc_free_structs(cache, &structs);
1741 }
1742
1743 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1744 {
1745         unsigned long flags;
1746         struct bio_list bios;
1747         struct bio *bio;
1748
1749         bio_list_init(&bios);
1750
1751         spin_lock_irqsave(&cache->lock, flags);
1752         bio_list_merge(&bios, &cache->deferred_flush_bios);
1753         bio_list_init(&cache->deferred_flush_bios);
1754         spin_unlock_irqrestore(&cache->lock, flags);
1755
1756         /*
1757          * These bios have already been through inc_ds()
1758          */
1759         while ((bio = bio_list_pop(&bios)))
1760                 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
1761 }
1762
1763 static void process_deferred_writethrough_bios(struct cache *cache)
1764 {
1765         unsigned long flags;
1766         struct bio_list bios;
1767         struct bio *bio;
1768
1769         bio_list_init(&bios);
1770
1771         spin_lock_irqsave(&cache->lock, flags);
1772         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1773         bio_list_init(&cache->deferred_writethrough_bios);
1774         spin_unlock_irqrestore(&cache->lock, flags);
1775
1776         /*
1777          * These bios have already been through inc_ds()
1778          */
1779         while ((bio = bio_list_pop(&bios)))
1780                 accounted_request(cache, bio);
1781 }
1782
1783 static void writeback_some_dirty_blocks(struct cache *cache)
1784 {
1785         int r = 0;
1786         dm_oblock_t oblock;
1787         dm_cblock_t cblock;
1788         struct prealloc structs;
1789         struct dm_bio_prison_cell *old_ocell;
1790         bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
1791
1792         memset(&structs, 0, sizeof(structs));
1793
1794         while (spare_migration_bandwidth(cache)) {
1795                 if (prealloc_data_structs(cache, &structs))
1796                         break;
1797
1798                 r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
1799                 if (r)
1800                         break;
1801
1802                 r = get_cell(cache, oblock, &structs, &old_ocell);
1803                 if (r) {
1804                         policy_set_dirty(cache->policy, oblock);
1805                         break;
1806                 }
1807
1808                 writeback(cache, &structs, oblock, cblock, old_ocell);
1809         }
1810
1811         prealloc_free_structs(cache, &structs);
1812 }
1813
1814 /*----------------------------------------------------------------
1815  * Invalidations.
1816  * Dropping something from the cache *without* writing back.
1817  *--------------------------------------------------------------*/
1818
1819 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1820 {
1821         int r = 0;
1822         uint64_t begin = from_cblock(req->cblocks->begin);
1823         uint64_t end = from_cblock(req->cblocks->end);
1824
1825         while (begin != end) {
1826                 r = policy_remove_cblock(cache->policy, to_cblock(begin));
1827                 if (!r) {
1828                         r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1829                         if (r)
1830                                 break;
1831
1832                 } else if (r == -ENODATA) {
1833                         /* harmless, already unmapped */
1834                         r = 0;
1835
1836                 } else {
1837                         DMERR("policy_remove_cblock failed");
1838                         break;
1839                 }
1840
1841                 begin++;
1842         }
1843
1844         cache->commit_requested = true;
1845
1846         req->err = r;
1847         atomic_set(&req->complete, 1);
1848
1849         wake_up(&req->result_wait);
1850 }
1851
1852 static void process_invalidation_requests(struct cache *cache)
1853 {
1854         struct list_head list;
1855         struct invalidation_request *req, *tmp;
1856
1857         INIT_LIST_HEAD(&list);
1858         spin_lock(&cache->invalidation_lock);
1859         list_splice_init(&cache->invalidation_requests, &list);
1860         spin_unlock(&cache->invalidation_lock);
1861
1862         list_for_each_entry_safe (req, tmp, &list, list)
1863                 process_invalidation_request(cache, req);
1864 }
1865
1866 /*----------------------------------------------------------------
1867  * Main worker loop
1868  *--------------------------------------------------------------*/
1869 static bool is_quiescing(struct cache *cache)
1870 {
1871         return atomic_read(&cache->quiescing);
1872 }
1873
1874 static void ack_quiescing(struct cache *cache)
1875 {
1876         if (is_quiescing(cache)) {
1877                 atomic_inc(&cache->quiescing_ack);
1878                 wake_up(&cache->quiescing_wait);
1879         }
1880 }
1881
1882 static void wait_for_quiescing_ack(struct cache *cache)
1883 {
1884         wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1885 }
1886
1887 static void start_quiescing(struct cache *cache)
1888 {
1889         atomic_inc(&cache->quiescing);
1890         wait_for_quiescing_ack(cache);
1891 }
1892
1893 static void stop_quiescing(struct cache *cache)
1894 {
1895         atomic_set(&cache->quiescing, 0);
1896         atomic_set(&cache->quiescing_ack, 0);
1897 }
1898
1899 static void wait_for_migrations(struct cache *cache)
1900 {
1901         wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
1902 }
1903
1904 static void stop_worker(struct cache *cache)
1905 {
1906         cancel_delayed_work(&cache->waker);
1907         flush_workqueue(cache->wq);
1908 }
1909
1910 static void requeue_deferred_io(struct cache *cache)
1911 {
1912         struct bio *bio;
1913         struct bio_list bios;
1914
1915         bio_list_init(&bios);
1916         bio_list_merge(&bios, &cache->deferred_bios);
1917         bio_list_init(&cache->deferred_bios);
1918
1919         while ((bio = bio_list_pop(&bios)))
1920                 bio_endio(bio, DM_ENDIO_REQUEUE);
1921 }
1922
1923 static int more_work(struct cache *cache)
1924 {
1925         if (is_quiescing(cache))
1926                 return !list_empty(&cache->quiesced_migrations) ||
1927                         !list_empty(&cache->completed_migrations) ||
1928                         !list_empty(&cache->need_commit_migrations);
1929         else
1930                 return !bio_list_empty(&cache->deferred_bios) ||
1931                         !bio_list_empty(&cache->deferred_flush_bios) ||
1932                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1933                         !list_empty(&cache->quiesced_migrations) ||
1934                         !list_empty(&cache->completed_migrations) ||
1935                         !list_empty(&cache->need_commit_migrations) ||
1936                         cache->invalidate;
1937 }
1938
1939 static void do_worker(struct work_struct *ws)
1940 {
1941         struct cache *cache = container_of(ws, struct cache, worker);
1942
1943         do {
1944                 if (!is_quiescing(cache)) {
1945                         writeback_some_dirty_blocks(cache);
1946                         process_deferred_writethrough_bios(cache);
1947                         process_deferred_bios(cache);
1948                         process_invalidation_requests(cache);
1949                 }
1950
1951                 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
1952                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1953
1954                 if (commit_if_needed(cache)) {
1955                         process_deferred_flush_bios(cache, false);
1956                         process_migrations(cache, &cache->need_commit_migrations, migration_failure);
1957
1958                         /*
1959                          * FIXME: rollback metadata or just go into a
1960                          * failure mode and error everything
1961                          */
1962                 } else {
1963                         process_deferred_flush_bios(cache, true);
1964                         process_migrations(cache, &cache->need_commit_migrations,
1965                                            migration_success_post_commit);
1966                 }
1967
1968                 ack_quiescing(cache);
1969
1970         } while (more_work(cache));
1971 }
1972
1973 /*
1974  * We want to commit periodically so that not too much
1975  * unwritten metadata builds up.
1976  */
1977 static void do_waker(struct work_struct *ws)
1978 {
1979         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1980         policy_tick(cache->policy);
1981         wake_worker(cache);
1982         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1983 }
1984
1985 /*----------------------------------------------------------------*/
1986
1987 static int is_congested(struct dm_dev *dev, int bdi_bits)
1988 {
1989         struct request_queue *q = bdev_get_queue(dev->bdev);
1990         return bdi_congested(&q->backing_dev_info, bdi_bits);
1991 }
1992
1993 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1994 {
1995         struct cache *cache = container_of(cb, struct cache, callbacks);
1996
1997         return is_congested(cache->origin_dev, bdi_bits) ||
1998                 is_congested(cache->cache_dev, bdi_bits);
1999 }
2000
2001 /*----------------------------------------------------------------
2002  * Target methods
2003  *--------------------------------------------------------------*/
2004
2005 /*
2006  * This function gets called on the error paths of the constructor, so we
2007  * have to cope with a partially initialised struct.
2008  */
2009 static void destroy(struct cache *cache)
2010 {
2011         unsigned i;
2012
2013         if (cache->migration_pool)
2014                 mempool_destroy(cache->migration_pool);
2015
2016         if (cache->all_io_ds)
2017                 dm_deferred_set_destroy(cache->all_io_ds);
2018
2019         if (cache->prison)
2020                 dm_bio_prison_destroy(cache->prison);
2021
2022         if (cache->wq)
2023                 destroy_workqueue(cache->wq);
2024
2025         if (cache->dirty_bitset)
2026                 free_bitset(cache->dirty_bitset);
2027
2028         if (cache->discard_bitset)
2029                 free_bitset(cache->discard_bitset);
2030
2031         if (cache->copier)
2032                 dm_kcopyd_client_destroy(cache->copier);
2033
2034         if (cache->cmd)
2035                 dm_cache_metadata_close(cache->cmd);
2036
2037         if (cache->metadata_dev)
2038                 dm_put_device(cache->ti, cache->metadata_dev);
2039
2040         if (cache->origin_dev)
2041                 dm_put_device(cache->ti, cache->origin_dev);
2042
2043         if (cache->cache_dev)
2044                 dm_put_device(cache->ti, cache->cache_dev);
2045
2046         if (cache->policy)
2047                 dm_cache_policy_destroy(cache->policy);
2048
2049         for (i = 0; i < cache->nr_ctr_args ; i++)
2050                 kfree(cache->ctr_args[i]);
2051         kfree(cache->ctr_args);
2052
2053         kfree(cache);
2054 }
2055
2056 static void cache_dtr(struct dm_target *ti)
2057 {
2058         struct cache *cache = ti->private;
2059
2060         destroy(cache);
2061 }
2062
2063 static sector_t get_dev_size(struct dm_dev *dev)
2064 {
2065         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2066 }
2067
2068 /*----------------------------------------------------------------*/
2069
2070 /*
2071  * Construct a cache device mapping.
2072  *
2073  * cache <metadata dev> <cache dev> <origin dev> <block size>
2074  *       <#feature args> [<feature arg>]*
2075  *       <policy> <#policy args> [<policy arg>]*
2076  *
2077  * metadata dev    : fast device holding the persistent metadata
2078  * cache dev       : fast device holding cached data blocks
2079  * origin dev      : slow device holding original data blocks
2080  * block size      : cache unit size in sectors
2081  *
2082  * #feature args   : number of feature arguments passed
2083  * feature args    : writethrough.  (The default is writeback.)
2084  *
2085  * policy          : the replacement policy to use
2086  * #policy args    : an even number of policy arguments corresponding
2087  *                   to key/value pairs passed to the policy
2088  * policy args     : key/value pairs passed to the policy
2089  *                   E.g. 'sequential_threshold 1024'
2090  *                   See cache-policies.txt for details.
2091  *
2092  * Optional feature arguments are:
2093  *   writethrough  : write through caching that prohibits cache block
2094  *                   content from being different from origin block content.
2095  *                   Without this argument, the default behaviour is to write
2096  *                   back cache block contents later for performance reasons,
2097  *                   so they may differ from the corresponding origin blocks.
2098  */
2099 struct cache_args {
2100         struct dm_target *ti;
2101
2102         struct dm_dev *metadata_dev;
2103
2104         struct dm_dev *cache_dev;
2105         sector_t cache_sectors;
2106
2107         struct dm_dev *origin_dev;
2108         sector_t origin_sectors;
2109
2110         uint32_t block_size;
2111
2112         const char *policy_name;
2113         int policy_argc;
2114         const char **policy_argv;
2115
2116         struct cache_features features;
2117 };
2118
2119 static void destroy_cache_args(struct cache_args *ca)
2120 {
2121         if (ca->metadata_dev)
2122                 dm_put_device(ca->ti, ca->metadata_dev);
2123
2124         if (ca->cache_dev)
2125                 dm_put_device(ca->ti, ca->cache_dev);
2126
2127         if (ca->origin_dev)
2128                 dm_put_device(ca->ti, ca->origin_dev);
2129
2130         kfree(ca);
2131 }
2132
2133 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2134 {
2135         if (!as->argc) {
2136                 *error = "Insufficient args";
2137                 return false;
2138         }
2139
2140         return true;
2141 }
2142
2143 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2144                               char **error)
2145 {
2146         int r;
2147         sector_t metadata_dev_size;
2148         char b[BDEVNAME_SIZE];
2149
2150         if (!at_least_one_arg(as, error))
2151                 return -EINVAL;
2152
2153         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2154                           &ca->metadata_dev);
2155         if (r) {
2156                 *error = "Error opening metadata device";
2157                 return r;
2158         }
2159
2160         metadata_dev_size = get_dev_size(ca->metadata_dev);
2161         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2162                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2163                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2164
2165         return 0;
2166 }
2167
2168 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2169                            char **error)
2170 {
2171         int r;
2172
2173         if (!at_least_one_arg(as, error))
2174                 return -EINVAL;
2175
2176         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2177                           &ca->cache_dev);
2178         if (r) {
2179                 *error = "Error opening cache device";
2180                 return r;
2181         }
2182         ca->cache_sectors = get_dev_size(ca->cache_dev);
2183
2184         return 0;
2185 }
2186
2187 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2188                             char **error)
2189 {
2190         int r;
2191
2192         if (!at_least_one_arg(as, error))
2193                 return -EINVAL;
2194
2195         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2196                           &ca->origin_dev);
2197         if (r) {
2198                 *error = "Error opening origin device";
2199                 return r;
2200         }
2201
2202         ca->origin_sectors = get_dev_size(ca->origin_dev);
2203         if (ca->ti->len > ca->origin_sectors) {
2204                 *error = "Device size larger than cached device";
2205                 return -EINVAL;
2206         }
2207
2208         return 0;
2209 }
2210
2211 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2212                             char **error)
2213 {
2214         unsigned long block_size;
2215
2216         if (!at_least_one_arg(as, error))
2217                 return -EINVAL;
2218
2219         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2220             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2221             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2222             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2223                 *error = "Invalid data block size";
2224                 return -EINVAL;
2225         }
2226
2227         if (block_size > ca->cache_sectors) {
2228                 *error = "Data block size is larger than the cache device";
2229                 return -EINVAL;
2230         }
2231
2232         ca->block_size = block_size;
2233
2234         return 0;
2235 }
2236
2237 static void init_features(struct cache_features *cf)
2238 {
2239         cf->mode = CM_WRITE;
2240         cf->io_mode = CM_IO_WRITEBACK;
2241 }
2242
2243 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2244                           char **error)
2245 {
2246         static struct dm_arg _args[] = {
2247                 {0, 1, "Invalid number of cache feature arguments"},
2248         };
2249
2250         int r;
2251         unsigned argc;
2252         const char *arg;
2253         struct cache_features *cf = &ca->features;
2254
2255         init_features(cf);
2256
2257         r = dm_read_arg_group(_args, as, &argc, error);
2258         if (r)
2259                 return -EINVAL;
2260
2261         while (argc--) {
2262                 arg = dm_shift_arg(as);
2263
2264                 if (!strcasecmp(arg, "writeback"))
2265                         cf->io_mode = CM_IO_WRITEBACK;
2266
2267                 else if (!strcasecmp(arg, "writethrough"))
2268                         cf->io_mode = CM_IO_WRITETHROUGH;
2269
2270                 else if (!strcasecmp(arg, "passthrough"))
2271                         cf->io_mode = CM_IO_PASSTHROUGH;
2272
2273                 else {
2274                         *error = "Unrecognised cache feature requested";
2275                         return -EINVAL;
2276                 }
2277         }
2278
2279         return 0;
2280 }
2281
2282 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2283                         char **error)
2284 {
2285         static struct dm_arg _args[] = {
2286                 {0, 1024, "Invalid number of policy arguments"},
2287         };
2288
2289         int r;
2290
2291         if (!at_least_one_arg(as, error))
2292                 return -EINVAL;
2293
2294         ca->policy_name = dm_shift_arg(as);
2295
2296         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2297         if (r)
2298                 return -EINVAL;
2299
2300         ca->policy_argv = (const char **)as->argv;
2301         dm_consume_args(as, ca->policy_argc);
2302
2303         return 0;
2304 }
2305
2306 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2307                             char **error)
2308 {
2309         int r;
2310         struct dm_arg_set as;
2311
2312         as.argc = argc;
2313         as.argv = argv;
2314
2315         r = parse_metadata_dev(ca, &as, error);
2316         if (r)
2317                 return r;
2318
2319         r = parse_cache_dev(ca, &as, error);
2320         if (r)
2321                 return r;
2322
2323         r = parse_origin_dev(ca, &as, error);
2324         if (r)
2325                 return r;
2326
2327         r = parse_block_size(ca, &as, error);
2328         if (r)
2329                 return r;
2330
2331         r = parse_features(ca, &as, error);
2332         if (r)
2333                 return r;
2334
2335         r = parse_policy(ca, &as, error);
2336         if (r)
2337                 return r;
2338
2339         return 0;
2340 }
2341
2342 /*----------------------------------------------------------------*/
2343
2344 static struct kmem_cache *migration_cache;
2345
2346 #define NOT_CORE_OPTION 1
2347
2348 static int process_config_option(struct cache *cache, const char *key, const char *value)
2349 {
2350         unsigned long tmp;
2351
2352         if (!strcasecmp(key, "migration_threshold")) {
2353                 if (kstrtoul(value, 10, &tmp))
2354                         return -EINVAL;
2355
2356                 cache->migration_threshold = tmp;
2357                 return 0;
2358         }
2359
2360         return NOT_CORE_OPTION;
2361 }
2362
2363 static int set_config_value(struct cache *cache, const char *key, const char *value)
2364 {
2365         int r = process_config_option(cache, key, value);
2366
2367         if (r == NOT_CORE_OPTION)
2368                 r = policy_set_config_value(cache->policy, key, value);
2369
2370         if (r)
2371                 DMWARN("bad config value for %s: %s", key, value);
2372
2373         return r;
2374 }
2375
2376 static int set_config_values(struct cache *cache, int argc, const char **argv)
2377 {
2378         int r = 0;
2379
2380         if (argc & 1) {
2381                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2382                 return -EINVAL;
2383         }
2384
2385         while (argc) {
2386                 r = set_config_value(cache, argv[0], argv[1]);
2387                 if (r)
2388                         break;
2389
2390                 argc -= 2;
2391                 argv += 2;
2392         }
2393
2394         return r;
2395 }
2396
2397 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2398                                char **error)
2399 {
2400         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2401                                                            cache->cache_size,
2402                                                            cache->origin_sectors,
2403                                                            cache->sectors_per_block);
2404         if (IS_ERR(p)) {
2405                 *error = "Error creating cache's policy";
2406                 return PTR_ERR(p);
2407         }
2408         cache->policy = p;
2409
2410         return 0;
2411 }
2412
2413 /*
2414  * We want the discard block size to be at least the size of the cache
2415  * block size and have no more than 2^14 discard blocks across the origin.
2416  */
2417 #define MAX_DISCARD_BLOCKS (1 << 14)
2418
2419 static bool too_many_discard_blocks(sector_t discard_block_size,
2420                                     sector_t origin_size)
2421 {
2422         (void) sector_div(origin_size, discard_block_size);
2423
2424         return origin_size > MAX_DISCARD_BLOCKS;
2425 }
2426
2427 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2428                                              sector_t origin_size)
2429 {
2430         sector_t discard_block_size = cache_block_size;
2431
2432         if (origin_size)
2433                 while (too_many_discard_blocks(discard_block_size, origin_size))
2434                         discard_block_size *= 2;
2435
2436         return discard_block_size;
2437 }
2438
2439 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2440 {
2441         dm_block_t nr_blocks = from_cblock(size);
2442
2443         if (nr_blocks > (1 << 20) && cache->cache_size != size)
2444                 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2445                              "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2446                              "Please consider increasing the cache block size to reduce the overall cache block count.",
2447                              (unsigned long long) nr_blocks);
2448
2449         cache->cache_size = size;
2450 }
2451
2452 #define DEFAULT_MIGRATION_THRESHOLD 2048
2453
2454 static int cache_create(struct cache_args *ca, struct cache **result)
2455 {
2456         int r = 0;
2457         char **error = &ca->ti->error;
2458         struct cache *cache;
2459         struct dm_target *ti = ca->ti;
2460         dm_block_t origin_blocks;
2461         struct dm_cache_metadata *cmd;
2462         bool may_format = ca->features.mode == CM_WRITE;
2463
2464         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2465         if (!cache)
2466                 return -ENOMEM;
2467
2468         cache->ti = ca->ti;
2469         ti->private = cache;
2470         ti->num_flush_bios = 2;
2471         ti->flush_supported = true;
2472
2473         ti->num_discard_bios = 1;
2474         ti->discards_supported = true;
2475         ti->discard_zeroes_data_unsupported = true;
2476         ti->split_discard_bios = false;
2477
2478         cache->features = ca->features;
2479         ti->per_bio_data_size = get_per_bio_data_size(cache);
2480
2481         cache->callbacks.congested_fn = cache_is_congested;
2482         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2483
2484         cache->metadata_dev = ca->metadata_dev;
2485         cache->origin_dev = ca->origin_dev;
2486         cache->cache_dev = ca->cache_dev;
2487
2488         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2489
2490         /* FIXME: factor out this whole section */
2491         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2492         origin_blocks = block_div(origin_blocks, ca->block_size);
2493         cache->origin_blocks = to_oblock(origin_blocks);
2494
2495         cache->sectors_per_block = ca->block_size;
2496         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2497                 r = -EINVAL;
2498                 goto bad;
2499         }
2500
2501         if (ca->block_size & (ca->block_size - 1)) {
2502                 dm_block_t cache_size = ca->cache_sectors;
2503
2504                 cache->sectors_per_block_shift = -1;
2505                 cache_size = block_div(cache_size, ca->block_size);
2506                 set_cache_size(cache, to_cblock(cache_size));
2507         } else {
2508                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2509                 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2510         }
2511
2512         r = create_cache_policy(cache, ca, error);
2513         if (r)
2514                 goto bad;
2515
2516         cache->policy_nr_args = ca->policy_argc;
2517         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2518
2519         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2520         if (r) {
2521                 *error = "Error setting cache policy's config values";
2522                 goto bad;
2523         }
2524
2525         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2526                                      ca->block_size, may_format,
2527                                      dm_cache_policy_get_hint_size(cache->policy));
2528         if (IS_ERR(cmd)) {
2529                 *error = "Error creating metadata object";
2530                 r = PTR_ERR(cmd);
2531                 goto bad;
2532         }
2533         cache->cmd = cmd;
2534
2535         if (passthrough_mode(&cache->features)) {
2536                 bool all_clean;
2537
2538                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2539                 if (r) {
2540                         *error = "dm_cache_metadata_all_clean() failed";
2541                         goto bad;
2542                 }
2543
2544                 if (!all_clean) {
2545                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2546                         r = -EINVAL;
2547                         goto bad;
2548                 }
2549         }
2550
2551         spin_lock_init(&cache->lock);
2552         bio_list_init(&cache->deferred_bios);
2553         bio_list_init(&cache->deferred_flush_bios);
2554         bio_list_init(&cache->deferred_writethrough_bios);
2555         INIT_LIST_HEAD(&cache->quiesced_migrations);
2556         INIT_LIST_HEAD(&cache->completed_migrations);
2557         INIT_LIST_HEAD(&cache->need_commit_migrations);
2558         atomic_set(&cache->nr_allocated_migrations, 0);
2559         atomic_set(&cache->nr_io_migrations, 0);
2560         init_waitqueue_head(&cache->migration_wait);
2561
2562         init_waitqueue_head(&cache->quiescing_wait);
2563         atomic_set(&cache->quiescing, 0);
2564         atomic_set(&cache->quiescing_ack, 0);
2565
2566         r = -ENOMEM;
2567         atomic_set(&cache->nr_dirty, 0);
2568         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2569         if (!cache->dirty_bitset) {
2570                 *error = "could not allocate dirty bitset";
2571                 goto bad;
2572         }
2573         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2574
2575         cache->discard_block_size =
2576                 calculate_discard_block_size(cache->sectors_per_block,
2577                                              cache->origin_sectors);
2578         cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2579                                                               cache->discard_block_size));
2580         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2581         if (!cache->discard_bitset) {
2582                 *error = "could not allocate discard bitset";
2583                 goto bad;
2584         }
2585         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2586
2587         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2588         if (IS_ERR(cache->copier)) {
2589                 *error = "could not create kcopyd client";
2590                 r = PTR_ERR(cache->copier);
2591                 goto bad;
2592         }
2593
2594         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2595         if (!cache->wq) {
2596                 *error = "could not create workqueue for metadata object";
2597                 goto bad;
2598         }
2599         INIT_WORK(&cache->worker, do_worker);
2600         INIT_DELAYED_WORK(&cache->waker, do_waker);
2601         cache->last_commit_jiffies = jiffies;
2602
2603         cache->prison = dm_bio_prison_create();
2604         if (!cache->prison) {
2605                 *error = "could not create bio prison";
2606                 goto bad;
2607         }
2608
2609         cache->all_io_ds = dm_deferred_set_create();
2610         if (!cache->all_io_ds) {
2611                 *error = "could not create all_io deferred set";
2612                 goto bad;
2613         }
2614
2615         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2616                                                          migration_cache);
2617         if (!cache->migration_pool) {
2618                 *error = "Error creating cache's migration mempool";
2619                 goto bad;
2620         }
2621
2622         cache->need_tick_bio = true;
2623         cache->sized = false;
2624         cache->invalidate = false;
2625         cache->commit_requested = false;
2626         cache->loaded_mappings = false;
2627         cache->loaded_discards = false;
2628
2629         load_stats(cache);
2630
2631         atomic_set(&cache->stats.demotion, 0);
2632         atomic_set(&cache->stats.promotion, 0);
2633         atomic_set(&cache->stats.copies_avoided, 0);
2634         atomic_set(&cache->stats.cache_cell_clash, 0);
2635         atomic_set(&cache->stats.commit_count, 0);
2636         atomic_set(&cache->stats.discard_count, 0);
2637
2638         spin_lock_init(&cache->invalidation_lock);
2639         INIT_LIST_HEAD(&cache->invalidation_requests);
2640
2641         iot_init(&cache->origin_tracker);
2642
2643         *result = cache;
2644         return 0;
2645
2646 bad:
2647         destroy(cache);
2648         return r;
2649 }
2650
2651 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2652 {
2653         unsigned i;
2654         const char **copy;
2655
2656         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2657         if (!copy)
2658                 return -ENOMEM;
2659         for (i = 0; i < argc; i++) {
2660                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2661                 if (!copy[i]) {
2662                         while (i--)
2663                                 kfree(copy[i]);
2664                         kfree(copy);
2665                         return -ENOMEM;
2666                 }
2667         }
2668
2669         cache->nr_ctr_args = argc;
2670         cache->ctr_args = copy;
2671
2672         return 0;
2673 }
2674
2675 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2676 {
2677         int r = -EINVAL;
2678         struct cache_args *ca;
2679         struct cache *cache = NULL;
2680
2681         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2682         if (!ca) {
2683                 ti->error = "Error allocating memory for cache";
2684                 return -ENOMEM;
2685         }
2686         ca->ti = ti;
2687
2688         r = parse_cache_args(ca, argc, argv, &ti->error);
2689         if (r)
2690                 goto out;
2691
2692         r = cache_create(ca, &cache);
2693         if (r)
2694                 goto out;
2695
2696         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2697         if (r) {
2698                 destroy(cache);
2699                 goto out;
2700         }
2701
2702         ti->private = cache;
2703
2704 out:
2705         destroy_cache_args(ca);
2706         return r;
2707 }
2708
2709 static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
2710 {
2711         int r;
2712         dm_oblock_t block = get_bio_block(cache, bio);
2713         size_t pb_data_size = get_per_bio_data_size(cache);
2714         bool can_migrate = false;
2715         bool discarded_block;
2716         struct policy_result lookup_result;
2717         struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2718         struct old_oblock_lock ool;
2719
2720         ool.locker.fn = null_locker;
2721
2722         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2723                 /*
2724                  * This can only occur if the io goes to a partial block at
2725                  * the end of the origin device.  We don't cache these.
2726                  * Just remap to the origin and carry on.
2727                  */
2728                 remap_to_origin(cache, bio);
2729                 return DM_MAPIO_REMAPPED;
2730         }
2731
2732         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2733                 defer_bio(cache, bio);
2734                 return DM_MAPIO_SUBMITTED;
2735         }
2736
2737         /*
2738          * Check to see if that block is currently migrating.
2739          */
2740         *cell = alloc_prison_cell(cache);
2741         if (!*cell) {
2742                 defer_bio(cache, bio);
2743                 return DM_MAPIO_SUBMITTED;
2744         }
2745
2746         r = bio_detain(cache, block, bio, *cell,
2747                        (cell_free_fn) free_prison_cell,
2748                        cache, cell);
2749         if (r) {
2750                 if (r < 0)
2751                         defer_bio(cache, bio);
2752
2753                 return DM_MAPIO_SUBMITTED;
2754         }
2755
2756         discarded_block = is_discarded_oblock(cache, block);
2757
2758         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2759                        bio, &ool.locker, &lookup_result);
2760         if (r == -EWOULDBLOCK) {
2761                 cell_defer(cache, *cell, true);
2762                 return DM_MAPIO_SUBMITTED;
2763
2764         } else if (r) {
2765                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2766                 cell_defer(cache, *cell, false);
2767                 bio_io_error(bio);
2768                 return DM_MAPIO_SUBMITTED;
2769         }
2770
2771         r = DM_MAPIO_REMAPPED;
2772         switch (lookup_result.op) {
2773         case POLICY_HIT:
2774                 if (passthrough_mode(&cache->features)) {
2775                         if (bio_data_dir(bio) == WRITE) {
2776                                 /*
2777                                  * We need to invalidate this block, so
2778                                  * defer for the worker thread.
2779                                  */
2780                                 cell_defer(cache, *cell, true);
2781                                 r = DM_MAPIO_SUBMITTED;
2782
2783                         } else {
2784                                 inc_miss_counter(cache, bio);
2785                                 remap_to_origin_clear_discard(cache, bio, block);
2786                         }
2787
2788                 } else {
2789                         inc_hit_counter(cache, bio);
2790                         if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2791                             !is_dirty(cache, lookup_result.cblock))
2792                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2793                         else
2794                                 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2795                 }
2796                 break;
2797
2798         case POLICY_MISS:
2799                 inc_miss_counter(cache, bio);
2800                 if (pb->req_nr != 0) {
2801                         /*
2802                          * This is a duplicate writethrough io that is no
2803                          * longer needed because the block has been demoted.
2804                          */
2805                         bio_endio(bio, 0);
2806                         cell_defer(cache, *cell, false);
2807                         r = DM_MAPIO_SUBMITTED;
2808
2809                 } else
2810                         remap_to_origin_clear_discard(cache, bio, block);
2811
2812                 break;
2813
2814         default:
2815                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2816                             (unsigned) lookup_result.op);
2817                 cell_defer(cache, *cell, false);
2818                 bio_io_error(bio);
2819                 r = DM_MAPIO_SUBMITTED;
2820         }
2821
2822         return r;
2823 }
2824
2825 static int cache_map(struct dm_target *ti, struct bio *bio)
2826 {
2827         int r;
2828         struct dm_bio_prison_cell *cell = NULL;
2829         struct cache *cache = ti->private;
2830
2831         r = __cache_map(cache, bio, &cell);
2832         if (r == DM_MAPIO_REMAPPED) {
2833                 accounted_begin(cache, bio);
2834
2835                 if (cell) {
2836                         inc_ds(cache, bio, cell);
2837                         cell_defer(cache, cell, false);
2838                 }
2839         }
2840
2841         return r;
2842 }
2843
2844 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2845 {
2846         struct cache *cache = ti->private;
2847         unsigned long flags;
2848         size_t pb_data_size = get_per_bio_data_size(cache);
2849         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2850
2851         if (pb->tick) {
2852                 policy_tick(cache->policy);
2853
2854                 spin_lock_irqsave(&cache->lock, flags);
2855                 cache->need_tick_bio = true;
2856                 spin_unlock_irqrestore(&cache->lock, flags);
2857         }
2858
2859         check_for_quiesced_migrations(cache, pb);
2860         accounted_complete(cache, bio);
2861
2862         return 0;
2863 }
2864
2865 static int write_dirty_bitset(struct cache *cache)
2866 {
2867         unsigned i, r;
2868
2869         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2870                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2871                                        is_dirty(cache, to_cblock(i)));
2872                 if (r)
2873                         return r;
2874         }
2875
2876         return 0;
2877 }
2878
2879 static int write_discard_bitset(struct cache *cache)
2880 {
2881         unsigned i, r;
2882
2883         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2884                                            cache->discard_nr_blocks);
2885         if (r) {
2886                 DMERR("could not resize on-disk discard bitset");
2887                 return r;
2888         }
2889
2890         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2891                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2892                                          is_discarded(cache, to_dblock(i)));
2893                 if (r)
2894                         return r;
2895         }
2896
2897         return 0;
2898 }
2899
2900 /*
2901  * returns true on success
2902  */
2903 static bool sync_metadata(struct cache *cache)
2904 {
2905         int r1, r2, r3, r4;
2906
2907         r1 = write_dirty_bitset(cache);
2908         if (r1)
2909                 DMERR("could not write dirty bitset");
2910
2911         r2 = write_discard_bitset(cache);
2912         if (r2)
2913                 DMERR("could not write discard bitset");
2914
2915         save_stats(cache);
2916
2917         r3 = dm_cache_write_hints(cache->cmd, cache->policy);
2918         if (r3)
2919                 DMERR("could not write hints");
2920
2921         /*
2922          * If writing the above metadata failed, we still commit, but don't
2923          * set the clean shutdown flag.  This will effectively force every
2924          * dirty bit to be set on reload.
2925          */
2926         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2927         if (r4)
2928                 DMERR("could not write cache metadata.  Data loss may occur.");
2929
2930         return !r1 && !r2 && !r3 && !r4;
2931 }
2932
2933 static void cache_postsuspend(struct dm_target *ti)
2934 {
2935         struct cache *cache = ti->private;
2936
2937         start_quiescing(cache);
2938         wait_for_migrations(cache);
2939         stop_worker(cache);
2940         requeue_deferred_io(cache);
2941         stop_quiescing(cache);
2942
2943         (void) sync_metadata(cache);
2944 }
2945
2946 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2947                         bool dirty, uint32_t hint, bool hint_valid)
2948 {
2949         int r;
2950         struct cache *cache = context;
2951
2952         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2953         if (r)
2954                 return r;
2955
2956         if (dirty)
2957                 set_dirty(cache, oblock, cblock);
2958         else
2959                 clear_dirty(cache, oblock, cblock);
2960
2961         return 0;
2962 }
2963
2964 /*
2965  * The discard block size in the on disk metadata is not
2966  * neccessarily the same as we're currently using.  So we have to
2967  * be careful to only set the discarded attribute if we know it
2968  * covers a complete block of the new size.
2969  */
2970 struct discard_load_info {
2971         struct cache *cache;
2972
2973         /*
2974          * These blocks are sized using the on disk dblock size, rather
2975          * than the current one.
2976          */
2977         dm_block_t block_size;
2978         dm_block_t discard_begin, discard_end;
2979 };
2980
2981 static void discard_load_info_init(struct cache *cache,
2982                                    struct discard_load_info *li)
2983 {
2984         li->cache = cache;
2985         li->discard_begin = li->discard_end = 0;
2986 }
2987
2988 static void set_discard_range(struct discard_load_info *li)
2989 {
2990         sector_t b, e;
2991
2992         if (li->discard_begin == li->discard_end)
2993                 return;
2994
2995         /*
2996          * Convert to sectors.
2997          */
2998         b = li->discard_begin * li->block_size;
2999         e = li->discard_end * li->block_size;
3000
3001         /*
3002          * Then convert back to the current dblock size.
3003          */
3004         b = dm_sector_div_up(b, li->cache->discard_block_size);
3005         sector_div(e, li->cache->discard_block_size);
3006
3007         /*
3008          * The origin may have shrunk, so we need to check we're still in
3009          * bounds.
3010          */
3011         if (e > from_dblock(li->cache->discard_nr_blocks))
3012                 e = from_dblock(li->cache->discard_nr_blocks);
3013
3014         for (; b < e; b++)
3015                 set_discard(li->cache, to_dblock(b));
3016 }
3017
3018 static int load_discard(void *context, sector_t discard_block_size,
3019                         dm_dblock_t dblock, bool discard)
3020 {
3021         struct discard_load_info *li = context;
3022
3023         li->block_size = discard_block_size;
3024
3025         if (discard) {
3026                 if (from_dblock(dblock) == li->discard_end)
3027                         /*
3028                          * We're already in a discard range, just extend it.
3029                          */
3030                         li->discard_end = li->discard_end + 1ULL;
3031
3032                 else {
3033                         /*
3034                          * Emit the old range and start a new one.
3035                          */
3036                         set_discard_range(li);
3037                         li->discard_begin = from_dblock(dblock);
3038                         li->discard_end = li->discard_begin + 1ULL;
3039                 }
3040         } else {
3041                 set_discard_range(li);
3042                 li->discard_begin = li->discard_end = 0;
3043         }
3044
3045         return 0;
3046 }
3047
3048 static dm_cblock_t get_cache_dev_size(struct cache *cache)
3049 {
3050         sector_t size = get_dev_size(cache->cache_dev);
3051         (void) sector_div(size, cache->sectors_per_block);
3052         return to_cblock(size);
3053 }
3054
3055 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3056 {
3057         if (from_cblock(new_size) > from_cblock(cache->cache_size))
3058                 return true;
3059
3060         /*
3061          * We can't drop a dirty block when shrinking the cache.
3062          */
3063         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3064                 new_size = to_cblock(from_cblock(new_size) + 1);
3065                 if (is_dirty(cache, new_size)) {
3066                         DMERR("unable to shrink cache; cache block %llu is dirty",
3067                               (unsigned long long) from_cblock(new_size));
3068                         return false;
3069                 }
3070         }
3071
3072         return true;
3073 }
3074
3075 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3076 {
3077         int r;
3078
3079         r = dm_cache_resize(cache->cmd, new_size);
3080         if (r) {
3081                 DMERR("could not resize cache metadata");
3082                 return r;
3083         }
3084
3085         set_cache_size(cache, new_size);
3086
3087         return 0;
3088 }
3089
3090 static int cache_preresume(struct dm_target *ti)
3091 {
3092         int r = 0;
3093         struct cache *cache = ti->private;
3094         dm_cblock_t csize = get_cache_dev_size(cache);
3095
3096         /*
3097          * Check to see if the cache has resized.
3098          */
3099         if (!cache->sized) {
3100                 r = resize_cache_dev(cache, csize);
3101                 if (r)
3102                         return r;
3103
3104                 cache->sized = true;
3105
3106         } else if (csize != cache->cache_size) {
3107                 if (!can_resize(cache, csize))
3108                         return -EINVAL;
3109
3110                 r = resize_cache_dev(cache, csize);
3111                 if (r)
3112                         return r;
3113         }
3114
3115         if (!cache->loaded_mappings) {
3116                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
3117                                            load_mapping, cache);
3118                 if (r) {
3119                         DMERR("could not load cache mappings");
3120                         return r;
3121                 }
3122
3123                 cache->loaded_mappings = true;
3124         }
3125
3126         if (!cache->loaded_discards) {
3127                 struct discard_load_info li;
3128
3129                 /*
3130                  * The discard bitset could have been resized, or the
3131                  * discard block size changed.  To be safe we start by
3132                  * setting every dblock to not discarded.
3133                  */
3134                 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3135
3136                 discard_load_info_init(cache, &li);
3137                 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3138                 if (r) {
3139                         DMERR("could not load origin discards");
3140                         return r;
3141                 }
3142                 set_discard_range(&li);
3143
3144                 cache->loaded_discards = true;
3145         }
3146
3147         return r;
3148 }
3149
3150 static void cache_resume(struct dm_target *ti)
3151 {
3152         struct cache *cache = ti->private;
3153
3154         cache->need_tick_bio = true;
3155         do_waker(&cache->waker.work);
3156 }
3157
3158 /*
3159  * Status format:
3160  *
3161  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3162  * <cache block size> <#used cache blocks>/<#total cache blocks>
3163  * <#read hits> <#read misses> <#write hits> <#write misses>
3164  * <#demotions> <#promotions> <#dirty>
3165  * <#features> <features>*
3166  * <#core args> <core args>
3167  * <policy name> <#policy args> <policy args>*
3168  */
3169 static void cache_status(struct dm_target *ti, status_type_t type,
3170                          unsigned status_flags, char *result, unsigned maxlen)
3171 {
3172         int r = 0;
3173         unsigned i;
3174         ssize_t sz = 0;
3175         dm_block_t nr_free_blocks_metadata = 0;
3176         dm_block_t nr_blocks_metadata = 0;
3177         char buf[BDEVNAME_SIZE];
3178         struct cache *cache = ti->private;
3179         dm_cblock_t residency;
3180
3181         switch (type) {
3182         case STATUSTYPE_INFO:
3183                 /* Commit to ensure statistics aren't out-of-date */
3184                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
3185                         r = dm_cache_commit(cache->cmd, false);
3186                         if (r)
3187                                 DMERR("could not commit metadata for accurate status");
3188                 }
3189
3190                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
3191                                                            &nr_free_blocks_metadata);
3192                 if (r) {
3193                         DMERR("could not get metadata free block count");
3194                         goto err;
3195                 }
3196
3197                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3198                 if (r) {
3199                         DMERR("could not get metadata device size");
3200                         goto err;
3201                 }
3202
3203                 residency = policy_residency(cache->policy);
3204
3205                 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
3206                        (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3207                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3208                        (unsigned long long)nr_blocks_metadata,
3209                        cache->sectors_per_block,
3210                        (unsigned long long) from_cblock(residency),
3211                        (unsigned long long) from_cblock(cache->cache_size),
3212                        (unsigned) atomic_read(&cache->stats.read_hit),
3213                        (unsigned) atomic_read(&cache->stats.read_miss),
3214                        (unsigned) atomic_read(&cache->stats.write_hit),
3215                        (unsigned) atomic_read(&cache->stats.write_miss),
3216                        (unsigned) atomic_read(&cache->stats.demotion),
3217                        (unsigned) atomic_read(&cache->stats.promotion),
3218                        (unsigned long) atomic_read(&cache->nr_dirty));
3219
3220                 if (writethrough_mode(&cache->features))
3221                         DMEMIT("1 writethrough ");
3222
3223                 else if (passthrough_mode(&cache->features))
3224                         DMEMIT("1 passthrough ");
3225
3226                 else if (writeback_mode(&cache->features))
3227                         DMEMIT("1 writeback ");
3228
3229                 else {
3230                         DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
3231                         goto err;
3232                 }
3233
3234                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3235
3236                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3237                 if (sz < maxlen) {
3238                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
3239                         if (r)
3240                                 DMERR("policy_emit_config_values returned %d", r);
3241                 }
3242
3243                 break;
3244
3245         case STATUSTYPE_TABLE:
3246                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3247                 DMEMIT("%s ", buf);
3248                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3249                 DMEMIT("%s ", buf);
3250                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3251                 DMEMIT("%s", buf);
3252
3253                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3254                         DMEMIT(" %s", cache->ctr_args[i]);
3255                 if (cache->nr_ctr_args)
3256                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3257         }
3258
3259         return;
3260
3261 err:
3262         DMEMIT("Error");
3263 }
3264
3265 /*
3266  * A cache block range can take two forms:
3267  *
3268  * i) A single cblock, eg. '3456'
3269  * ii) A begin and end cblock with dots between, eg. 123-234
3270  */
3271 static int parse_cblock_range(struct cache *cache, const char *str,
3272                               struct cblock_range *result)
3273 {
3274         char dummy;
3275         uint64_t b, e;
3276         int r;
3277
3278         /*
3279          * Try and parse form (ii) first.
3280          */
3281         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3282         if (r < 0)
3283                 return r;
3284
3285         if (r == 2) {
3286                 result->begin = to_cblock(b);
3287                 result->end = to_cblock(e);
3288                 return 0;
3289         }
3290
3291         /*
3292          * That didn't work, try form (i).
3293          */
3294         r = sscanf(str, "%llu%c", &b, &dummy);
3295         if (r < 0)
3296                 return r;
3297
3298         if (r == 1) {
3299                 result->begin = to_cblock(b);
3300                 result->end = to_cblock(from_cblock(result->begin) + 1u);
3301                 return 0;
3302         }
3303
3304         DMERR("invalid cblock range '%s'", str);
3305         return -EINVAL;
3306 }
3307
3308 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3309 {
3310         uint64_t b = from_cblock(range->begin);
3311         uint64_t e = from_cblock(range->end);
3312         uint64_t n = from_cblock(cache->cache_size);
3313
3314         if (b >= n) {
3315                 DMERR("begin cblock out of range: %llu >= %llu", b, n);
3316                 return -EINVAL;
3317         }
3318
3319         if (e > n) {
3320                 DMERR("end cblock out of range: %llu > %llu", e, n);
3321                 return -EINVAL;
3322         }
3323
3324         if (b >= e) {
3325                 DMERR("invalid cblock range: %llu >= %llu", b, e);
3326                 return -EINVAL;
3327         }
3328
3329         return 0;
3330 }
3331
3332 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3333 {
3334         struct invalidation_request req;
3335
3336         INIT_LIST_HEAD(&req.list);
3337         req.cblocks = range;
3338         atomic_set(&req.complete, 0);
3339         req.err = 0;
3340         init_waitqueue_head(&req.result_wait);
3341
3342         spin_lock(&cache->invalidation_lock);
3343         list_add(&req.list, &cache->invalidation_requests);
3344         spin_unlock(&cache->invalidation_lock);
3345         wake_worker(cache);
3346
3347         wait_event(req.result_wait, atomic_read(&req.complete));
3348         return req.err;
3349 }
3350
3351 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3352                                               const char **cblock_ranges)
3353 {
3354         int r = 0;
3355         unsigned i;
3356         struct cblock_range range;
3357
3358         if (!passthrough_mode(&cache->features)) {
3359                 DMERR("cache has to be in passthrough mode for invalidation");
3360                 return -EPERM;
3361         }
3362
3363         for (i = 0; i < count; i++) {
3364                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3365                 if (r)
3366                         break;
3367
3368                 r = validate_cblock_range(cache, &range);
3369                 if (r)
3370                         break;
3371
3372                 /*
3373                  * Pass begin and end origin blocks to the worker and wake it.
3374                  */
3375                 r = request_invalidation(cache, &range);
3376                 if (r)
3377                         break;
3378         }
3379
3380         return r;
3381 }
3382
3383 /*
3384  * Supports
3385  *      "<key> <value>"
3386  * and
3387  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3388  *
3389  * The key migration_threshold is supported by the cache target core.
3390  */
3391 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3392 {
3393         struct cache *cache = ti->private;
3394
3395         if (!argc)
3396                 return -EINVAL;
3397
3398         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3399                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3400
3401         if (argc != 2)
3402                 return -EINVAL;
3403
3404         return set_config_value(cache, argv[0], argv[1]);
3405 }
3406
3407 static int cache_iterate_devices(struct dm_target *ti,
3408                                  iterate_devices_callout_fn fn, void *data)
3409 {
3410         int r = 0;
3411         struct cache *cache = ti->private;
3412
3413         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3414         if (!r)
3415                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3416
3417         return r;
3418 }
3419
3420 /*
3421  * We assume I/O is going to the origin (which is the volume
3422  * more likely to have restrictions e.g. by being striped).
3423  * (Looking up the exact location of the data would be expensive
3424  * and could always be out of date by the time the bio is submitted.)
3425  */
3426 static int cache_bvec_merge(struct dm_target *ti,
3427                             struct bvec_merge_data *bvm,
3428                             struct bio_vec *biovec, int max_size)
3429 {
3430         struct cache *cache = ti->private;
3431         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
3432
3433         if (!q->merge_bvec_fn)
3434                 return max_size;
3435
3436         bvm->bi_bdev = cache->origin_dev->bdev;
3437         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3438 }
3439
3440 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3441 {
3442         /*
3443          * FIXME: these limits may be incompatible with the cache device
3444          */
3445         limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3446                                             cache->origin_sectors);
3447         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3448 }
3449
3450 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3451 {
3452         struct cache *cache = ti->private;
3453         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3454
3455         /*
3456          * If the system-determined stacked limits are compatible with the
3457          * cache's blocksize (io_opt is a factor) do not override them.
3458          */
3459         if (io_opt_sectors < cache->sectors_per_block ||
3460             do_div(io_opt_sectors, cache->sectors_per_block)) {
3461                 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3462                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3463         }
3464         set_discard_limits(cache, limits);
3465 }
3466
3467 /*----------------------------------------------------------------*/
3468
3469 static struct target_type cache_target = {
3470         .name = "cache",
3471         .version = {1, 6, 0},
3472         .module = THIS_MODULE,
3473         .ctr = cache_ctr,
3474         .dtr = cache_dtr,
3475         .map = cache_map,
3476         .end_io = cache_end_io,
3477         .postsuspend = cache_postsuspend,
3478         .preresume = cache_preresume,
3479         .resume = cache_resume,
3480         .status = cache_status,
3481         .message = cache_message,
3482         .iterate_devices = cache_iterate_devices,
3483         .merge = cache_bvec_merge,
3484         .io_hints = cache_io_hints,
3485 };
3486
3487 static int __init dm_cache_init(void)
3488 {
3489         int r;
3490
3491         r = dm_register_target(&cache_target);
3492         if (r) {
3493                 DMERR("cache target registration failed: %d", r);
3494                 return r;
3495         }
3496
3497         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3498         if (!migration_cache) {
3499                 dm_unregister_target(&cache_target);
3500                 return -ENOMEM;
3501         }
3502
3503         return 0;
3504 }
3505
3506 static void __exit dm_cache_exit(void)
3507 {
3508         dm_unregister_target(&cache_target);
3509         kmem_cache_destroy(migration_cache);
3510 }
3511
3512 module_init(dm_cache_init);
3513 module_exit(dm_cache_exit);
3514
3515 MODULE_DESCRIPTION(DM_NAME " cache target");
3516 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3517 MODULE_LICENSE("GPL");