Merge branch 'dm-4.12' into dm-4.12-post-merge

author Mike Snitzer <snitzer@redhat.com>

Mon, 1 May 2017 22:18:04 +0000 (18:18 -0400)

committer Mike Snitzer <snitzer@redhat.com>

Mon, 1 May 2017 22:18:04 +0000 (18:18 -0400)
author Mike Snitzer <snitzer@redhat.com>
Mon, 1 May 2017 22:18:04 +0000 (18:18 -0400)
committer Mike Snitzer <snitzer@redhat.com>
Mon, 1 May 2017 22:18:04 +0000 (18:18 -0400)
diff --combined drivers/md/dm-cache-target.c

index 975922c8f23143c4fcf586714015343f9ed0b41d,6e747fcbdf0f88a4a6c4f6c5e25e3c3fa0795a37..1db375f50a1321aae81492d1b3ac6392c52a00c2
--- 1/drivers/md/dm-cache-target.c
--- 2/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@@ -5,7 -5,7 +5,7 @@@
    */
   
   #include "dm.h"
- #include "dm-bio-prison.h"
+ #include "dm-bio-prison-v2.h"
   #include "dm-bio-record.h"
   #include "dm-cache-metadata.h"
   
@@@ -15,6 -15,7 +15,7 @@@
   #include <linux/init.h>
   #include <linux/mempool.h>
   #include <linux/module.h>
+ #include <linux/rwsem.h>
   #include <linux/slab.h>
   #include <linux/vmalloc.h>
   
@@@ -25,7 -26,18 +26,18 @@@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_
   
   /*----------------------------------------------------------------*/
   
- #define IOT_RESOLUTION 4
+ /*
+  * Glossary:
+  *
+  * oblock: index of an origin block
+  * cblock: index of a cache block
+  * promotion: movement of a block from origin to cache
+  * demotion: movement of a block from cache to origin
+  * migration: movement of a block between the origin and cache device,
+  *          either direction
+  */
+ 
+ /*----------------------------------------------------------------*/
   
   struct io_tracker {
         spinlock_t lock;
@@@ -99,18 -111,177 +111,177 @@@ static void iot_io_end(struct io_tracke
   /*----------------------------------------------------------------*/
   
   /*
-  * Glossary:
-  *
-  * oblock: index of an origin block
-  * cblock: index of a cache block
-  * promotion: movement of a block from origin to cache
-  * demotion: movement of a block from cache to origin
-  * migration: movement of a block between the origin and cache device,
-  *          either direction
+  * Represents a chunk of future work.  'input' allows continuations to pass
+  * values between themselves, typically error values.
    */
+ struct continuation {
+       struct work_struct ws;
+       int input;
+ };
+ 
+ static inline void init_continuation(struct continuation *k,
+                                    void (*fn)(struct work_struct *))
+ {
+       INIT_WORK(&k->ws, fn);
+       k->input = 0;
+ }
+ 
+ static inline void queue_continuation(struct workqueue_struct *wq,
+                                     struct continuation *k)
+ {
+       queue_work(wq, &k->ws);
+ }
   
   /*----------------------------------------------------------------*/
   
+ /*
+  * The batcher collects together pieces of work that need a particular
+  * operation to occur before they can proceed (typically a commit).
+  */
+ struct batcher {
+       /*
+        * The operation that everyone is waiting for.
+        */
+       int (*commit_op)(void *context);
+       void *commit_context;
+ 
+       /*
+        * This is how bios should be issued once the commit op is complete
+        * (accounted_request).
+        */
+       void (*issue_op)(struct bio *bio, void *context);
+       void *issue_context;
+ 
+       /*
+        * Queued work gets put on here after commit.
+        */
+       struct workqueue_struct *wq;
+ 
+       spinlock_t lock;
+       struct list_head work_items;
+       struct bio_list bios;
+       struct work_struct commit_work;
+ 
+       bool commit_scheduled;
+ };
+ 
+ static void __commit(struct work_struct *_ws)
+ {
+       struct batcher *b = container_of(_ws, struct batcher, commit_work);
+ 
+       int r;
+       unsigned long flags;
+       struct list_head work_items;
+       struct work_struct *ws, *tmp;
+       struct continuation *k;
+       struct bio *bio;
+       struct bio_list bios;
+ 
+       INIT_LIST_HEAD(&work_items);
+       bio_list_init(&bios);
+ 
+       /*
+        * We have to grab these before the commit_op to avoid a race
+        * condition.
+        */
+       spin_lock_irqsave(&b->lock, flags);
+       list_splice_init(&b->work_items, &work_items);
+       bio_list_merge(&bios, &b->bios);
+       bio_list_init(&b->bios);
+       b->commit_scheduled = false;
+       spin_unlock_irqrestore(&b->lock, flags);
+ 
+       r = b->commit_op(b->commit_context);
+ 
+       list_for_each_entry_safe(ws, tmp, &work_items, entry) {
+               k = container_of(ws, struct continuation, ws);
+               k->input = r;
+               INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
+               queue_work(b->wq, ws);
+       }
+ 
+       while ((bio = bio_list_pop(&bios))) {
+               if (r) {
+                       bio->bi_error = r;
+                       bio_endio(bio);
+               } else
+                       b->issue_op(bio, b->issue_context);
+       }
+ }
+ 
+ static void batcher_init(struct batcher *b,
+                        int (*commit_op)(void *),
+                        void *commit_context,
+                        void (*issue_op)(struct bio *bio, void *),
+                        void *issue_context,
+                        struct workqueue_struct *wq)
+ {
+       b->commit_op = commit_op;
+       b->commit_context = commit_context;
+       b->issue_op = issue_op;
+       b->issue_context = issue_context;
+       b->wq = wq;
+ 
+       spin_lock_init(&b->lock);
+       INIT_LIST_HEAD(&b->work_items);
+       bio_list_init(&b->bios);
+       INIT_WORK(&b->commit_work, __commit);
+       b->commit_scheduled = false;
+ }
+ 
+ static void async_commit(struct batcher *b)
+ {
+       queue_work(b->wq, &b->commit_work);
+ }
+ 
+ static void continue_after_commit(struct batcher *b, struct continuation *k)
+ {
+       unsigned long flags;
+       bool commit_scheduled;
+ 
+       spin_lock_irqsave(&b->lock, flags);
+       commit_scheduled = b->commit_scheduled;
+       list_add_tail(&k->ws.entry, &b->work_items);
+       spin_unlock_irqrestore(&b->lock, flags);
+ 
+       if (commit_scheduled)
+               async_commit(b);
+ }
+ 
+ /*
+  * Bios are errored if commit failed.
+  */
+ static void issue_after_commit(struct batcher *b, struct bio *bio)
+ {
+        unsigned long flags;
+        bool commit_scheduled;
+ 
+        spin_lock_irqsave(&b->lock, flags);
+        commit_scheduled = b->commit_scheduled;
+        bio_list_add(&b->bios, bio);
+        spin_unlock_irqrestore(&b->lock, flags);
+ 
+        if (commit_scheduled)
+              async_commit(b);
+ }
+ 
+ /*
+  * Call this if some urgent work is waiting for the commit to complete.
+  */
+ static void schedule_commit(struct batcher *b)
+ {
+       bool immediate;
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&b->lock, flags);
+       immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
+       b->commit_scheduled = true;
+       spin_unlock_irqrestore(&b->lock, flags);
+ 
+       if (immediate)
+               async_commit(b);
+ }
+ 
   /*
    * There are a couple of places where we let a bio run, but want to do some
    * work before calling its endio function.  We do this by temporarily
@@@ -189,31 -360,13 +360,13 @@@ struct cache_stats 
         atomic_t write_miss;
         atomic_t demotion;
         atomic_t promotion;
+       atomic_t writeback;
         atomic_t copies_avoided;
         atomic_t cache_cell_clash;
         atomic_t commit_count;
         atomic_t discard_count;
   };
   
- /*
-  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
-  * the one-past-the-end value.
-  */
- struct cblock_range {
-       dm_cblock_t begin;
-       dm_cblock_t end;
- };
- 
- struct invalidation_request {
-       struct list_head list;
-       struct cblock_range *cblocks;
- 
-       atomic_t complete;
-       int err;
- 
-       wait_queue_head_t result_wait;
- };
- 
   struct cache {
         struct dm_target *ti;
         struct dm_target_callbacks callbacks;
@@@ -255,11 -408,7 +408,7 @@@
         spinlock_t lock;
         struct list_head deferred_cells;
         struct bio_list deferred_bios;
-       struct bio_list deferred_flush_bios;
         struct bio_list deferred_writethrough_bios;
-       struct list_head quiesced_migrations;
-       struct list_head completed_migrations;
-       struct list_head need_commit_migrations;
         sector_t migration_threshold;
         wait_queue_head_t migration_wait;
         atomic_t nr_allocated_migrations;
@@@ -270,9 -419,7 +419,7 @@@
          */
         atomic_t nr_io_migrations;
   
-       wait_queue_head_t quiescing_wait;
-       atomic_t quiescing;
-       atomic_t quiescing_ack;
+       struct rw_semaphore quiesce_lock;
   
         /*
          * cache_size entries, dirty if set
@@@ -296,13 -443,11 +443,11 @@@
   
         struct dm_kcopyd_client *copier;
         struct workqueue_struct *wq;
-       struct work_struct worker;
- 
+       struct work_struct deferred_bio_worker;
+       struct work_struct deferred_writethrough_worker;
+       struct work_struct migration_worker;
         struct delayed_work waker;
-       unsigned long last_commit_jiffies;
- 
-       struct dm_bio_prison *prison;
-       struct dm_deferred_set *all_io_ds;
+       struct dm_bio_prison_v2 *prison;
   
         mempool_t *migration_pool;
   
@@@ -330,12 -475,17 +475,17 @@@
         struct list_head invalidation_requests;
   
         struct io_tracker origin_tracker;
+ 
+       struct work_struct commit_ws;
+       struct batcher committer;
+ 
+       struct rw_semaphore background_work_lock;
   };
   
   struct per_bio_data {
         bool tick:1;
         unsigned req_nr:2;
-       struct dm_deferred_entry *all_io_entry;
+       struct dm_bio_prison_cell_v2 *cell;
         struct dm_hook_info hook_info;
         sector_t len;
   
@@@ -350,55 -500,64 +500,64 @@@
   };
   
   struct dm_cache_migration {
-       struct list_head list;
+       struct continuation k;
         struct cache *cache;
   
-       unsigned long start_jiffies;
-       dm_oblock_t old_oblock;
-       dm_oblock_t new_oblock;
-       dm_cblock_t cblock;
- 
-       bool err:1;
-       bool discard:1;
-       bool writeback:1;
-       bool demote:1;
-       bool promote:1;
-       bool requeue_holder:1;
-       bool invalidate:1;
+       struct policy_work *op;
+       struct bio *overwrite_bio;
+       struct dm_bio_prison_cell_v2 *cell;
   
-       struct dm_bio_prison_cell *old_ocell;
-       struct dm_bio_prison_cell *new_ocell;
+       dm_cblock_t invalidate_cblock;
+       dm_oblock_t invalidate_oblock;
   };
   
- /*
-  * Processing a bio in the worker thread may require these memory
-  * allocations.  We prealloc to avoid deadlocks (the same worker thread
-  * frees them back to the mempool).
-  */
- struct prealloc {
-       struct dm_cache_migration *mg;
-       struct dm_bio_prison_cell *cell1;
-       struct dm_bio_prison_cell *cell2;
- };
+ /*----------------------------------------------------------------*/
+ 
+ static bool writethrough_mode(struct cache_features *f)
+ {
+       return f->io_mode == CM_IO_WRITETHROUGH;
+ }
+ 
+ static bool writeback_mode(struct cache_features *f)
+ {
+       return f->io_mode == CM_IO_WRITEBACK;
+ }
+ 
+ static inline bool passthrough_mode(struct cache_features *f)
+ {
+       return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
+ }
+ 
+ /*----------------------------------------------------------------*/
+ 
+ static void wake_deferred_bio_worker(struct cache *cache)
+ {
+       queue_work(cache->wq, &cache->deferred_bio_worker);
+ }
   
- static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+ static void wake_deferred_writethrough_worker(struct cache *cache)
+ {
+       queue_work(cache->wq, &cache->deferred_writethrough_worker);
+ }
   
- static void wake_worker(struct cache *cache)
+ static void wake_migration_worker(struct cache *cache)
   {
-       queue_work(cache->wq, &cache->worker);
+       if (passthrough_mode(&cache->features))
+               return;
+ 
+       queue_work(cache->wq, &cache->migration_worker);
   }
   
   /*----------------------------------------------------------------*/
   
- static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+ static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
   {
-       /* FIXME: change to use a local slab. */
-       return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+       return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
   }
   
- static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+ static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
   {
-       dm_bio_prison_free_cell(cache->prison, cell);
+       dm_bio_prison_free_cell_v2(cache->prison, cell);
   }
   
   static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@@ -424,146 -583,127 +583,127 @@@ static void free_migration(struct dm_ca
         mempool_free(mg, cache->migration_pool);
   }
   
- static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
- {
-       if (!p->mg) {
-               p->mg = alloc_migration(cache);
-               if (!p->mg)
-                       return -ENOMEM;
-       }
- 
-       if (!p->cell1) {
-               p->cell1 = alloc_prison_cell(cache);
-               if (!p->cell1)
-                       return -ENOMEM;
-       }
- 
-       if (!p->cell2) {
-               p->cell2 = alloc_prison_cell(cache);
-               if (!p->cell2)
-                       return -ENOMEM;
-       }
- 
-       return 0;
- }
+ /*----------------------------------------------------------------*/
   
- static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+ static inline dm_oblock_t oblock_succ(dm_oblock_t b)
   {
-       if (p->cell2)
-               free_prison_cell(cache, p->cell2);
- 
-       if (p->cell1)
-               free_prison_cell(cache, p->cell1);
- 
-       if (p->mg)
-               free_migration(p->mg);
+       return to_oblock(from_oblock(b) + 1ull);
   }
   
- static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+ static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
   {
-       struct dm_cache_migration *mg = p->mg;
- 
-       BUG_ON(!mg);
-       p->mg = NULL;
- 
-       return mg;
+       key->virtual = 0;
+       key->dev = 0;
+       key->block_begin = from_oblock(begin);
+       key->block_end = from_oblock(end);
   }
   
   /*
-  * You must have a cell within the prealloc struct to return.  If not this
-  * function will BUG() rather than returning NULL.
+  * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
+  * level 1 which prevents *both* READs and WRITEs.
    */
- static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+ #define WRITE_LOCK_LEVEL 0
+ #define READ_WRITE_LOCK_LEVEL 1
+ 
+ static unsigned lock_level(struct bio *bio)
   {
-       struct dm_bio_prison_cell *r = NULL;
+       return bio_data_dir(bio) == WRITE ?
+               WRITE_LOCK_LEVEL :
+               READ_WRITE_LOCK_LEVEL;
+ }
   
-       if (p->cell1) {
-               r = p->cell1;
-               p->cell1 = NULL;
+ /*----------------------------------------------------------------
+  * Per bio data
+  *--------------------------------------------------------------*/
   
-       } else if (p->cell2) {
-               r = p->cell2;
-               p->cell2 = NULL;
-       } else
-               BUG();
+ /*
+  * If using writeback, leave out struct per_bio_data's writethrough fields.
+  */
+ #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+ #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
   
-       return r;
+ static size_t get_per_bio_data_size(struct cache *cache)
+ {
+       return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
   }
   
- /*
-  * You can't have more than two cells in a prealloc struct.  BUG() will be
-  * called if you try and overfill.
-  */
- static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+ static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
   {
-       if (!p->cell2)
-               p->cell2 = cell;
+       struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+       BUG_ON(!pb);
+       return pb;
+ }
   
-       else if (!p->cell1)
-               p->cell1 = cell;
+ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+ {
+       struct per_bio_data *pb = get_per_bio_data(bio, data_size);
   
-       else
-               BUG();
+       pb->tick = false;
+       pb->req_nr = dm_bio_get_target_bio_nr(bio);
+       pb->cell = NULL;
+       pb->len = 0;
+ 
+       return pb;
   }
   
   /*----------------------------------------------------------------*/
   
- static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+ static void defer_bio(struct cache *cache, struct bio *bio)
   {
-       key->virtual = 0;
-       key->dev = 0;
-       key->block_begin = from_oblock(begin);
-       key->block_end = from_oblock(end);
- }
+       unsigned long flags;
   
- /*
-  * The caller hands in a preallocated cell, and a free function for it.
-  * The cell will be freed if there's an error, or if it wasn't used because
-  * a cell with that key already exists.
-  */
- typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+       spin_lock_irqsave(&cache->lock, flags);
+       bio_list_add(&cache->deferred_bios, bio);
+       spin_unlock_irqrestore(&cache->lock, flags);
+ 
+       wake_deferred_bio_worker(cache);
+ }
   
- static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
-                           struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
-                           cell_free_fn free_fn, void *free_context,
-                           struct dm_bio_prison_cell **cell_result)
+ static void defer_bios(struct cache *cache, struct bio_list *bios)
   {
-       int r;
-       struct dm_cell_key key;
+       unsigned long flags;
   
-       build_key(oblock_begin, oblock_end, &key);
-       r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
-       if (r)
-               free_fn(free_context, cell_prealloc);
+       spin_lock_irqsave(&cache->lock, flags);
+       bio_list_merge(&cache->deferred_bios, bios);
+       bio_list_init(bios);
+       spin_unlock_irqrestore(&cache->lock, flags);
   
-       return r;
+       wake_deferred_bio_worker(cache);
   }
   
- static int bio_detain(struct cache *cache, dm_oblock_t oblock,
-                     struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
-                     cell_free_fn free_fn, void *free_context,
-                     struct dm_bio_prison_cell **cell_result)
+ /*----------------------------------------------------------------*/
+ 
+ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
   {
+       bool r;
+       size_t pb_size;
+       struct per_bio_data *pb;
+       struct dm_cell_key_v2 key;
         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
-       return bio_detain_range(cache, oblock, end, bio,
-                               cell_prealloc, free_fn, free_context, cell_result);
- }
+       struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
   
- static int get_cell(struct cache *cache,
-                   dm_oblock_t oblock,
-                   struct prealloc *structs,
-                   struct dm_bio_prison_cell **cell_result)
- {
-       int r;
-       struct dm_cell_key key;
-       struct dm_bio_prison_cell *cell_prealloc;
+       cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
+       if (!cell_prealloc) {
+               defer_bio(cache, bio);
+               return false;
+       }
+ 
+       build_key(oblock, end, &key);
+       r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
+       if (!r) {
+               /*
+                * Failed to get the lock.
+                */
+               free_prison_cell(cache, cell_prealloc);
+               return r;
+       }
   
-       cell_prealloc = prealloc_get_cell(structs);
+       if (cell != cell_prealloc)
+               free_prison_cell(cache, cell_prealloc);
   
-       build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
-       r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
-       if (r)
-               prealloc_put_cell(structs, cell_prealloc);
+       pb_size = get_per_bio_data_size(cache);
+       pb = get_per_bio_data(bio, pb_size);
+       pb->cell = cell;
   
         return r;
   }
@@@ -575,21 -715,33 +715,33 @@@ static bool is_dirty(struct cache *cach
         return test_bit(from_cblock(b), cache->dirty_bitset);
   }
   
- static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+ static void set_dirty(struct cache *cache, dm_cblock_t cblock)
   {
         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
                 atomic_inc(&cache->nr_dirty);
-               policy_set_dirty(cache->policy, oblock);
+               policy_set_dirty(cache->policy, cblock);
         }
   }
   
- static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+ /*
+  * These two are called when setting after migrations to force the policy
+  * and dirty bitset to be in sync.
+  */
+ static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
+ {
+       if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
+               atomic_inc(&cache->nr_dirty);
+       policy_set_dirty(cache->policy, cblock);
+ }
+ 
+ static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
   {
         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
-               policy_clear_dirty(cache->policy, oblock);
                 if (atomic_dec_return(&cache->nr_dirty) == 0)
                         dm_table_event(cache->ti->table);
         }
+ 
+       policy_clear_dirty(cache->policy, cblock);
   }
   
   /*----------------------------------------------------------------*/
@@@ -628,11 -780,6 +780,6 @@@ static dm_dblock_t oblock_to_dblock(str
                                    oblocks_per_dblock(cache)));
   }
   
- static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
- {
-       return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
- }
- 
   static void set_discard(struct cache *cache, dm_dblock_t b)
   {
         unsigned long flags;
@@@ -679,89 -826,12 +826,12 @@@ static bool is_discarded_oblock(struct 
         return r;
   }
   
- /*----------------------------------------------------------------*/
- 
- static void load_stats(struct cache *cache)
+ /*----------------------------------------------------------------
+  * Remapping
+  *--------------------------------------------------------------*/
+ static void remap_to_origin(struct cache *cache, struct bio *bio)
   {
-       struct dm_cache_statistics stats;
- 
-       dm_cache_metadata_get_stats(cache->cmd, &stats);
-       atomic_set(&cache->stats.read_hit, stats.read_hits);
-       atomic_set(&cache->stats.read_miss, stats.read_misses);
-       atomic_set(&cache->stats.write_hit, stats.write_hits);
-       atomic_set(&cache->stats.write_miss, stats.write_misses);
- }
- 
- static void save_stats(struct cache *cache)
- {
-       struct dm_cache_statistics stats;
- 
-       if (get_cache_mode(cache) >= CM_READ_ONLY)
-               return;
- 
-       stats.read_hits = atomic_read(&cache->stats.read_hit);
-       stats.read_misses = atomic_read(&cache->stats.read_miss);
-       stats.write_hits = atomic_read(&cache->stats.write_hit);
-       stats.write_misses = atomic_read(&cache->stats.write_miss);
- 
-       dm_cache_metadata_set_stats(cache->cmd, &stats);
- }
- 
- /*----------------------------------------------------------------
-  * Per bio data
-  *--------------------------------------------------------------*/
- 
- /*
-  * If using writeback, leave out struct per_bio_data's writethrough fields.
-  */
- #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
- #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
- 
- static bool writethrough_mode(struct cache_features *f)
- {
-       return f->io_mode == CM_IO_WRITETHROUGH;
- }
- 
- static bool writeback_mode(struct cache_features *f)
- {
-       return f->io_mode == CM_IO_WRITEBACK;
- }
- 
- static bool passthrough_mode(struct cache_features *f)
- {
-       return f->io_mode == CM_IO_PASSTHROUGH;
- }
- 
- static size_t get_per_bio_data_size(struct cache *cache)
- {
-       return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
- }
- 
- static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
- {
-       struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
-       BUG_ON(!pb);
-       return pb;
- }
- 
- static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
- {
-       struct per_bio_data *pb = get_per_bio_data(bio, data_size);
- 
-       pb->tick = false;
-       pb->req_nr = dm_bio_get_target_bio_nr(bio);
-       pb->all_io_entry = NULL;
-       pb->len = 0;
- 
-       return pb;
- }
- 
- /*----------------------------------------------------------------
-  * Remapping
-  *--------------------------------------------------------------*/
- static void remap_to_origin(struct cache *cache, struct bio *bio)
- {
-       bio->bi_bdev = cache->origin_dev->bdev;
+       bio->bi_bdev = cache->origin_dev->bdev;
   }
   
   static void remap_to_cache(struct cache *cache, struct bio *bio,
@@@ -797,8 -867,9 +867,9 @@@ static void check_if_tick_bio_needed(st
   }
   
   static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
-                                 dm_oblock_t oblock)
+                                         dm_oblock_t oblock)
   {
+       // FIXME: this is called way too much.
         check_if_tick_bio_needed(cache, bio);
         remap_to_origin(cache, bio);
         if (bio_data_dir(bio) == WRITE)
@@@ -811,7 -882,7 +882,7 @@@ static void remap_to_cache_dirty(struc
         check_if_tick_bio_needed(cache, bio);
         remap_to_cache(cache, bio, cblock);
         if (bio_data_dir(bio) == WRITE) {
-               set_dirty(cache, oblock, cblock);
+               set_dirty(cache, cblock);
                 clear_discard(cache, oblock_to_dblock(cache, oblock));
         }
   }
@@@ -828,22 -899,6 +899,6 @@@ static dm_oblock_t get_bio_block(struc
         return to_oblock(block_nr);
   }
   
- /*
-  * You must increment the deferred set whilst the prison cell is held.  To
-  * encourage this, we ask for 'cell' to be passed in.
-  */
- static void inc_ds(struct cache *cache, struct bio *bio,
-                  struct dm_bio_prison_cell *cell)
- {
-       size_t pb_data_size = get_per_bio_data_size(cache);
-       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- 
-       BUG_ON(!cell);
-       BUG_ON(pb->all_io_entry);
- 
-       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- }
- 
   static bool accountable_bio(struct cache *cache, struct bio *bio)
   {
         return ((bio->bi_bdev == cache->origin_dev->bdev) &&
@@@ -875,29 -930,10 +930,10 @@@ static void accounted_request(struct ca
         generic_make_request(bio);
   }
   
- static void issue(struct cache *cache, struct bio *bio)
- {
-       unsigned long flags;
- 
-       if (!op_is_flush(bio->bi_opf)) {
-               accounted_request(cache, bio);
-               return;
-       }
- 
-       /*
-        * Batch together any bios that trigger commits and then issue a
-        * single commit for them in do_worker().
-        */
-       spin_lock_irqsave(&cache->lock, flags);
-       cache->commit_requested = true;
-       bio_list_add(&cache->deferred_flush_bios, bio);
-       spin_unlock_irqrestore(&cache->lock, flags);
- }
- 
- static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+ static void issue_op(struct bio *bio, void *context)
   {
-       inc_ds(cache, bio, cell);
-       issue(cache, bio);
+       struct cache *cache = context;
+       accounted_request(cache, bio);
   }
   
   static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@@ -908,7 -944,7 +944,7 @@@
         bio_list_add(&cache->deferred_writethrough_bios, bio);
         spin_unlock_irqrestore(&cache->lock, flags);
   
-       wake_worker(cache);
+       wake_deferred_writethrough_worker(cache);
   }
   
   static void writethrough_endio(struct bio *bio)
@@@ -934,6 -970,7 +970,7 @@@
   }
   
   /*
+  * FIXME: send in parallel, huge latency as is.
    * When running in writethrough mode we need to send writes to clean blocks
    * to both the cache and origin devices.  In future we'd like to clone the
    * bio and send them in parallel, but for now we're doing them in
@@@ -1046,12 -1083,58 +1083,58 @@@ static void metadata_operation_failed(s
         set_cache_mode(cache, CM_READ_ONLY);
   }
   
+ /*----------------------------------------------------------------*/
+ 
+ static void load_stats(struct cache *cache)
+ {
+       struct dm_cache_statistics stats;
+ 
+       dm_cache_metadata_get_stats(cache->cmd, &stats);
+       atomic_set(&cache->stats.read_hit, stats.read_hits);
+       atomic_set(&cache->stats.read_miss, stats.read_misses);
+       atomic_set(&cache->stats.write_hit, stats.write_hits);
+       atomic_set(&cache->stats.write_miss, stats.write_misses);
+ }
+ 
+ static void save_stats(struct cache *cache)
+ {
+       struct dm_cache_statistics stats;
+ 
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return;
+ 
+       stats.read_hits = atomic_read(&cache->stats.read_hit);
+       stats.read_misses = atomic_read(&cache->stats.read_miss);
+       stats.write_hits = atomic_read(&cache->stats.write_hit);
+       stats.write_misses = atomic_read(&cache->stats.write_miss);
+ 
+       dm_cache_metadata_set_stats(cache->cmd, &stats);
+ }
+ 
+ static void update_stats(struct cache_stats *stats, enum policy_operation op)
+ {
+       switch (op) {
+       case POLICY_PROMOTE:
+               atomic_inc(&stats->promotion);
+               break;
+ 
+       case POLICY_DEMOTE:
+               atomic_inc(&stats->demotion);
+               break;
+ 
+       case POLICY_WRITEBACK:
+               atomic_inc(&stats->writeback);
+               break;
+       }
+ }
+ 
   /*----------------------------------------------------------------
    * Migration processing
    *
    * Migration covers moving data from the origin device to the cache, or
    * vice versa.
    *--------------------------------------------------------------*/
+ 
   static void inc_io_migrations(struct cache *cache)
   {
         atomic_inc(&cache->nr_io_migrations);
@@@ -1067,213 -1150,109 +1150,109 @@@ static bool discard_or_flush(struct bi
         return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
   }
   
- static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
- {
-       if (discard_or_flush(cell->holder)) {
-               /*
-                * We have to handle these bios individually.
-                */
-               dm_cell_release(cache->prison, cell, &cache->deferred_bios);
-               free_prison_cell(cache, cell);
-       } else
-               list_add_tail(&cell->user_list, &cache->deferred_cells);
- }
- 
- static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
+ static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+                                    dm_dblock_t *b, dm_dblock_t *e)
   {
-       unsigned long flags;
- 
-       if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
-               /*
-                * There was no prisoner to promote to holder, the
-                * cell has been released.
-                */
-               free_prison_cell(cache, cell);
-               return;
-       }
+       sector_t sb = bio->bi_iter.bi_sector;
+       sector_t se = bio_end_sector(bio);
   
-       spin_lock_irqsave(&cache->lock, flags);
-       __cell_defer(cache, cell);
-       spin_unlock_irqrestore(&cache->lock, flags);
+       *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
   
-       wake_worker(cache);
+       if (se - sb < cache->discard_block_size)
+               *e = *b;
+       else
+               *e = to_dblock(block_div(se, cache->discard_block_size));
   }
   
- static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
- {
-       dm_cell_error(cache->prison, cell, err);
-       free_prison_cell(cache, cell);
- }
+ /*----------------------------------------------------------------*/
   
- static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+ static void prevent_background_work(struct cache *cache)
   {
-       cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+       lockdep_off();
+       down_write(&cache->background_work_lock);
+       lockdep_on();
   }
   
- static void free_io_migration(struct dm_cache_migration *mg)
+ static void allow_background_work(struct cache *cache)
   {
-       struct cache *cache = mg->cache;
- 
-       dec_io_migrations(cache);
-       free_migration(mg);
-       wake_worker(cache);
+       lockdep_off();
+       up_write(&cache->background_work_lock);
+       lockdep_on();
   }
   
- static void migration_failure(struct dm_cache_migration *mg)
+ static bool background_work_begin(struct cache *cache)
   {
-       struct cache *cache = mg->cache;
-       const char *dev_name = cache_device_name(cache);
- 
-       if (mg->writeback) {
-               DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
-               set_dirty(cache, mg->old_oblock, mg->cblock);
-               cell_defer(cache, mg->old_ocell, false);
- 
-       } else if (mg->demote) {
-               DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
-               policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+       bool r;
   
-               cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-               if (mg->promote)
-                       cell_defer(cache, mg->new_ocell, true);
-       } else {
-               DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
-               policy_remove_mapping(cache->policy, mg->new_oblock);
-               cell_defer(cache, mg->new_ocell, true);
-       }
+       lockdep_off();
+       r = down_read_trylock(&cache->background_work_lock);
+       lockdep_on();
   
-       free_io_migration(mg);
+       return r;
   }
   
- static void migration_success_pre_commit(struct dm_cache_migration *mg)
+ static void background_work_end(struct cache *cache)
   {
-       int r;
-       unsigned long flags;
-       struct cache *cache = mg->cache;
- 
-       if (mg->writeback) {
-               clear_dirty(cache, mg->old_oblock, mg->cblock);
-               cell_defer(cache, mg->old_ocell, false);
-               free_io_migration(mg);
-               return;
+       lockdep_off();
+       up_read(&cache->background_work_lock);
+       lockdep_on();
+ }
   
-       } else if (mg->demote) {
-               r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
-               if (r) {
-                       DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
-                                   cache_device_name(cache));
-                       metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-                       policy_force_mapping(cache->policy, mg->new_oblock,
-                                            mg->old_oblock);
-                       if (mg->promote)
-                               cell_defer(cache, mg->new_ocell, true);
-                       free_io_migration(mg);
-                       return;
-               }
-       } else {
-               r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
-               if (r) {
-                       DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
-                                   cache_device_name(cache));
-                       metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
-                       policy_remove_mapping(cache->policy, mg->new_oblock);
-                       free_io_migration(mg);
-                       return;
-               }
-       }
+ /*----------------------------------------------------------------*/
   
-       spin_lock_irqsave(&cache->lock, flags);
-       list_add_tail(&mg->list, &cache->need_commit_migrations);
-       cache->commit_requested = true;
-       spin_unlock_irqrestore(&cache->lock, flags);
+ static void quiesce(struct dm_cache_migration *mg,
+                   void (*continuation)(struct work_struct *))
+ {
+       init_continuation(&mg->k, continuation);
+       dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
   }
   
- static void migration_success_post_commit(struct dm_cache_migration *mg)
+ static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
   {
-       unsigned long flags;
-       struct cache *cache = mg->cache;
- 
-       if (mg->writeback) {
-               DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
-                            cache_device_name(cache));
-               return;
- 
-       } else if (mg->demote) {
-               cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
- 
-               if (mg->promote) {
-                       mg->demote = false;
- 
-                       spin_lock_irqsave(&cache->lock, flags);
-                       list_add_tail(&mg->list, &cache->quiesced_migrations);
-                       spin_unlock_irqrestore(&cache->lock, flags);
- 
-               } else {
-                       if (mg->invalidate)
-                               policy_remove_mapping(cache->policy, mg->old_oblock);
-                       free_io_migration(mg);
-               }
- 
-       } else {
-               if (mg->requeue_holder) {
-                       clear_dirty(cache, mg->new_oblock, mg->cblock);
-                       cell_defer(cache, mg->new_ocell, true);
-               } else {
-                       /*
-                        * The block was promoted via an overwrite, so it's dirty.
-                        */
-                       set_dirty(cache, mg->new_oblock, mg->cblock);
-                       bio_endio(mg->new_ocell->holder);
-                       cell_defer(cache, mg->new_ocell, false);
-               }
-               free_io_migration(mg);
-       }
+       struct continuation *k = container_of(ws, struct continuation, ws);
+       return container_of(k, struct dm_cache_migration, k);
   }
   
   static void copy_complete(int read_err, unsigned long write_err, void *context)
   {
-       unsigned long flags;
-       struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
-       struct cache *cache = mg->cache;
+       struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
   
         if (read_err || write_err)
-               mg->err = true;
- 
-       spin_lock_irqsave(&cache->lock, flags);
-       list_add_tail(&mg->list, &cache->completed_migrations);
-       spin_unlock_irqrestore(&cache->lock, flags);
+               mg->k.input = -EIO;
   
-       wake_worker(cache);
+       queue_continuation(mg->cache->wq, &mg->k);
   }
   
- static void issue_copy(struct dm_cache_migration *mg)
+ static int copy(struct dm_cache_migration *mg, bool promote)
   {
         int r;
         struct dm_io_region o_region, c_region;
         struct cache *cache = mg->cache;
-       sector_t cblock = from_cblock(mg->cblock);
   
         o_region.bdev = cache->origin_dev->bdev;
+       o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
         o_region.count = cache->sectors_per_block;
   
         c_region.bdev = cache->cache_dev->bdev;
-       c_region.sector = cblock * cache->sectors_per_block;
+       c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
         c_region.count = cache->sectors_per_block;
   
-       if (mg->writeback || mg->demote) {
-               /* demote */
-               o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
-               r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
-       } else {
-               /* promote */
-               o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
-               r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
-       }
+       if (promote)
+               r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
+       else
+               r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
   
-       if (r < 0) {
-               DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
-               migration_failure(mg);
-       }
+       return r;
+ }
+ 
+ static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
+ {
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ 
+       if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
+               free_prison_cell(cache, pb->cell);
+       pb->cell = NULL;
   }
   
   static void overwrite_endio(struct bio *bio)
@@@ -1282,930 -1261,752 +1261,752 @@@
         struct cache *cache = mg->cache;
         size_t pb_data_size = get_per_bio_data_size(cache);
         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-       unsigned long flags;
   
         dm_unhook_bio(&pb->hook_info, bio);
   
         if (bio->bi_error)
-               mg->err = true;
- 
-       mg->requeue_holder = false;
+               mg->k.input = bio->bi_error;
   
-       spin_lock_irqsave(&cache->lock, flags);
-       list_add_tail(&mg->list, &cache->completed_migrations);
-       spin_unlock_irqrestore(&cache->lock, flags);
- 
-       wake_worker(cache);
+       queue_continuation(mg->cache->wq, &mg->k);
   }
   
- static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+ static void overwrite(struct dm_cache_migration *mg,
+                     void (*continuation)(struct work_struct *))
   {
+       struct bio *bio = mg->overwrite_bio;
         size_t pb_data_size = get_per_bio_data_size(mg->cache);
         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
   
         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
-       remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
   
         /*
-        * No need to inc_ds() here, since the cell will be held for the
-        * duration of the io.
+        * The overwrite bio is part of the copy operation, as such it does
+        * not set/clear discard or dirty flags.
          */
+       if (mg->op->op == POLICY_PROMOTE)
+               remap_to_cache(mg->cache, bio, mg->op->cblock);
+       else
+               remap_to_origin(mg->cache, bio);
+ 
+       init_continuation(&mg->k, continuation);
         accounted_request(mg->cache, bio);
   }
   
- static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+ /*
+  * Migration steps:
+  *
+  * 1) exclusive lock preventing WRITEs
+  * 2) quiesce
+  * 3) copy or issue overwrite bio
+  * 4) upgrade to exclusive lock preventing READs and WRITEs
+  * 5) quiesce
+  * 6) update metadata and commit
+  * 7) unlock
+  */
+ static void mg_complete(struct dm_cache_migration *mg, bool success)
   {
-       return (bio_data_dir(bio) == WRITE) &&
-               (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+       struct bio_list bios;
+       struct cache *cache = mg->cache;
+       struct policy_work *op = mg->op;
+       dm_cblock_t cblock = op->cblock;
+ 
+       if (success)
+               update_stats(&cache->stats, op->op);
+ 
+       switch (op->op) {
+       case POLICY_PROMOTE:
+               clear_discard(cache, oblock_to_dblock(cache, op->oblock));
+               policy_complete_background_work(cache->policy, op, success);
+ 
+               if (mg->overwrite_bio) {
+                       if (success)
+                               force_set_dirty(cache, cblock);
+                       else
+                               mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+                       bio_endio(mg->overwrite_bio);
+               } else {
+                       if (success)
+                               force_clear_dirty(cache, cblock);
+                       dec_io_migrations(cache);
+               }
+               break;
+ 
+       case POLICY_DEMOTE:
+               /*
+                * We clear dirty here to update the nr_dirty counter.
+                */
+               if (success)
+                       force_clear_dirty(cache, cblock);
+               policy_complete_background_work(cache->policy, op, success);
+               dec_io_migrations(cache);
+               break;
+ 
+       case POLICY_WRITEBACK:
+               if (success)
+                       force_clear_dirty(cache, cblock);
+               policy_complete_background_work(cache->policy, op, success);
+               dec_io_migrations(cache);
+               break;
+       }
+ 
+       bio_list_init(&bios);
+       if (mg->cell) {
+               if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+                       free_prison_cell(cache, mg->cell);
+       }
+ 
+       free_migration(mg);
+       defer_bios(cache, &bios);
+       wake_migration_worker(cache);
+ 
+       background_work_end(cache);
   }
   
- static void avoid_copy(struct dm_cache_migration *mg)
+ static void mg_success(struct work_struct *ws)
   {
-       atomic_inc(&mg->cache->stats.copies_avoided);
-       migration_success_pre_commit(mg);
+       struct dm_cache_migration *mg = ws_to_mg(ws);
+       mg_complete(mg, mg->k.input == 0);
   }
   
- static void calc_discard_block_range(struct cache *cache, struct bio *bio,
-                                    dm_dblock_t *b, dm_dblock_t *e)
+ static void mg_update_metadata(struct work_struct *ws)
   {
-       sector_t sb = bio->bi_iter.bi_sector;
-       sector_t se = bio_end_sector(bio);
- 
-       *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
- 
-       if (se - sb < cache->discard_block_size)
-               *e = *b;
-       else
-               *e = to_dblock(block_div(se, cache->discard_block_size));
- }
- 
- static void issue_discard(struct dm_cache_migration *mg)
- {
-       dm_dblock_t b, e;
-       struct bio *bio = mg->new_ocell->holder;
-       struct cache *cache = mg->cache;
- 
-       calc_discard_block_range(cache, bio, &b, &e);
-       while (b != e) {
-               set_discard(cache, b);
-               b = to_dblock(from_dblock(b) + 1);
-       }
- 
-       bio_endio(bio);
-       cell_defer(cache, mg->new_ocell, false);
-       free_migration(mg);
-       wake_worker(cache);
- }
- 
- static void issue_copy_or_discard(struct dm_cache_migration *mg)
- {
-       bool avoid;
+       int r;
+       struct dm_cache_migration *mg = ws_to_mg(ws);
         struct cache *cache = mg->cache;
+       struct policy_work *op = mg->op;
   
-       if (mg->discard) {
-               issue_discard(mg);
-               return;
-       }
+       switch (op->op) {
+       case POLICY_PROMOTE:
+               r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
+               if (r) {
+                       DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
+                                   cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
   
-       if (mg->writeback || mg->demote)
-               avoid = !is_dirty(cache, mg->cblock) ||
-                       is_discarded_oblock(cache, mg->old_oblock);
-       else {
-               struct bio *bio = mg->new_ocell->holder;
+                       mg_complete(mg, false);
+                       return;
+               }
+               mg_complete(mg, true);
+               break;
   
-               avoid = is_discarded_oblock(cache, mg->new_oblock);
+       case POLICY_DEMOTE:
+               r = dm_cache_remove_mapping(cache->cmd, op->cblock);
+               if (r) {
+                       DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
+                                   cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
   
-               if (writeback_mode(&cache->features) &&
-                   !avoid && bio_writes_complete_block(cache, bio)) {
-                       issue_overwrite(mg, bio);
+                       mg_complete(mg, false);
                         return;
                 }
-       }
   
-       avoid ? avoid_copy(mg) : issue_copy(mg);
+               /*
+                * It would be nice if we only had to commit when a REQ_FLUSH
+                * comes through.  But there's one scenario that we have to
+                * look out for:
+                *
+                * - vblock x in a cache block
+                * - domotion occurs
+                * - cache block gets reallocated and over written
+                * - crash
+                *
+                * When we recover, because there was no commit the cache will
+                * rollback to having the data for vblock x in the cache block.
+                * But the cache block has since been overwritten, so it'll end
+                * up pointing to data that was never in 'x' during the history
+                * of the device.
+                *
+                * To avoid this issue we require a commit as part of the
+                * demotion operation.
+                */
+               init_continuation(&mg->k, mg_success);
+               continue_after_commit(&cache->committer, &mg->k);
+               schedule_commit(&cache->committer);
+               break;
+ 
+       case POLICY_WRITEBACK:
+               mg_complete(mg, true);
+               break;
+       }
   }
   
- static void complete_migration(struct dm_cache_migration *mg)
+ static void mg_update_metadata_after_copy(struct work_struct *ws)
   {
-       if (mg->err)
-               migration_failure(mg);
+       struct dm_cache_migration *mg = ws_to_mg(ws);
+ 
+       /*
+        * Did the copy succeed?
+        */
+       if (mg->k.input)
+               mg_complete(mg, false);
         else
-               migration_success_pre_commit(mg);
+               mg_update_metadata(ws);
   }
   
- static void process_migrations(struct cache *cache, struct list_head *head,
-                              void (*fn)(struct dm_cache_migration *))
+ static void mg_upgrade_lock(struct work_struct *ws)
   {
-       unsigned long flags;
-       struct list_head list;
-       struct dm_cache_migration *mg, *tmp;
+       int r;
+       struct dm_cache_migration *mg = ws_to_mg(ws);
   
-       INIT_LIST_HEAD(&list);
-       spin_lock_irqsave(&cache->lock, flags);
-       list_splice_init(head, &list);
-       spin_unlock_irqrestore(&cache->lock, flags);
+       /*
+        * Did the copy succeed?
+        */
+       if (mg->k.input)
+               mg_complete(mg, false);
   
-       list_for_each_entry_safe(mg, tmp, &list, list)
-               fn(mg);
- }
+       else {
+               /*
+                * Now we want the lock to prevent both reads and writes.
+                */
+               r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
+                                           READ_WRITE_LOCK_LEVEL);
+               if (r < 0)
+                       mg_complete(mg, false);
   
- static void __queue_quiesced_migration(struct dm_cache_migration *mg)
- {
-       list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+               else if (r)
+                       quiesce(mg, mg_update_metadata);
+ 
+               else
+                       mg_update_metadata(ws);
+       }
   }
   
- static void queue_quiesced_migration(struct dm_cache_migration *mg)
+ static void mg_copy(struct work_struct *ws)
   {
-       unsigned long flags;
-       struct cache *cache = mg->cache;
+       int r;
+       struct dm_cache_migration *mg = ws_to_mg(ws);
   
-       spin_lock_irqsave(&cache->lock, flags);
-       __queue_quiesced_migration(mg);
-       spin_unlock_irqrestore(&cache->lock, flags);
+       if (mg->overwrite_bio) {
+               /*
+                * It's safe to do this here, even though it's new data
+                * because all IO has been locked out of the block.
+                *
+                * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
+                * so _not_ using mg_upgrade_lock() as continutation.
+                */
+               overwrite(mg, mg_update_metadata_after_copy);
   
-       wake_worker(cache);
- }
+       } else {
+               struct cache *cache = mg->cache;
+               struct policy_work *op = mg->op;
+               bool is_policy_promote = (op->op == POLICY_PROMOTE);
   
- static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
- {
-       unsigned long flags;
-       struct dm_cache_migration *mg, *tmp;
+               if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+                   is_discarded_oblock(cache, op->oblock)) {
+                       mg_upgrade_lock(ws);
+                       return;
+               }
   
-       spin_lock_irqsave(&cache->lock, flags);
-       list_for_each_entry_safe(mg, tmp, work, list)
-               __queue_quiesced_migration(mg);
-       spin_unlock_irqrestore(&cache->lock, flags);
+               init_continuation(&mg->k, mg_upgrade_lock);
   
-       wake_worker(cache);
+               r = copy(mg, is_policy_promote);
+               if (r) {
+                       DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+                       mg->k.input = -EIO;
+                       mg_complete(mg, false);
+               }
+       }
   }
   
- static void check_for_quiesced_migrations(struct cache *cache,
-                                         struct per_bio_data *pb)
+ static int mg_lock_writes(struct dm_cache_migration *mg)
   {
-       struct list_head work;
- 
-       if (!pb->all_io_entry)
-               return;
- 
-       INIT_LIST_HEAD(&work);
-       dm_deferred_entry_dec(pb->all_io_entry, &work);
+       int r;
+       struct dm_cell_key_v2 key;
+       struct cache *cache = mg->cache;
+       struct dm_bio_prison_cell_v2 *prealloc;
   
-       if (!list_empty(&work))
-               queue_quiesced_migrations(cache, &work);
- }
+       prealloc = alloc_prison_cell(cache);
+       if (!prealloc) {
+               DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
+               mg_complete(mg, false);
+               return -ENOMEM;
+       }
   
- static void quiesce_migration(struct dm_cache_migration *mg)
- {
-       if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
-               queue_quiesced_migration(mg);
- }
+       /*
+        * Prevent writes to the block, but allow reads to continue.
+        * Unless we're using an overwrite bio, in which case we lock
+        * everything.
+        */
+       build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
+       r = dm_cell_lock_v2(cache->prison, &key,
+                           mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
+                           prealloc, &mg->cell);
+       if (r < 0) {
+               free_prison_cell(cache, prealloc);
+               mg_complete(mg, false);
+               return r;
+       }
   
- static void promote(struct cache *cache, struct prealloc *structs,
-                   dm_oblock_t oblock, dm_cblock_t cblock,
-                   struct dm_bio_prison_cell *cell)
- {
-       struct dm_cache_migration *mg = prealloc_get_migration(structs);
+       if (mg->cell != prealloc)
+               free_prison_cell(cache, prealloc);
   
-       mg->err = false;
-       mg->discard = false;
-       mg->writeback = false;
-       mg->demote = false;
-       mg->promote = true;
-       mg->requeue_holder = true;
-       mg->invalidate = false;
-       mg->cache = cache;
-       mg->new_oblock = oblock;
-       mg->cblock = cblock;
-       mg->old_ocell = NULL;
-       mg->new_ocell = cell;
-       mg->start_jiffies = jiffies;
+       if (r == 0)
+               mg_copy(&mg->k.ws);
+       else
+               quiesce(mg, mg_copy);
   
-       inc_io_migrations(cache);
-       quiesce_migration(mg);
+       return 0;
   }
   
- static void writeback(struct cache *cache, struct prealloc *structs,
-                     dm_oblock_t oblock, dm_cblock_t cblock,
-                     struct dm_bio_prison_cell *cell)
+ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
   {
-       struct dm_cache_migration *mg = prealloc_get_migration(structs);
- 
-       mg->err = false;
-       mg->discard = false;
-       mg->writeback = true;
-       mg->demote = false;
-       mg->promote = false;
-       mg->requeue_holder = true;
-       mg->invalidate = false;
-       mg->cache = cache;
-       mg->old_oblock = oblock;
-       mg->cblock = cblock;
-       mg->old_ocell = cell;
-       mg->new_ocell = NULL;
-       mg->start_jiffies = jiffies;
- 
-       inc_io_migrations(cache);
-       quiesce_migration(mg);
- }
- 
- static void demote_then_promote(struct cache *cache, struct prealloc *structs,
-                               dm_oblock_t old_oblock, dm_oblock_t new_oblock,
-                               dm_cblock_t cblock,
-                               struct dm_bio_prison_cell *old_ocell,
-                               struct dm_bio_prison_cell *new_ocell)
- {
-       struct dm_cache_migration *mg = prealloc_get_migration(structs);
- 
-       mg->err = false;
-       mg->discard = false;
-       mg->writeback = false;
-       mg->demote = true;
-       mg->promote = true;
-       mg->requeue_holder = true;
-       mg->invalidate = false;
-       mg->cache = cache;
-       mg->old_oblock = old_oblock;
-       mg->new_oblock = new_oblock;
-       mg->cblock = cblock;
-       mg->old_ocell = old_ocell;
-       mg->new_ocell = new_ocell;
-       mg->start_jiffies = jiffies;
- 
-       inc_io_migrations(cache);
-       quiesce_migration(mg);
- }
+       struct dm_cache_migration *mg;
   
- /*
-  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
-  * block are thrown away.
-  */
- static void invalidate(struct cache *cache, struct prealloc *structs,
-                      dm_oblock_t oblock, dm_cblock_t cblock,
-                      struct dm_bio_prison_cell *cell)
- {
-       struct dm_cache_migration *mg = prealloc_get_migration(structs);
- 
-       mg->err = false;
-       mg->discard = false;
-       mg->writeback = false;
-       mg->demote = true;
-       mg->promote = false;
-       mg->requeue_holder = true;
-       mg->invalidate = true;
-       mg->cache = cache;
-       mg->old_oblock = oblock;
-       mg->cblock = cblock;
-       mg->old_ocell = cell;
-       mg->new_ocell = NULL;
-       mg->start_jiffies = jiffies;
+       if (!background_work_begin(cache)) {
+               policy_complete_background_work(cache->policy, op, false);
+               return -EPERM;
+       }
   
-       inc_io_migrations(cache);
-       quiesce_migration(mg);
- }
+       mg = alloc_migration(cache);
+       if (!mg) {
+               policy_complete_background_work(cache->policy, op, false);
+               background_work_end(cache);
+               return -ENOMEM;
+       }
   
- static void discard(struct cache *cache, struct prealloc *structs,
-                   struct dm_bio_prison_cell *cell)
- {
-       struct dm_cache_migration *mg = prealloc_get_migration(structs);
+       memset(mg, 0, sizeof(*mg));
   
-       mg->err = false;
-       mg->discard = true;
-       mg->writeback = false;
-       mg->demote = false;
-       mg->promote = false;
-       mg->requeue_holder = false;
-       mg->invalidate = false;
         mg->cache = cache;
-       mg->old_ocell = NULL;
-       mg->new_ocell = cell;
-       mg->start_jiffies = jiffies;
+       mg->op = op;
+       mg->overwrite_bio = bio;
+ 
+       if (!bio)
+               inc_io_migrations(cache);
   
-       quiesce_migration(mg);
+       return mg_lock_writes(mg);
   }
   
   /*----------------------------------------------------------------
-  * bio processing
+  * invalidation processing
    *--------------------------------------------------------------*/
- static void defer_bio(struct cache *cache, struct bio *bio)
- {
-       unsigned long flags;
- 
-       spin_lock_irqsave(&cache->lock, flags);
-       bio_list_add(&cache->deferred_bios, bio);
-       spin_unlock_irqrestore(&cache->lock, flags);
- 
-       wake_worker(cache);
- }
- 
- static void process_flush_bio(struct cache *cache, struct bio *bio)
- {
-       size_t pb_data_size = get_per_bio_data_size(cache);
-       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- 
-       BUG_ON(bio->bi_iter.bi_size);
-       if (!pb->req_nr)
-               remap_to_origin(cache, bio);
-       else
-               remap_to_cache(cache, bio, 0);
   
-       /*
-        * REQ_PREFLUSH is not directed at any particular block so we don't
-        * need to inc_ds().  REQ_FUA's are split into a write + REQ_PREFLUSH
-        * by dm-core.
-        */
-       issue(cache, bio);
- }
- 
- static void process_discard_bio(struct cache *cache, struct prealloc *structs,
-                               struct bio *bio)
+ static void invalidate_complete(struct dm_cache_migration *mg, bool success)
   {
-       int r;
-       dm_dblock_t b, e;
-       struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
- 
-       calc_discard_block_range(cache, bio, &b, &e);
-       if (b == e) {
-               bio_endio(bio);
-               return;
-       }
+       struct bio_list bios;
+       struct cache *cache = mg->cache;
   
-       cell_prealloc = prealloc_get_cell(structs);
-       r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
-                            (cell_free_fn) prealloc_put_cell,
-                            structs, &new_ocell);
-       if (r > 0)
-               return;
+       bio_list_init(&bios);
+       if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+               free_prison_cell(cache, mg->cell);
   
-       discard(cache, structs, new_ocell);
- }
+       if (!success && mg->overwrite_bio)
+               bio_io_error(mg->overwrite_bio);
   
- static bool spare_migration_bandwidth(struct cache *cache)
- {
-       sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
-               cache->sectors_per_block;
-       return current_volume < cache->migration_threshold;
- }
+       free_migration(mg);
+       defer_bios(cache, &bios);
   
- static void inc_hit_counter(struct cache *cache, struct bio *bio)
- {
-       atomic_inc(bio_data_dir(bio) == READ ?
-                  &cache->stats.read_hit : &cache->stats.write_hit);
+       background_work_end(cache);
   }
   
- static void inc_miss_counter(struct cache *cache, struct bio *bio)
+ static void invalidate_completed(struct work_struct *ws)
   {
-       atomic_inc(bio_data_dir(bio) == READ ?
-                  &cache->stats.read_miss : &cache->stats.write_miss);
+       struct dm_cache_migration *mg = ws_to_mg(ws);
+       invalidate_complete(mg, !mg->k.input);
   }
   
- /*----------------------------------------------------------------*/
- 
- struct inc_detail {
-       struct cache *cache;
-       struct bio_list bios_for_issue;
-       struct bio_list unhandled_bios;
-       bool any_writes;
- };
- 
- static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+ static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
   {
-       struct bio *bio;
-       struct inc_detail *detail = context;
-       struct cache *cache = detail->cache;
- 
-       inc_ds(cache, cell->holder, cell);
-       if (bio_data_dir(cell->holder) == WRITE)
-               detail->any_writes = true;
- 
-       while ((bio = bio_list_pop(&cell->bios))) {
-               if (discard_or_flush(bio)) {
-                       bio_list_add(&detail->unhandled_bios, bio);
-                       continue;
+       int r = policy_invalidate_mapping(cache->policy, cblock);
+       if (!r) {
+               r = dm_cache_remove_mapping(cache->cmd, cblock);
+               if (r) {
+                       DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+                                   cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
                 }
   
-               if (bio_data_dir(bio) == WRITE)
-                       detail->any_writes = true;
+       } else if (r == -ENODATA) {
+               /*
+                * Harmless, already unmapped.
+                */
+               r = 0;
   
-               bio_list_add(&detail->bios_for_issue, bio);
-               inc_ds(cache, bio, cell);
-       }
+       } else
+               DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
+ 
+       return r;
   }
   
- // FIXME: refactor these two
- static void remap_cell_to_origin_clear_discard(struct cache *cache,
-                                              struct dm_bio_prison_cell *cell,
-                                              dm_oblock_t oblock, bool issue_holder)
+ static void invalidate_remove(struct work_struct *ws)
   {
-       struct bio *bio;
-       unsigned long flags;
-       struct inc_detail detail;
- 
-       detail.cache = cache;
-       bio_list_init(&detail.bios_for_issue);
-       bio_list_init(&detail.unhandled_bios);
-       detail.any_writes = false;
- 
-       spin_lock_irqsave(&cache->lock, flags);
-       dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
-       bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
-       spin_unlock_irqrestore(&cache->lock, flags);
- 
-       remap_to_origin(cache, cell->holder);
-       if (issue_holder)
-               issue(cache, cell->holder);
-       else
-               accounted_begin(cache, cell->holder);
- 
-       if (detail.any_writes)
-               clear_discard(cache, oblock_to_dblock(cache, oblock));
+       int r;
+       struct dm_cache_migration *mg = ws_to_mg(ws);
+       struct cache *cache = mg->cache;
   
-       while ((bio = bio_list_pop(&detail.bios_for_issue))) {
-               remap_to_origin(cache, bio);
-               issue(cache, bio);
+       r = invalidate_cblock(cache, mg->invalidate_cblock);
+       if (r) {
+               invalidate_complete(mg, false);
+               return;
         }
   
-       free_prison_cell(cache, cell);
+       init_continuation(&mg->k, invalidate_completed);
+       continue_after_commit(&cache->committer, &mg->k);
+       remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
+       mg->overwrite_bio = NULL;
+       schedule_commit(&cache->committer);
   }
   
- static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
-                                     dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+ static int invalidate_lock(struct dm_cache_migration *mg)
   {
-       struct bio *bio;
-       unsigned long flags;
-       struct inc_detail detail;
- 
-       detail.cache = cache;
-       bio_list_init(&detail.bios_for_issue);
-       bio_list_init(&detail.unhandled_bios);
-       detail.any_writes = false;
- 
-       spin_lock_irqsave(&cache->lock, flags);
-       dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
-       bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
-       spin_unlock_irqrestore(&cache->lock, flags);
- 
-       remap_to_cache(cache, cell->holder, cblock);
-       if (issue_holder)
-               issue(cache, cell->holder);
-       else
-               accounted_begin(cache, cell->holder);
+       int r;
+       struct dm_cell_key_v2 key;
+       struct cache *cache = mg->cache;
+       struct dm_bio_prison_cell_v2 *prealloc;
   
-       if (detail.any_writes) {
-               set_dirty(cache, oblock, cblock);
-               clear_discard(cache, oblock_to_dblock(cache, oblock));
+       prealloc = alloc_prison_cell(cache);
+       if (!prealloc) {
+               invalidate_complete(mg, false);
+               return -ENOMEM;
         }
   
-       while ((bio = bio_list_pop(&detail.bios_for_issue))) {
-               remap_to_cache(cache, bio, cblock);
-               issue(cache, bio);
+       build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
+       r = dm_cell_lock_v2(cache->prison, &key,
+                           READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
+       if (r < 0) {
+               free_prison_cell(cache, prealloc);
+               invalidate_complete(mg, false);
+               return r;
         }
   
-       free_prison_cell(cache, cell);
- }
+       if (mg->cell != prealloc)
+               free_prison_cell(cache, prealloc);
   
- /*----------------------------------------------------------------*/
+       if (r)
+               quiesce(mg, invalidate_remove);
   
- struct old_oblock_lock {
-       struct policy_locker locker;
-       struct cache *cache;
-       struct prealloc *structs;
-       struct dm_bio_prison_cell *cell;
- };
+       else {
+               /*
+                * We can't call invalidate_remove() directly here because we
+                * might still be in request context.
+                */
+               init_continuation(&mg->k, invalidate_remove);
+               queue_work(cache->wq, &mg->k.ws);
+       }
   
- static int null_locker(struct policy_locker *locker, dm_oblock_t b)
- {
-       /* This should never be called */
-       BUG();
         return 0;
   }
   
- static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
+                           dm_oblock_t oblock, struct bio *bio)
   {
-       struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
-       struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
- 
-       return bio_detain(l->cache, b, NULL, cell_prealloc,
-                         (cell_free_fn) prealloc_put_cell,
-                         l->structs, &l->cell);
- }
- 
- static void process_cell(struct cache *cache, struct prealloc *structs,
-                        struct dm_bio_prison_cell *new_ocell)
- {
-       int r;
-       bool release_cell = true;
-       struct bio *bio = new_ocell->holder;
-       dm_oblock_t block = get_bio_block(cache, bio);
-       struct policy_result lookup_result;
-       bool passthrough = passthrough_mode(&cache->features);
-       bool fast_promotion, can_migrate;
-       struct old_oblock_lock ool;
- 
-       fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-       can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
- 
-       ool.locker.fn = cell_locker;
-       ool.cache = cache;
-       ool.structs = structs;
-       ool.cell = NULL;
-       r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
-                      bio, &ool.locker, &lookup_result);
- 
-       if (r == -EWOULDBLOCK)
-               /* migration has been denied */
-               lookup_result.op = POLICY_MISS;
- 
-       switch (lookup_result.op) {
-       case POLICY_HIT:
-               if (passthrough) {
-                       inc_miss_counter(cache, bio);
- 
-                       /*
-                        * Passthrough always maps to the origin,
-                        * invalidating any cache blocks that are written
-                        * to.
-                        */
- 
-                       if (bio_data_dir(bio) == WRITE) {
-                               atomic_inc(&cache->stats.demotion);
-                               invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
-                               release_cell = false;
+       struct dm_cache_migration *mg;
   
-                       } else {
-                               /* FIXME: factor out issue_origin() */
-                               remap_to_origin_clear_discard(cache, bio, block);
-                               inc_and_issue(cache, bio, new_ocell);
-                       }
-               } else {
-                       inc_hit_counter(cache, bio);
- 
-                       if (bio_data_dir(bio) == WRITE &&
-                           writethrough_mode(&cache->features) &&
-                           !is_dirty(cache, lookup_result.cblock)) {
-                               remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-                               inc_and_issue(cache, bio, new_ocell);
- 
-                       } else {
-                               remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
-                               release_cell = false;
-                       }
-               }
+       if (!background_work_begin(cache))
+               return -EPERM;
   
-               break;
+       mg = alloc_migration(cache);
+       if (!mg) {
+               background_work_end(cache);
+               return -ENOMEM;
+       }
   
-       case POLICY_MISS:
-               inc_miss_counter(cache, bio);
-               remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
-               release_cell = false;
-               break;
+       memset(mg, 0, sizeof(*mg));
   
-       case POLICY_NEW:
-               atomic_inc(&cache->stats.promotion);
-               promote(cache, structs, block, lookup_result.cblock, new_ocell);
-               release_cell = false;
-               break;
+       mg->cache = cache;
+       mg->overwrite_bio = bio;
+       mg->invalidate_cblock = cblock;
+       mg->invalidate_oblock = oblock;
   
-       case POLICY_REPLACE:
-               atomic_inc(&cache->stats.demotion);
-               atomic_inc(&cache->stats.promotion);
-               demote_then_promote(cache, structs, lookup_result.old_oblock,
-                                   block, lookup_result.cblock,
-                                   ool.cell, new_ocell);
-               release_cell = false;
-               break;
+       return invalidate_lock(mg);
+ }
   
-       default:
-               DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
-                           cache_device_name(cache), __func__,
-                           (unsigned) lookup_result.op);
-               bio_io_error(bio);
-       }
+ /*----------------------------------------------------------------
+  * bio processing
+  *--------------------------------------------------------------*/
   
-       if (release_cell)
-               cell_defer(cache, new_ocell, false);
- }
+ enum busy {
+       IDLE,
+       MODERATE,
+       BUSY
+ };
   
- static void process_bio(struct cache *cache, struct prealloc *structs,
-                       struct bio *bio)
+ static enum busy spare_migration_bandwidth(struct cache *cache)
   {
-       int r;
-       dm_oblock_t block = get_bio_block(cache, bio);
-       struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
- 
-       /*
-        * Check to see if that block is currently migrating.
-        */
-       cell_prealloc = prealloc_get_cell(structs);
-       r = bio_detain(cache, block, bio, cell_prealloc,
-                      (cell_free_fn) prealloc_put_cell,
-                      structs, &new_ocell);
-       if (r > 0)
-               return;
+       bool idle = iot_idle_for(&cache->origin_tracker, HZ);
+       sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
+               cache->sectors_per_block;
   
-       process_cell(cache, structs, new_ocell);
+       if (current_volume <= cache->migration_threshold)
+               return idle ? IDLE : MODERATE;
+       else
+               return idle ? MODERATE : BUSY;
   }
   
- static int need_commit_due_to_time(struct cache *cache)
+ static void inc_hit_counter(struct cache *cache, struct bio *bio)
   {
-       return jiffies < cache->last_commit_jiffies ||
-              jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+       atomic_inc(bio_data_dir(bio) == READ ?
+                  &cache->stats.read_hit : &cache->stats.write_hit);
   }
   
- /*
-  * A non-zero return indicates read_only or fail_io mode.
-  */
- static int commit(struct cache *cache, bool clean_shutdown)
+ static void inc_miss_counter(struct cache *cache, struct bio *bio)
   {
-       int r;
- 
-       if (get_cache_mode(cache) >= CM_READ_ONLY)
-               return -EINVAL;
+       atomic_inc(bio_data_dir(bio) == READ ?
+                  &cache->stats.read_miss : &cache->stats.write_miss);
+ }
   
-       atomic_inc(&cache->stats.commit_count);
-       r = dm_cache_commit(cache->cmd, clean_shutdown);
-       if (r)
-               metadata_operation_failed(cache, "dm_cache_commit", r);
+ /*----------------------------------------------------------------*/
   
-       return r;
+ static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+ {
+       return (bio_data_dir(bio) == WRITE) &&
+               (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
   }
   
- static int commit_if_needed(struct cache *cache)
+ static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
   {
-       int r = 0;
- 
-       if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
-           dm_cache_changed_this_transaction(cache->cmd)) {
-               r = commit(cache, false);
-               cache->commit_requested = false;
-               cache->last_commit_jiffies = jiffies;
-       }
- 
-       return r;
+       return writeback_mode(&cache->features) &&
+               (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
   }
   
- static void process_deferred_bios(struct cache *cache)
+ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
+                  bool *commit_needed)
   {
-       bool prealloc_used = false;
-       unsigned long flags;
-       struct bio_list bios;
-       struct bio *bio;
-       struct prealloc structs;
- 
-       memset(&structs, 0, sizeof(structs));
-       bio_list_init(&bios);
+       int r, data_dir;
+       bool rb, background_queued;
+       dm_cblock_t cblock;
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
   
-       spin_lock_irqsave(&cache->lock, flags);
-       bio_list_merge(&bios, &cache->deferred_bios);
-       bio_list_init(&cache->deferred_bios);
-       spin_unlock_irqrestore(&cache->lock, flags);
+       *commit_needed = false;
   
-       while (!bio_list_empty(&bios)) {
+       rb = bio_detain_shared(cache, block, bio);
+       if (!rb) {
                 /*
-                * If we've got no free migration structs, and processing
-                * this bio might require one, we pause until there are some
-                * prepared mappings to process.
+                * An exclusive lock is held for this block, so we have to
+                * wait.  We set the commit_needed flag so the current
+                * transaction will be committed asap, allowing this lock
+                * to be dropped.
                  */
-               prealloc_used = true;
-               if (prealloc_data_structs(cache, &structs)) {
-                       spin_lock_irqsave(&cache->lock, flags);
-                       bio_list_merge(&cache->deferred_bios, &bios);
-                       spin_unlock_irqrestore(&cache->lock, flags);
-                       break;
+               *commit_needed = true;
+               return DM_MAPIO_SUBMITTED;
+       }
+ 
+       data_dir = bio_data_dir(bio);
+ 
+       if (optimisable_bio(cache, bio, block)) {
+               struct policy_work *op = NULL;
+ 
+               r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
+               if (unlikely(r && r != -ENOENT)) {
+                       DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
+                                   cache_device_name(cache), r);
+                       bio_io_error(bio);
+                       return DM_MAPIO_SUBMITTED;
                 }
   
-               bio = bio_list_pop(&bios);
+               if (r == -ENOENT && op) {
+                       bio_drop_shared_lock(cache, bio);
+                       BUG_ON(op->op != POLICY_PROMOTE);
+                       mg_start(cache, op, bio);
+                       return DM_MAPIO_SUBMITTED;
+               }
+       } else {
+               r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
+               if (unlikely(r && r != -ENOENT)) {
+                       DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
+                                   cache_device_name(cache), r);
+                       bio_io_error(bio);
+                       return DM_MAPIO_SUBMITTED;
+               }
   
-               if (bio->bi_opf & REQ_PREFLUSH)
-                       process_flush_bio(cache, bio);
-               else if (bio_op(bio) == REQ_OP_DISCARD)
-                       process_discard_bio(cache, &structs, bio);
-               else
-                       process_bio(cache, &structs, bio);
+               if (background_queued)
+                       wake_migration_worker(cache);
         }
   
-       if (prealloc_used)
-               prealloc_free_structs(cache, &structs);
- }
- 
- static void process_deferred_cells(struct cache *cache)
- {
-       bool prealloc_used = false;
-       unsigned long flags;
-       struct dm_bio_prison_cell *cell, *tmp;
-       struct list_head cells;
-       struct prealloc structs;
+       if (r == -ENOENT) {
+               /*
+                * Miss.
+                */
+               inc_miss_counter(cache, bio);
+               if (pb->req_nr == 0) {
+                       accounted_begin(cache, bio);
+                       remap_to_origin_clear_discard(cache, bio, block);
   
-       memset(&structs, 0, sizeof(structs));
+               } else {
+                       /*
+                        * This is a duplicate writethrough io that is no
+                        * longer needed because the block has been demoted.
+                        */
+                       bio_endio(bio);
+                       return DM_MAPIO_SUBMITTED;
+               }
+       } else {
+               /*
+                * Hit.
+                */
+               inc_hit_counter(cache, bio);
   
-       INIT_LIST_HEAD(&cells);
+               /*
+                * Passthrough always maps to the origin, invalidating any
+                * cache blocks that are written to.
+                */
+               if (passthrough_mode(&cache->features)) {
+                       if (bio_data_dir(bio) == WRITE) {
+                               bio_drop_shared_lock(cache, bio);
+                               atomic_inc(&cache->stats.demotion);
+                               invalidate_start(cache, cblock, block, bio);
+                       } else
+                               remap_to_origin_clear_discard(cache, bio, block);
   
-       spin_lock_irqsave(&cache->lock, flags);
-       list_splice_init(&cache->deferred_cells, &cells);
-       spin_unlock_irqrestore(&cache->lock, flags);
+               } else {
+                       if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+                           !is_dirty(cache, cblock)) {
+                               remap_to_origin_then_cache(cache, bio, block, cblock);
+                               accounted_begin(cache, bio);
+                       } else
+                               remap_to_cache_dirty(cache, bio, block, cblock);
+               }
+       }
   
-       list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+       /*
+        * dm core turns FUA requests into a separate payload and FLUSH req.
+        */
+       if (bio->bi_opf & REQ_FUA) {
                 /*
-                * If we've got no free migration structs, and processing
-                * this bio might require one, we pause until there are some
-                * prepared mappings to process.
+                * issue_after_commit will call accounted_begin a second time.  So
+                * we call accounted_complete() to avoid double accounting.
                  */
-               prealloc_used = true;
-               if (prealloc_data_structs(cache, &structs)) {
-                       spin_lock_irqsave(&cache->lock, flags);
-                       list_splice(&cells, &cache->deferred_cells);
-                       spin_unlock_irqrestore(&cache->lock, flags);
-                       break;
-               }
- 
-               process_cell(cache, &structs, cell);
+               accounted_complete(cache, bio);
+               issue_after_commit(&cache->committer, bio);
+               *commit_needed = true;
+               return DM_MAPIO_SUBMITTED;
         }
   
-       if (prealloc_used)
-               prealloc_free_structs(cache, &structs);
+       return DM_MAPIO_REMAPPED;
   }
   
- static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+ static bool process_bio(struct cache *cache, struct bio *bio)
   {
-       unsigned long flags;
-       struct bio_list bios;
-       struct bio *bio;
- 
-       bio_list_init(&bios);
+       bool commit_needed;
   
-       spin_lock_irqsave(&cache->lock, flags);
-       bio_list_merge(&bios, &cache->deferred_flush_bios);
-       bio_list_init(&cache->deferred_flush_bios);
-       spin_unlock_irqrestore(&cache->lock, flags);
+       if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
+               generic_make_request(bio);
   
-       /*
-        * These bios have already been through inc_ds()
-        */
-       while ((bio = bio_list_pop(&bios)))
-               submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
+       return commit_needed;
   }
   
- static void process_deferred_writethrough_bios(struct cache *cache)
+ /*
+  * A non-zero return indicates read_only or fail_io mode.
+  */
+ static int commit(struct cache *cache, bool clean_shutdown)
   {
-       unsigned long flags;
-       struct bio_list bios;
-       struct bio *bio;
+       int r;
   
-       bio_list_init(&bios);
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return -EINVAL;
   
-       spin_lock_irqsave(&cache->lock, flags);
-       bio_list_merge(&bios, &cache->deferred_writethrough_bios);
-       bio_list_init(&cache->deferred_writethrough_bios);
-       spin_unlock_irqrestore(&cache->lock, flags);
+       atomic_inc(&cache->stats.commit_count);
+       r = dm_cache_commit(cache->cmd, clean_shutdown);
+       if (r)
+               metadata_operation_failed(cache, "dm_cache_commit", r);
   
-       /*
-        * These bios have already been through inc_ds()
-        */
-       while ((bio = bio_list_pop(&bios)))
-               accounted_request(cache, bio);
+       return r;
   }
   
- static void writeback_some_dirty_blocks(struct cache *cache)
+ /*
+  * Used by the batcher.
+  */
+ static int commit_op(void *context)
   {
-       bool prealloc_used = false;
-       dm_oblock_t oblock;
-       dm_cblock_t cblock;
-       struct prealloc structs;
-       struct dm_bio_prison_cell *old_ocell;
-       bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
- 
-       memset(&structs, 0, sizeof(structs));
- 
-       while (spare_migration_bandwidth(cache)) {
-               if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
-                       break; /* no work to do */
- 
-               prealloc_used = true;
-               if (prealloc_data_structs(cache, &structs) ||
-                   get_cell(cache, oblock, &structs, &old_ocell)) {
-                       policy_set_dirty(cache->policy, oblock);
-                       break;
-               }
+       struct cache *cache = context;
   
-               writeback(cache, &structs, oblock, cblock, old_ocell);
-       }
+       if (dm_cache_changed_this_transaction(cache->cmd))
+               return commit(cache, false);
   
-       if (prealloc_used)
-               prealloc_free_structs(cache, &structs);
+       return 0;
   }
   
- /*----------------------------------------------------------------
-  * Invalidations.
-  * Dropping something from the cache *without* writing back.
-  *--------------------------------------------------------------*/
+ /*----------------------------------------------------------------*/
   
- static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
+ static bool process_flush_bio(struct cache *cache, struct bio *bio)
   {
-       int r = 0;
-       uint64_t begin = from_cblock(req->cblocks->begin);
-       uint64_t end = from_cblock(req->cblocks->end);
- 
-       while (begin != end) {
-               r = policy_remove_cblock(cache->policy, to_cblock(begin));
-               if (!r) {
-                       r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
-                       if (r) {
-                               metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-                               break;
-                       }
- 
-               } else if (r == -ENODATA) {
-                       /* harmless, already unmapped */
-                       r = 0;
- 
-               } else {
-                       DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
-                       break;
-               }
- 
-               begin++;
-         }
- 
-       cache->commit_requested = true;
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
   
-       req->err = r;
-       atomic_set(&req->complete, 1);
+       if (!pb->req_nr)
+               remap_to_origin(cache, bio);
+       else
+               remap_to_cache(cache, bio, 0);
   
-       wake_up(&req->result_wait);
+       issue_after_commit(&cache->committer, bio);
+       return true;
   }
   
- static void process_invalidation_requests(struct cache *cache)
+ static bool process_discard_bio(struct cache *cache, struct bio *bio)
   {
-       struct list_head list;
-       struct invalidation_request *req, *tmp;
+       dm_dblock_t b, e;
   
-       INIT_LIST_HEAD(&list);
-       spin_lock(&cache->invalidation_lock);
-       list_splice_init(&cache->invalidation_requests, &list);
-       spin_unlock(&cache->invalidation_lock);
+       // FIXME: do we need to lock the region?  Or can we just assume the
+       // user wont be so foolish as to issue discard concurrently with
+       // other IO?
+       calc_discard_block_range(cache, bio, &b, &e);
+       while (b != e) {
+               set_discard(cache, b);
+               b = to_dblock(from_dblock(b) + 1);
+       }
   
-       list_for_each_entry_safe (req, tmp, &list, list)
-               process_invalidation_request(cache, req);
- }
+       bio_endio(bio);
   
- /*----------------------------------------------------------------
-  * Main worker loop
-  *--------------------------------------------------------------*/
- static bool is_quiescing(struct cache *cache)
- {
-       return atomic_read(&cache->quiescing);
+       return false;
   }
   
- static void ack_quiescing(struct cache *cache)
+ static void process_deferred_bios(struct work_struct *ws)
   {
-       if (is_quiescing(cache)) {
-               atomic_inc(&cache->quiescing_ack);
-               wake_up(&cache->quiescing_wait);
-       }
- }
+       struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
   
- static void wait_for_quiescing_ack(struct cache *cache)
- {
-       wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
- }
+       unsigned long flags;
+       bool commit_needed = false;
+       struct bio_list bios;
+       struct bio *bio;
   
- static void start_quiescing(struct cache *cache)
- {
-       atomic_inc(&cache->quiescing);
-       wait_for_quiescing_ack(cache);
- }
+       bio_list_init(&bios);
   
- static void stop_quiescing(struct cache *cache)
- {
-       atomic_set(&cache->quiescing, 0);
-       atomic_set(&cache->quiescing_ack, 0);
- }
+       spin_lock_irqsave(&cache->lock, flags);
+       bio_list_merge(&bios, &cache->deferred_bios);
+       bio_list_init(&cache->deferred_bios);
+       spin_unlock_irqrestore(&cache->lock, flags);
   
- static void wait_for_migrations(struct cache *cache)
- {
-       wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
- }
+       while ((bio = bio_list_pop(&bios))) {
+               if (bio->bi_opf & REQ_PREFLUSH)
+                       commit_needed = process_flush_bio(cache, bio) || commit_needed;
   
- static void stop_worker(struct cache *cache)
- {
-       cancel_delayed_work(&cache->waker);
-       flush_workqueue(cache->wq);
+               else if (bio_op(bio) == REQ_OP_DISCARD)
+                       commit_needed = process_discard_bio(cache, bio) || commit_needed;
+ 
+               else
+                       commit_needed = process_bio(cache, bio) || commit_needed;
+       }
+ 
+       if (commit_needed)
+               schedule_commit(&cache->committer);
   }
   
- static void requeue_deferred_cells(struct cache *cache)
+ static void process_deferred_writethrough_bios(struct work_struct *ws)
   {
+       struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
+ 
         unsigned long flags;
-       struct list_head cells;
-       struct dm_bio_prison_cell *cell, *tmp;
+       struct bio_list bios;
+       struct bio *bio;
+ 
+       bio_list_init(&bios);
   
-       INIT_LIST_HEAD(&cells);
         spin_lock_irqsave(&cache->lock, flags);
-       list_splice_init(&cache->deferred_cells, &cells);
+       bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+       bio_list_init(&cache->deferred_writethrough_bios);
         spin_unlock_irqrestore(&cache->lock, flags);
   
-       list_for_each_entry_safe(cell, tmp, &cells, user_list)
-               cell_requeue(cache, cell);
+       /*
+        * These bios have already been through accounted_begin()
+        */
+       while ((bio = bio_list_pop(&bios)))
+               generic_make_request(bio);
   }
   
+ /*----------------------------------------------------------------
+  * Main worker loop
+  *--------------------------------------------------------------*/
+ 
   static void requeue_deferred_bios(struct cache *cache)
   {
         struct bio *bio;
@@@ -2221,53 -2022,6 +2022,6 @@@
         }
   }
   
- static int more_work(struct cache *cache)
- {
-       if (is_quiescing(cache))
-               return !list_empty(&cache->quiesced_migrations) ||
-                       !list_empty(&cache->completed_migrations) ||
-                       !list_empty(&cache->need_commit_migrations);
-       else
-               return !bio_list_empty(&cache->deferred_bios) ||
-                       !list_empty(&cache->deferred_cells) ||
-                       !bio_list_empty(&cache->deferred_flush_bios) ||
-                       !bio_list_empty(&cache->deferred_writethrough_bios) ||
-                       !list_empty(&cache->quiesced_migrations) ||
-                       !list_empty(&cache->completed_migrations) ||
-                       !list_empty(&cache->need_commit_migrations) ||
-                       cache->invalidate;
- }
- 
- static void do_worker(struct work_struct *ws)
- {
-       struct cache *cache = container_of(ws, struct cache, worker);
- 
-       do {
-               if (!is_quiescing(cache)) {
-                       writeback_some_dirty_blocks(cache);
-                       process_deferred_writethrough_bios(cache);
-                       process_deferred_bios(cache);
-                       process_deferred_cells(cache);
-                       process_invalidation_requests(cache);
-               }
- 
-               process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
-               process_migrations(cache, &cache->completed_migrations, complete_migration);
- 
-               if (commit_if_needed(cache)) {
-                       process_deferred_flush_bios(cache, false);
-                       process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-               } else {
-                       process_deferred_flush_bios(cache, true);
-                       process_migrations(cache, &cache->need_commit_migrations,
-                                          migration_success_post_commit);
-               }
- 
-               ack_quiescing(cache);
- 
-       } while (more_work(cache));
- }
- 
   /*
    * We want to commit periodically so that not too much
    * unwritten metadata builds up.
@@@ -2275,25 -2029,39 +2029,39 @@@
   static void do_waker(struct work_struct *ws)
   {
         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+ 
         policy_tick(cache->policy, true);
-       wake_worker(cache);
+       wake_migration_worker(cache);
+       schedule_commit(&cache->committer);
         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
   }
   
- /*----------------------------------------------------------------*/
- 
- static int is_congested(struct dm_dev *dev, int bdi_bits)
+ static void check_migrations(struct work_struct *ws)
   {
-       struct request_queue *q = bdev_get_queue(dev->bdev);
-       return bdi_congested(q->backing_dev_info, bdi_bits);
- }
+       int r;
+       struct policy_work *op;
+       struct cache *cache = container_of(ws, struct cache, migration_worker);
+       enum busy b;
   
- static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
- {
-       struct cache *cache = container_of(cb, struct cache, callbacks);
+       for (;;) {
+               b = spare_migration_bandwidth(cache);
+               if (b == BUSY)
+                       break;
   
-       return is_congested(cache->origin_dev, bdi_bits) ||
-               is_congested(cache->cache_dev, bdi_bits);
+               r = policy_get_background_work(cache->policy, b == IDLE, &op);
+               if (r == -ENODATA)
+                       break;
+ 
+               if (r) {
+                       DMERR_LIMIT("%s: policy_background_work failed",
+                                   cache_device_name(cache));
+                       break;
+               }
+ 
+               r = mg_start(cache, op, NULL);
+               if (r)
+                       break;
+       }
   }
   
   /*----------------------------------------------------------------
@@@ -2310,11 -2078,8 +2078,8 @@@ static void destroy(struct cache *cache
   
         mempool_destroy(cache->migration_pool);
   
-       if (cache->all_io_ds)
-               dm_deferred_set_destroy(cache->all_io_ds);
- 
         if (cache->prison)
-               dm_bio_prison_destroy(cache->prison);
+               dm_bio_prison_destroy_v2(cache->prison);
   
         if (cache->wq)
                 destroy_workqueue(cache->wq);
@@@ -2707,6 -2472,7 +2472,7 @@@ static int create_cache_policy(struct c
                 return PTR_ERR(p);
         }
         cache->policy = p;
+       BUG_ON(!cache->policy);
   
         return 0;
   }
@@@ -2750,6 -2516,20 +2516,20 @@@ static void set_cache_size(struct cach
         cache->cache_size = size;
   }
   
+ static int is_congested(struct dm_dev *dev, int bdi_bits)
+ {
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+       return bdi_congested(q->backing_dev_info, bdi_bits);
+ }
+ 
+ static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+ {
+       struct cache *cache = container_of(cb, struct cache, callbacks);
+ 
+       return is_congested(cache->origin_dev, bdi_bits) ||
+               is_congested(cache->cache_dev, bdi_bits);
+ }
+ 
   #define DEFAULT_MIGRATION_THRESHOLD 2048
   
   static int cache_create(struct cache_args *ca, struct cache **result)
@@@ -2773,6 -2553,7 +2553,6 @@@
   
         ti->num_discard_bios = 1;
         ti->discards_supported = true;
- -      ti->discard_zeroes_data_unsupported = true;
         ti->split_discard_bios = false;
   
         cache->features = ca->features;
@@@ -2787,7 -2568,6 +2567,6 @@@
   
         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
   
-       /* FIXME: factor out this whole section */
         origin_blocks = cache->origin_sectors = ca->origin_sectors;
         origin_blocks = block_div(origin_blocks, ca->block_size);
         cache->origin_blocks = to_oblock(origin_blocks);
@@@ -2853,24 -2633,18 +2632,18 @@@
                         r = -EINVAL;
                         goto bad;
                 }
+ 
+               policy_allow_migrations(cache->policy, false);
         }
   
         spin_lock_init(&cache->lock);
         INIT_LIST_HEAD(&cache->deferred_cells);
         bio_list_init(&cache->deferred_bios);
-       bio_list_init(&cache->deferred_flush_bios);
         bio_list_init(&cache->deferred_writethrough_bios);
-       INIT_LIST_HEAD(&cache->quiesced_migrations);
-       INIT_LIST_HEAD(&cache->completed_migrations);
-       INIT_LIST_HEAD(&cache->need_commit_migrations);
         atomic_set(&cache->nr_allocated_migrations, 0);
         atomic_set(&cache->nr_io_migrations, 0);
         init_waitqueue_head(&cache->migration_wait);
   
-       init_waitqueue_head(&cache->quiescing_wait);
-       atomic_set(&cache->quiescing, 0);
-       atomic_set(&cache->quiescing_ack, 0);
- 
         r = -ENOMEM;
         atomic_set(&cache->nr_dirty, 0);
         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@@ -2899,27 -2673,23 +2672,23 @@@
                 goto bad;
         }
   
-       cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+       cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
         if (!cache->wq) {
                 *error = "could not create workqueue for metadata object";
                 goto bad;
         }
-       INIT_WORK(&cache->worker, do_worker);
+       INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
+       INIT_WORK(&cache->deferred_writethrough_worker,
+                 process_deferred_writethrough_bios);
+       INIT_WORK(&cache->migration_worker, check_migrations);
         INIT_DELAYED_WORK(&cache->waker, do_waker);
-       cache->last_commit_jiffies = jiffies;
   
-       cache->prison = dm_bio_prison_create();
+       cache->prison = dm_bio_prison_create_v2(cache->wq);
         if (!cache->prison) {
                 *error = "could not create bio prison";
                 goto bad;
         }
   
-       cache->all_io_ds = dm_deferred_set_create();
-       if (!cache->all_io_ds) {
-               *error = "could not create all_io deferred set";
-               goto bad;
-       }
- 
         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
                                                          migration_cache);
         if (!cache->migration_pool) {
@@@ -2946,11 -2716,15 +2715,15 @@@
         spin_lock_init(&cache->invalidation_lock);
         INIT_LIST_HEAD(&cache->invalidation_requests);
   
+       batcher_init(&cache->committer, commit_op, cache,
+                    issue_op, cache, cache->wq);
         iot_init(&cache->origin_tracker);
   
+       init_rwsem(&cache->background_work_lock);
+       prevent_background_work(cache);
+ 
         *result = cache;
         return 0;
- 
   bad:
         destroy(cache);
         return r;
@@@ -3008,7 -2782,6 +2781,6 @@@ static int cache_ctr(struct dm_target *
         }
   
         ti->private = cache;
- 
   out:
         destroy_cache_args(ca);
         return r;
@@@ -3021,17 -2794,11 +2793,11 @@@ static int cache_map(struct dm_target *
         struct cache *cache = ti->private;
   
         int r;
-       struct dm_bio_prison_cell *cell = NULL;
+       bool commit_needed;
         dm_oblock_t block = get_bio_block(cache, bio);
         size_t pb_data_size = get_per_bio_data_size(cache);
-       bool can_migrate = false;
-       bool fast_promotion;
-       struct policy_result lookup_result;
-       struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
-       struct old_oblock_lock ool;
- 
-       ool.locker.fn = null_locker;
   
+       init_per_bio_data(bio, pb_data_size);
         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
                 /*
                  * This can only occur if the io goes to a partial block at
@@@ -3048,101 -2815,9 +2814,9 @@@
                 return DM_MAPIO_SUBMITTED;
         }
   
-       /*
-        * Check to see if that block is currently migrating.
-        */
-       cell = alloc_prison_cell(cache);
-       if (!cell) {
-               defer_bio(cache, bio);
-               return DM_MAPIO_SUBMITTED;
-       }
- 
-       r = bio_detain(cache, block, bio, cell,
-                      (cell_free_fn) free_prison_cell,
-                      cache, &cell);
-       if (r) {
-               if (r < 0)
-                       defer_bio(cache, bio);
- 
-               return DM_MAPIO_SUBMITTED;
-       }
- 
-       fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
- 
-       r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
-                      bio, &ool.locker, &lookup_result);
-       if (r == -EWOULDBLOCK) {
-               cell_defer(cache, cell, true);
-               return DM_MAPIO_SUBMITTED;
- 
-       } else if (r) {
-               DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
-                           cache_device_name(cache), r);
-               cell_defer(cache, cell, false);
-               bio_io_error(bio);
-               return DM_MAPIO_SUBMITTED;
-       }
- 
-       r = DM_MAPIO_REMAPPED;
-       switch (lookup_result.op) {
-       case POLICY_HIT:
-               if (passthrough_mode(&cache->features)) {
-                       if (bio_data_dir(bio) == WRITE) {
-                               /*
-                                * We need to invalidate this block, so
-                                * defer for the worker thread.
-                                */
-                               cell_defer(cache, cell, true);
-                               r = DM_MAPIO_SUBMITTED;
- 
-                       } else {
-                               inc_miss_counter(cache, bio);
-                               remap_to_origin_clear_discard(cache, bio, block);
-                               accounted_begin(cache, bio);
-                               inc_ds(cache, bio, cell);
-                               // FIXME: we want to remap hits or misses straight
-                               // away rather than passing over to the worker.
-                               cell_defer(cache, cell, false);
-                       }
- 
-               } else {
-                       inc_hit_counter(cache, bio);
-                       if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
-                           !is_dirty(cache, lookup_result.cblock)) {
-                               remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-                               accounted_begin(cache, bio);
-                               inc_ds(cache, bio, cell);
-                               cell_defer(cache, cell, false);
- 
-                       } else
-                               remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
-               }
-               break;
- 
-       case POLICY_MISS:
-               inc_miss_counter(cache, bio);
-               if (pb->req_nr != 0) {
-                       /*
-                        * This is a duplicate writethrough io that is no
-                        * longer needed because the block has been demoted.
-                        */
-                       bio_endio(bio);
-                       // FIXME: remap everything as a miss
-                       cell_defer(cache, cell, false);
-                       r = DM_MAPIO_SUBMITTED;
- 
-               } else
-                       remap_cell_to_origin_clear_discard(cache, cell, block, false);
-               break;
- 
-       default:
-               DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
-                           cache_device_name(cache), __func__,
-                           (unsigned) lookup_result.op);
-               cell_defer(cache, cell, false);
-               bio_io_error(bio);
-               r = DM_MAPIO_SUBMITTED;
-       }
+       r = map_bio(cache, bio, block, &commit_needed);
+       if (commit_needed)
+               schedule_commit(&cache->committer);
   
         return r;
   }
@@@ -3162,7 -2837,7 +2836,7 @@@ static int cache_end_io(struct dm_targe
                 spin_unlock_irqrestore(&cache->lock, flags);
         }
   
-       check_for_quiesced_migrations(cache, pb);
+       bio_drop_shared_lock(cache, bio);
         accounted_complete(cache, bio);
   
         return 0;
@@@ -3262,12 -2937,18 +2936,18 @@@ static void cache_postsuspend(struct dm
   {
         struct cache *cache = ti->private;
   
-       start_quiescing(cache);
-       wait_for_migrations(cache);
-       stop_worker(cache);
+       prevent_background_work(cache);
+       BUG_ON(atomic_read(&cache->nr_io_migrations));
+ 
+       cancel_delayed_work(&cache->waker);
+       flush_workqueue(cache->wq);
+       WARN_ON(cache->origin_tracker.in_flight);
+ 
+       /*
+        * If it's a flush suspend there won't be any deferred bios, so this
+        * call is harmless.
+        */
         requeue_deferred_bios(cache);
-       requeue_deferred_cells(cache);
-       stop_quiescing(cache);
   
         if (get_cache_mode(cache) == CM_WRITE)
                 (void) sync_metadata(cache);
@@@ -3279,15 -2960,16 +2959,16 @@@ static int load_mapping(void *context, 
         int r;
         struct cache *cache = context;
   
-       r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+       if (dirty) {
+               set_bit(from_cblock(cblock), cache->dirty_bitset);
+               atomic_inc(&cache->nr_dirty);
+       } else
+               clear_bit(from_cblock(cblock), cache->dirty_bitset);
+ 
+       r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
         if (r)
                 return r;
   
-       if (dirty)
-               set_dirty(cache, oblock, cblock);
-       else
-               clear_dirty(cache, oblock, cblock);
- 
         return 0;
   }
   
@@@ -3486,6 -3168,7 +3167,7 @@@ static void cache_resume(struct dm_targ
         struct cache *cache = ti->private;
   
         cache->need_tick_bio = true;
+       allow_background_work(cache);
         do_waker(&cache->waker.work);
   }
   
@@@ -3619,11 -3302,20 +3301,20 @@@ err
         DMEMIT("Error");
   }
   
+ /*
+  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
+  * the one-past-the-end value.
+  */
+ struct cblock_range {
+       dm_cblock_t begin;
+       dm_cblock_t end;
+ };
+ 
   /*
    * A cache block range can take two forms:
    *
    * i) A single cblock, eg. '3456'
-  * ii) A begin and end cblock with dots between, eg. 123-234
+  * ii) A begin and end cblock with a dash between, eg. 123-234
    */
   static int parse_cblock_range(struct cache *cache, const char *str,
                               struct cblock_range *result)
@@@ -3689,23 -3381,31 +3380,31 @@@ static int validate_cblock_range(struc
         return 0;
   }
   
+ static inline dm_cblock_t cblock_succ(dm_cblock_t b)
+ {
+       return to_cblock(from_cblock(b) + 1);
+ }
+ 
   static int request_invalidation(struct cache *cache, struct cblock_range *range)
   {
-       struct invalidation_request req;
+       int r = 0;
   
-       INIT_LIST_HEAD(&req.list);
-       req.cblocks = range;
-       atomic_set(&req.complete, 0);
-       req.err = 0;
-       init_waitqueue_head(&req.result_wait);
+       /*
+        * We don't need to do any locking here because we know we're in
+        * passthrough mode.  There's is potential for a race between an
+        * invalidation triggered by an io and an invalidation message.  This
+        * is harmless, we must not worry if the policy call fails.
+        */
+       while (range->begin != range->end) {
+               r = invalidate_cblock(cache, range->begin);
+               if (r)
+                       return r;
   
-       spin_lock(&cache->invalidation_lock);
-       list_add(&req.list, &cache->invalidation_requests);
-       spin_unlock(&cache->invalidation_lock);
-       wake_worker(cache);
+               range->begin = cblock_succ(range->begin);
+       }
   
-       wait_event(req.result_wait, atomic_read(&req.complete));
-       return req.err;
+       cache->commit_requested = true;
+       return r;
   }
   
   static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@@ -3815,7 -3515,7 +3514,7 @@@ static void cache_io_hints(struct dm_ta
   
   static struct target_type cache_target = {
         .name = "cache",
-       .version = {1, 10, 0},
+       .version = {2, 0, 0},
         .module = THIS_MODULE,
         .ctr = cache_ctr,
         .dtr = cache_dtr,
diff --combined drivers/md/dm-core.h

index fea5bd52ada8fa6c01cf57fc681746854c635569,b92f74d9a9825f4931e396e9c790d60b2ff88f30..97db4d11c05ac169f4f19e23c715139dbaccdebf
--- 1/drivers/md/dm-core.h
--- 2/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@@ -47,7 -47,7 +47,7 @@@ struct mapped_device 
         struct request_queue *queue;
         int numa_node_id;
   
-       unsigned type;
+       enum dm_queue_mode type;
         /* Protect queue and type against concurrent access. */
         struct mutex type_lock;
   
@@@ -132,7 -132,6 +132,7 @@@ void dm_init_md_queue(struct mapped_dev
   void dm_init_normal_md_queue(struct mapped_device *md);
   int md_in_flight(struct mapped_device *md);
   void disable_write_same(struct mapped_device *md);
+ +void disable_write_zeroes(struct mapped_device *md);
   
   static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
   {
diff --combined drivers/md/dm-crypt.c

index ef1d836bd81b61ec8f79a1e305685f272c776580,24f3b9fdeeb6401622213308e8ea497ab1d8c82f..ebf9e72d479b9c46e2316eb121917ce9862af5be
--- 1/drivers/md/dm-crypt.c
--- 2/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@@ -1,8 -1,8 +1,8 @@@
   /*
    * Copyright (C) 2003 Jana Saout <jana@saout.de>
    * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
-  * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
-  * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
+  * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
+  * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
    *
    * This file is released under the GPL.
    */
@@@ -31,6 -31,9 +31,9 @@@
   #include <crypto/md5.h>
   #include <crypto/algapi.h>
   #include <crypto/skcipher.h>
+ #include <crypto/aead.h>
+ #include <crypto/authenc.h>
+ #include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
   #include <keys/user-type.h>
   
   #include <linux/device-mapper.h>
@@@ -48,7 -51,11 +51,11 @@@ struct convert_context 
         struct bvec_iter iter_out;
         sector_t cc_sector;
         atomic_t cc_pending;
-       struct skcipher_request *req;
+       union {
+               struct skcipher_request *req;
+               struct aead_request *req_aead;
+       } r;
+ 
   };
   
   /*
@@@ -57,6 -64,8 +64,8 @@@
   struct dm_crypt_io {
         struct crypt_config *cc;
         struct bio *base_bio;
+       u8 *integrity_metadata;
+       bool integrity_metadata_from_pool;
         struct work_struct work;
   
         struct convert_context ctx;
@@@ -70,8 -79,8 +79,8 @@@
   
   struct dm_crypt_request {
         struct convert_context *ctx;
-       struct scatterlist sg_in;
-       struct scatterlist sg_out;
+       struct scatterlist sg_in[4];
+       struct scatterlist sg_out[4];
         sector_t iv_sector;
   };
   
@@@ -118,6 -127,11 +127,11 @@@ struct iv_tcw_private 
   enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
              DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
   
+ enum cipher_flags {
+       CRYPT_MODE_INTEGRITY_AEAD,      /* Use authenticated mode for cihper */
+       CRYPT_IV_LARGE_SECTORS,         /* Calculate IV from sector_size, not 512B sectors */
+ };
+ 
   /*
    * The fields in here must be read only after initialization.
    */
@@@ -126,11 -140,14 +140,14 @@@ struct crypt_config 
         sector_t start;
   
         /*
-        * pool for per bio private data, crypto requests and
-        * encryption requeusts/buffer pages
+        * pool for per bio private data, crypto requests,
+        * encryption requeusts/buffer pages and integrity tags
          */
         mempool_t *req_pool;
         mempool_t *page_pool;
+       mempool_t *tag_pool;
+       unsigned tag_pool_max_sectors;
+ 
         struct bio_set *bs;
         struct mutex bio_alloc_lock;
   
@@@ -143,6 -160,7 +160,7 @@@
   
         char *cipher;
         char *cipher_string;
+       char *cipher_auth;
         char *key_string;
   
         const struct crypt_iv_operations *iv_gen_ops;
@@@ -154,11 -172,17 +172,17 @@@
         } iv_gen_private;
         sector_t iv_offset;
         unsigned int iv_size;
+       unsigned short int sector_size;
+       unsigned char sector_shift;
   
         /* ESSIV: struct crypto_cipher *essiv_tfm */
         void *iv_private;
-       struct crypto_skcipher **tfms;
+       union {
+               struct crypto_skcipher **tfms;
+               struct crypto_aead **tfms_aead;
+       } cipher_tfm;
         unsigned tfms_count;
+       unsigned long cipher_flags;
   
         /*
          * Layout of each crypto request:
@@@ -181,21 -205,36 +205,36 @@@
         unsigned int key_size;
         unsigned int key_parts;      /* independent parts in key buffer */
         unsigned int key_extra_size; /* additional keys length */
+       unsigned int key_mac_size;   /* MAC key size for authenc(...) */
+ 
+       unsigned int integrity_tag_size;
+       unsigned int integrity_iv_size;
+       unsigned int on_disk_tag_size;
+ 
+       u8 *authenc_key; /* space for keys in authenc() format (if used) */
         u8 key[0];
   };
   
- #define MIN_IOS        64
+ #define MIN_IOS               64
+ #define MAX_TAG_SIZE  480
+ #define POOL_ENTRY_SIZE       512
   
   static void clone_init(struct dm_crypt_io *, struct bio *);
   static void kcryptd_queue_crypt(struct dm_crypt_io *io);
- static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+ static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+                                            struct scatterlist *sg);
   
   /*
-  * Use this to access cipher attributes that are the same for each CPU.
+  * Use this to access cipher attributes that are independent of the key.
    */
   static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
   {
-       return cc->tfms[0];
+       return cc->cipher_tfm.tfms[0];
+ }
+ 
+ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
+ {
+       return cc->cipher_tfm.tfms_aead[0];
   }
   
   /*
@@@ -310,10 -349,11 +349,11 @@@ static int crypt_iv_essiv_wipe(struct c
         return err;
   }
   
- /* Set up per cpu cipher state */
- static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
-                                            struct dm_target *ti,
-                                            u8 *salt, unsigned saltsize)
+ /* Allocate the cipher for ESSIV */
+ static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
+                                               struct dm_target *ti,
+                                               const u8 *salt,
+                                               unsigned int saltsize)
   {
         struct crypto_cipher *essiv_tfm;
         int err;
@@@ -325,8 -365,7 +365,7 @@@
                 return essiv_tfm;
         }
   
-       if (crypto_cipher_blocksize(essiv_tfm) !=
-           crypto_skcipher_ivsize(any_tfm(cc))) {
+       if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
                 ti->error = "Block size of ESSIV cipher does "
                             "not match IV size of block cipher";
                 crypto_free_cipher(essiv_tfm);
@@@ -393,8 -432,8 +432,8 @@@ static int crypt_iv_essiv_ctr(struct cr
         cc->iv_gen_private.essiv.salt = salt;
         cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
   
-       essiv_tfm = setup_essiv_cpu(cc, ti, salt,
-                               crypto_ahash_digestsize(hash_tfm));
+       essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
+                                      crypto_ahash_digestsize(hash_tfm));
         if (IS_ERR(essiv_tfm)) {
                 crypt_iv_essiv_dtr(cc);
                 return PTR_ERR(essiv_tfm);
@@@ -488,6 -527,11 +527,11 @@@ static int crypt_iv_lmk_ctr(struct cryp
   {
         struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
   
+       if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+               ti->error = "Unsupported sector size for LMK";
+               return -EINVAL;
+       }
+ 
         lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
         if (IS_ERR(lmk->hash_tfm)) {
                 ti->error = "Error initializing LMK hash";
@@@ -585,12 -629,14 +629,14 @@@ static int crypt_iv_lmk_one(struct cryp
   static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
                             struct dm_crypt_request *dmreq)
   {
+       struct scatterlist *sg;
         u8 *src;
         int r = 0;
   
         if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
-               src = kmap_atomic(sg_page(&dmreq->sg_in));
-               r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+               sg = crypt_get_sg_data(cc, dmreq->sg_in);
+               src = kmap_atomic(sg_page(sg));
+               r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
                 kunmap_atomic(src);
         } else
                 memset(iv, 0, cc->iv_size);
@@@ -601,18 -647,20 +647,20 @@@
   static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
                              struct dm_crypt_request *dmreq)
   {
+       struct scatterlist *sg;
         u8 *dst;
         int r;
   
         if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
                 return 0;
   
-       dst = kmap_atomic(sg_page(&dmreq->sg_out));
-       r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+       sg = crypt_get_sg_data(cc, dmreq->sg_out);
+       dst = kmap_atomic(sg_page(sg));
+       r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
   
         /* Tweak the first block of plaintext sector */
         if (!r)
-               crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+               crypto_xor(dst + sg->offset, iv, cc->iv_size);
   
         kunmap_atomic(dst);
         return r;
@@@ -637,6 -685,11 +685,11 @@@ static int crypt_iv_tcw_ctr(struct cryp
   {
         struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
   
+       if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+               ti->error = "Unsupported sector size for TCW";
+               return -EINVAL;
+       }
+ 
         if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
                 ti->error = "Wrong key size for TCW";
                 return -EINVAL;
@@@ -724,6 -777,7 +777,7 @@@ out
   static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
                             struct dm_crypt_request *dmreq)
   {
+       struct scatterlist *sg;
         struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
         __le64 sector = cpu_to_le64(dmreq->iv_sector);
         u8 *src;
@@@ -731,8 -785,9 +785,9 @@@
   
         /* Remove whitening from ciphertext */
         if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
-               src = kmap_atomic(sg_page(&dmreq->sg_in));
-               r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+               sg = crypt_get_sg_data(cc, dmreq->sg_in);
+               src = kmap_atomic(sg_page(sg));
+               r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
                 kunmap_atomic(src);
         }
   
@@@ -748,6 -803,7 +803,7 @@@
   static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
                              struct dm_crypt_request *dmreq)
   {
+       struct scatterlist *sg;
         u8 *dst;
         int r;
   
@@@ -755,13 -811,22 +811,22 @@@
                 return 0;
   
         /* Apply whitening on ciphertext */
-       dst = kmap_atomic(sg_page(&dmreq->sg_out));
-       r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+       sg = crypt_get_sg_data(cc, dmreq->sg_out);
+       dst = kmap_atomic(sg_page(sg));
+       r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
         kunmap_atomic(dst);
   
         return r;
   }
   
+ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
+                               struct dm_crypt_request *dmreq)
+ {
+       /* Used only for writes, there must be an additional space to store IV */
+       get_random_bytes(iv, cc->iv_size);
+       return 0;
+ }
+ 
   static const struct crypt_iv_operations crypt_iv_plain_ops = {
         .generator = crypt_iv_plain_gen
   };
@@@ -806,6 -871,108 +871,108 @@@ static const struct crypt_iv_operation
         .post      = crypt_iv_tcw_post
   };
   
+ static struct crypt_iv_operations crypt_iv_random_ops = {
+       .generator = crypt_iv_random_gen
+ };
+ 
+ /*
+  * Integrity extensions
+  */
+ static bool crypt_integrity_aead(struct crypt_config *cc)
+ {
+       return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+ }
+ 
+ static bool crypt_integrity_hmac(struct crypt_config *cc)
+ {
+       return crypt_integrity_aead(cc) && cc->key_mac_size;
+ }
+ 
+ /* Get sg containing data */
+ static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+                                            struct scatterlist *sg)
+ {
+       if (unlikely(crypt_integrity_aead(cc)))
+               return &sg[2];
+ 
+       return sg;
+ }
+ 
+ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
+ {
+       struct bio_integrity_payload *bip;
+       unsigned int tag_len;
+       int ret;
+ 
+       if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
+               return 0;
+ 
+       bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+       if (IS_ERR(bip))
+               return PTR_ERR(bip);
+ 
+       tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+ 
+       bip->bip_iter.bi_size = tag_len;
+       bip->bip_iter.bi_sector = io->cc->start + io->sector;
+ 
+       /* We own the metadata, do not let bio_free to release it */
+       bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
+ 
+       ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
+                                    tag_len, offset_in_page(io->integrity_metadata));
+       if (unlikely(ret != tag_len))
+               return -ENOMEM;
+ 
+       return 0;
+ }
+ 
+ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
+ {
+ #ifdef CONFIG_BLK_DEV_INTEGRITY
+       struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+ 
+       /* From now we require underlying device with our integrity profile */
+       if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
+               ti->error = "Integrity profile not supported.";
+               return -EINVAL;
+       }
+ 
+       if (bi->tag_size != cc->on_disk_tag_size ||
+           bi->tuple_size != cc->on_disk_tag_size) {
+               ti->error = "Integrity profile tag size mismatch.";
+               return -EINVAL;
+       }
+       if (1 << bi->interval_exp != cc->sector_size) {
+               ti->error = "Integrity profile sector size mismatch.";
+               return -EINVAL;
+       }
+ 
+       if (crypt_integrity_aead(cc)) {
+               cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
+               DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+                      cc->integrity_tag_size, cc->integrity_iv_size);
+ 
+               if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
+                       ti->error = "Integrity AEAD auth tag size is not supported.";
+                       return -EINVAL;
+               }
+       } else if (cc->integrity_iv_size)
+               DMINFO("Additional per-sector space %u bytes for IV.",
+                      cc->integrity_iv_size);
+ 
+       if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
+               ti->error = "Not enough space for integrity tag in the profile.";
+               return -EINVAL;
+       }
+ 
+       return 0;
+ #else
+       ti->error = "Integrity profile not supported.";
+       return -EINVAL;
+ #endif
+ }
+ 
   static void crypt_convert_init(struct crypt_config *cc,
                                struct convert_context *ctx,
                                struct bio *bio_out, struct bio *bio_in,
@@@ -822,58 -989,217 +989,217 @@@
   }
   
   static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
-                                            struct skcipher_request *req)
+                                            void *req)
   {
         return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
   }
   
- static struct skcipher_request *req_of_dmreq(struct crypt_config *cc,
-                                              struct dm_crypt_request *dmreq)
+ static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq)
   {
-       return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start);
+       return (void *)((char *)dmreq - cc->dmreq_start);
   }
   
   static u8 *iv_of_dmreq(struct crypt_config *cc,
                        struct dm_crypt_request *dmreq)
   {
-       return (u8 *)ALIGN((unsigned long)(dmreq + 1),
-               crypto_skcipher_alignmask(any_tfm(cc)) + 1);
+       if (crypt_integrity_aead(cc))
+               return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+                       crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
+       else
+               return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+                       crypto_skcipher_alignmask(any_tfm(cc)) + 1);
   }
   
- static int crypt_convert_block(struct crypt_config *cc,
-                              struct convert_context *ctx,
-                              struct skcipher_request *req)
+ static u8 *org_iv_of_dmreq(struct crypt_config *cc,
+                      struct dm_crypt_request *dmreq)
+ {
+       return iv_of_dmreq(cc, dmreq) + cc->iv_size;
+ }
+ 
+ static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+                      struct dm_crypt_request *dmreq)
+ {
+       u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
+       return (uint64_t*) ptr;
+ }
+ 
+ static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
+                      struct dm_crypt_request *dmreq)
+ {
+       u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size +
+                 cc->iv_size + sizeof(uint64_t);
+       return (unsigned int*)ptr;
+ }
+ 
+ static void *tag_from_dmreq(struct crypt_config *cc,
+                               struct dm_crypt_request *dmreq)
+ {
+       struct convert_context *ctx = dmreq->ctx;
+       struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
+ 
+       return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
+               cc->on_disk_tag_size];
+ }
+ 
+ static void *iv_tag_from_dmreq(struct crypt_config *cc,
+                              struct dm_crypt_request *dmreq)
+ {
+       return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size;
+ }
+ 
+ static int crypt_convert_block_aead(struct crypt_config *cc,
+                                    struct convert_context *ctx,
+                                    struct aead_request *req,
+                                    unsigned int tag_offset)
   {
         struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
         struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
         struct dm_crypt_request *dmreq;
-       u8 *iv;
-       int r;
+       u8 *iv, *org_iv, *tag_iv, *tag;
+       uint64_t *sector;
+       int r = 0;
+ 
+       BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
+ 
+       /* Reject unexpected unaligned bio. */
+       if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+               return -EIO;
   
         dmreq = dmreq_of_req(cc, req);
+       dmreq->iv_sector = ctx->cc_sector;
+       if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+               dmreq->iv_sector >>= cc->sector_shift;
+       dmreq->ctx = ctx;
+ 
+       *org_tag_of_dmreq(cc, dmreq) = tag_offset;
+ 
+       sector = org_sector_of_dmreq(cc, dmreq);
+       *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+ 
         iv = iv_of_dmreq(cc, dmreq);
+       org_iv = org_iv_of_dmreq(cc, dmreq);
+       tag = tag_from_dmreq(cc, dmreq);
+       tag_iv = iv_tag_from_dmreq(cc, dmreq);
+ 
+       /* AEAD request:
+        *  |----- AAD -------|------ DATA -------|-- AUTH TAG --|
+        *  | (authenticated) | (auth+encryption) |              |
+        *  | sector_LE |  IV |  sector in/out    |  tag in/out  |
+        */
+       sg_init_table(dmreq->sg_in, 4);
+       sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
+       sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
+       sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
+       sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
+ 
+       sg_init_table(dmreq->sg_out, 4);
+       sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
+       sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
+       sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
+       sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
+ 
+       if (cc->iv_gen_ops) {
+               /* For READs use IV stored in integrity metadata */
+               if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+                       memcpy(org_iv, tag_iv, cc->iv_size);
+               } else {
+                       r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+                       if (r < 0)
+                               return r;
+                       /* Store generated IV in integrity metadata */
+                       if (cc->integrity_iv_size)
+                               memcpy(tag_iv, org_iv, cc->iv_size);
+               }
+               /* Working copy of IV, to be modified in crypto API */
+               memcpy(iv, org_iv, cc->iv_size);
+       }
+ 
+       aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
+       if (bio_data_dir(ctx->bio_in) == WRITE) {
+               aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+                                      cc->sector_size, iv);
+               r = crypto_aead_encrypt(req);
+               if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
+                       memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
+                              cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
+       } else {
+               aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+                                      cc->sector_size + cc->integrity_tag_size, iv);
+               r = crypto_aead_decrypt(req);
+       }
+ 
+       if (r == -EBADMSG)
+               DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+                           (unsigned long long)le64_to_cpu(*sector));
+ 
+       if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+               r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+ 
+       bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+       bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
+ 
+       return r;
+ }
+ 
+ static int crypt_convert_block_skcipher(struct crypt_config *cc,
+                                       struct convert_context *ctx,
+                                       struct skcipher_request *req,
+                                       unsigned int tag_offset)
+ {
+       struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+       struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
+       struct scatterlist *sg_in, *sg_out;
+       struct dm_crypt_request *dmreq;
+       u8 *iv, *org_iv, *tag_iv;
+       uint64_t *sector;
+       int r = 0;
   
+       /* Reject unexpected unaligned bio. */
+       if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+               return -EIO;
+ 
+       dmreq = dmreq_of_req(cc, req);
         dmreq->iv_sector = ctx->cc_sector;
+       if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+               dmreq->iv_sector >>= cc->sector_shift;
         dmreq->ctx = ctx;
-       sg_init_table(&dmreq->sg_in, 1);
-       sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
-                   bv_in.bv_offset);
   
-       sg_init_table(&dmreq->sg_out, 1);
-       sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
-                   bv_out.bv_offset);
+       *org_tag_of_dmreq(cc, dmreq) = tag_offset;
+ 
+       iv = iv_of_dmreq(cc, dmreq);
+       org_iv = org_iv_of_dmreq(cc, dmreq);
+       tag_iv = iv_tag_from_dmreq(cc, dmreq);
+ 
+       sector = org_sector_of_dmreq(cc, dmreq);
+       *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+ 
+       /* For skcipher we use only the first sg item */
+       sg_in  = &dmreq->sg_in[0];
+       sg_out = &dmreq->sg_out[0];
   
-       bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
-       bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
+       sg_init_table(sg_in, 1);
+       sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
+ 
+       sg_init_table(sg_out, 1);
+       sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
   
         if (cc->iv_gen_ops) {
-               r = cc->iv_gen_ops->generator(cc, iv, dmreq);
-               if (r < 0)
-                       return r;
+               /* For READs use IV stored in integrity metadata */
+               if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+                       memcpy(org_iv, tag_iv, cc->integrity_iv_size);
+               } else {
+                       r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+                       if (r < 0)
+                               return r;
+                       /* Store generated IV in integrity metadata */
+                       if (cc->integrity_iv_size)
+                               memcpy(tag_iv, org_iv, cc->integrity_iv_size);
+               }
+               /* Working copy of IV, to be modified in crypto API */
+               memcpy(iv, org_iv, cc->iv_size);
         }
   
-       skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
-                                  1 << SECTOR_SHIFT, iv);
+       skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv);
   
         if (bio_data_dir(ctx->bio_in) == WRITE)
                 r = crypto_skcipher_encrypt(req);
@@@ -881,7 -1207,10 +1207,10 @@@
                 r = crypto_skcipher_decrypt(req);
   
         if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
-               r = cc->iv_gen_ops->post(cc, iv, dmreq);
+               r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+ 
+       bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+       bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
   
         return r;
   }
@@@ -889,27 -1218,53 +1218,53 @@@
   static void kcryptd_async_done(struct crypto_async_request *async_req,
                                int error);
   
- static void crypt_alloc_req(struct crypt_config *cc,
-                           struct convert_context *ctx)
+ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
+                                    struct convert_context *ctx)
   {
         unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
   
-       if (!ctx->req)
-               ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+       if (!ctx->r.req)
+               ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
   
-       skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+       skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
   
         /*
          * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
          * requests if driver request queue is full.
          */
-       skcipher_request_set_callback(ctx->req,
+       skcipher_request_set_callback(ctx->r.req,
             CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-           kcryptd_async_done, dmreq_of_req(cc, ctx->req));
+           kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
   }
   
- static void crypt_free_req(struct crypt_config *cc,
-                          struct skcipher_request *req, struct bio *base_bio)
+ static void crypt_alloc_req_aead(struct crypt_config *cc,
+                                struct convert_context *ctx)
+ {
+       if (!ctx->r.req_aead)
+               ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
+ 
+       aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
+ 
+       /*
+        * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+        * requests if driver request queue is full.
+        */
+       aead_request_set_callback(ctx->r.req_aead,
+           CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+           kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
+ }
+ 
+ static void crypt_alloc_req(struct crypt_config *cc,
+                           struct convert_context *ctx)
+ {
+       if (crypt_integrity_aead(cc))
+               crypt_alloc_req_aead(cc, ctx);
+       else
+               crypt_alloc_req_skcipher(cc, ctx);
+ }
+ 
+ static void crypt_free_req_skcipher(struct crypt_config *cc,
+                                   struct skcipher_request *req, struct bio *base_bio)
   {
         struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
   
@@@ -917,12 -1272,31 +1272,31 @@@
                 mempool_free(req, cc->req_pool);
   }
   
+ static void crypt_free_req_aead(struct crypt_config *cc,
+                               struct aead_request *req, struct bio *base_bio)
+ {
+       struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+ 
+       if ((struct aead_request *)(io + 1) != req)
+               mempool_free(req, cc->req_pool);
+ }
+ 
+ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
+ {
+       if (crypt_integrity_aead(cc))
+               crypt_free_req_aead(cc, req, base_bio);
+       else
+               crypt_free_req_skcipher(cc, req, base_bio);
+ }
+ 
   /*
    * Encrypt / decrypt data from one bio to another one (can be the same one)
    */
   static int crypt_convert(struct crypt_config *cc,
                          struct convert_context *ctx)
   {
+       unsigned int tag_offset = 0;
+       unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
         int r;
   
         atomic_set(&ctx->cc_pending, 1);
@@@ -930,10 -1304,12 +1304,12 @@@
         while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
   
                 crypt_alloc_req(cc, ctx);
- 
                 atomic_inc(&ctx->cc_pending);
   
-               r = crypt_convert_block(cc, ctx, ctx->req);
+               if (crypt_integrity_aead(cc))
+                       r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+               else
+                       r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
   
                 switch (r) {
                 /*
@@@ -949,22 -1325,31 +1325,31 @@@
                  * completion function kcryptd_async_done() will be called.
                  */
                 case -EINPROGRESS:
-                       ctx->req = NULL;
-                       ctx->cc_sector++;
+                       ctx->r.req = NULL;
+                       ctx->cc_sector += sector_step;
+                       tag_offset++;
                         continue;
                 /*
                  * The request was already processed (synchronously).
                  */
                 case 0:
                         atomic_dec(&ctx->cc_pending);
-                       ctx->cc_sector++;
+                       ctx->cc_sector += sector_step;
+                       tag_offset++;
                         cond_resched();
                         continue;
- 
-               /* There was an error while processing the request. */
+               /*
+                * There was a data integrity error.
+                */
+               case -EBADMSG:
+                       atomic_dec(&ctx->cc_pending);
+                       return -EILSEQ;
+               /*
+                * There was an error while processing the request.
+                */
                 default:
                         atomic_dec(&ctx->cc_pending);
-                       return r;
+                       return -EIO;
                 }
         }
   
@@@ -1005,7 -1390,7 +1390,7 @@@ retry
   
         clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
         if (!clone)
-               goto return_clone;
+               goto out;
   
         clone_init(io, clone);
   
@@@ -1027,7 -1412,13 +1412,13 @@@
                 remaining_size -= len;
         }
   
- return_clone:
+       /* Allocate space for integrity tags */
+       if (dm_crypt_integrity_io_alloc(io, clone)) {
+               crypt_free_buffer_pages(cc, clone);
+               bio_put(clone);
+               clone = NULL;
+       }
+ out:
         if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                 mutex_unlock(&cc->bio_alloc_lock);
   
@@@ -1053,7 -1444,9 +1444,9 @@@ static void crypt_io_init(struct dm_cry
         io->base_bio = bio;
         io->sector = sector;
         io->error = 0;
-       io->ctx.req = NULL;
+       io->ctx.r.req = NULL;
+       io->integrity_metadata = NULL;
+       io->integrity_metadata_from_pool = false;
         atomic_set(&io->io_pending, 0);
   }
   
@@@ -1075,8 -1468,13 +1468,13 @@@ static void crypt_dec_pending(struct dm
         if (!atomic_dec_and_test(&io->io_pending))
                 return;
   
-       if (io->ctx.req)
-               crypt_free_req(cc, io->ctx.req, base_bio);
+       if (io->ctx.r.req)
+               crypt_free_req(cc, io->ctx.r.req, base_bio);
+ 
+       if (unlikely(io->integrity_metadata_from_pool))
+               mempool_free(io->integrity_metadata, io->cc->tag_pool);
+       else
+               kfree(io->integrity_metadata);
   
         base_bio->bi_error = error;
         bio_endio(base_bio);
@@@ -1156,6 -1554,12 +1554,12 @@@ static int kcryptd_io_read(struct dm_cr
         clone_init(io, clone);
         clone->bi_iter.bi_sector = cc->start + io->sector;
   
+       if (dm_crypt_integrity_io_alloc(io, clone)) {
+               crypt_dec_pending(io);
+               bio_put(clone);
+               return 1;
+       }
+ 
         generic_make_request(clone);
         return 0;
   }
@@@ -1314,8 -1718,8 +1718,8 @@@ static void kcryptd_crypt_write_convert
   
         crypt_inc_pending(io);
         r = crypt_convert(cc, &io->ctx);
-       if (r)
-               io->error = -EIO;
+       if (r < 0)
+               io->error = r;
         crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
   
         /* Encryption was already finished, submit io now */
@@@ -1345,7 -1749,7 +1749,7 @@@ static void kcryptd_crypt_read_convert(
   
         r = crypt_convert(cc, &io->ctx);
         if (r < 0)
-               io->error = -EIO;
+               io->error = r;
   
         if (atomic_dec_and_test(&io->ctx.cc_pending))
                 kcryptd_crypt_read_done(io);
@@@ -1372,9 -1776,13 +1776,13 @@@ static void kcryptd_async_done(struct c
         }
   
         if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
-               error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+               error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
   
-       if (error < 0)
+       if (error == -EBADMSG) {
+               DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+                           (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
+               io->error = -EILSEQ;
+       } else if (error < 0)
                 io->error = -EIO;
   
         crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
@@@ -1406,61 -1814,59 +1814,59 @@@ static void kcryptd_queue_crypt(struct 
         queue_work(cc->crypt_queue, &io->work);
   }
   
- /*
-  * Decode key from its hex representation
-  */
- static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
+ static void crypt_free_tfms_aead(struct crypt_config *cc)
   {
-       char buffer[3];
-       unsigned int i;
- 
-       buffer[2] = '\0';
- 
-       for (i = 0; i < size; i++) {
-               buffer[0] = *hex++;
-               buffer[1] = *hex++;
+       if (!cc->cipher_tfm.tfms_aead)
+               return;
   
-               if (kstrtou8(buffer, 16, &key[i]))
-                       return -EINVAL;
+       if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+               crypto_free_aead(cc->cipher_tfm.tfms_aead[0]);
+               cc->cipher_tfm.tfms_aead[0] = NULL;
         }
   
-       if (*hex != '\0')
-               return -EINVAL;
- 
-       return 0;
+       kfree(cc->cipher_tfm.tfms_aead);
+       cc->cipher_tfm.tfms_aead = NULL;
   }
   
- static void crypt_free_tfms(struct crypt_config *cc)
+ static void crypt_free_tfms_skcipher(struct crypt_config *cc)
   {
         unsigned i;
   
-       if (!cc->tfms)
+       if (!cc->cipher_tfm.tfms)
                 return;
   
         for (i = 0; i < cc->tfms_count; i++)
-               if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
-                       crypto_free_skcipher(cc->tfms[i]);
-                       cc->tfms[i] = NULL;
+               if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) {
+                       crypto_free_skcipher(cc->cipher_tfm.tfms[i]);
+                       cc->cipher_tfm.tfms[i] = NULL;
                 }
   
-       kfree(cc->tfms);
-       cc->tfms = NULL;
+       kfree(cc->cipher_tfm.tfms);
+       cc->cipher_tfm.tfms = NULL;
   }
   
- static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+ static void crypt_free_tfms(struct crypt_config *cc)
+ {
+       if (crypt_integrity_aead(cc))
+               crypt_free_tfms_aead(cc);
+       else
+               crypt_free_tfms_skcipher(cc);
+ }
+ 
+ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
   {
         unsigned i;
         int err;
   
-       cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
-                          GFP_KERNEL);
-       if (!cc->tfms)
+       cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
+                                     sizeof(struct crypto_skcipher *), GFP_KERNEL);
+       if (!cc->cipher_tfm.tfms)
                 return -ENOMEM;
   
         for (i = 0; i < cc->tfms_count; i++) {
-               cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
-               if (IS_ERR(cc->tfms[i])) {
-                       err = PTR_ERR(cc->tfms[i]);
+               cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
+               if (IS_ERR(cc->cipher_tfm.tfms[i])) {
+                       err = PTR_ERR(cc->cipher_tfm.tfms[i]);
                         crypt_free_tfms(cc);
                         return err;
                 }
@@@ -1469,22 -1875,95 +1875,95 @@@
         return 0;
   }
   
+ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
+ {
+       int err;
+ 
+       cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
+       if (!cc->cipher_tfm.tfms)
+               return -ENOMEM;
+ 
+       cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
+       if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+               err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
+               crypt_free_tfms(cc);
+               return err;
+       }
+ 
+       return 0;
+ }
+ 
+ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+ {
+       if (crypt_integrity_aead(cc))
+               return crypt_alloc_tfms_aead(cc, ciphermode);
+       else
+               return crypt_alloc_tfms_skcipher(cc, ciphermode);
+ }
+ 
+ static unsigned crypt_subkey_size(struct crypt_config *cc)
+ {
+       return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+ }
+ 
+ static unsigned crypt_authenckey_size(struct crypt_config *cc)
+ {
+       return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param));
+ }
+ 
+ /*
+  * If AEAD is composed like authenc(hmac(sha256),xts(aes)),
+  * the key must be for some reason in special format.
+  * This funcion converts cc->key to this special format.
+  */
+ static void crypt_copy_authenckey(char *p, const void *key,
+                                 unsigned enckeylen, unsigned authkeylen)
+ {
+       struct crypto_authenc_key_param *param;
+       struct rtattr *rta;
+ 
+       rta = (struct rtattr *)p;
+       param = RTA_DATA(rta);
+       param->enckeylen = cpu_to_be32(enckeylen);
+       rta->rta_len = RTA_LENGTH(sizeof(*param));
+       rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+       p += RTA_SPACE(sizeof(*param));
+       memcpy(p, key + enckeylen, authkeylen);
+       p += authkeylen;
+       memcpy(p, key, enckeylen);
+ }
+ 
   static int crypt_setkey(struct crypt_config *cc)
   {
         unsigned subkey_size;
         int err = 0, i, r;
   
         /* Ignore extra keys (which are used for IV etc) */
-       subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+       subkey_size = crypt_subkey_size(cc);
   
+       if (crypt_integrity_hmac(cc))
+               crypt_copy_authenckey(cc->authenc_key, cc->key,
+                                     subkey_size - cc->key_mac_size,
+                                     cc->key_mac_size);
         for (i = 0; i < cc->tfms_count; i++) {
-               r = crypto_skcipher_setkey(cc->tfms[i],
-                                          cc->key + (i * subkey_size),
-                                          subkey_size);
+               if (crypt_integrity_hmac(cc))
+                       r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+                               cc->authenc_key, crypt_authenckey_size(cc));
+               else if (crypt_integrity_aead(cc))
+                       r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+                                              cc->key + (i * subkey_size),
+                                              subkey_size);
+               else
+                       r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
+                                                  cc->key + (i * subkey_size),
+                                                  subkey_size);
                 if (r)
                         err = r;
         }
   
+       if (crypt_integrity_hmac(cc))
+               memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc));
+ 
         return err;
   }
   
@@@ -1633,7 -2112,8 +2112,8 @@@ static int crypt_set_key(struct crypt_c
         kzfree(cc->key_string);
         cc->key_string = NULL;
   
-       if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
+       /* Decode key from its hex representation. */
+       if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0)
                 goto out;
   
         r = crypt_setkey(cc);
@@@ -1649,12 -2129,16 +2129,16 @@@ out
   
   static int crypt_wipe_key(struct crypt_config *cc)
   {
+       int r;
+ 
         clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-       memset(&cc->key, 0, cc->key_size * sizeof(u8));
+       get_random_bytes(&cc->key, cc->key_size);
         kzfree(cc->key_string);
         cc->key_string = NULL;
+       r = crypt_setkey(cc);
+       memset(&cc->key, 0, cc->key_size * sizeof(u8));
   
-       return crypt_setkey(cc);
+       return r;
   }
   
   static void crypt_dtr(struct dm_target *ti)
@@@ -1681,6 -2165,7 +2165,7 @@@
   
         mempool_destroy(cc->page_pool);
         mempool_destroy(cc->req_pool);
+       mempool_destroy(cc->tag_pool);
   
         if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                 cc->iv_gen_ops->dtr(cc);
@@@ -1691,30 -2176,221 +2176,221 @@@
         kzfree(cc->cipher);
         kzfree(cc->cipher_string);
         kzfree(cc->key_string);
+       kzfree(cc->cipher_auth);
+       kzfree(cc->authenc_key);
   
         /* Must zero key material before freeing */
         kzfree(cc);
   }
   
- static int crypt_ctr_cipher(struct dm_target *ti,
-                           char *cipher_in, char *key)
+ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
+ {
+       struct crypt_config *cc = ti->private;
+ 
+       if (crypt_integrity_aead(cc))
+               cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+       else
+               cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+ 
+       if (cc->iv_size)
+               /* at least a 64 bit sector number should fit in our buffer */
+               cc->iv_size = max(cc->iv_size,
+                                 (unsigned int)(sizeof(u64) / sizeof(u8)));
+       else if (ivmode) {
+               DMWARN("Selected cipher does not support IVs");
+               ivmode = NULL;
+       }
+ 
+       /* Choose ivmode, see comments at iv code. */
+       if (ivmode == NULL)
+               cc->iv_gen_ops = NULL;
+       else if (strcmp(ivmode, "plain") == 0)
+               cc->iv_gen_ops = &crypt_iv_plain_ops;
+       else if (strcmp(ivmode, "plain64") == 0)
+               cc->iv_gen_ops = &crypt_iv_plain64_ops;
+       else if (strcmp(ivmode, "essiv") == 0)
+               cc->iv_gen_ops = &crypt_iv_essiv_ops;
+       else if (strcmp(ivmode, "benbi") == 0)
+               cc->iv_gen_ops = &crypt_iv_benbi_ops;
+       else if (strcmp(ivmode, "null") == 0)
+               cc->iv_gen_ops = &crypt_iv_null_ops;
+       else if (strcmp(ivmode, "lmk") == 0) {
+               cc->iv_gen_ops = &crypt_iv_lmk_ops;
+               /*
+                * Version 2 and 3 is recognised according
+                * to length of provided multi-key string.
+                * If present (version 3), last key is used as IV seed.
+                * All keys (including IV seed) are always the same size.
+                */
+               if (cc->key_size % cc->key_parts) {
+                       cc->key_parts++;
+                       cc->key_extra_size = cc->key_size / cc->key_parts;
+               }
+       } else if (strcmp(ivmode, "tcw") == 0) {
+               cc->iv_gen_ops = &crypt_iv_tcw_ops;
+               cc->key_parts += 2; /* IV + whitening */
+               cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
+       } else if (strcmp(ivmode, "random") == 0) {
+               cc->iv_gen_ops = &crypt_iv_random_ops;
+               /* Need storage space in integrity fields. */
+               cc->integrity_iv_size = cc->iv_size;
+       } else {
+               ti->error = "Invalid IV mode";
+               return -EINVAL;
+       }
+ 
+       return 0;
+ }
+ 
+ /*
+  * Workaround to parse cipher algorithm from crypto API spec.
+  * The cc->cipher is currently used only in ESSIV.
+  * This should be probably done by crypto-api calls (once available...)
+  */
+ static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
+ {
+       const char *alg_name = NULL;
+       char *start, *end;
+ 
+       if (crypt_integrity_aead(cc)) {
+               alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
+               if (!alg_name)
+                       return -EINVAL;
+               if (crypt_integrity_hmac(cc)) {
+                       alg_name = strchr(alg_name, ',');
+                       if (!alg_name)
+                               return -EINVAL;
+               }
+               alg_name++;
+       } else {
+               alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
+               if (!alg_name)
+                       return -EINVAL;
+       }
+ 
+       start = strchr(alg_name, '(');
+       end = strchr(alg_name, ')');
+ 
+       if (!start && !end) {
+               cc->cipher = kstrdup(alg_name, GFP_KERNEL);
+               return cc->cipher ? 0 : -ENOMEM;
+       }
+ 
+       if (!start || !end || ++start >= end)
+               return -EINVAL;
+ 
+       cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
+       if (!cc->cipher)
+               return -ENOMEM;
+ 
+       strncpy(cc->cipher, start, end - start);
+ 
+       return 0;
+ }
+ 
+ /*
+  * Workaround to parse HMAC algorithm from AEAD crypto API spec.
+  * The HMAC is needed to calculate tag size (HMAC digest size).
+  * This should be probably done by crypto-api calls (once available...)
+  */
+ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
+ {
+       char *start, *end, *mac_alg = NULL;
+       struct crypto_ahash *mac;
+ 
+       if (!strstarts(cipher_api, "authenc("))
+               return 0;
+ 
+       start = strchr(cipher_api, '(');
+       end = strchr(cipher_api, ',');
+       if (!start || !end || ++start > end)
+               return -EINVAL;
+ 
+       mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
+       if (!mac_alg)
+               return -ENOMEM;
+       strncpy(mac_alg, start, end - start);
+ 
+       mac = crypto_alloc_ahash(mac_alg, 0, 0);
+       kfree(mac_alg);
+ 
+       if (IS_ERR(mac))
+               return PTR_ERR(mac);
+ 
+       cc->key_mac_size = crypto_ahash_digestsize(mac);
+       crypto_free_ahash(mac);
+ 
+       cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
+       if (!cc->authenc_key)
+               return -ENOMEM;
+ 
+       return 0;
+ }
+ 
+ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key,
+                               char **ivmode, char **ivopts)
+ {
+       struct crypt_config *cc = ti->private;
+       char *tmp, *cipher_api;
+       int ret = -EINVAL;
+ 
+       cc->tfms_count = 1;
+ 
+       /*
+        * New format (capi: prefix)
+        * capi:cipher_api_spec-iv:ivopts
+        */
+       tmp = &cipher_in[strlen("capi:")];
+       cipher_api = strsep(&tmp, "-");
+       *ivmode = strsep(&tmp, ":");
+       *ivopts = tmp;
+ 
+       if (*ivmode && !strcmp(*ivmode, "lmk"))
+               cc->tfms_count = 64;
+ 
+       cc->key_parts = cc->tfms_count;
+ 
+       /* Allocate cipher */
+       ret = crypt_alloc_tfms(cc, cipher_api);
+       if (ret < 0) {
+               ti->error = "Error allocating crypto tfm";
+               return ret;
+       }
+ 
+       /* Alloc AEAD, can be used only in new format. */
+       if (crypt_integrity_aead(cc)) {
+               ret = crypt_ctr_auth_cipher(cc, cipher_api);
+               if (ret < 0) {
+                       ti->error = "Invalid AEAD cipher spec";
+                       return -ENOMEM;
+               }
+               cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+       } else
+               cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+ 
+       ret = crypt_ctr_blkdev_cipher(cc);
+       if (ret < 0) {
+               ti->error = "Cannot allocate cipher string";
+               return -ENOMEM;
+       }
+ 
+       return 0;
+ }
+ 
+ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key,
+                               char **ivmode, char **ivopts)
   {
         struct crypt_config *cc = ti->private;
-       char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
+       char *tmp, *cipher, *chainmode, *keycount;
         char *cipher_api = NULL;
         int ret = -EINVAL;
         char dummy;
   
-       /* Convert to crypto api definition? */
-       if (strchr(cipher_in, '(')) {
+       if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) {
                 ti->error = "Bad cipher specification";
                 return -EINVAL;
         }
   
-       cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
-       if (!cc->cipher_string)
-               goto bad_mem;
- 
         /*
          * Legacy dm-crypt cipher specification
          * cipher[:keycount]-mode-iv:ivopts
@@@ -1731,15 -2407,14 +2407,14 @@@
                 return -EINVAL;
         }
         cc->key_parts = cc->tfms_count;
-       cc->key_extra_size = 0;
   
         cc->cipher = kstrdup(cipher, GFP_KERNEL);
         if (!cc->cipher)
                 goto bad_mem;
   
         chainmode = strsep(&tmp, "-");
-       ivopts = strsep(&tmp, "-");
-       ivmode = strsep(&ivopts, ":");
+       *ivopts = strsep(&tmp, "-");
+       *ivmode = strsep(&*ivopts, ":");
   
         if (tmp)
                 DMWARN("Ignoring unexpected additional cipher options");
@@@ -1748,12 -2423,12 +2423,12 @@@
          * For compatibility with the original dm-crypt mapping format, if
          * only the cipher name is supplied, use cbc-plain.
          */
-       if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
+       if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) {
                 chainmode = "cbc";
-               ivmode = "plain";
+               *ivmode = "plain";
         }
   
-       if (strcmp(chainmode, "ecb") && !ivmode) {
+       if (strcmp(chainmode, "ecb") && !*ivmode) {
                 ti->error = "IV mechanism required";
                 return -EINVAL;
         }
@@@ -1773,60 -2448,45 +2448,45 @@@
         ret = crypt_alloc_tfms(cc, cipher_api);
         if (ret < 0) {
                 ti->error = "Error allocating crypto tfm";
-               goto bad;
+               kfree(cipher_api);
+               return ret;
         }
   
-       /* Initialize IV */
-       cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
-       if (cc->iv_size)
-               /* at least a 64 bit sector number should fit in our buffer */
-               cc->iv_size = max(cc->iv_size,
-                                 (unsigned int)(sizeof(u64) / sizeof(u8)));
-       else if (ivmode) {
-               DMWARN("Selected cipher does not support IVs");
-               ivmode = NULL;
-       }
+       return 0;
+ bad_mem:
+       ti->error = "Cannot allocate cipher strings";
+       return -ENOMEM;
+ }
   
-       /* Choose ivmode, see comments at iv code. */
-       if (ivmode == NULL)
-               cc->iv_gen_ops = NULL;
-       else if (strcmp(ivmode, "plain") == 0)
-               cc->iv_gen_ops = &crypt_iv_plain_ops;
-       else if (strcmp(ivmode, "plain64") == 0)
-               cc->iv_gen_ops = &crypt_iv_plain64_ops;
-       else if (strcmp(ivmode, "essiv") == 0)
-               cc->iv_gen_ops = &crypt_iv_essiv_ops;
-       else if (strcmp(ivmode, "benbi") == 0)
-               cc->iv_gen_ops = &crypt_iv_benbi_ops;
-       else if (strcmp(ivmode, "null") == 0)
-               cc->iv_gen_ops = &crypt_iv_null_ops;
-       else if (strcmp(ivmode, "lmk") == 0) {
-               cc->iv_gen_ops = &crypt_iv_lmk_ops;
-               /*
-                * Version 2 and 3 is recognised according
-                * to length of provided multi-key string.
-                * If present (version 3), last key is used as IV seed.
-                * All keys (including IV seed) are always the same size.
-                */
-               if (cc->key_size % cc->key_parts) {
-                       cc->key_parts++;
-                       cc->key_extra_size = cc->key_size / cc->key_parts;
-               }
-       } else if (strcmp(ivmode, "tcw") == 0) {
-               cc->iv_gen_ops = &crypt_iv_tcw_ops;
-               cc->key_parts += 2; /* IV + whitening */
-               cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
-       } else {
-               ret = -EINVAL;
-               ti->error = "Invalid IV mode";
-               goto bad;
+ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
+ {
+       struct crypt_config *cc = ti->private;
+       char *ivmode = NULL, *ivopts = NULL;
+       int ret;
+ 
+       cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+       if (!cc->cipher_string) {
+               ti->error = "Cannot allocate cipher strings";
+               return -ENOMEM;
         }
   
+       if (strstarts(cipher_in, "capi:"))
+               ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts);
+       else
+               ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts);
+       if (ret)
+               return ret;
+ 
+       /* Initialize IV */
+       ret = crypt_ctr_ivmode(ti, ivmode);
+       if (ret < 0)
+               return ret;
+ 
         /* Initialize and set key */
         ret = crypt_set_key(cc, key);
         if (ret < 0) {
                 ti->error = "Error decoding and setting key";
-               goto bad;
+               return ret;
         }
   
         /* Allocate IV */
@@@ -1834,7 -2494,7 +2494,7 @@@
                 ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
                 if (ret < 0) {
                         ti->error = "Error creating IV";
-                       goto bad;
+                       return ret;
                 }
         }
   
@@@ -1843,18 -2503,82 +2503,82 @@@
                 ret = cc->iv_gen_ops->init(cc);
                 if (ret < 0) {
                         ti->error = "Error initialising IV";
-                       goto bad;
+                       return ret;
                 }
         }
   
-       ret = 0;
- bad:
-       kfree(cipher_api);
         return ret;
+ }
   
- bad_mem:
-       ti->error = "Cannot allocate cipher strings";
-       return -ENOMEM;
+ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
+ {
+       struct crypt_config *cc = ti->private;
+       struct dm_arg_set as;
+       static struct dm_arg _args[] = {
+               {0, 6, "Invalid number of feature args"},
+       };
+       unsigned int opt_params, val;
+       const char *opt_string, *sval;
+       char dummy;
+       int ret;
+ 
+       /* Optional parameters */
+       as.argc = argc;
+       as.argv = argv;
+ 
+       ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+       if (ret)
+               return ret;
+ 
+       while (opt_params--) {
+               opt_string = dm_shift_arg(&as);
+               if (!opt_string) {
+                       ti->error = "Not enough feature arguments";
+                       return -EINVAL;
+               }
+ 
+               if (!strcasecmp(opt_string, "allow_discards"))
+                       ti->num_discard_bios = 1;
+ 
+               else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+                       set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+ 
+               else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+                       set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+               else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
+                       if (val == 0 || val > MAX_TAG_SIZE) {
+                               ti->error = "Invalid integrity arguments";
+                               return -EINVAL;
+                       }
+                       cc->on_disk_tag_size = val;
+                       sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
+                       if (!strcasecmp(sval, "aead")) {
+                               set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+                       } else  if (strcasecmp(sval, "none")) {
+                               ti->error = "Unknown integrity profile";
+                               return -EINVAL;
+                       }
+ 
+                       cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
+                       if (!cc->cipher_auth)
+                               return -ENOMEM;
+               } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
+                       if (cc->sector_size < (1 << SECTOR_SHIFT) ||
+                           cc->sector_size > 4096 ||
+                           (cc->sector_size & (cc->sector_size - 1))) {
+                               ti->error = "Invalid feature value for sector_size";
+                               return -EINVAL;
+                       }
+                       cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
+               } else if (!strcasecmp(opt_string, "iv_large_sectors"))
+                       set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+               else {
+                       ti->error = "Invalid feature arguments";
+                       return -EINVAL;
+               }
+       }
+ 
+       return 0;
   }
   
   /*
@@@ -1865,18 -2589,12 +2589,12 @@@ static int crypt_ctr(struct dm_target *
   {
         struct crypt_config *cc;
         int key_size;
-       unsigned int opt_params;
+       unsigned int align_mask;
         unsigned long long tmpll;
         int ret;
-       size_t iv_size_padding;
-       struct dm_arg_set as;
-       const char *opt_string;
+       size_t iv_size_padding, additional_req_size;
         char dummy;
   
-       static struct dm_arg _args[] = {
-               {0, 3, "Invalid number of feature args"},
-       };
- 
         if (argc < 5) {
                 ti->error = "Not enough arguments";
                 return -EINVAL;
@@@ -1894,40 -2612,63 +2612,63 @@@
                 return -ENOMEM;
         }
         cc->key_size = key_size;
+       cc->sector_size = (1 << SECTOR_SHIFT);
+       cc->sector_shift = 0;
   
         ti->private = cc;
+ 
+       /* Optional parameters need to be read before cipher constructor */
+       if (argc > 5) {
+               ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
+               if (ret)
+                       goto bad;
+       }
+ 
         ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
         if (ret < 0)
                 goto bad;
   
-       cc->dmreq_start = sizeof(struct skcipher_request);
-       cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+       if (crypt_integrity_aead(cc)) {
+               cc->dmreq_start = sizeof(struct aead_request);
+               cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
+               align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
+       } else {
+               cc->dmreq_start = sizeof(struct skcipher_request);
+               cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+               align_mask = crypto_skcipher_alignmask(any_tfm(cc));
+       }
         cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
   
-       if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+       if (align_mask < CRYPTO_MINALIGN) {
                 /* Allocate the padding exactly */
                 iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
-                               & crypto_skcipher_alignmask(any_tfm(cc));
+                               & align_mask;
         } else {
                 /*
                  * If the cipher requires greater alignment than kmalloc
                  * alignment, we don't know the exact position of the
                  * initialization vector. We must assume worst case.
                  */
-               iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc));
+               iv_size_padding = align_mask;
         }
   
         ret = -ENOMEM;
-       cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
-                       sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
+ 
+       /*  ...| IV + padding | original IV | original sec. number | bio tag offset | */
+       additional_req_size = sizeof(struct dm_crypt_request) +
+               iv_size_padding + cc->iv_size +
+               cc->iv_size +
+               sizeof(uint64_t) +
+               sizeof(unsigned int);
+ 
+       cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
         if (!cc->req_pool) {
                 ti->error = "Cannot allocate crypt request mempool";
                 goto bad;
         }
   
         cc->per_bio_data_size = ti->per_io_data_size =
-               ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
-                     sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
+               ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
                       ARCH_KMALLOC_MINALIGN);
   
         cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
@@@ -1945,7 -2686,8 +2686,8 @@@
         mutex_init(&cc->bio_alloc_lock);
   
         ret = -EINVAL;
-       if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+       if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
+           (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
                 ti->error = "Invalid iv_offset sector";
                 goto bad;
         }
@@@ -1964,53 -2706,37 +2706,37 @@@
         }
         cc->start = tmpll;
   
-       argv += 5;
-       argc -= 5;
- 
-       /* Optional parameters */
-       if (argc) {
-               as.argc = argc;
-               as.argv = argv;
- 
-               ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+       if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
+               ret = crypt_integrity_ctr(cc, ti);
                 if (ret)
                         goto bad;
   
-               ret = -EINVAL;
-               while (opt_params--) {
-                       opt_string = dm_shift_arg(&as);
-                       if (!opt_string) {
-                               ti->error = "Not enough feature arguments";
-                               goto bad;
-                       }
- 
-                       if (!strcasecmp(opt_string, "allow_discards"))
-                               ti->num_discard_bios = 1;
+               cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
+               if (!cc->tag_pool_max_sectors)
+                       cc->tag_pool_max_sectors = 1;
   
-                       else if (!strcasecmp(opt_string, "same_cpu_crypt"))
-                               set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
- 
-                       else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
-                               set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
- 
-                       else {
-                               ti->error = "Invalid feature arguments";
-                               goto bad;
-                       }
+               cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+                       cc->tag_pool_max_sectors * cc->on_disk_tag_size);
+               if (!cc->tag_pool) {
+                       ti->error = "Cannot allocate integrity tags mempool";
+                       goto bad;
                 }
+ 
+               cc->tag_pool_max_sectors <<= cc->sector_shift;
         }
   
         ret = -ENOMEM;
-       cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+       cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
         if (!cc->io_queue) {
                 ti->error = "Couldn't create kcryptd io queue";
                 goto bad;
         }
   
         if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
-               cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+               cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
         else
-               cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+               cc->crypt_queue = alloc_workqueue("kcryptd",
+                                                 WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
                                                   num_online_cpus());
         if (!cc->crypt_queue) {
                 ti->error = "Couldn't create kcryptd queue";
@@@ -2030,6 -2756,7 +2756,6 @@@
         wake_up_process(cc->write_thread);
   
         ti->num_flush_bios = 1;
- -      ti->discard_zeroes_data_unsupported = true;
   
         return 0;
   
@@@ -2061,12 -2788,39 +2787,39 @@@ static int crypt_map(struct dm_target *
          * Check if bio is too large, split as needed.
          */
         if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
-           bio_data_dir(bio) == WRITE)
+           (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
                 dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
   
+       /*
+        * Ensure that bio is a multiple of internal sector encryption size
+        * and is aligned to this size as defined in IO hints.
+        */
+       if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
+               return -EIO;
+ 
+       if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
+               return -EIO;
+ 
         io = dm_per_bio_data(bio, cc->per_bio_data_size);
         crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
-       io->ctx.req = (struct skcipher_request *)(io + 1);
+ 
+       if (cc->on_disk_tag_size) {
+               unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
+ 
+               if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
+                   unlikely(!(io->integrity_metadata = kmalloc(tag_len,
+                               GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+                       if (bio_sectors(bio) > cc->tag_pool_max_sectors)
+                               dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
+                       io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+                       io->integrity_metadata_from_pool = true;
+               }
+       }
+ 
+       if (crypt_integrity_aead(cc))
+               io->ctx.r.req_aead = (struct aead_request *)(io + 1);
+       else
+               io->ctx.r.req = (struct skcipher_request *)(io + 1);
   
         if (bio_data_dir(io->base_bio) == READ) {
                 if (kcryptd_io_read(io, GFP_NOWAIT))
@@@ -2107,6 -2861,10 +2860,10 @@@ static void crypt_status(struct dm_targ
                 num_feature_args += !!ti->num_discard_bios;
                 num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
                 num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+               num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
+               num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+               if (cc->on_disk_tag_size)
+                       num_feature_args++;
                 if (num_feature_args) {
                         DMEMIT(" %d", num_feature_args);
                         if (ti->num_discard_bios)
@@@ -2115,6 -2873,12 +2872,12 @@@
                                 DMEMIT(" same_cpu_crypt");
                         if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
                                 DMEMIT(" submit_from_crypt_cpus");
+                       if (cc->on_disk_tag_size)
+                               DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
+                       if (cc->sector_size != (1 << SECTOR_SHIFT))
+                               DMEMIT(" sector_size:%d", cc->sector_size);
+                       if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+                               DMEMIT(" iv_large_sectors");
                 }
   
                 break;
@@@ -2204,6 -2968,8 +2967,8 @@@ static int crypt_iterate_devices(struc
   
   static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
   {
+       struct crypt_config *cc = ti->private;
+ 
         /*
          * Unfortunate constraint that is required to avoid the potential
          * for exceeding underlying device's max_segments limits -- due to
@@@ -2211,11 -2977,17 +2976,17 @@@
          * bio that are not as physically contiguous as the original bio.
          */
         limits->max_segment_size = PAGE_SIZE;
+ 
+       if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+               limits->logical_block_size = cc->sector_size;
+               limits->physical_block_size = cc->sector_size;
+               blk_limits_io_min(limits, cc->sector_size);
+       }
   }
   
   static struct target_type crypt_target = {
         .name   = "crypt",
-       .version = {1, 15, 0},
+       .version = {1, 17, 0},
         .module = THIS_MODULE,
         .ctr    = crypt_ctr,
         .dtr    = crypt_dtr,
diff --combined drivers/md/dm-linear.c

index e17fd44ceef534352dfd2b9b15d787399bbaf065,ffa0c9c5968a38e08c140bf03c5e8998e0c44d86..a5120961632a7d63d76e4fd8a24a3116394cacff
--- 1/drivers/md/dm-linear.c
--- 2/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@@ -59,7 -59,6 +59,7 @@@ static int linear_ctr(struct dm_target 
         ti->num_flush_bios = 1;
         ti->num_discard_bios = 1;
         ti->num_write_same_bios = 1;
+ +      ti->num_write_zeroes_bios = 1;
         ti->private = lc;
         return 0;
   
@@@ -163,6 -162,7 +163,7 @@@ static long linear_direct_access(struc
   static struct target_type linear_target = {
         .name   = "linear",
         .version = {1, 3, 0},
+       .features = DM_TARGET_PASSES_INTEGRITY,
         .module = THIS_MODULE,
         .ctr    = linear_ctr,
         .dtr    = linear_dtr,
diff --combined drivers/md/dm-mpath.c

index 2950b145443d7e1c26f9831490406869f6ed2323,fd7cdc4ce2a5f8311a3132ae89deb0fb2a37427e..52cd3f1608b35a2e88ee28193e621650f81d6d4a
--- 1/drivers/md/dm-mpath.c
--- 2/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@@ -90,7 -90,7 +90,7 @@@ struct multipath 
         atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
         atomic_t pg_init_count;         /* Number of times pg_init called */
   
-       unsigned queue_mode;
+       enum dm_queue_mode queue_mode;
   
         struct mutex work_mutex;
         struct work_struct trigger_event;
@@@ -111,7 -111,8 +111,8 @@@ typedef int (*action_fn) (struct pgpat
   
   static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
   static void trigger_event(struct work_struct *work);
- static void activate_path(struct work_struct *work);
+ static void activate_or_offline_path(struct pgpath *pgpath);
+ static void activate_path_work(struct work_struct *work);
   static void process_queued_bios(struct work_struct *work);
   
   /*-----------------------------------------------
@@@ -136,7 -137,7 +137,7 @@@ static struct pgpath *alloc_pgpath(void
   
         if (pgpath) {
                 pgpath->is_active = true;
-               INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+               INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
         }
   
         return pgpath;
@@@ -297,6 -298,8 +298,8 @@@ static int __pg_init_all_paths(struct m
         struct pgpath *pgpath;
         unsigned long pg_init_delay = 0;
   
+       lockdep_assert_held(&m->lock);
+ 
         if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
                 return 0;
   
@@@ -321,13 -324,16 +324,16 @@@
         return atomic_read(&m->pg_init_in_progress);
   }
   
- static void pg_init_all_paths(struct multipath *m)
+ static int pg_init_all_paths(struct multipath *m)
   {
+       int ret;
         unsigned long flags;
   
         spin_lock_irqsave(&m->lock, flags);
-       __pg_init_all_paths(m);
+       ret = __pg_init_all_paths(m);
         spin_unlock_irqrestore(&m->lock, flags);
+ 
+       return ret;
   }
   
   static void __switch_pg(struct multipath *m, struct priority_group *pg)
@@@ -436,45 -442,21 +442,21 @@@ failed
   }
   
   /*
-  * Check whether bios must be queued in the device-mapper core rather
-  * than here in the target.
-  *
-  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
-  * same value then we are not between multipath_presuspend()
-  * and multipath_resume() calls and we have no need to check
-  * for the DMF_NOFLUSH_SUSPENDING flag.
+  * dm_report_EIO() is a macro instead of a function to make pr_debug()
+  * report the function name and line number of the function from which
+  * it has been invoked.
    */
- static bool __must_push_back(struct multipath *m)
- {
-       return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
-                test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
-               dm_noflush_suspending(m->ti));
- }
- 
- static bool must_push_back_rq(struct multipath *m)
- {
-       bool r;
-       unsigned long flags;
- 
-       spin_lock_irqsave(&m->lock, flags);
-       r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
-            __must_push_back(m));
-       spin_unlock_irqrestore(&m->lock, flags);
- 
-       return r;
- }
- 
- static bool must_push_back_bio(struct multipath *m)
- {
-       bool r;
-       unsigned long flags;
- 
-       spin_lock_irqsave(&m->lock, flags);
-       r = __must_push_back(m);
-       spin_unlock_irqrestore(&m->lock, flags);
- 
-       return r;
- }
+ #define dm_report_EIO(m)                                              \
+ ({                                                                    \
+       struct mapped_device *md = dm_table_get_md((m)->ti->table);     \
+                                                                       \
+       pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
+                dm_device_name(md),                                    \
+                test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),        \
+                test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags),  \
+                dm_noflush_suspending((m)->ti));                       \
+       -EIO;                                                           \
+ })
   
   /*
    * Map cloned requests (request-based multipath)
@@@ -484,11 -466,11 +466,11 @@@ static int multipath_clone_and_map(stru
                                    struct request **__clone)
   {
         struct multipath *m = ti->private;
-       int r = DM_MAPIO_REQUEUE;
         size_t nr_bytes = blk_rq_bytes(rq);
         struct pgpath *pgpath;
         struct block_device *bdev;
         struct dm_mpath_io *mpio = get_mpio(map_context);
+       struct request_queue *q;
         struct request *clone;
   
         /* Do we need to select a new pgpath? */
@@@ -497,13 -479,14 +479,14 @@@
                 pgpath = choose_pgpath(m, nr_bytes);
   
         if (!pgpath) {
-               if (must_push_back_rq(m))
+               if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
                         return DM_MAPIO_DELAY_REQUEUE;
-               return -EIO;    /* Failed */
+               return dm_report_EIO(m);        /* Failed */
         } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
                    test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-               pg_init_all_paths(m);
-               return r;
+               if (pg_init_all_paths(m))
+                       return DM_MAPIO_DELAY_REQUEUE;
+               return DM_MAPIO_REQUEUE;
         }
   
         memset(mpio, 0, sizeof(*mpio));
@@@ -511,13 -494,19 +494,19 @@@
         mpio->nr_bytes = nr_bytes;
   
         bdev = pgpath->path.dev->bdev;
- 
-       clone = blk_get_request(bdev_get_queue(bdev),
-                       rq->cmd_flags | REQ_NOMERGE,
-                       GFP_ATOMIC);
+       q = bdev_get_queue(bdev);
+       clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
         if (IS_ERR(clone)) {
                 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-               return r;
+               bool queue_dying = blk_queue_dying(q);
+               DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
+                           PTR_ERR(clone), queue_dying ? " (path offline)" : "");
+               if (queue_dying) {
+                       atomic_inc(&m->pg_init_in_progress);
+                       activate_or_offline_path(pgpath);
+                       return DM_MAPIO_REQUEUE;
+               }
+               return DM_MAPIO_DELAY_REQUEUE;
         }
         clone->bio = clone->biotail = NULL;
         clone->rq_disk = bdev->bd_disk;
@@@ -567,9 -556,9 +556,9 @@@ static int __multipath_map_bio(struct m
         }
   
         if (!pgpath) {
-               if (!must_push_back_bio(m))
-                       return -EIO;
-               return DM_MAPIO_REQUEUE;
+               if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+                       return DM_MAPIO_REQUEUE;
+               return dm_report_EIO(m);
         }
   
         mpio->pgpath = pgpath;
@@@ -640,6 -629,14 +629,14 @@@ static void process_queued_bios(struct 
         blk_finish_plug(&plug);
   }
   
+ static void assign_bit(bool value, long nr, unsigned long *addr)
+ {
+       if (value)
+               set_bit(nr, addr);
+       else
+               clear_bit(nr, addr);
+ }
+ 
   /*
    * If we run out of usable paths, should we queue I/O or error it?
    */
@@@ -649,23 -646,11 +646,11 @@@ static int queue_if_no_path(struct mult
         unsigned long flags;
   
         spin_lock_irqsave(&m->lock, flags);
- 
-       if (save_old_value) {
-               if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-                       set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-               else
-                       clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-       } else {
-               if (queue_if_no_path)
-                       set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-               else
-                       clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-       }
-       if (queue_if_no_path)
-               set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-       else
-               clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
- 
+       assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+                  (!save_old_value && queue_if_no_path),
+                  MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+       assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
+                  MPATHF_QUEUE_IF_NO_PATH, &m->flags);
         spin_unlock_irqrestore(&m->lock, flags);
   
         if (!queue_if_no_path) {
@@@ -1103,7 -1088,6 +1088,7 @@@ static int multipath_ctr(struct dm_targ
         ti->num_flush_bios = 1;
         ti->num_discard_bios = 1;
         ti->num_write_same_bios = 1;
+ +      ti->num_write_zeroes_bios = 1;
         if (m->queue_mode == DM_TYPE_BIO_BASED)
                 ti->per_io_data_size = multipath_per_bio_data_size();
         else
@@@ -1438,10 -1422,8 +1423,8 @@@ out
         spin_unlock_irqrestore(&m->lock, flags);
   }
   
- static void activate_path(struct work_struct *work)
+ static void activate_or_offline_path(struct pgpath *pgpath)
   {
-       struct pgpath *pgpath =
-               container_of(work, struct pgpath, activate_path.work);
         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
   
         if (pgpath->is_active && !blk_queue_dying(q))
@@@ -1450,6 -1432,14 +1433,14 @@@
                 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
   }
   
+ static void activate_path_work(struct work_struct *work)
+ {
+       struct pgpath *pgpath =
+               container_of(work, struct pgpath, activate_path.work);
+ 
+       activate_or_offline_path(pgpath);
+ }
+ 
   static int noretry_error(int error)
   {
         switch (error) {
@@@ -1492,7 -1482,7 +1483,7 @@@ static int do_end_io(struct multipath *
          */
         int r = DM_ENDIO_REQUEUE;
   
- -      if (!error && !clone->errors)
+ +      if (!error)
                 return 0;       /* I/O complete */
   
         if (noretry_error(error))
@@@ -1501,12 -1491,9 +1492,9 @@@
         if (mpio->pgpath)
                 fail_path(mpio->pgpath);
   
-       if (!atomic_read(&m->nr_valid_paths)) {
-               if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-                       if (!must_push_back_rq(m))
-                               r = -EIO;
-               }
-       }
+       if (atomic_read(&m->nr_valid_paths) == 0 &&
+           !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+               r = dm_report_EIO(m);
   
         return r;
   }
@@@ -1547,13 -1534,9 +1535,9 @@@ static int do_end_io_bio(struct multipa
         if (mpio->pgpath)
                 fail_path(mpio->pgpath);
   
-       if (!atomic_read(&m->nr_valid_paths)) {
-               if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-                       if (!must_push_back_bio(m))
-                               return -EIO;
-                       return DM_ENDIO_REQUEUE;
-               }
-       }
+       if (atomic_read(&m->nr_valid_paths) == 0 &&
+           !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+               return dm_report_EIO(m);
   
         /* Queue for the daemon to resubmit */
         dm_bio_restore(get_bio_details_from_bio(clone), clone);
@@@ -1619,10 -1602,8 +1603,8 @@@ static void multipath_resume(struct dm_
         unsigned long flags;
   
         spin_lock_irqsave(&m->lock, flags);
-       if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
-               set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-       else
-               clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+       assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
+                  MPATHF_QUEUE_IF_NO_PATH, &m->flags);
         spin_unlock_irqrestore(&m->lock, flags);
   }
   
@@@ -1682,6 -1663,9 +1664,9 @@@ static void multipath_status(struct dm_
                         case DM_TYPE_MQ_REQUEST_BASED:
                                 DMEMIT("queue_mode mq ");
                                 break;
+                       default:
+                               WARN_ON_ONCE(true);
+                               break;
                         }
                 }
         }
diff --combined drivers/md/dm-raid.c

index 468f1380de1d352ab6ea4fff66e65402dccdaf15,0f61bb659b737468bf3a36eb26eb72af90482e9c..3a67073d9aa1827158cb113e8df162d6bc5d2cfb
--- 1/drivers/md/dm-raid.c
--- 2/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@@ -1,6 -1,6 +1,6 @@@
   /*
    * Copyright (C) 2010-2011 Neil Brown
-  * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
+  * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
    *
    * This file is released under the GPL.
    */
@@@ -79,7 -79,10 +79,10 @@@ struct raid_dev 
   #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
   
   /* New for v1.10.0 */
- #define __CTR_FLAG_JOURNAL_DEV                15 /* 2 */ /* Only with raid4/5/6! */
+ #define __CTR_FLAG_JOURNAL_DEV                15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
+ 
+ /* New for v1.11.1 */
+ #define __CTR_FLAG_JOURNAL_MODE               16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
   
   /*
    * Flags for rs->ctr_flags field.
@@@ -100,6 -103,7 +103,7 @@@
   #define CTR_FLAG_DATA_OFFSET          (1 << __CTR_FLAG_DATA_OFFSET)
   #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
   #define CTR_FLAG_JOURNAL_DEV          (1 << __CTR_FLAG_JOURNAL_DEV)
+ #define CTR_FLAG_JOURNAL_MODE         (1 << __CTR_FLAG_JOURNAL_MODE)
   
   #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
   
@@@ -175,7 -179,8 +179,8 @@@
                                  CTR_FLAG_REGION_SIZE | \
                                  CTR_FLAG_DELTA_DISKS | \
                                  CTR_FLAG_DATA_OFFSET | \
-                                CTR_FLAG_JOURNAL_DEV)
+                                CTR_FLAG_JOURNAL_DEV | \
+                                CTR_FLAG_JOURNAL_MODE)
   
   #define RAID6_VALID_FLAGS     (CTR_FLAG_SYNC | \
                                  CTR_FLAG_REBUILD | \
@@@ -186,7 -191,8 +191,8 @@@
                                  CTR_FLAG_REGION_SIZE | \
                                  CTR_FLAG_DELTA_DISKS | \
                                  CTR_FLAG_DATA_OFFSET | \
-                                CTR_FLAG_JOURNAL_DEV)
+                                CTR_FLAG_JOURNAL_DEV | \
+                                CTR_FLAG_JOURNAL_MODE)
   /* ...valid options definitions per raid level */
   
   /*
@@@ -239,6 -245,7 +245,7 @@@ struct raid_set 
         struct journal_dev {
                 struct dm_dev *dev;
                 struct md_rdev rdev;
+               int mode;
         } journal_dev;
   
         struct raid_dev dev[0];
@@@ -326,6 -333,7 +333,7 @@@ static struct arg_name_flag 
         { CTR_FLAG_DELTA_DISKS, "delta_disks"},
         { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
         { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
+       { CTR_FLAG_JOURNAL_MODE, "journal_mode" },
   };
   
   /* Return argument name string for given @flag */
@@@ -344,6 -352,39 +352,39 @@@ static const char *dm_raid_arg_name_by_
         return NULL;
   }
   
+ /* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
+ static struct {
+       const int mode;
+       const char *param;
+ } _raid456_journal_mode[] = {
+       { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
+       { R5C_JOURNAL_MODE_WRITE_BACK    , "writeback" }
+ };
+ 
+ /* Return MD raid4/5/6 journal mode for dm @journal_mode one */
+ static int dm_raid_journal_mode_to_md(const char *mode)
+ {
+       int m = ARRAY_SIZE(_raid456_journal_mode);
+ 
+       while (m--)
+               if (!strcasecmp(mode, _raid456_journal_mode[m].param))
+                       return _raid456_journal_mode[m].mode;
+ 
+       return -EINVAL;
+ }
+ 
+ /* Return dm-raid raid4/5/6 journal mode string for @mode */
+ static const char *md_journal_mode_to_dm_raid(const int mode)
+ {
+       int m = ARRAY_SIZE(_raid456_journal_mode);
+ 
+       while (m--)
+               if (mode == _raid456_journal_mode[m].mode)
+                       return _raid456_journal_mode[m].param;
+ 
+       return "unknown";
+ }
+ 
   /*
    * Bool helpers to test for various raid levels of a raid set.
    * It's level as reported by the superblock rather than
@@@ -1183,7 -1224,7 +1224,7 @@@ static int parse_raid_params(struct rai
                         continue;
                 }
   
-               /* "journal_dev dev" */
+               /* "journal_dev <dev>" */
                 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
                         int r;
                         struct md_rdev *jdev;
@@@ -1211,10 -1252,32 +1252,32 @@@
                                 rs->ti->error = "No space for raid4/5/6 journal";
                                 return -ENOSPC;
                         }
+                       rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
                         set_bit(Journal, &jdev->flags);
                         continue;
                 }
   
+               /* "journal_mode <mode>" ("journal_dev" mandatory!) */
+               if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
+                       int r;
+ 
+                       if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                               rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
+                               return -EINVAL;
+                       }
+                       if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+                               rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
+                               return -EINVAL;
+                       }
+                       r = dm_raid_journal_mode_to_md(arg);
+                       if (r < 0) {
+                               rs->ti->error = "Invalid 'journal_mode' argument";
+                               return r;
+                       }
+                       rs->journal_dev.mode = r;
+                       continue;
+               }
+ 
                 /*
                  * Parameters with number values from here on.
                  */
@@@ -2813,9 -2876,7 +2876,9 @@@ static void configure_discard_support(s
         /* Assume discards not supported until after checks below. */
         ti->discards_supported = false;
   
- -      /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+ +      /*
+ +       * XXX: RAID level 4,5,6 require zeroing for safety.
+ +       */
         raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
   
         for (i = 0; i < rs->raid_disks; i++) {
@@@ -2829,6 -2890,8 +2892,6 @@@
                         return;
   
                 if (raid456) {
- -                      if (!q->limits.discard_zeroes_data)
- -                              return;
                         if (!devices_handle_discard_safely) {
                                 DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
                                 DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
@@@ -3076,6 -3139,16 +3139,16 @@@ static int raid_ctr(struct dm_target *t
         rs->callbacks.congested_fn = raid_is_congested;
         dm_table_add_target_callbacks(ti->table, &rs->callbacks);
   
+       /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+       if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+               r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
+               if (r) {
+                       ti->error = "Failed to set raid4/5/6 journal mode";
+                       mddev_unlock(&rs->md);
+                       goto bad_journal_mode_set;
+               }
+       }
+ 
         mddev_suspend(&rs->md);
   
         /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@@ -3109,6 -3182,7 +3182,7 @@@
         mddev_unlock(&rs->md);
         return 0;
   
+ bad_journal_mode_set:
   bad_stripe_cache:
   bad_check_reshape:
         md_stop(&rs->md);
@@@ -3180,18 -3254,18 +3254,18 @@@ static const char *decipher_sync_action
    * Status characters:
    *
    *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
-  *  'a' = Alive but not in-sync
-  *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+  *  'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
+  *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
    *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
    */
- static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
   {
         if (!rdev->bdev)
                 return "-";
         else if (test_bit(Faulty, &rdev->flags))
                 return "D";
         else if (test_bit(Journal, &rdev->flags))
-               return "A";
+               return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
         else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
                 return "a";
         else
@@@ -3315,7 -3389,7 +3389,7 @@@ static void raid_status(struct dm_targe
   
                 /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
                 for (i = 0; i < rs->raid_disks; i++)
-                       DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
+                       DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
   
                 /*
                  * In-sync/Reshape ratio:
@@@ -3366,7 -3440,7 +3440,7 @@@
                  * v1.10.0+:
                  */
                 DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
-                             __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
+                             __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
                 break;
   
         case STATUSTYPE_TABLE:
@@@ -3381,39 -3455,30 +3455,30 @@@
                                   write_mostly_params +
                                   hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
                                   hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
-                                 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
+                                 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
+                                 (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
+ 
                 /* Emit table line */
+               /* This has to be in the documented order for userspace! */
                 DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
-               if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
-                       DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
-                                        raid10_md_layout_to_format(mddev->layout));
-               if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
-                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
-                                        raid10_md_layout_to_copies(mddev->layout));
-               if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
-                       DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
                 if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
                         DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
-               if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
-                       DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
-                                          (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
-               if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
-                       DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
-                                          (unsigned long long) rs->data_offset);
-               if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
-                       DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
-                                         mddev->bitmap_info.daemon_sleep);
-               if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
-                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
-                                        max(rs->delta_disks, mddev->delta_disks));
-               if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
-                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
-                                        max_nr_stripes);
+               if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+                       DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
                 if (rebuild_disks)
                         for (i = 0; i < rs->raid_disks; i++)
                                 if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
                                         DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
                                                          rs->dev[i].rdev.raid_disk);
+               if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+                       DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+                                         mddev->bitmap_info.daemon_sleep);
+               if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+                                        mddev->sync_speed_min);
+               if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+                                        mddev->sync_speed_max);
                 if (write_mostly_params)
                         for (i = 0; i < rs->raid_disks; i++)
                                 if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
@@@ -3422,15 -3487,30 +3487,30 @@@
                 if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
                         DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
                                           mddev->bitmap_info.max_write_behind);
-               if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
-                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
-                                        mddev->sync_speed_max);
-               if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
-                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
-                                        mddev->sync_speed_min);
+               if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+                                        max_nr_stripes);
+               if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+                       DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+                                          (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+               if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+                                        raid10_md_layout_to_copies(mddev->layout));
+               if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+                       DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+                                        raid10_md_layout_to_format(mddev->layout));
+               if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+                       DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+                                        max(rs->delta_disks, mddev->delta_disks));
+               if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+                       DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+                                          (unsigned long long) rs->data_offset);
                 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
                         DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
                                         __get_dev_name(rs->journal_dev.dev));
+               if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
+                       DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
+                                        md_journal_mode_to_dm_raid(rs->journal_dev.mode));
                 DMEMIT(" %d", rs->raid_disks);
                 for (i = 0; i < rs->raid_disks; i++)
                         DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@@ -3791,7 -3871,7 +3871,7 @@@ static void raid_resume(struct dm_targe
   
   static struct target_type raid_target = {
         .name = "raid",
-       .version = {1, 10, 1},
+       .version = {1, 11, 1},
         .module = THIS_MODULE,
         .ctr = raid_ctr,
         .dtr = raid_dtr,
diff --combined drivers/md/dm-rq.c

index bff7e3bdb4ed1d6bf8ff367e11fa13749dfd6776,a6e8da9da7a4dd65550bd1753a7563eb487d6863..d445b712970b268293aa4a34cbbbb6ad3549131d
--- 1/drivers/md/dm-rq.c
--- 2/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@@ -280,7 -280,7 +280,7 @@@ static void dm_requeue_original_request
         if (!rq->q->mq_ops)
                 dm_old_requeue_request(rq);
         else
-               dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
+               dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);
   
         rq_completed(md, rw, false);
   }
@@@ -298,14 -298,9 +298,14 @@@ static void dm_done(struct request *clo
                         r = rq_end_io(tio->ti, clone, error, &tio->info);
         }
   
- -      if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
- -                   !clone->q->limits.max_write_same_sectors))
- -              disable_write_same(tio->md);
+ +      if (unlikely(r == -EREMOTEIO)) {
+ +              if (req_op(clone) == REQ_OP_WRITE_SAME &&
+ +                  !clone->q->limits.max_write_same_sectors)
+ +                      disable_write_same(tio->md);
+ +              if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+ +                  !clone->q->limits.max_write_zeroes_sectors)
+ +                      disable_write_zeroes(tio->md);
+ +      }
   
         if (r <= 0)
                 /* The target wants to complete the I/O */
@@@ -363,7 -358,7 +363,7 @@@ static void dm_complete_request(struct 
         if (!rq->q->mq_ops)
                 blk_complete_request(rq);
         else
- -              blk_mq_complete_request(rq, error);
+ +              blk_mq_complete_request(rq);
   }
   
   /*
@@@ -760,14 -755,13 +760,14 @@@ static int dm_mq_queue_rq(struct blk_mq
                 /* Undo dm_start_request() before requeuing */
                 rq_end_stats(md, rq);
                 rq_completed(md, rq_data_dir(rq), false);
+ +              blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
                 return BLK_MQ_RQ_QUEUE_BUSY;
         }
   
         return BLK_MQ_RQ_QUEUE_OK;
   }
   
- -static struct blk_mq_ops dm_mq_ops = {
+ +static const struct blk_mq_ops dm_mq_ops = {
         .queue_rq = dm_mq_queue_rq,
         .complete = dm_softirq_done,
         .init_request = dm_mq_init_request,
@@@ -815,10 -809,14 +815,14 @@@ int dm_mq_init_request_queue(struct map
         dm_init_md_queue(md);
   
         /* backfill 'mq' sysfs registration normally done in blk_register_queue */
-       blk_mq_register_dev(disk_to_dev(md->disk), q);
+       err = blk_mq_register_dev(disk_to_dev(md->disk), q);
+       if (err)
+               goto out_cleanup_queue;
   
         return 0;
   
+ out_cleanup_queue:
+       blk_cleanup_queue(q);
   out_tag_set:
         blk_mq_free_tag_set(md->tag_set);
   out_kfree_tag_set:
diff --combined drivers/md/dm-stripe.c

index 5ef49c121d9955dfafaf7b4ba30c50f3449c1df0,d7e1b86e7570b16759224c7ca06345883fcdbf8e..4b50ae115c6d34370cde6780178c1c49aabeba67
--- 1/drivers/md/dm-stripe.c
--- 2/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@@ -169,7 -169,6 +169,7 @@@ static int stripe_ctr(struct dm_target 
         ti->num_flush_bios = stripes;
         ti->num_discard_bios = stripes;
         ti->num_write_same_bios = stripes;
+ +      ti->num_write_zeroes_bios = stripes;
   
         sc->chunk_size = chunk_size;
         if (chunk_size & (chunk_size - 1))
@@@ -294,7 -293,6 +294,7 @@@ static int stripe_map(struct dm_target 
                 return DM_MAPIO_REMAPPED;
         }
         if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+ +          unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
             unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
                 target_bio_nr = dm_bio_get_target_bio_nr(bio);
                 BUG_ON(target_bio_nr >= sc->stripes);
@@@ -442,6 -440,7 +442,7 @@@ static void stripe_io_hints(struct dm_t
   static struct target_type stripe_target = {
         .name   = "striped",
         .version = {1, 6, 0},
+       .features = DM_TARGET_PASSES_INTEGRITY,
         .module = THIS_MODULE,
         .ctr    = stripe_ctr,
         .dtr    = stripe_dtr,
diff --combined drivers/md/dm-table.c

index 958275aca008454a460b0fbe22f543bad2f4172a,a02a0482915666b1416b16c0f8853d2d68cb3468..5f5eae41f8041ee3a70540f1fd78e01444abf247
--- 1/drivers/md/dm-table.c
--- 2/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@@ -30,7 -30,7 +30,7 @@@
   
   struct dm_table {
         struct mapped_device *md;
-       unsigned type;
+       enum dm_queue_mode type;
   
         /* btree table */
         unsigned int depth;
@@@ -47,6 -47,7 +47,7 @@@
         bool integrity_supported:1;
         bool singleton:1;
         bool all_blk_mq:1;
+       unsigned integrity_added:1;
   
         /*
          * Indicates the rw permissions for the new logical
@@@ -372,7 -373,7 +373,7 @@@ static int upgrade_mode(struct dm_dev_i
    */
   dev_t dm_get_dev_t(const char *path)
   {
-       dev_t uninitialized_var(dev);
+       dev_t dev;
         struct block_device *bdev;
   
         bdev = lookup_bdev(path);
@@@ -626,13 -627,13 +627,13 @@@ static int validate_hardware_logical_bl
   
         struct dm_target *uninitialized_var(ti);
         struct queue_limits ti_limits;
-       unsigned i = 0;
+       unsigned i;
   
         /*
          * Check each entry in the table in turn.
          */
-       while (i < dm_table_get_num_targets(table)) {
-               ti = dm_table_get_target(table, i++);
+       for (i = 0; i < dm_table_get_num_targets(table); i++) {
+               ti = dm_table_get_target(table, i);
   
                 blk_set_stacking_limits(&ti_limits);
   
@@@ -725,6 -726,9 +726,9 @@@ int dm_table_add_target(struct dm_tabl
                 t->immutable_target_type = tgt->type;
         }
   
+       if (dm_target_has_integrity(tgt->type))
+               t->integrity_added = 1;
+ 
         tgt->table = t;
         tgt->begin = start;
         tgt->len = len;
@@@ -821,19 -825,19 +825,19 @@@ void dm_consume_args(struct dm_arg_set 
   }
   EXPORT_SYMBOL(dm_consume_args);
   
- static bool __table_type_bio_based(unsigned table_type)
+ static bool __table_type_bio_based(enum dm_queue_mode table_type)
   {
         return (table_type == DM_TYPE_BIO_BASED ||
                 table_type == DM_TYPE_DAX_BIO_BASED);
   }
   
- static bool __table_type_request_based(unsigned table_type)
+ static bool __table_type_request_based(enum dm_queue_mode table_type)
   {
         return (table_type == DM_TYPE_REQUEST_BASED ||
                 table_type == DM_TYPE_MQ_REQUEST_BASED);
   }
   
- void dm_table_set_type(struct dm_table *t, unsigned type)
+ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
   {
         t->type = type;
   }
@@@ -850,11 -854,11 +854,11 @@@ static int device_supports_dax(struct d
   static bool dm_table_supports_dax(struct dm_table *t)
   {
         struct dm_target *ti;
-       unsigned i = 0;
+       unsigned i;
   
         /* Ensure that all targets support DAX. */
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               ti = dm_table_get_target(t, i);
   
                 if (!ti->type->direct_access)
                         return false;
@@@ -875,7 -879,7 +879,7 @@@ static int dm_table_determine_type(stru
         struct dm_target *tgt;
         struct dm_dev_internal *dd;
         struct list_head *devices = dm_table_get_devices(t);
-       unsigned live_md_type = dm_get_md_type(t->md);
+       enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
   
         if (t->type != DM_TYPE_NONE) {
                 /* target already set the table's type */
@@@ -984,7 -988,7 +988,7 @@@ verify_rq_based
         return 0;
   }
   
- unsigned dm_table_get_type(struct dm_table *t)
+ enum dm_queue_mode dm_table_get_type(struct dm_table *t)
   {
         return t->type;
   }
@@@ -1006,11 -1010,11 +1010,11 @@@ struct dm_target *dm_table_get_immutabl
   
   struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
   {
-       struct dm_target *uninitialized_var(ti);
-       unsigned i = 0;
+       struct dm_target *ti;
+       unsigned i;
   
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               ti = dm_table_get_target(t, i);
                 if (dm_target_is_wildcard(ti->type))
                         return ti;
         }
@@@ -1035,7 -1039,7 +1039,7 @@@ bool dm_table_all_blk_mq_devices(struc
   
   static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
   {
-       unsigned type = dm_table_get_type(t);
+       enum dm_queue_mode type = dm_table_get_type(t);
         unsigned per_io_data_size = 0;
         struct dm_target *tgt;
         unsigned i;
@@@ -1131,6 -1135,13 +1135,13 @@@ static struct gendisk * dm_table_get_in
         struct list_head *devices = dm_table_get_devices(t);
         struct dm_dev_internal *dd = NULL;
         struct gendisk *prev_disk = NULL, *template_disk = NULL;
+       unsigned i;
+ 
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               struct dm_target *ti = dm_table_get_target(t, i);
+               if (!dm_target_passes_integrity(ti->type))
+                       goto no_integrity;
+       }
   
         list_for_each_entry(dd, devices, list) {
                 template_disk = dd->dm_dev->bdev->bd_disk;
@@@ -1168,6 -1179,10 +1179,10 @@@ static int dm_table_register_integrity(
         struct mapped_device *md = t->md;
         struct gendisk *template_disk = NULL;
   
+       /* If target handles integrity itself do not register it here. */
+       if (t->integrity_added)
+               return 0;
+ 
         template_disk = dm_table_get_integrity_disk(t);
         if (!template_disk)
                 return 0;
@@@ -1313,15 -1328,16 +1328,16 @@@ static int count_device(struct dm_targe
    */
   bool dm_table_has_no_data_devices(struct dm_table *table)
   {
-       struct dm_target *uninitialized_var(ti);
-       unsigned i = 0, num_devices = 0;
+       struct dm_target *ti;
+       unsigned i, num_devices;
   
-       while (i < dm_table_get_num_targets(table)) {
-               ti = dm_table_get_target(table, i++);
+       for (i = 0; i < dm_table_get_num_targets(table); i++) {
+               ti = dm_table_get_target(table, i);
   
                 if (!ti->type->iterate_devices)
                         return false;
   
+               num_devices = 0;
                 ti->type->iterate_devices(ti, count_device, &num_devices);
                 if (num_devices)
                         return false;
@@@ -1336,16 -1352,16 +1352,16 @@@
   int dm_calculate_queue_limits(struct dm_table *table,
                               struct queue_limits *limits)
   {
-       struct dm_target *uninitialized_var(ti);
+       struct dm_target *ti;
         struct queue_limits ti_limits;
-       unsigned i = 0;
+       unsigned i;
   
         blk_set_stacking_limits(limits);
   
-       while (i < dm_table_get_num_targets(table)) {
+       for (i = 0; i < dm_table_get_num_targets(table); i++) {
                 blk_set_stacking_limits(&ti_limits);
   
-               ti = dm_table_get_target(table, i++);
+               ti = dm_table_get_target(table, i);
   
                 if (!ti->type->iterate_devices)
                         goto combine_limits;
@@@ -1394,6 -1410,9 +1410,9 @@@ static void dm_table_verify_integrity(s
   {
         struct gendisk *template_disk = NULL;
   
+       if (t->integrity_added)
+               return;
+ 
         if (t->integrity_supported) {
                 /*
                  * Verify that the original integrity profile
@@@ -1424,7 -1443,7 +1443,7 @@@ static int device_flush_capable(struct 
   static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
   {
         struct dm_target *ti;
-       unsigned i = 0;
+       unsigned i;
   
         /*
          * Require at least one underlying device to support flushes.
@@@ -1432,8 -1451,8 +1451,8 @@@
          * so we need to use iterate_devices here, which targets
          * supporting flushes must provide.
          */
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               ti = dm_table_get_target(t, i);
   
                 if (!ti->num_flush_bios)
                         continue;
@@@ -1449,6 -1468,22 +1468,6 @@@
         return false;
   }
   
- -static bool dm_table_discard_zeroes_data(struct dm_table *t)
- -{
- -      struct dm_target *ti;
- -      unsigned i = 0;
- -
- -      /* Ensure that all targets supports discard_zeroes_data. */
- -      while (i < dm_table_get_num_targets(t)) {
- -              ti = dm_table_get_target(t, i++);
- -
- -              if (ti->discard_zeroes_data_unsupported)
- -                      return false;
- -      }
- -
- -      return true;
- -}
- -
   static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
                             sector_t start, sector_t len, void *data)
   {
@@@ -1477,10 -1512,10 +1496,10 @@@ static bool dm_table_all_devices_attrib
                                            iterate_devices_callout_fn func)
   {
         struct dm_target *ti;
-       unsigned i = 0;
+       unsigned i;
   
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               ti = dm_table_get_target(t, i);
   
                 if (!ti->type->iterate_devices ||
                     !ti->type->iterate_devices(ti, func, NULL))
@@@ -1501,10 -1536,10 +1520,10 @@@ static int device_not_write_same_capabl
   static bool dm_table_supports_write_same(struct dm_table *t)
   {
         struct dm_target *ti;
-       unsigned i = 0;
+       unsigned i;
   
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               ti = dm_table_get_target(t, i);
   
                 if (!ti->num_write_same_bios)
                         return false;
@@@ -1517,34 -1552,6 +1536,34 @@@
         return true;
   }
   
+ +static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+ +                                         sector_t start, sector_t len, void *data)
+ +{
+ +      struct request_queue *q = bdev_get_queue(dev->bdev);
+ +
+ +      return q && !q->limits.max_write_zeroes_sectors;
+ +}
+ +
+ +static bool dm_table_supports_write_zeroes(struct dm_table *t)
+ +{
+ +      struct dm_target *ti;
+ +      unsigned i = 0;
+ +
+ +      while (i < dm_table_get_num_targets(t)) {
+ +              ti = dm_table_get_target(t, i++);
+ +
+ +              if (!ti->num_write_zeroes_bios)
+ +                      return false;
+ +
+ +              if (!ti->type->iterate_devices ||
+ +                  ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+ +                      return false;
+ +      }
+ +
+ +      return true;
+ +}
+ +
+ +
   static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
                                   sector_t start, sector_t len, void *data)
   {
@@@ -1556,7 -1563,7 +1575,7 @@@
   static bool dm_table_supports_discards(struct dm_table *t)
   {
         struct dm_target *ti;
-       unsigned i = 0;
+       unsigned i;
   
         /*
          * Unless any target used by the table set discards_supported,
@@@ -1565,8 -1572,8 +1584,8 @@@
          * so we need to use iterate_devices here, which targets
          * supporting discard selectively must provide.
          */
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
+       for (i = 0; i < dm_table_get_num_targets(t); i++) {
+               ti = dm_table_get_target(t, i);
   
                 if (!ti->num_discard_bios)
                         continue;
@@@ -1604,6 -1611,9 +1623,6 @@@ void dm_table_set_restrictions(struct d
         }
         blk_queue_write_cache(q, wc, fua);
   
- -      if (!dm_table_discard_zeroes_data(t))
- -              q->limits.discard_zeroes_data = 0;
- -
         /* Ensure that all underlying devices are non-rotational. */
         if (dm_table_all_devices_attribute(t, device_is_nonrot))
                 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
@@@ -1612,8 -1622,6 +1631,8 @@@
   
         if (!dm_table_supports_write_same(t))
                 q->limits.max_write_same_sectors = 0;
+ +      if (!dm_table_supports_write_zeroes(t))
+ +              q->limits.max_write_zeroes_sectors = 0;
   
         if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
                 queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
@@@ -1672,6 -1680,8 +1691,8 @@@ static void suspend_targets(struct dm_t
         int i = t->num_targets;
         struct dm_target *ti = t->targets;
   
+       lockdep_assert_held(&t->md->suspend_lock);
+ 
         while (i--) {
                 switch (mode) {
                 case PRESUSPEND:
@@@ -1719,6 -1729,8 +1740,8 @@@ int dm_table_resume_targets(struct dm_t
   {
         int i, r = 0;
   
+       lockdep_assert_held(&t->md->suspend_lock);
+ 
         for (i = 0; i < t->num_targets; i++) {
                 struct dm_target *ti = t->targets + i;
   
diff --combined drivers/md/dm-thin.c

index a5f1916f621a9972449dbbfa2408397484a5d7f3,f90bcbf24ebccee462ae3afef7cf6ae77644a7c3..17ad50daed08ef5022b8648ef2e8701208c85a9d
--- 1/drivers/md/dm-thin.c
--- 2/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@@ -5,7 -5,7 +5,7 @@@
    */
   
   #include "dm-thin-metadata.h"
- #include "dm-bio-prison.h"
+ #include "dm-bio-prison-v1.h"
   #include "dm.h"
   
   #include <linux/device-mapper.h>
@@@ -1069,6 -1069,7 +1069,7 @@@ static void passdown_endio(struct bio *
          * to unmap (we ignore err).
          */
         queue_passdown_pt2(bio->bi_private);
+       bio_put(bio);
   }
   
   static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
@@@ -3263,6 -3264,7 +3264,6 @@@ static int pool_ctr(struct dm_target *t
          * them down to the data device.  The thin device's discard
          * processing will cause mappings to be removed from the btree.
          */
- -      ti->discard_zeroes_data_unsupported = true;
         if (pf.discard_enabled && pf.discard_passdown) {
                 ti->num_discard_bios = 1;
   
@@@ -4118,6 -4120,7 +4119,6 @@@ static int thin_ctr(struct dm_target *t
         ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
   
         /* In case the pool supports discards, pass them on. */
- -      ti->discard_zeroes_data_unsupported = true;
         if (tc->pool->pf.discard_enabled) {
                 ti->discards_supported = true;
                 ti->num_discard_bios = 1;
diff --combined drivers/md/dm.c

index 8bf397729bbd28a964f58fe3bba3ca885bfc08ab,dbfaf6dde657ae1265341cc53f7b853404477bf7..268edf402bbb058189ed4707395a0a6013ac983d
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -810,6 -810,7 +810,6 @@@ static void dec_pending(struct dm_io *i
                         queue_io(md, bio);
                 } else {
                         /* done with normal IO or empty flush */
- -                      trace_block_bio_complete(md->queue, bio, io_error);
                         bio->bi_error = io_error;
                         bio_endio(bio);
                 }
@@@ -824,14 -825,6 +824,14 @@@ void disable_write_same(struct mapped_d
         limits->max_write_same_sectors = 0;
   }
   
+ +void disable_write_zeroes(struct mapped_device *md)
+ +{
+ +      struct queue_limits *limits = dm_get_queue_limits(md);
+ +
+ +      /* device doesn't really support WRITE ZEROES, disable it */
+ +      limits->max_write_zeroes_sectors = 0;
+ +}
+ +
   static void clone_endio(struct bio *bio)
   {
         int error = bio->bi_error;
@@@ -858,14 -851,9 +858,14 @@@
                 }
         }
   
- -      if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
- -                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
- -              disable_write_same(md);
+ +      if (unlikely(r == -EREMOTEIO)) {
+ +              if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ +                  !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ +                      disable_write_same(md);
+ +              if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ +                  !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ +                      disable_write_zeroes(md);
+ +      }
   
         free_tio(tio);
         dec_pending(io, error);
@@@ -1001,29 -989,26 +1001,29 @@@ static void flush_current_bio_list(stru
         struct dm_offload *o = container_of(cb, struct dm_offload, cb);
         struct bio_list list;
         struct bio *bio;
+ +      int i;
   
         INIT_LIST_HEAD(&o->cb.list);
   
         if (unlikely(!current->bio_list))
                 return;
   
- -      list = *current->bio_list;
- -      bio_list_init(current->bio_list);
- -
- -      while ((bio = bio_list_pop(&list))) {
- -              struct bio_set *bs = bio->bi_pool;
- -              if (unlikely(!bs) || bs == fs_bio_set) {
- -                      bio_list_add(current->bio_list, bio);
- -                      continue;
+ +      for (i = 0; i < 2; i++) {
+ +              list = current->bio_list[i];
+ +              bio_list_init(&current->bio_list[i]);
+ +
+ +              while ((bio = bio_list_pop(&list))) {
+ +                      struct bio_set *bs = bio->bi_pool;
+ +                      if (unlikely(!bs) || bs == fs_bio_set) {
+ +                              bio_list_add(&current->bio_list[i], bio);
+ +                              continue;
+ +                      }
+ +
+ +                      spin_lock(&bs->rescue_lock);
+ +                      bio_list_add(&bs->rescue_list, bio);
+ +                      queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ +                      spin_unlock(&bs->rescue_lock);
                 }
- -
- -              spin_lock(&bs->rescue_lock);
- -              bio_list_add(&bs->rescue_list, bio);
- -              queue_work(bs->rescue_workqueue, &bs->rescue_work);
- -              spin_unlock(&bs->rescue_lock);
         }
   }
   
@@@ -1104,8 -1089,18 +1104,18 @@@ static int clone_bio(struct dm_target_i
   
         __bio_clone_fast(clone, bio);
   
-       if (bio_integrity(bio)) {
-               int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+       if (unlikely(bio_integrity(bio) != NULL)) {
+               int r;
+ 
+               if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
+                            !dm_target_passes_integrity(tio->ti->type))) {
+                       DMWARN("%s: the target %s doesn't support integrity data.",
+                               dm_device_name(tio->io->md),
+                               tio->ti->type->name);
+                       return -EIO;
+               }
+ 
+               r = bio_integrity_clone(clone, bio, GFP_NOIO);
                 if (r < 0)
                         return r;
         }
@@@ -1113,7 -1108,7 +1123,7 @@@
         bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
         clone->bi_iter.bi_size = to_bytes(len);
   
-       if (bio_integrity(bio))
+       if (unlikely(bio_integrity(bio) != NULL))
                 bio_integrity_trim(clone, 0, len);
   
         return 0;
@@@ -1214,11 -1209,6 +1224,11 @@@ static unsigned get_num_write_same_bios
         return ti->num_write_same_bios;
   }
   
+ +static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+ +{
+ +      return ti->num_write_zeroes_bios;
+ +}
+ +
   typedef bool (*is_split_required_fn)(struct dm_target *ti);
   
   static bool is_split_required_for_discard(struct dm_target *ti)
@@@ -1273,11 -1263,6 +1283,11 @@@ static int __send_write_same(struct clo
         return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
   }
   
+ +static int __send_write_zeroes(struct clone_info *ci)
+ +{
+ +      return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+ +}
+ +
   /*
    * Select the correct strategy for processing a non-flush bio.
    */
@@@ -1292,8 -1277,6 +1302,8 @@@ static int __split_and_process_non_flus
                 return __send_discard(ci);
         else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
                 return __send_write_same(ci);
+ +      else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+ +              return __send_write_zeroes(ci);
   
         ti = dm_table_find_target(ci->map, ci->sector);
         if (!dm_target_is_valid(ti))
@@@ -1715,6 -1698,8 +1725,8 @@@ static void event_callback(void *contex
    */
   static void __set_size(struct mapped_device *md, sector_t size)
   {
+       lockdep_assert_held(&md->suspend_lock);
+ 
         set_capacity(md->disk, size);
   
         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
@@@ -1822,13 -1807,13 +1834,13 @@@ void dm_unlock_md_type(struct mapped_de
         mutex_unlock(&md->type_lock);
   }
   
- void dm_set_md_type(struct mapped_device *md, unsigned type)
+ void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
   {
         BUG_ON(!mutex_is_locked(&md->type_lock));
         md->type = type;
   }
   
- unsigned dm_get_md_type(struct mapped_device *md)
+ enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
   {
         return md->type;
   }
@@@ -1855,7 -1840,7 +1867,7 @@@ EXPORT_SYMBOL_GPL(dm_get_queue_limits)
   int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
   {
         int r;
-       unsigned type = dm_get_md_type(md);
+       enum dm_queue_mode type = dm_get_md_type(md);
   
         switch (type) {
         case DM_TYPE_REQUEST_BASED:
@@@ -1886,6 -1871,9 +1898,9 @@@
                 if (type == DM_TYPE_DAX_BIO_BASED)
                         queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
                 break;
+       case DM_TYPE_NONE:
+               WARN_ON_ONCE(true);
+               break;
         }
   
         return 0;
@@@ -2164,8 -2152,6 +2179,6 @@@ static void unlock_fs(struct mapped_dev
    * If __dm_suspend returns 0, the device is completely quiescent
    * now. There is no request-processing activity. All new requests
    * are being added to md->deferred list.
-  *
-  * Caller must hold md->suspend_lock
    */
   static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
                         unsigned suspend_flags, long task_state,
@@@ -2183,6 -2169,8 +2196,8 @@@
          */
         if (noflush)
                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+       else
+               pr_debug("%s: suspending with flush\n", dm_device_name(md));
   
         /*
          * This gets reverted if there's an error later and the targets
@@@ -2381,6 -2369,8 +2396,8 @@@ static void __dm_internal_suspend(struc
   {
         struct dm_table *map = NULL;
   
+       lockdep_assert_held(&md->suspend_lock);
+ 
         if (md->internal_suspend_count++)
                 return; /* nested internal suspend */
   
@@@ -2571,7 -2561,7 +2588,7 @@@ int dm_noflush_suspending(struct dm_tar
   }
   EXPORT_SYMBOL_GPL(dm_noflush_suspending);
   
- struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
                                             unsigned integrity, unsigned per_io_data_size)
   {
         struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
diff --combined include/linux/device-mapper.h

index c7ea33e38fb9e705baf2921f6cefe167574f9483,1ce4036224eb76f62b52f7d8204425d130173ccf..925b63cdef527bf055a87b9c3c53bb1e28f26568
--- 1/include/linux/device-mapper.h
--- 2/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@@ -22,11 -22,13 +22,13 @@@ struct bio_vec
   /*
    * Type of table, mapped_device's mempool and request_queue
    */
- #define DM_TYPE_NONE                  0
- #define DM_TYPE_BIO_BASED             1
- #define DM_TYPE_REQUEST_BASED         2
- #define DM_TYPE_MQ_REQUEST_BASED      3
- #define DM_TYPE_DAX_BIO_BASED         4
+ enum dm_queue_mode {
+       DM_TYPE_NONE             = 0,
+       DM_TYPE_BIO_BASED        = 1,
+       DM_TYPE_REQUEST_BASED    = 2,
+       DM_TYPE_MQ_REQUEST_BASED = 3,
+       DM_TYPE_DAX_BIO_BASED    = 4,
+ };
   
   typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
   
@@@ -221,6 -223,18 +223,18 @@@ struct target_type 
    */
   typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
   
+ /*
+  * A target implements own bio data integrity.
+  */
+ #define DM_TARGET_INTEGRITY           0x00000010
+ #define dm_target_has_integrity(type) ((type)->features & DM_TARGET_INTEGRITY)
+ 
+ /*
+  * A target passes integrity data to the lower device.
+  */
+ #define DM_TARGET_PASSES_INTEGRITY    0x00000020
+ #define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
+ 
   struct dm_target {
         struct dm_table *table;
         struct target_type *type;
@@@ -254,12 -268,6 +268,12 @@@
          */
         unsigned num_write_same_bios;
   
+ +      /*
+ +       * The number of WRITE ZEROES bios that will be submitted to the target.
+ +       * The bio number can be accessed with dm_bio_get_target_bio_nr.
+ +       */
+ +      unsigned num_write_zeroes_bios;
+ +
         /*
          * The minimum number of extra bytes allocated in each io for the
          * target to use.
@@@ -296,6 -304,11 +310,6 @@@
          * on max_io_len boundary.
          */
         bool split_discard_bios:1;
- -
- -      /*
- -       * Set if this target does not return zeroes on discarded blocks.
- -       */
- -      bool discard_zeroes_data_unsupported:1;
   };
   
   /* Each target can link one of these into the table */
@@@ -465,7 -478,7 +479,7 @@@ void dm_table_add_target_callbacks(stru
    * Useful for "hybrid" target (supports both bio-based
    * and request-based).
    */
- void dm_table_set_type(struct dm_table *t, unsigned type);
+ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
   
   /*
    * Finally call this to make the table ready for use.
author	Mike Snitzer <snitzer@redhat.com>
	Mon, 1 May 2017 22:18:04 +0000 (18:18 -0400)
committer	Mike Snitzer <snitzer@redhat.com>
	Mon, 1 May 2017 22:18:04 +0000 (18:18 -0400)
		1	2
drivers/md/dm-cache-target.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-core.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-crypt.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-linear.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-mpath.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-raid.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-rq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-stripe.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-table.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-thin.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/device-mapper.h	patch \|	diff1 \|	diff2 \|	blob \| history