*/
#include "dm.h"
- #include "dm-bio-prison.h"
+ #include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
+ #include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
/*----------------------------------------------------------------*/
- #define IOT_RESOLUTION 4
+ /*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+ /*----------------------------------------------------------------*/
struct io_tracker {
spinlock_t lock;
/*----------------------------------------------------------------*/
/*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- * either direction
+ * Represents a chunk of future work. 'input' allows continuations to pass
+ * values between themselves, typically error values.
*/
+ struct continuation {
+ struct work_struct ws;
+ int input;
+ };
+
+ static inline void init_continuation(struct continuation *k,
+ void (*fn)(struct work_struct *))
+ {
+ INIT_WORK(&k->ws, fn);
+ k->input = 0;
+ }
+
+ static inline void queue_continuation(struct workqueue_struct *wq,
+ struct continuation *k)
+ {
+ queue_work(wq, &k->ws);
+ }
/*----------------------------------------------------------------*/
+ /*
+ * The batcher collects together pieces of work that need a particular
+ * operation to occur before they can proceed (typically a commit).
+ */
+ struct batcher {
+ /*
+ * The operation that everyone is waiting for.
+ */
+ int (*commit_op)(void *context);
+ void *commit_context;
+
+ /*
+ * This is how bios should be issued once the commit op is complete
+ * (accounted_request).
+ */
+ void (*issue_op)(struct bio *bio, void *context);
+ void *issue_context;
+
+ /*
+ * Queued work gets put on here after commit.
+ */
+ struct workqueue_struct *wq;
+
+ spinlock_t lock;
+ struct list_head work_items;
+ struct bio_list bios;
+ struct work_struct commit_work;
+
+ bool commit_scheduled;
+ };
+
+ static void __commit(struct work_struct *_ws)
+ {
+ struct batcher *b = container_of(_ws, struct batcher, commit_work);
+
+ int r;
+ unsigned long flags;
+ struct list_head work_items;
+ struct work_struct *ws, *tmp;
+ struct continuation *k;
+ struct bio *bio;
+ struct bio_list bios;
+
+ INIT_LIST_HEAD(&work_items);
+ bio_list_init(&bios);
+
+ /*
+ * We have to grab these before the commit_op to avoid a race
+ * condition.
+ */
+ spin_lock_irqsave(&b->lock, flags);
+ list_splice_init(&b->work_items, &work_items);
+ bio_list_merge(&bios, &b->bios);
+ bio_list_init(&b->bios);
+ b->commit_scheduled = false;
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ r = b->commit_op(b->commit_context);
+
+ list_for_each_entry_safe(ws, tmp, &work_items, entry) {
+ k = container_of(ws, struct continuation, ws);
+ k->input = r;
+ INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
+ queue_work(b->wq, ws);
+ }
+
+ while ((bio = bio_list_pop(&bios))) {
+ if (r) {
+ bio->bi_error = r;
+ bio_endio(bio);
+ } else
+ b->issue_op(bio, b->issue_context);
+ }
+ }
+
+ static void batcher_init(struct batcher *b,
+ int (*commit_op)(void *),
+ void *commit_context,
+ void (*issue_op)(struct bio *bio, void *),
+ void *issue_context,
+ struct workqueue_struct *wq)
+ {
+ b->commit_op = commit_op;
+ b->commit_context = commit_context;
+ b->issue_op = issue_op;
+ b->issue_context = issue_context;
+ b->wq = wq;
+
+ spin_lock_init(&b->lock);
+ INIT_LIST_HEAD(&b->work_items);
+ bio_list_init(&b->bios);
+ INIT_WORK(&b->commit_work, __commit);
+ b->commit_scheduled = false;
+ }
+
+ static void async_commit(struct batcher *b)
+ {
+ queue_work(b->wq, &b->commit_work);
+ }
+
+ static void continue_after_commit(struct batcher *b, struct continuation *k)
+ {
+ unsigned long flags;
+ bool commit_scheduled;
+
+ spin_lock_irqsave(&b->lock, flags);
+ commit_scheduled = b->commit_scheduled;
+ list_add_tail(&k->ws.entry, &b->work_items);
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (commit_scheduled)
+ async_commit(b);
+ }
+
+ /*
+ * Bios are errored if commit failed.
+ */
+ static void issue_after_commit(struct batcher *b, struct bio *bio)
+ {
+ unsigned long flags;
+ bool commit_scheduled;
+
+ spin_lock_irqsave(&b->lock, flags);
+ commit_scheduled = b->commit_scheduled;
+ bio_list_add(&b->bios, bio);
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (commit_scheduled)
+ async_commit(b);
+ }
+
+ /*
+ * Call this if some urgent work is waiting for the commit to complete.
+ */
+ static void schedule_commit(struct batcher *b)
+ {
+ bool immediate;
+ unsigned long flags;
+
+ spin_lock_irqsave(&b->lock, flags);
+ immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
+ b->commit_scheduled = true;
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (immediate)
+ async_commit(b);
+ }
+
/*
* There are a couple of places where we let a bio run, but want to do some
* work before calling its endio function. We do this by temporarily
atomic_t write_miss;
atomic_t demotion;
atomic_t promotion;
+ atomic_t writeback;
atomic_t copies_avoided;
atomic_t cache_cell_clash;
atomic_t commit_count;
atomic_t discard_count;
};
- /*
- * Defines a range of cblocks, begin to (end - 1) are in the range. end is
- * the one-past-the-end value.
- */
- struct cblock_range {
- dm_cblock_t begin;
- dm_cblock_t end;
- };
-
- struct invalidation_request {
- struct list_head list;
- struct cblock_range *cblocks;
-
- atomic_t complete;
- int err;
-
- wait_queue_head_t result_wait;
- };
-
struct cache {
struct dm_target *ti;
struct dm_target_callbacks callbacks;
spinlock_t lock;
struct list_head deferred_cells;
struct bio_list deferred_bios;
- struct bio_list deferred_flush_bios;
struct bio_list deferred_writethrough_bios;
- struct list_head quiesced_migrations;
- struct list_head completed_migrations;
- struct list_head need_commit_migrations;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
atomic_t nr_allocated_migrations;
*/
atomic_t nr_io_migrations;
- wait_queue_head_t quiescing_wait;
- atomic_t quiescing;
- atomic_t quiescing_ack;
+ struct rw_semaphore quiesce_lock;
/*
* cache_size entries, dirty if set
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
- struct work_struct worker;
-
+ struct work_struct deferred_bio_worker;
+ struct work_struct deferred_writethrough_worker;
+ struct work_struct migration_worker;
struct delayed_work waker;
- unsigned long last_commit_jiffies;
-
- struct dm_bio_prison *prison;
- struct dm_deferred_set *all_io_ds;
+ struct dm_bio_prison_v2 *prison;
mempool_t *migration_pool;
struct list_head invalidation_requests;
struct io_tracker origin_tracker;
+
+ struct work_struct commit_ws;
+ struct batcher committer;
+
+ struct rw_semaphore background_work_lock;
};
struct per_bio_data {
bool tick:1;
unsigned req_nr:2;
- struct dm_deferred_entry *all_io_entry;
+ struct dm_bio_prison_cell_v2 *cell;
struct dm_hook_info hook_info;
sector_t len;
};
struct dm_cache_migration {
- struct list_head list;
+ struct continuation k;
struct cache *cache;
- unsigned long start_jiffies;
- dm_oblock_t old_oblock;
- dm_oblock_t new_oblock;
- dm_cblock_t cblock;
-
- bool err:1;
- bool discard:1;
- bool writeback:1;
- bool demote:1;
- bool promote:1;
- bool requeue_holder:1;
- bool invalidate:1;
+ struct policy_work *op;
+ struct bio *overwrite_bio;
+ struct dm_bio_prison_cell_v2 *cell;
- struct dm_bio_prison_cell *old_ocell;
- struct dm_bio_prison_cell *new_ocell;
+ dm_cblock_t invalidate_cblock;
+ dm_oblock_t invalidate_oblock;
};
- /*
- * Processing a bio in the worker thread may require these memory
- * allocations. We prealloc to avoid deadlocks (the same worker thread
- * frees them back to the mempool).
- */
- struct prealloc {
- struct dm_cache_migration *mg;
- struct dm_bio_prison_cell *cell1;
- struct dm_bio_prison_cell *cell2;
- };
+ /*----------------------------------------------------------------*/
+
+ static bool writethrough_mode(struct cache_features *f)
+ {
+ return f->io_mode == CM_IO_WRITETHROUGH;
+ }
+
+ static bool writeback_mode(struct cache_features *f)
+ {
+ return f->io_mode == CM_IO_WRITEBACK;
+ }
+
+ static inline bool passthrough_mode(struct cache_features *f)
+ {
+ return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
+ }
+
+ /*----------------------------------------------------------------*/
+
+ static void wake_deferred_bio_worker(struct cache *cache)
+ {
+ queue_work(cache->wq, &cache->deferred_bio_worker);
+ }
- static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+ static void wake_deferred_writethrough_worker(struct cache *cache)
+ {
+ queue_work(cache->wq, &cache->deferred_writethrough_worker);
+ }
- static void wake_worker(struct cache *cache)
+ static void wake_migration_worker(struct cache *cache)
{
- queue_work(cache->wq, &cache->worker);
+ if (passthrough_mode(&cache->features))
+ return;
+
+ queue_work(cache->wq, &cache->migration_worker);
}
/*----------------------------------------------------------------*/
- static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+ static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
{
- /* FIXME: change to use a local slab. */
- return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+ return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
}
- static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+ static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
{
- dm_bio_prison_free_cell(cache->prison, cell);
+ dm_bio_prison_free_cell_v2(cache->prison, cell);
}
static struct dm_cache_migration *alloc_migration(struct cache *cache)
mempool_free(mg, cache->migration_pool);
}
- static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
- {
- if (!p->mg) {
- p->mg = alloc_migration(cache);
- if (!p->mg)
- return -ENOMEM;
- }
-
- if (!p->cell1) {
- p->cell1 = alloc_prison_cell(cache);
- if (!p->cell1)
- return -ENOMEM;
- }
-
- if (!p->cell2) {
- p->cell2 = alloc_prison_cell(cache);
- if (!p->cell2)
- return -ENOMEM;
- }
-
- return 0;
- }
+ /*----------------------------------------------------------------*/
- static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+ static inline dm_oblock_t oblock_succ(dm_oblock_t b)
{
- if (p->cell2)
- free_prison_cell(cache, p->cell2);
-
- if (p->cell1)
- free_prison_cell(cache, p->cell1);
-
- if (p->mg)
- free_migration(p->mg);
+ return to_oblock(from_oblock(b) + 1ull);
}
- static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+ static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
{
- struct dm_cache_migration *mg = p->mg;
-
- BUG_ON(!mg);
- p->mg = NULL;
-
- return mg;
+ key->virtual = 0;
+ key->dev = 0;
+ key->block_begin = from_oblock(begin);
+ key->block_end = from_oblock(end);
}
/*
- * You must have a cell within the prealloc struct to return. If not this
- * function will BUG() rather than returning NULL.
+ * We have two lock levels. Level 0, which is used to prevent WRITEs, and
+ * level 1 which prevents *both* READs and WRITEs.
*/
- static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+ #define WRITE_LOCK_LEVEL 0
+ #define READ_WRITE_LOCK_LEVEL 1
+
+ static unsigned lock_level(struct bio *bio)
{
- struct dm_bio_prison_cell *r = NULL;
+ return bio_data_dir(bio) == WRITE ?
+ WRITE_LOCK_LEVEL :
+ READ_WRITE_LOCK_LEVEL;
+ }
- if (p->cell1) {
- r = p->cell1;
- p->cell1 = NULL;
+ /*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
- } else if (p->cell2) {
- r = p->cell2;
- p->cell2 = NULL;
- } else
- BUG();
+ /*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+ #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+ #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
- return r;
+ static size_t get_per_bio_data_size(struct cache *cache)
+ {
+ return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
}
- /*
- * You can't have more than two cells in a prealloc struct. BUG() will be
- * called if you try and overfill.
- */
- static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+ static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
{
- if (!p->cell2)
- p->cell2 = cell;
+ struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+ BUG_ON(!pb);
+ return pb;
+ }
- else if (!p->cell1)
- p->cell1 = cell;
+ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+ {
+ struct per_bio_data *pb = get_per_bio_data(bio, data_size);
- else
- BUG();
+ pb->tick = false;
+ pb->req_nr = dm_bio_get_target_bio_nr(bio);
+ pb->cell = NULL;
+ pb->len = 0;
+
+ return pb;
}
/*----------------------------------------------------------------*/
- static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+ static void defer_bio(struct cache *cache, struct bio *bio)
{
- key->virtual = 0;
- key->dev = 0;
- key->block_begin = from_oblock(begin);
- key->block_end = from_oblock(end);
- }
+ unsigned long flags;
- /*
- * The caller hands in a preallocated cell, and a free function for it.
- * The cell will be freed if there's an error, or if it wasn't used because
- * a cell with that key already exists.
- */
- typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_deferred_bio_worker(cache);
+ }
- static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+ static void defer_bios(struct cache *cache, struct bio_list *bios)
{
- int r;
- struct dm_cell_key key;
+ unsigned long flags;
- build_key(oblock_begin, oblock_end, &key);
- r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
- if (r)
- free_fn(free_context, cell_prealloc);
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&cache->deferred_bios, bios);
+ bio_list_init(bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
- return r;
+ wake_deferred_bio_worker(cache);
}
- static int bio_detain(struct cache *cache, dm_oblock_t oblock,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+ /*----------------------------------------------------------------*/
+
+ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
{
+ bool r;
+ size_t pb_size;
+ struct per_bio_data *pb;
+ struct dm_cell_key_v2 key;
dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
- return bio_detain_range(cache, oblock, end, bio,
- cell_prealloc, free_fn, free_context, cell_result);
- }
+ struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
- static int get_cell(struct cache *cache,
- dm_oblock_t oblock,
- struct prealloc *structs,
- struct dm_bio_prison_cell **cell_result)
- {
- int r;
- struct dm_cell_key key;
- struct dm_bio_prison_cell *cell_prealloc;
+ cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
+ if (!cell_prealloc) {
+ defer_bio(cache, bio);
+ return false;
+ }
+
+ build_key(oblock, end, &key);
+ r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
+ if (!r) {
+ /*
+ * Failed to get the lock.
+ */
+ free_prison_cell(cache, cell_prealloc);
+ return r;
+ }
- cell_prealloc = prealloc_get_cell(structs);
+ if (cell != cell_prealloc)
+ free_prison_cell(cache, cell_prealloc);
- build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
- r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
- if (r)
- prealloc_put_cell(structs, cell_prealloc);
+ pb_size = get_per_bio_data_size(cache);
+ pb = get_per_bio_data(bio, pb_size);
+ pb->cell = cell;
return r;
}
return test_bit(from_cblock(b), cache->dirty_bitset);
}
- static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+ static void set_dirty(struct cache *cache, dm_cblock_t cblock)
{
if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
atomic_inc(&cache->nr_dirty);
- policy_set_dirty(cache->policy, oblock);
+ policy_set_dirty(cache->policy, cblock);
}
}
- static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+ /*
+ * These two are called when setting after migrations to force the policy
+ * and dirty bitset to be in sync.
+ */
+ static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
+ {
+ if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
+ atomic_inc(&cache->nr_dirty);
+ policy_set_dirty(cache->policy, cblock);
+ }
+
+ static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
{
if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
- policy_clear_dirty(cache->policy, oblock);
if (atomic_dec_return(&cache->nr_dirty) == 0)
dm_table_event(cache->ti->table);
}
+
+ policy_clear_dirty(cache->policy, cblock);
}
/*----------------------------------------------------------------*/
oblocks_per_dblock(cache)));
}
- static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
- {
- return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
- }
-
static void set_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
return r;
}
- /*----------------------------------------------------------------*/
-
- static void load_stats(struct cache *cache)
+ /*----------------------------------------------------------------
+ * Remapping
+ *--------------------------------------------------------------*/
+ static void remap_to_origin(struct cache *cache, struct bio *bio)
{
- struct dm_cache_statistics stats;
-
- dm_cache_metadata_get_stats(cache->cmd, &stats);
- atomic_set(&cache->stats.read_hit, stats.read_hits);
- atomic_set(&cache->stats.read_miss, stats.read_misses);
- atomic_set(&cache->stats.write_hit, stats.write_hits);
- atomic_set(&cache->stats.write_miss, stats.write_misses);
- }
-
- static void save_stats(struct cache *cache)
- {
- struct dm_cache_statistics stats;
-
- if (get_cache_mode(cache) >= CM_READ_ONLY)
- return;
-
- stats.read_hits = atomic_read(&cache->stats.read_hit);
- stats.read_misses = atomic_read(&cache->stats.read_miss);
- stats.write_hits = atomic_read(&cache->stats.write_hit);
- stats.write_misses = atomic_read(&cache->stats.write_miss);
-
- dm_cache_metadata_set_stats(cache->cmd, &stats);
- }
-
- /*----------------------------------------------------------------
- * Per bio data
- *--------------------------------------------------------------*/
-
- /*
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
- #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
- #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-
- static bool writethrough_mode(struct cache_features *f)
- {
- return f->io_mode == CM_IO_WRITETHROUGH;
- }
-
- static bool writeback_mode(struct cache_features *f)
- {
- return f->io_mode == CM_IO_WRITEBACK;
- }
-
- static bool passthrough_mode(struct cache_features *f)
- {
- return f->io_mode == CM_IO_PASSTHROUGH;
- }
-
- static size_t get_per_bio_data_size(struct cache *cache)
- {
- return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
- }
-
- static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
- {
- struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
- BUG_ON(!pb);
- return pb;
- }
-
- static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
- {
- struct per_bio_data *pb = get_per_bio_data(bio, data_size);
-
- pb->tick = false;
- pb->req_nr = dm_bio_get_target_bio_nr(bio);
- pb->all_io_entry = NULL;
- pb->len = 0;
-
- return pb;
- }
-
- /*----------------------------------------------------------------
- * Remapping
- *--------------------------------------------------------------*/
- static void remap_to_origin(struct cache *cache, struct bio *bio)
- {
- bio->bi_bdev = cache->origin_dev->bdev;
+ bio->bi_bdev = cache->origin_dev->bdev;
}
static void remap_to_cache(struct cache *cache, struct bio *bio,
}
static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock)
+ dm_oblock_t oblock)
{
+ // FIXME: this is called way too much.
check_if_tick_bio_needed(cache, bio);
remap_to_origin(cache, bio);
if (bio_data_dir(bio) == WRITE)
check_if_tick_bio_needed(cache, bio);
remap_to_cache(cache, bio, cblock);
if (bio_data_dir(bio) == WRITE) {
- set_dirty(cache, oblock, cblock);
+ set_dirty(cache, cblock);
clear_discard(cache, oblock_to_dblock(cache, oblock));
}
}
return to_oblock(block_nr);
}
- /*
- * You must increment the deferred set whilst the prison cell is held. To
- * encourage this, we ask for 'cell' to be passed in.
- */
- static void inc_ds(struct cache *cache, struct bio *bio,
- struct dm_bio_prison_cell *cell)
- {
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
- BUG_ON(!cell);
- BUG_ON(pb->all_io_entry);
-
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- }
-
static bool accountable_bio(struct cache *cache, struct bio *bio)
{
return ((bio->bi_bdev == cache->origin_dev->bdev) &&
generic_make_request(bio);
}
- static void issue(struct cache *cache, struct bio *bio)
- {
- unsigned long flags;
-
- if (!op_is_flush(bio->bi_opf)) {
- accounted_request(cache, bio);
- return;
- }
-
- /*
- * Batch together any bios that trigger commits and then issue a
- * single commit for them in do_worker().
- */
- spin_lock_irqsave(&cache->lock, flags);
- cache->commit_requested = true;
- bio_list_add(&cache->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
- }
-
- static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+ static void issue_op(struct bio *bio, void *context)
{
- inc_ds(cache, bio, cell);
- issue(cache, bio);
+ struct cache *cache = context;
+ accounted_request(cache, bio);
}
static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
bio_list_add(&cache->deferred_writethrough_bios, bio);
spin_unlock_irqrestore(&cache->lock, flags);
- wake_worker(cache);
+ wake_deferred_writethrough_worker(cache);
}
static void writethrough_endio(struct bio *bio)
}
/*
+ * FIXME: send in parallel, huge latency as is.
* When running in writethrough mode we need to send writes to clean blocks
* to both the cache and origin devices. In future we'd like to clone the
* bio and send them in parallel, but for now we're doing them in
set_cache_mode(cache, CM_READ_ONLY);
}
+ /*----------------------------------------------------------------*/
+
+ static void load_stats(struct cache *cache)
+ {
+ struct dm_cache_statistics stats;
+
+ dm_cache_metadata_get_stats(cache->cmd, &stats);
+ atomic_set(&cache->stats.read_hit, stats.read_hits);
+ atomic_set(&cache->stats.read_miss, stats.read_misses);
+ atomic_set(&cache->stats.write_hit, stats.write_hits);
+ atomic_set(&cache->stats.write_miss, stats.write_misses);
+ }
+
+ static void save_stats(struct cache *cache)
+ {
+ struct dm_cache_statistics stats;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
+ stats.read_hits = atomic_read(&cache->stats.read_hit);
+ stats.read_misses = atomic_read(&cache->stats.read_miss);
+ stats.write_hits = atomic_read(&cache->stats.write_hit);
+ stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+ dm_cache_metadata_set_stats(cache->cmd, &stats);
+ }
+
+ static void update_stats(struct cache_stats *stats, enum policy_operation op)
+ {
+ switch (op) {
+ case POLICY_PROMOTE:
+ atomic_inc(&stats->promotion);
+ break;
+
+ case POLICY_DEMOTE:
+ atomic_inc(&stats->demotion);
+ break;
+
+ case POLICY_WRITEBACK:
+ atomic_inc(&stats->writeback);
+ break;
+ }
+ }
+
/*----------------------------------------------------------------
* Migration processing
*
* Migration covers moving data from the origin device to the cache, or
* vice versa.
*--------------------------------------------------------------*/
+
static void inc_io_migrations(struct cache *cache)
{
atomic_inc(&cache->nr_io_migrations);
return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
}
- static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
- {
- if (discard_or_flush(cell->holder)) {
- /*
- * We have to handle these bios individually.
- */
- dm_cell_release(cache->prison, cell, &cache->deferred_bios);
- free_prison_cell(cache, cell);
- } else
- list_add_tail(&cell->user_list, &cache->deferred_cells);
- }
-
- static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
+ static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+ dm_dblock_t *b, dm_dblock_t *e)
{
- unsigned long flags;
-
- if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
- /*
- * There was no prisoner to promote to holder, the
- * cell has been released.
- */
- free_prison_cell(cache, cell);
- return;
- }
+ sector_t sb = bio->bi_iter.bi_sector;
+ sector_t se = bio_end_sector(bio);
- spin_lock_irqsave(&cache->lock, flags);
- __cell_defer(cache, cell);
- spin_unlock_irqrestore(&cache->lock, flags);
+ *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
- wake_worker(cache);
+ if (se - sb < cache->discard_block_size)
+ *e = *b;
+ else
+ *e = to_dblock(block_div(se, cache->discard_block_size));
}
- static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
- {
- dm_cell_error(cache->prison, cell, err);
- free_prison_cell(cache, cell);
- }
+ /*----------------------------------------------------------------*/
- static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+ static void prevent_background_work(struct cache *cache)
{
- cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+ lockdep_off();
+ down_write(&cache->background_work_lock);
+ lockdep_on();
}
- static void free_io_migration(struct dm_cache_migration *mg)
+ static void allow_background_work(struct cache *cache)
{
- struct cache *cache = mg->cache;
-
- dec_io_migrations(cache);
- free_migration(mg);
- wake_worker(cache);
+ lockdep_off();
+ up_write(&cache->background_work_lock);
+ lockdep_on();
}
- static void migration_failure(struct dm_cache_migration *mg)
+ static bool background_work_begin(struct cache *cache)
{
- struct cache *cache = mg->cache;
- const char *dev_name = cache_device_name(cache);
-
- if (mg->writeback) {
- DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
- set_dirty(cache, mg->old_oblock, mg->cblock);
- cell_defer(cache, mg->old_ocell, false);
-
- } else if (mg->demote) {
- DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
- policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+ bool r;
- cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
- if (mg->promote)
- cell_defer(cache, mg->new_ocell, true);
- } else {
- DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
- policy_remove_mapping(cache->policy, mg->new_oblock);
- cell_defer(cache, mg->new_ocell, true);
- }
+ lockdep_off();
+ r = down_read_trylock(&cache->background_work_lock);
+ lockdep_on();
- free_io_migration(mg);
+ return r;
}
- static void migration_success_pre_commit(struct dm_cache_migration *mg)
+ static void background_work_end(struct cache *cache)
{
- int r;
- unsigned long flags;
- struct cache *cache = mg->cache;
-
- if (mg->writeback) {
- clear_dirty(cache, mg->old_oblock, mg->cblock);
- cell_defer(cache, mg->old_ocell, false);
- free_io_migration(mg);
- return;
+ lockdep_off();
+ up_read(&cache->background_work_lock);
+ lockdep_on();
+ }
- } else if (mg->demote) {
- r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
- if (r) {
- DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
- cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- policy_force_mapping(cache->policy, mg->new_oblock,
- mg->old_oblock);
- if (mg->promote)
- cell_defer(cache, mg->new_ocell, true);
- free_io_migration(mg);
- return;
- }
- } else {
- r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
- if (r) {
- DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
- cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
- policy_remove_mapping(cache->policy, mg->new_oblock);
- free_io_migration(mg);
- return;
- }
- }
+ /*----------------------------------------------------------------*/
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->need_commit_migrations);
- cache->commit_requested = true;
- spin_unlock_irqrestore(&cache->lock, flags);
+ static void quiesce(struct dm_cache_migration *mg,
+ void (*continuation)(struct work_struct *))
+ {
+ init_continuation(&mg->k, continuation);
+ dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
}
- static void migration_success_post_commit(struct dm_cache_migration *mg)
+ static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
{
- unsigned long flags;
- struct cache *cache = mg->cache;
-
- if (mg->writeback) {
- DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
- cache_device_name(cache));
- return;
-
- } else if (mg->demote) {
- cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-
- if (mg->promote) {
- mg->demote = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->quiesced_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- } else {
- if (mg->invalidate)
- policy_remove_mapping(cache->policy, mg->old_oblock);
- free_io_migration(mg);
- }
-
- } else {
- if (mg->requeue_holder) {
- clear_dirty(cache, mg->new_oblock, mg->cblock);
- cell_defer(cache, mg->new_ocell, true);
- } else {
- /*
- * The block was promoted via an overwrite, so it's dirty.
- */
- set_dirty(cache, mg->new_oblock, mg->cblock);
- bio_endio(mg->new_ocell->holder);
- cell_defer(cache, mg->new_ocell, false);
- }
- free_io_migration(mg);
- }
+ struct continuation *k = container_of(ws, struct continuation, ws);
+ return container_of(k, struct dm_cache_migration, k);
}
static void copy_complete(int read_err, unsigned long write_err, void *context)
{
- unsigned long flags;
- struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
- struct cache *cache = mg->cache;
+ struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
if (read_err || write_err)
- mg->err = true;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->completed_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
+ mg->k.input = -EIO;
- wake_worker(cache);
+ queue_continuation(mg->cache->wq, &mg->k);
}
- static void issue_copy(struct dm_cache_migration *mg)
+ static int copy(struct dm_cache_migration *mg, bool promote)
{
int r;
struct dm_io_region o_region, c_region;
struct cache *cache = mg->cache;
- sector_t cblock = from_cblock(mg->cblock);
o_region.bdev = cache->origin_dev->bdev;
+ o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
o_region.count = cache->sectors_per_block;
c_region.bdev = cache->cache_dev->bdev;
- c_region.sector = cblock * cache->sectors_per_block;
+ c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
c_region.count = cache->sectors_per_block;
- if (mg->writeback || mg->demote) {
- /* demote */
- o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
- r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
- } else {
- /* promote */
- o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
- r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
- }
+ if (promote)
+ r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
+ else
+ r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
- if (r < 0) {
- DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
- migration_failure(mg);
- }
+ return r;
+ }
+
+ static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
+ {
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
+ free_prison_cell(cache, pb->cell);
+ pb->cell = NULL;
}
static void overwrite_endio(struct bio *bio)
struct cache *cache = mg->cache;
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- unsigned long flags;
dm_unhook_bio(&pb->hook_info, bio);
if (bio->bi_error)
- mg->err = true;
-
- mg->requeue_holder = false;
+ mg->k.input = bio->bi_error;
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->completed_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- wake_worker(cache);
+ queue_continuation(mg->cache->wq, &mg->k);
}
- static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+ static void overwrite(struct dm_cache_migration *mg,
+ void (*continuation)(struct work_struct *))
{
+ struct bio *bio = mg->overwrite_bio;
size_t pb_data_size = get_per_bio_data_size(mg->cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
- remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
/*
- * No need to inc_ds() here, since the cell will be held for the
- * duration of the io.
+ * The overwrite bio is part of the copy operation, as such it does
+ * not set/clear discard or dirty flags.
*/
+ if (mg->op->op == POLICY_PROMOTE)
+ remap_to_cache(mg->cache, bio, mg->op->cblock);
+ else
+ remap_to_origin(mg->cache, bio);
+
+ init_continuation(&mg->k, continuation);
accounted_request(mg->cache, bio);
}
- static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+ /*
+ * Migration steps:
+ *
+ * 1) exclusive lock preventing WRITEs
+ * 2) quiesce
+ * 3) copy or issue overwrite bio
+ * 4) upgrade to exclusive lock preventing READs and WRITEs
+ * 5) quiesce
+ * 6) update metadata and commit
+ * 7) unlock
+ */
+ static void mg_complete(struct dm_cache_migration *mg, bool success)
{
- return (bio_data_dir(bio) == WRITE) &&
- (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+ struct bio_list bios;
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ dm_cblock_t cblock = op->cblock;
+
+ if (success)
+ update_stats(&cache->stats, op->op);
+
+ switch (op->op) {
+ case POLICY_PROMOTE:
+ clear_discard(cache, oblock_to_dblock(cache, op->oblock));
+ policy_complete_background_work(cache->policy, op, success);
+
+ if (mg->overwrite_bio) {
+ if (success)
+ force_set_dirty(cache, cblock);
+ else
+ mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+ bio_endio(mg->overwrite_bio);
+ } else {
+ if (success)
+ force_clear_dirty(cache, cblock);
+ dec_io_migrations(cache);
+ }
+ break;
+
+ case POLICY_DEMOTE:
+ /*
+ * We clear dirty here to update the nr_dirty counter.
+ */
+ if (success)
+ force_clear_dirty(cache, cblock);
+ policy_complete_background_work(cache->policy, op, success);
+ dec_io_migrations(cache);
+ break;
+
+ case POLICY_WRITEBACK:
+ if (success)
+ force_clear_dirty(cache, cblock);
+ policy_complete_background_work(cache->policy, op, success);
+ dec_io_migrations(cache);
+ break;
+ }
+
+ bio_list_init(&bios);
+ if (mg->cell) {
+ if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+ free_prison_cell(cache, mg->cell);
+ }
+
+ free_migration(mg);
+ defer_bios(cache, &bios);
+ wake_migration_worker(cache);
+
+ background_work_end(cache);
}
- static void avoid_copy(struct dm_cache_migration *mg)
+ static void mg_success(struct work_struct *ws)
{
- atomic_inc(&mg->cache->stats.copies_avoided);
- migration_success_pre_commit(mg);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ mg_complete(mg, mg->k.input == 0);
}
- static void calc_discard_block_range(struct cache *cache, struct bio *bio,
- dm_dblock_t *b, dm_dblock_t *e)
+ static void mg_update_metadata(struct work_struct *ws)
{
- sector_t sb = bio->bi_iter.bi_sector;
- sector_t se = bio_end_sector(bio);
-
- *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
-
- if (se - sb < cache->discard_block_size)
- *e = *b;
- else
- *e = to_dblock(block_div(se, cache->discard_block_size));
- }
-
- static void issue_discard(struct dm_cache_migration *mg)
- {
- dm_dblock_t b, e;
- struct bio *bio = mg->new_ocell->holder;
- struct cache *cache = mg->cache;
-
- calc_discard_block_range(cache, bio, &b, &e);
- while (b != e) {
- set_discard(cache, b);
- b = to_dblock(from_dblock(b) + 1);
- }
-
- bio_endio(bio);
- cell_defer(cache, mg->new_ocell, false);
- free_migration(mg);
- wake_worker(cache);
- }
-
- static void issue_copy_or_discard(struct dm_cache_migration *mg)
- {
- bool avoid;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
- if (mg->discard) {
- issue_discard(mg);
- return;
- }
+ switch (op->op) {
+ case POLICY_PROMOTE:
+ r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
+ if (r) {
+ DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
- if (mg->writeback || mg->demote)
- avoid = !is_dirty(cache, mg->cblock) ||
- is_discarded_oblock(cache, mg->old_oblock);
- else {
- struct bio *bio = mg->new_ocell->holder;
+ mg_complete(mg, false);
+ return;
+ }
+ mg_complete(mg, true);
+ break;
- avoid = is_discarded_oblock(cache, mg->new_oblock);
+ case POLICY_DEMOTE:
+ r = dm_cache_remove_mapping(cache->cmd, op->cblock);
+ if (r) {
+ DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- if (writeback_mode(&cache->features) &&
- !avoid && bio_writes_complete_block(cache, bio)) {
- issue_overwrite(mg, bio);
+ mg_complete(mg, false);
return;
}
- }
- avoid ? avoid_copy(mg) : issue_copy(mg);
+ /*
+ * It would be nice if we only had to commit when a REQ_FLUSH
+ * comes through. But there's one scenario that we have to
+ * look out for:
+ *
+ * - vblock x in a cache block
+ * - domotion occurs
+ * - cache block gets reallocated and over written
+ * - crash
+ *
+ * When we recover, because there was no commit the cache will
+ * rollback to having the data for vblock x in the cache block.
+ * But the cache block has since been overwritten, so it'll end
+ * up pointing to data that was never in 'x' during the history
+ * of the device.
+ *
+ * To avoid this issue we require a commit as part of the
+ * demotion operation.
+ */
+ init_continuation(&mg->k, mg_success);
+ continue_after_commit(&cache->committer, &mg->k);
+ schedule_commit(&cache->committer);
+ break;
+
+ case POLICY_WRITEBACK:
+ mg_complete(mg, true);
+ break;
+ }
}
- static void complete_migration(struct dm_cache_migration *mg)
+ static void mg_update_metadata_after_copy(struct work_struct *ws)
{
- if (mg->err)
- migration_failure(mg);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+
+ /*
+ * Did the copy succeed?
+ */
+ if (mg->k.input)
+ mg_complete(mg, false);
else
- migration_success_pre_commit(mg);
+ mg_update_metadata(ws);
}
- static void process_migrations(struct cache *cache, struct list_head *head,
- void (*fn)(struct dm_cache_migration *))
+ static void mg_upgrade_lock(struct work_struct *ws)
{
- unsigned long flags;
- struct list_head list;
- struct dm_cache_migration *mg, *tmp;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
- INIT_LIST_HEAD(&list);
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(head, &list);
- spin_unlock_irqrestore(&cache->lock, flags);
+ /*
+ * Did the copy succeed?
+ */
+ if (mg->k.input)
+ mg_complete(mg, false);
- list_for_each_entry_safe(mg, tmp, &list, list)
- fn(mg);
- }
+ else {
+ /*
+ * Now we want the lock to prevent both reads and writes.
+ */
+ r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
+ READ_WRITE_LOCK_LEVEL);
+ if (r < 0)
+ mg_complete(mg, false);
- static void __queue_quiesced_migration(struct dm_cache_migration *mg)
- {
- list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+ else if (r)
+ quiesce(mg, mg_update_metadata);
+
+ else
+ mg_update_metadata(ws);
+ }
}
- static void queue_quiesced_migration(struct dm_cache_migration *mg)
+ static void mg_copy(struct work_struct *ws)
{
- unsigned long flags;
- struct cache *cache = mg->cache;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
- spin_lock_irqsave(&cache->lock, flags);
- __queue_quiesced_migration(mg);
- spin_unlock_irqrestore(&cache->lock, flags);
+ if (mg->overwrite_bio) {
+ /*
+ * It's safe to do this here, even though it's new data
+ * because all IO has been locked out of the block.
+ *
+ * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
+ * so _not_ using mg_upgrade_lock() as continutation.
+ */
+ overwrite(mg, mg_update_metadata_after_copy);
- wake_worker(cache);
- }
+ } else {
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ bool is_policy_promote = (op->op == POLICY_PROMOTE);
- static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
- {
- unsigned long flags;
- struct dm_cache_migration *mg, *tmp;
+ if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+ is_discarded_oblock(cache, op->oblock)) {
+ mg_upgrade_lock(ws);
+ return;
+ }
- spin_lock_irqsave(&cache->lock, flags);
- list_for_each_entry_safe(mg, tmp, work, list)
- __queue_quiesced_migration(mg);
- spin_unlock_irqrestore(&cache->lock, flags);
+ init_continuation(&mg->k, mg_upgrade_lock);
- wake_worker(cache);
+ r = copy(mg, is_policy_promote);
+ if (r) {
+ DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+ mg->k.input = -EIO;
+ mg_complete(mg, false);
+ }
+ }
}
- static void check_for_quiesced_migrations(struct cache *cache,
- struct per_bio_data *pb)
+ static int mg_lock_writes(struct dm_cache_migration *mg)
{
- struct list_head work;
-
- if (!pb->all_io_entry)
- return;
-
- INIT_LIST_HEAD(&work);
- dm_deferred_entry_dec(pb->all_io_entry, &work);
+ int r;
+ struct dm_cell_key_v2 key;
+ struct cache *cache = mg->cache;
+ struct dm_bio_prison_cell_v2 *prealloc;
- if (!list_empty(&work))
- queue_quiesced_migrations(cache, &work);
- }
+ prealloc = alloc_prison_cell(cache);
+ if (!prealloc) {
+ DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
+ mg_complete(mg, false);
+ return -ENOMEM;
+ }
- static void quiesce_migration(struct dm_cache_migration *mg)
- {
- if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
- queue_quiesced_migration(mg);
- }
+ /*
+ * Prevent writes to the block, but allow reads to continue.
+ * Unless we're using an overwrite bio, in which case we lock
+ * everything.
+ */
+ build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
+ r = dm_cell_lock_v2(cache->prison, &key,
+ mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
+ prealloc, &mg->cell);
+ if (r < 0) {
+ free_prison_cell(cache, prealloc);
+ mg_complete(mg, false);
+ return r;
+ }
- static void promote(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
- {
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ if (mg->cell != prealloc)
+ free_prison_cell(cache, prealloc);
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = false;
- mg->promote = true;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->new_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = NULL;
- mg->new_ocell = cell;
- mg->start_jiffies = jiffies;
+ if (r == 0)
+ mg_copy(&mg->k.ws);
+ else
+ quiesce(mg, mg_copy);
- inc_io_migrations(cache);
- quiesce_migration(mg);
+ return 0;
}
- static void writeback(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
+ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = true;
- mg->demote = false;
- mg->promote = false;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = cell;
- mg->new_ocell = NULL;
- mg->start_jiffies = jiffies;
-
- inc_io_migrations(cache);
- quiesce_migration(mg);
- }
-
- static void demote_then_promote(struct cache *cache, struct prealloc *structs,
- dm_oblock_t old_oblock, dm_oblock_t new_oblock,
- dm_cblock_t cblock,
- struct dm_bio_prison_cell *old_ocell,
- struct dm_bio_prison_cell *new_ocell)
- {
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = true;
- mg->promote = true;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_oblock = old_oblock;
- mg->new_oblock = new_oblock;
- mg->cblock = cblock;
- mg->old_ocell = old_ocell;
- mg->new_ocell = new_ocell;
- mg->start_jiffies = jiffies;
-
- inc_io_migrations(cache);
- quiesce_migration(mg);
- }
+ struct dm_cache_migration *mg;
- /*
- * Invalidate a cache entry. No writeback occurs; any changes in the cache
- * block are thrown away.
- */
- static void invalidate(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
- {
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = true;
- mg->promote = false;
- mg->requeue_holder = true;
- mg->invalidate = true;
- mg->cache = cache;
- mg->old_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = cell;
- mg->new_ocell = NULL;
- mg->start_jiffies = jiffies;
+ if (!background_work_begin(cache)) {
+ policy_complete_background_work(cache->policy, op, false);
+ return -EPERM;
+ }
- inc_io_migrations(cache);
- quiesce_migration(mg);
- }
+ mg = alloc_migration(cache);
+ if (!mg) {
+ policy_complete_background_work(cache->policy, op, false);
+ background_work_end(cache);
+ return -ENOMEM;
+ }
- static void discard(struct cache *cache, struct prealloc *structs,
- struct dm_bio_prison_cell *cell)
- {
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ memset(mg, 0, sizeof(*mg));
- mg->err = false;
- mg->discard = true;
- mg->writeback = false;
- mg->demote = false;
- mg->promote = false;
- mg->requeue_holder = false;
- mg->invalidate = false;
mg->cache = cache;
- mg->old_ocell = NULL;
- mg->new_ocell = cell;
- mg->start_jiffies = jiffies;
+ mg->op = op;
+ mg->overwrite_bio = bio;
+
+ if (!bio)
+ inc_io_migrations(cache);
- quiesce_migration(mg);
+ return mg_lock_writes(mg);
}
/*----------------------------------------------------------------
- * bio processing
+ * invalidation processing
*--------------------------------------------------------------*/
- static void defer_bio(struct cache *cache, struct bio *bio)
- {
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_add(&cache->deferred_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- wake_worker(cache);
- }
-
- static void process_flush_bio(struct cache *cache, struct bio *bio)
- {
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
- BUG_ON(bio->bi_iter.bi_size);
- if (!pb->req_nr)
- remap_to_origin(cache, bio);
- else
- remap_to_cache(cache, bio, 0);
- /*
- * REQ_PREFLUSH is not directed at any particular block so we don't
- * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH
- * by dm-core.
- */
- issue(cache, bio);
- }
-
- static void process_discard_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
+ static void invalidate_complete(struct dm_cache_migration *mg, bool success)
{
- int r;
- dm_dblock_t b, e;
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-
- calc_discard_block_range(cache, bio, &b, &e);
- if (b == e) {
- bio_endio(bio);
- return;
- }
+ struct bio_list bios;
+ struct cache *cache = mg->cache;
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ bio_list_init(&bios);
+ if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+ free_prison_cell(cache, mg->cell);
- discard(cache, structs, new_ocell);
- }
+ if (!success && mg->overwrite_bio)
+ bio_io_error(mg->overwrite_bio);
- static bool spare_migration_bandwidth(struct cache *cache)
- {
- sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
- cache->sectors_per_block;
- return current_volume < cache->migration_threshold;
- }
+ free_migration(mg);
+ defer_bios(cache, &bios);
- static void inc_hit_counter(struct cache *cache, struct bio *bio)
- {
- atomic_inc(bio_data_dir(bio) == READ ?
- &cache->stats.read_hit : &cache->stats.write_hit);
+ background_work_end(cache);
}
- static void inc_miss_counter(struct cache *cache, struct bio *bio)
+ static void invalidate_completed(struct work_struct *ws)
{
- atomic_inc(bio_data_dir(bio) == READ ?
- &cache->stats.read_miss : &cache->stats.write_miss);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ invalidate_complete(mg, !mg->k.input);
}
- /*----------------------------------------------------------------*/
-
- struct inc_detail {
- struct cache *cache;
- struct bio_list bios_for_issue;
- struct bio_list unhandled_bios;
- bool any_writes;
- };
-
- static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+ static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
{
- struct bio *bio;
- struct inc_detail *detail = context;
- struct cache *cache = detail->cache;
-
- inc_ds(cache, cell->holder, cell);
- if (bio_data_dir(cell->holder) == WRITE)
- detail->any_writes = true;
-
- while ((bio = bio_list_pop(&cell->bios))) {
- if (discard_or_flush(bio)) {
- bio_list_add(&detail->unhandled_bios, bio);
- continue;
+ int r = policy_invalidate_mapping(cache->policy, cblock);
+ if (!r) {
+ r = dm_cache_remove_mapping(cache->cmd, cblock);
+ if (r) {
+ DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
}
- if (bio_data_dir(bio) == WRITE)
- detail->any_writes = true;
+ } else if (r == -ENODATA) {
+ /*
+ * Harmless, already unmapped.
+ */
+ r = 0;
- bio_list_add(&detail->bios_for_issue, bio);
- inc_ds(cache, bio, cell);
- }
+ } else
+ DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
+
+ return r;
}
- // FIXME: refactor these two
- static void remap_cell_to_origin_clear_discard(struct cache *cache,
- struct dm_bio_prison_cell *cell,
- dm_oblock_t oblock, bool issue_holder)
+ static void invalidate_remove(struct work_struct *ws)
{
- struct bio *bio;
- unsigned long flags;
- struct inc_detail detail;
-
- detail.cache = cache;
- bio_list_init(&detail.bios_for_issue);
- bio_list_init(&detail.unhandled_bios);
- detail.any_writes = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
- bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- remap_to_origin(cache, cell->holder);
- if (issue_holder)
- issue(cache, cell->holder);
- else
- accounted_begin(cache, cell->holder);
-
- if (detail.any_writes)
- clear_discard(cache, oblock_to_dblock(cache, oblock));
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ struct cache *cache = mg->cache;
- while ((bio = bio_list_pop(&detail.bios_for_issue))) {
- remap_to_origin(cache, bio);
- issue(cache, bio);
+ r = invalidate_cblock(cache, mg->invalidate_cblock);
+ if (r) {
+ invalidate_complete(mg, false);
+ return;
}
- free_prison_cell(cache, cell);
+ init_continuation(&mg->k, invalidate_completed);
+ continue_after_commit(&cache->committer, &mg->k);
+ remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
+ mg->overwrite_bio = NULL;
+ schedule_commit(&cache->committer);
}
- static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
- dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+ static int invalidate_lock(struct dm_cache_migration *mg)
{
- struct bio *bio;
- unsigned long flags;
- struct inc_detail detail;
-
- detail.cache = cache;
- bio_list_init(&detail.bios_for_issue);
- bio_list_init(&detail.unhandled_bios);
- detail.any_writes = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
- bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- remap_to_cache(cache, cell->holder, cblock);
- if (issue_holder)
- issue(cache, cell->holder);
- else
- accounted_begin(cache, cell->holder);
+ int r;
+ struct dm_cell_key_v2 key;
+ struct cache *cache = mg->cache;
+ struct dm_bio_prison_cell_v2 *prealloc;
- if (detail.any_writes) {
- set_dirty(cache, oblock, cblock);
- clear_discard(cache, oblock_to_dblock(cache, oblock));
+ prealloc = alloc_prison_cell(cache);
+ if (!prealloc) {
+ invalidate_complete(mg, false);
+ return -ENOMEM;
}
- while ((bio = bio_list_pop(&detail.bios_for_issue))) {
- remap_to_cache(cache, bio, cblock);
- issue(cache, bio);
+ build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
+ r = dm_cell_lock_v2(cache->prison, &key,
+ READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
+ if (r < 0) {
+ free_prison_cell(cache, prealloc);
+ invalidate_complete(mg, false);
+ return r;
}
- free_prison_cell(cache, cell);
- }
+ if (mg->cell != prealloc)
+ free_prison_cell(cache, prealloc);
- /*----------------------------------------------------------------*/
+ if (r)
+ quiesce(mg, invalidate_remove);
- struct old_oblock_lock {
- struct policy_locker locker;
- struct cache *cache;
- struct prealloc *structs;
- struct dm_bio_prison_cell *cell;
- };
+ else {
+ /*
+ * We can't call invalidate_remove() directly here because we
+ * might still be in request context.
+ */
+ init_continuation(&mg->k, invalidate_remove);
+ queue_work(cache->wq, &mg->k.ws);
+ }
- static int null_locker(struct policy_locker *locker, dm_oblock_t b)
- {
- /* This should never be called */
- BUG();
return 0;
}
- static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
+ dm_oblock_t oblock, struct bio *bio)
{
- struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
- struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
-
- return bio_detain(l->cache, b, NULL, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- l->structs, &l->cell);
- }
-
- static void process_cell(struct cache *cache, struct prealloc *structs,
- struct dm_bio_prison_cell *new_ocell)
- {
- int r;
- bool release_cell = true;
- struct bio *bio = new_ocell->holder;
- dm_oblock_t block = get_bio_block(cache, bio);
- struct policy_result lookup_result;
- bool passthrough = passthrough_mode(&cache->features);
- bool fast_promotion, can_migrate;
- struct old_oblock_lock ool;
-
- fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
- can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
-
- ool.locker.fn = cell_locker;
- ool.cache = cache;
- ool.structs = structs;
- ool.cell = NULL;
- r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
- bio, &ool.locker, &lookup_result);
-
- if (r == -EWOULDBLOCK)
- /* migration has been denied */
- lookup_result.op = POLICY_MISS;
-
- switch (lookup_result.op) {
- case POLICY_HIT:
- if (passthrough) {
- inc_miss_counter(cache, bio);
-
- /*
- * Passthrough always maps to the origin,
- * invalidating any cache blocks that are written
- * to.
- */
-
- if (bio_data_dir(bio) == WRITE) {
- atomic_inc(&cache->stats.demotion);
- invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
- release_cell = false;
+ struct dm_cache_migration *mg;
- } else {
- /* FIXME: factor out issue_origin() */
- remap_to_origin_clear_discard(cache, bio, block);
- inc_and_issue(cache, bio, new_ocell);
- }
- } else {
- inc_hit_counter(cache, bio);
-
- if (bio_data_dir(bio) == WRITE &&
- writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock)) {
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- inc_and_issue(cache, bio, new_ocell);
-
- } else {
- remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
- release_cell = false;
- }
- }
+ if (!background_work_begin(cache))
+ return -EPERM;
- break;
+ mg = alloc_migration(cache);
+ if (!mg) {
+ background_work_end(cache);
+ return -ENOMEM;
+ }
- case POLICY_MISS:
- inc_miss_counter(cache, bio);
- remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
- release_cell = false;
- break;
+ memset(mg, 0, sizeof(*mg));
- case POLICY_NEW:
- atomic_inc(&cache->stats.promotion);
- promote(cache, structs, block, lookup_result.cblock, new_ocell);
- release_cell = false;
- break;
+ mg->cache = cache;
+ mg->overwrite_bio = bio;
+ mg->invalidate_cblock = cblock;
+ mg->invalidate_oblock = oblock;
- case POLICY_REPLACE:
- atomic_inc(&cache->stats.demotion);
- atomic_inc(&cache->stats.promotion);
- demote_then_promote(cache, structs, lookup_result.old_oblock,
- block, lookup_result.cblock,
- ool.cell, new_ocell);
- release_cell = false;
- break;
+ return invalidate_lock(mg);
+ }
- default:
- DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
- cache_device_name(cache), __func__,
- (unsigned) lookup_result.op);
- bio_io_error(bio);
- }
+ /*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
- if (release_cell)
- cell_defer(cache, new_ocell, false);
- }
+ enum busy {
+ IDLE,
+ MODERATE,
+ BUSY
+ };
- static void process_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
+ static enum busy spare_migration_bandwidth(struct cache *cache)
{
- int r;
- dm_oblock_t block = get_bio_block(cache, bio);
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-
- /*
- * Check to see if that block is currently migrating.
- */
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain(cache, block, bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ bool idle = iot_idle_for(&cache->origin_tracker, HZ);
+ sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
+ cache->sectors_per_block;
- process_cell(cache, structs, new_ocell);
+ if (current_volume <= cache->migration_threshold)
+ return idle ? IDLE : MODERATE;
+ else
+ return idle ? MODERATE : BUSY;
}
- static int need_commit_due_to_time(struct cache *cache)
+ static void inc_hit_counter(struct cache *cache, struct bio *bio)
{
- return jiffies < cache->last_commit_jiffies ||
- jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+ atomic_inc(bio_data_dir(bio) == READ ?
+ &cache->stats.read_hit : &cache->stats.write_hit);
}
- /*
- * A non-zero return indicates read_only or fail_io mode.
- */
- static int commit(struct cache *cache, bool clean_shutdown)
+ static void inc_miss_counter(struct cache *cache, struct bio *bio)
{
- int r;
-
- if (get_cache_mode(cache) >= CM_READ_ONLY)
- return -EINVAL;
+ atomic_inc(bio_data_dir(bio) == READ ?
+ &cache->stats.read_miss : &cache->stats.write_miss);
+ }
- atomic_inc(&cache->stats.commit_count);
- r = dm_cache_commit(cache->cmd, clean_shutdown);
- if (r)
- metadata_operation_failed(cache, "dm_cache_commit", r);
+ /*----------------------------------------------------------------*/
- return r;
+ static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+ {
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
}
- static int commit_if_needed(struct cache *cache)
+ static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
{
- int r = 0;
-
- if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
- dm_cache_changed_this_transaction(cache->cmd)) {
- r = commit(cache, false);
- cache->commit_requested = false;
- cache->last_commit_jiffies = jiffies;
- }
-
- return r;
+ return writeback_mode(&cache->features) &&
+ (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
}
- static void process_deferred_bios(struct cache *cache)
+ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
+ bool *commit_needed)
{
- bool prealloc_used = false;
- unsigned long flags;
- struct bio_list bios;
- struct bio *bio;
- struct prealloc structs;
-
- memset(&structs, 0, sizeof(structs));
- bio_list_init(&bios);
+ int r, data_dir;
+ bool rb, background_queued;
+ dm_cblock_t cblock;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ *commit_needed = false;
- while (!bio_list_empty(&bios)) {
+ rb = bio_detain_shared(cache, block, bio);
+ if (!rb) {
/*
- * If we've got no free migration structs, and processing
- * this bio might require one, we pause until there are some
- * prepared mappings to process.
+ * An exclusive lock is held for this block, so we have to
+ * wait. We set the commit_needed flag so the current
+ * transaction will be committed asap, allowing this lock
+ * to be dropped.
*/
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs)) {
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&cache->deferred_bios, &bios);
- spin_unlock_irqrestore(&cache->lock, flags);
- break;
+ *commit_needed = true;
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ data_dir = bio_data_dir(bio);
+
+ if (optimisable_bio(cache, bio, block)) {
+ struct policy_work *op = NULL;
+
+ r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
+ if (unlikely(r && r != -ENOENT)) {
+ DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
+ cache_device_name(cache), r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
}
- bio = bio_list_pop(&bios);
+ if (r == -ENOENT && op) {
+ bio_drop_shared_lock(cache, bio);
+ BUG_ON(op->op != POLICY_PROMOTE);
+ mg_start(cache, op, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
+ if (unlikely(r && r != -ENOENT)) {
+ DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
+ cache_device_name(cache), r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
- if (bio->bi_opf & REQ_PREFLUSH)
- process_flush_bio(cache, bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
- process_discard_bio(cache, &structs, bio);
- else
- process_bio(cache, &structs, bio);
+ if (background_queued)
+ wake_migration_worker(cache);
}
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
- }
-
- static void process_deferred_cells(struct cache *cache)
- {
- bool prealloc_used = false;
- unsigned long flags;
- struct dm_bio_prison_cell *cell, *tmp;
- struct list_head cells;
- struct prealloc structs;
+ if (r == -ENOENT) {
+ /*
+ * Miss.
+ */
+ inc_miss_counter(cache, bio);
+ if (pb->req_nr == 0) {
+ accounted_begin(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
- memset(&structs, 0, sizeof(structs));
+ } else {
+ /*
+ * This is a duplicate writethrough io that is no
+ * longer needed because the block has been demoted.
+ */
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ /*
+ * Hit.
+ */
+ inc_hit_counter(cache, bio);
- INIT_LIST_HEAD(&cells);
+ /*
+ * Passthrough always maps to the origin, invalidating any
+ * cache blocks that are written to.
+ */
+ if (passthrough_mode(&cache->features)) {
+ if (bio_data_dir(bio) == WRITE) {
+ bio_drop_shared_lock(cache, bio);
+ atomic_inc(&cache->stats.demotion);
+ invalidate_start(cache, cblock, block, bio);
+ } else
+ remap_to_origin_clear_discard(cache, bio, block);
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(&cache->deferred_cells, &cells);
- spin_unlock_irqrestore(&cache->lock, flags);
+ } else {
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ !is_dirty(cache, cblock)) {
+ remap_to_origin_then_cache(cache, bio, block, cblock);
+ accounted_begin(cache, bio);
+ } else
+ remap_to_cache_dirty(cache, bio, block, cblock);
+ }
+ }
- list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+ /*
+ * dm core turns FUA requests into a separate payload and FLUSH req.
+ */
+ if (bio->bi_opf & REQ_FUA) {
/*
- * If we've got no free migration structs, and processing
- * this bio might require one, we pause until there are some
- * prepared mappings to process.
+ * issue_after_commit will call accounted_begin a second time. So
+ * we call accounted_complete() to avoid double accounting.
*/
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs)) {
- spin_lock_irqsave(&cache->lock, flags);
- list_splice(&cells, &cache->deferred_cells);
- spin_unlock_irqrestore(&cache->lock, flags);
- break;
- }
-
- process_cell(cache, &structs, cell);
+ accounted_complete(cache, bio);
+ issue_after_commit(&cache->committer, bio);
+ *commit_needed = true;
+ return DM_MAPIO_SUBMITTED;
}
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
+ return DM_MAPIO_REMAPPED;
}
- static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+ static bool process_bio(struct cache *cache, struct bio *bio)
{
- unsigned long flags;
- struct bio_list bios;
- struct bio *bio;
-
- bio_list_init(&bios);
+ bool commit_needed;
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_flush_bios);
- bio_list_init(&cache->deferred_flush_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
+ generic_make_request(bio);
- /*
- * These bios have already been through inc_ds()
- */
- while ((bio = bio_list_pop(&bios)))
- submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
+ return commit_needed;
}
- static void process_deferred_writethrough_bios(struct cache *cache)
+ /*
+ * A non-zero return indicates read_only or fail_io mode.
+ */
+ static int commit(struct cache *cache, bool clean_shutdown)
{
- unsigned long flags;
- struct bio_list bios;
- struct bio *bio;
+ int r;
- bio_list_init(&bios);
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_writethrough_bios);
- bio_list_init(&cache->deferred_writethrough_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ atomic_inc(&cache->stats.commit_count);
+ r = dm_cache_commit(cache->cmd, clean_shutdown);
+ if (r)
+ metadata_operation_failed(cache, "dm_cache_commit", r);
- /*
- * These bios have already been through inc_ds()
- */
- while ((bio = bio_list_pop(&bios)))
- accounted_request(cache, bio);
+ return r;
}
- static void writeback_some_dirty_blocks(struct cache *cache)
+ /*
+ * Used by the batcher.
+ */
+ static int commit_op(void *context)
{
- bool prealloc_used = false;
- dm_oblock_t oblock;
- dm_cblock_t cblock;
- struct prealloc structs;
- struct dm_bio_prison_cell *old_ocell;
- bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
-
- memset(&structs, 0, sizeof(structs));
-
- while (spare_migration_bandwidth(cache)) {
- if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
- break; /* no work to do */
-
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs) ||
- get_cell(cache, oblock, &structs, &old_ocell)) {
- policy_set_dirty(cache->policy, oblock);
- break;
- }
+ struct cache *cache = context;
- writeback(cache, &structs, oblock, cblock, old_ocell);
- }
+ if (dm_cache_changed_this_transaction(cache->cmd))
+ return commit(cache, false);
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
+ return 0;
}
- /*----------------------------------------------------------------
- * Invalidations.
- * Dropping something from the cache *without* writing back.
- *--------------------------------------------------------------*/
+ /*----------------------------------------------------------------*/
- static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
+ static bool process_flush_bio(struct cache *cache, struct bio *bio)
{
- int r = 0;
- uint64_t begin = from_cblock(req->cblocks->begin);
- uint64_t end = from_cblock(req->cblocks->end);
-
- while (begin != end) {
- r = policy_remove_cblock(cache->policy, to_cblock(begin));
- if (!r) {
- r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
- if (r) {
- metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- break;
- }
-
- } else if (r == -ENODATA) {
- /* harmless, already unmapped */
- r = 0;
-
- } else {
- DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
- break;
- }
-
- begin++;
- }
-
- cache->commit_requested = true;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- req->err = r;
- atomic_set(&req->complete, 1);
+ if (!pb->req_nr)
+ remap_to_origin(cache, bio);
+ else
+ remap_to_cache(cache, bio, 0);
- wake_up(&req->result_wait);
+ issue_after_commit(&cache->committer, bio);
+ return true;
}
- static void process_invalidation_requests(struct cache *cache)
+ static bool process_discard_bio(struct cache *cache, struct bio *bio)
{
- struct list_head list;
- struct invalidation_request *req, *tmp;
+ dm_dblock_t b, e;
- INIT_LIST_HEAD(&list);
- spin_lock(&cache->invalidation_lock);
- list_splice_init(&cache->invalidation_requests, &list);
- spin_unlock(&cache->invalidation_lock);
+ // FIXME: do we need to lock the region? Or can we just assume the
+ // user wont be so foolish as to issue discard concurrently with
+ // other IO?
+ calc_discard_block_range(cache, bio, &b, &e);
+ while (b != e) {
+ set_discard(cache, b);
+ b = to_dblock(from_dblock(b) + 1);
+ }
- list_for_each_entry_safe (req, tmp, &list, list)
- process_invalidation_request(cache, req);
- }
+ bio_endio(bio);
- /*----------------------------------------------------------------
- * Main worker loop
- *--------------------------------------------------------------*/
- static bool is_quiescing(struct cache *cache)
- {
- return atomic_read(&cache->quiescing);
+ return false;
}
- static void ack_quiescing(struct cache *cache)
+ static void process_deferred_bios(struct work_struct *ws)
{
- if (is_quiescing(cache)) {
- atomic_inc(&cache->quiescing_ack);
- wake_up(&cache->quiescing_wait);
- }
- }
+ struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
- static void wait_for_quiescing_ack(struct cache *cache)
- {
- wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
- }
+ unsigned long flags;
+ bool commit_needed = false;
+ struct bio_list bios;
+ struct bio *bio;
- static void start_quiescing(struct cache *cache)
- {
- atomic_inc(&cache->quiescing);
- wait_for_quiescing_ack(cache);
- }
+ bio_list_init(&bios);
- static void stop_quiescing(struct cache *cache)
- {
- atomic_set(&cache->quiescing, 0);
- atomic_set(&cache->quiescing_ack, 0);
- }
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
- static void wait_for_migrations(struct cache *cache)
- {
- wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
- }
+ while ((bio = bio_list_pop(&bios))) {
+ if (bio->bi_opf & REQ_PREFLUSH)
+ commit_needed = process_flush_bio(cache, bio) || commit_needed;
- static void stop_worker(struct cache *cache)
- {
- cancel_delayed_work(&cache->waker);
- flush_workqueue(cache->wq);
+ else if (bio_op(bio) == REQ_OP_DISCARD)
+ commit_needed = process_discard_bio(cache, bio) || commit_needed;
+
+ else
+ commit_needed = process_bio(cache, bio) || commit_needed;
+ }
+
+ if (commit_needed)
+ schedule_commit(&cache->committer);
}
- static void requeue_deferred_cells(struct cache *cache)
+ static void process_deferred_writethrough_bios(struct work_struct *ws)
{
+ struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
+
unsigned long flags;
- struct list_head cells;
- struct dm_bio_prison_cell *cell, *tmp;
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
- INIT_LIST_HEAD(&cells);
spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(&cache->deferred_cells, &cells);
+ bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
spin_unlock_irqrestore(&cache->lock, flags);
- list_for_each_entry_safe(cell, tmp, &cells, user_list)
- cell_requeue(cache, cell);
+ /*
+ * These bios have already been through accounted_begin()
+ */
+ while ((bio = bio_list_pop(&bios)))
+ generic_make_request(bio);
}
+ /*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
+
static void requeue_deferred_bios(struct cache *cache)
{
struct bio *bio;
}
}
- static int more_work(struct cache *cache)
- {
- if (is_quiescing(cache))
- return !list_empty(&cache->quiesced_migrations) ||
- !list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations);
- else
- return !bio_list_empty(&cache->deferred_bios) ||
- !list_empty(&cache->deferred_cells) ||
- !bio_list_empty(&cache->deferred_flush_bios) ||
- !bio_list_empty(&cache->deferred_writethrough_bios) ||
- !list_empty(&cache->quiesced_migrations) ||
- !list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations) ||
- cache->invalidate;
- }
-
- static void do_worker(struct work_struct *ws)
- {
- struct cache *cache = container_of(ws, struct cache, worker);
-
- do {
- if (!is_quiescing(cache)) {
- writeback_some_dirty_blocks(cache);
- process_deferred_writethrough_bios(cache);
- process_deferred_bios(cache);
- process_deferred_cells(cache);
- process_invalidation_requests(cache);
- }
-
- process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
- process_migrations(cache, &cache->completed_migrations, complete_migration);
-
- if (commit_if_needed(cache)) {
- process_deferred_flush_bios(cache, false);
- process_migrations(cache, &cache->need_commit_migrations, migration_failure);
- } else {
- process_deferred_flush_bios(cache, true);
- process_migrations(cache, &cache->need_commit_migrations,
- migration_success_post_commit);
- }
-
- ack_quiescing(cache);
-
- } while (more_work(cache));
- }
-
/*
* We want to commit periodically so that not too much
* unwritten metadata builds up.
static void do_waker(struct work_struct *ws)
{
struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+
policy_tick(cache->policy, true);
- wake_worker(cache);
+ wake_migration_worker(cache);
+ schedule_commit(&cache->committer);
queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
}
- /*----------------------------------------------------------------*/
-
- static int is_congested(struct dm_dev *dev, int bdi_bits)
+ static void check_migrations(struct work_struct *ws)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- return bdi_congested(q->backing_dev_info, bdi_bits);
- }
+ int r;
+ struct policy_work *op;
+ struct cache *cache = container_of(ws, struct cache, migration_worker);
+ enum busy b;
- static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
- {
- struct cache *cache = container_of(cb, struct cache, callbacks);
+ for (;;) {
+ b = spare_migration_bandwidth(cache);
+ if (b == BUSY)
+ break;
- return is_congested(cache->origin_dev, bdi_bits) ||
- is_congested(cache->cache_dev, bdi_bits);
+ r = policy_get_background_work(cache->policy, b == IDLE, &op);
+ if (r == -ENODATA)
+ break;
+
+ if (r) {
+ DMERR_LIMIT("%s: policy_background_work failed",
+ cache_device_name(cache));
+ break;
+ }
+
+ r = mg_start(cache, op, NULL);
+ if (r)
+ break;
+ }
}
/*----------------------------------------------------------------
mempool_destroy(cache->migration_pool);
- if (cache->all_io_ds)
- dm_deferred_set_destroy(cache->all_io_ds);
-
if (cache->prison)
- dm_bio_prison_destroy(cache->prison);
+ dm_bio_prison_destroy_v2(cache->prison);
if (cache->wq)
destroy_workqueue(cache->wq);
return PTR_ERR(p);
}
cache->policy = p;
+ BUG_ON(!cache->policy);
return 0;
}
cache->cache_size = size;
}
+ static int is_congested(struct dm_dev *dev, int bdi_bits)
+ {
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ return bdi_congested(q->backing_dev_info, bdi_bits);
+ }
+
+ static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+ {
+ struct cache *cache = container_of(cb, struct cache, callbacks);
+
+ return is_congested(cache->origin_dev, bdi_bits) ||
+ is_congested(cache->cache_dev, bdi_bits);
+ }
+
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
- ti->discard_zeroes_data_unsupported = true;
ti->split_discard_bios = false;
cache->features = ca->features;
ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
- /* FIXME: factor out this whole section */
origin_blocks = cache->origin_sectors = ca->origin_sectors;
origin_blocks = block_div(origin_blocks, ca->block_size);
cache->origin_blocks = to_oblock(origin_blocks);
r = -EINVAL;
goto bad;
}
+
+ policy_allow_migrations(cache->policy, false);
}
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->deferred_cells);
bio_list_init(&cache->deferred_bios);
- bio_list_init(&cache->deferred_flush_bios);
bio_list_init(&cache->deferred_writethrough_bios);
- INIT_LIST_HEAD(&cache->quiesced_migrations);
- INIT_LIST_HEAD(&cache->completed_migrations);
- INIT_LIST_HEAD(&cache->need_commit_migrations);
atomic_set(&cache->nr_allocated_migrations, 0);
atomic_set(&cache->nr_io_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
- init_waitqueue_head(&cache->quiescing_wait);
- atomic_set(&cache->quiescing, 0);
- atomic_set(&cache->quiescing_ack, 0);
-
r = -ENOMEM;
atomic_set(&cache->nr_dirty, 0);
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
goto bad;
}
- cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!cache->wq) {
*error = "could not create workqueue for metadata object";
goto bad;
}
- INIT_WORK(&cache->worker, do_worker);
+ INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
+ INIT_WORK(&cache->deferred_writethrough_worker,
+ process_deferred_writethrough_bios);
+ INIT_WORK(&cache->migration_worker, check_migrations);
INIT_DELAYED_WORK(&cache->waker, do_waker);
- cache->last_commit_jiffies = jiffies;
- cache->prison = dm_bio_prison_create();
+ cache->prison = dm_bio_prison_create_v2(cache->wq);
if (!cache->prison) {
*error = "could not create bio prison";
goto bad;
}
- cache->all_io_ds = dm_deferred_set_create();
- if (!cache->all_io_ds) {
- *error = "could not create all_io deferred set";
- goto bad;
- }
-
cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
migration_cache);
if (!cache->migration_pool) {
spin_lock_init(&cache->invalidation_lock);
INIT_LIST_HEAD(&cache->invalidation_requests);
+ batcher_init(&cache->committer, commit_op, cache,
+ issue_op, cache, cache->wq);
iot_init(&cache->origin_tracker);
+ init_rwsem(&cache->background_work_lock);
+ prevent_background_work(cache);
+
*result = cache;
return 0;
-
bad:
destroy(cache);
return r;
}
ti->private = cache;
-
out:
destroy_cache_args(ca);
return r;
struct cache *cache = ti->private;
int r;
- struct dm_bio_prison_cell *cell = NULL;
+ bool commit_needed;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
- bool can_migrate = false;
- bool fast_promotion;
- struct policy_result lookup_result;
- struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
- struct old_oblock_lock ool;
-
- ool.locker.fn = null_locker;
+ init_per_bio_data(bio, pb_data_size);
if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
/*
* This can only occur if the io goes to a partial block at
return DM_MAPIO_SUBMITTED;
}
- /*
- * Check to see if that block is currently migrating.
- */
- cell = alloc_prison_cell(cache);
- if (!cell) {
- defer_bio(cache, bio);
- return DM_MAPIO_SUBMITTED;
- }
-
- r = bio_detain(cache, block, bio, cell,
- (cell_free_fn) free_prison_cell,
- cache, &cell);
- if (r) {
- if (r < 0)
- defer_bio(cache, bio);
-
- return DM_MAPIO_SUBMITTED;
- }
-
- fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-
- r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
- bio, &ool.locker, &lookup_result);
- if (r == -EWOULDBLOCK) {
- cell_defer(cache, cell, true);
- return DM_MAPIO_SUBMITTED;
-
- } else if (r) {
- DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
- cache_device_name(cache), r);
- cell_defer(cache, cell, false);
- bio_io_error(bio);
- return DM_MAPIO_SUBMITTED;
- }
-
- r = DM_MAPIO_REMAPPED;
- switch (lookup_result.op) {
- case POLICY_HIT:
- if (passthrough_mode(&cache->features)) {
- if (bio_data_dir(bio) == WRITE) {
- /*
- * We need to invalidate this block, so
- * defer for the worker thread.
- */
- cell_defer(cache, cell, true);
- r = DM_MAPIO_SUBMITTED;
-
- } else {
- inc_miss_counter(cache, bio);
- remap_to_origin_clear_discard(cache, bio, block);
- accounted_begin(cache, bio);
- inc_ds(cache, bio, cell);
- // FIXME: we want to remap hits or misses straight
- // away rather than passing over to the worker.
- cell_defer(cache, cell, false);
- }
-
- } else {
- inc_hit_counter(cache, bio);
- if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock)) {
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- accounted_begin(cache, bio);
- inc_ds(cache, bio, cell);
- cell_defer(cache, cell, false);
-
- } else
- remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
- }
- break;
-
- case POLICY_MISS:
- inc_miss_counter(cache, bio);
- if (pb->req_nr != 0) {
- /*
- * This is a duplicate writethrough io that is no
- * longer needed because the block has been demoted.
- */
- bio_endio(bio);
- // FIXME: remap everything as a miss
- cell_defer(cache, cell, false);
- r = DM_MAPIO_SUBMITTED;
-
- } else
- remap_cell_to_origin_clear_discard(cache, cell, block, false);
- break;
-
- default:
- DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
- cache_device_name(cache), __func__,
- (unsigned) lookup_result.op);
- cell_defer(cache, cell, false);
- bio_io_error(bio);
- r = DM_MAPIO_SUBMITTED;
- }
+ r = map_bio(cache, bio, block, &commit_needed);
+ if (commit_needed)
+ schedule_commit(&cache->committer);
return r;
}
spin_unlock_irqrestore(&cache->lock, flags);
}
- check_for_quiesced_migrations(cache, pb);
+ bio_drop_shared_lock(cache, bio);
accounted_complete(cache, bio);
return 0;
{
struct cache *cache = ti->private;
- start_quiescing(cache);
- wait_for_migrations(cache);
- stop_worker(cache);
+ prevent_background_work(cache);
+ BUG_ON(atomic_read(&cache->nr_io_migrations));
+
+ cancel_delayed_work(&cache->waker);
+ flush_workqueue(cache->wq);
+ WARN_ON(cache->origin_tracker.in_flight);
+
+ /*
+ * If it's a flush suspend there won't be any deferred bios, so this
+ * call is harmless.
+ */
requeue_deferred_bios(cache);
- requeue_deferred_cells(cache);
- stop_quiescing(cache);
if (get_cache_mode(cache) == CM_WRITE)
(void) sync_metadata(cache);
int r;
struct cache *cache = context;
- r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+ if (dirty) {
+ set_bit(from_cblock(cblock), cache->dirty_bitset);
+ atomic_inc(&cache->nr_dirty);
+ } else
+ clear_bit(from_cblock(cblock), cache->dirty_bitset);
+
+ r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
if (r)
return r;
- if (dirty)
- set_dirty(cache, oblock, cblock);
- else
- clear_dirty(cache, oblock, cblock);
-
return 0;
}
struct cache *cache = ti->private;
cache->need_tick_bio = true;
+ allow_background_work(cache);
do_waker(&cache->waker.work);
}
DMEMIT("Error");
}
+ /*
+ * Defines a range of cblocks, begin to (end - 1) are in the range. end is
+ * the one-past-the-end value.
+ */
+ struct cblock_range {
+ dm_cblock_t begin;
+ dm_cblock_t end;
+ };
+
/*
* A cache block range can take two forms:
*
* i) A single cblock, eg. '3456'
- * ii) A begin and end cblock with dots between, eg. 123-234
+ * ii) A begin and end cblock with a dash between, eg. 123-234
*/
static int parse_cblock_range(struct cache *cache, const char *str,
struct cblock_range *result)
return 0;
}
+ static inline dm_cblock_t cblock_succ(dm_cblock_t b)
+ {
+ return to_cblock(from_cblock(b) + 1);
+ }
+
static int request_invalidation(struct cache *cache, struct cblock_range *range)
{
- struct invalidation_request req;
+ int r = 0;
- INIT_LIST_HEAD(&req.list);
- req.cblocks = range;
- atomic_set(&req.complete, 0);
- req.err = 0;
- init_waitqueue_head(&req.result_wait);
+ /*
+ * We don't need to do any locking here because we know we're in
+ * passthrough mode. There's is potential for a race between an
+ * invalidation triggered by an io and an invalidation message. This
+ * is harmless, we must not worry if the policy call fails.
+ */
+ while (range->begin != range->end) {
+ r = invalidate_cblock(cache, range->begin);
+ if (r)
+ return r;
- spin_lock(&cache->invalidation_lock);
- list_add(&req.list, &cache->invalidation_requests);
- spin_unlock(&cache->invalidation_lock);
- wake_worker(cache);
+ range->begin = cblock_succ(range->begin);
+ }
- wait_event(req.result_wait, atomic_read(&req.complete));
- return req.err;
+ cache->commit_requested = true;
+ return r;
}
static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 10, 0},
+ .version = {2, 0, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
struct request_queue *queue;
int numa_node_id;
- unsigned type;
+ enum dm_queue_mode type;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
void dm_init_normal_md_queue(struct mapped_device *md);
int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
- * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
+ * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
#include <crypto/md5.h>
#include <crypto/algapi.h>
#include <crypto/skcipher.h>
+ #include <crypto/aead.h>
+ #include <crypto/authenc.h>
+ #include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
#include <keys/user-type.h>
#include <linux/device-mapper.h>
struct bvec_iter iter_out;
sector_t cc_sector;
atomic_t cc_pending;
- struct skcipher_request *req;
+ union {
+ struct skcipher_request *req;
+ struct aead_request *req_aead;
+ } r;
+
};
/*
struct dm_crypt_io {
struct crypt_config *cc;
struct bio *base_bio;
+ u8 *integrity_metadata;
+ bool integrity_metadata_from_pool;
struct work_struct work;
struct convert_context ctx;
struct dm_crypt_request {
struct convert_context *ctx;
- struct scatterlist sg_in;
- struct scatterlist sg_out;
+ struct scatterlist sg_in[4];
+ struct scatterlist sg_out[4];
sector_t iv_sector;
};
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+ enum cipher_flags {
+ CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
+ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
+ };
+
/*
* The fields in here must be read only after initialization.
*/
sector_t start;
/*
- * pool for per bio private data, crypto requests and
- * encryption requeusts/buffer pages
+ * pool for per bio private data, crypto requests,
+ * encryption requeusts/buffer pages and integrity tags
*/
mempool_t *req_pool;
mempool_t *page_pool;
+ mempool_t *tag_pool;
+ unsigned tag_pool_max_sectors;
+
struct bio_set *bs;
struct mutex bio_alloc_lock;
char *cipher;
char *cipher_string;
+ char *cipher_auth;
char *key_string;
const struct crypt_iv_operations *iv_gen_ops;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
+ unsigned short int sector_size;
+ unsigned char sector_shift;
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
- struct crypto_skcipher **tfms;
+ union {
+ struct crypto_skcipher **tfms;
+ struct crypto_aead **tfms_aead;
+ } cipher_tfm;
unsigned tfms_count;
+ unsigned long cipher_flags;
/*
* Layout of each crypto request:
unsigned int key_size;
unsigned int key_parts; /* independent parts in key buffer */
unsigned int key_extra_size; /* additional keys length */
+ unsigned int key_mac_size; /* MAC key size for authenc(...) */
+
+ unsigned int integrity_tag_size;
+ unsigned int integrity_iv_size;
+ unsigned int on_disk_tag_size;
+
+ u8 *authenc_key; /* space for keys in authenc() format (if used) */
u8 key[0];
};
- #define MIN_IOS 64
+ #define MIN_IOS 64
+ #define MAX_TAG_SIZE 480
+ #define POOL_ENTRY_SIZE 512
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
- static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+ static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+ struct scatterlist *sg);
/*
- * Use this to access cipher attributes that are the same for each CPU.
+ * Use this to access cipher attributes that are independent of the key.
*/
static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
{
- return cc->tfms[0];
+ return cc->cipher_tfm.tfms[0];
+ }
+
+ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
+ {
+ return cc->cipher_tfm.tfms_aead[0];
}
/*
return err;
}
- /* Set up per cpu cipher state */
- static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
- struct dm_target *ti,
- u8 *salt, unsigned saltsize)
+ /* Allocate the cipher for ESSIV */
+ static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
+ struct dm_target *ti,
+ const u8 *salt,
+ unsigned int saltsize)
{
struct crypto_cipher *essiv_tfm;
int err;
return essiv_tfm;
}
- if (crypto_cipher_blocksize(essiv_tfm) !=
- crypto_skcipher_ivsize(any_tfm(cc))) {
+ if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
ti->error = "Block size of ESSIV cipher does "
"not match IV size of block cipher";
crypto_free_cipher(essiv_tfm);
cc->iv_gen_private.essiv.salt = salt;
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
- essiv_tfm = setup_essiv_cpu(cc, ti, salt,
- crypto_ahash_digestsize(hash_tfm));
+ essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
+ crypto_ahash_digestsize(hash_tfm));
if (IS_ERR(essiv_tfm)) {
crypt_iv_essiv_dtr(cc);
return PTR_ERR(essiv_tfm);
{
struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ ti->error = "Unsupported sector size for LMK";
+ return -EINVAL;
+ }
+
lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
if (IS_ERR(lmk->hash_tfm)) {
ti->error = "Error initializing LMK hash";
static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
u8 *src;
int r = 0;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
- src = kmap_atomic(sg_page(&dmreq->sg_in));
- r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_in);
+ src = kmap_atomic(sg_page(sg));
+ r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
kunmap_atomic(src);
} else
memset(iv, 0, cc->iv_size);
static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
u8 *dst;
int r;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
return 0;
- dst = kmap_atomic(sg_page(&dmreq->sg_out));
- r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ dst = kmap_atomic(sg_page(sg));
+ r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
/* Tweak the first block of plaintext sector */
if (!r)
- crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+ crypto_xor(dst + sg->offset, iv, cc->iv_size);
kunmap_atomic(dst);
return r;
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ ti->error = "Unsupported sector size for TCW";
+ return -EINVAL;
+ }
+
if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
ti->error = "Wrong key size for TCW";
return -EINVAL;
static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 *src;
/* Remove whitening from ciphertext */
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
- src = kmap_atomic(sg_page(&dmreq->sg_in));
- r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_in);
+ src = kmap_atomic(sg_page(sg));
+ r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
kunmap_atomic(src);
}
static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
u8 *dst;
int r;
return 0;
/* Apply whitening on ciphertext */
- dst = kmap_atomic(sg_page(&dmreq->sg_out));
- r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ dst = kmap_atomic(sg_page(sg));
+ r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
kunmap_atomic(dst);
return r;
}
+ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+ {
+ /* Used only for writes, there must be an additional space to store IV */
+ get_random_bytes(iv, cc->iv_size);
+ return 0;
+ }
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
.post = crypt_iv_tcw_post
};
+ static struct crypt_iv_operations crypt_iv_random_ops = {
+ .generator = crypt_iv_random_gen
+ };
+
+ /*
+ * Integrity extensions
+ */
+ static bool crypt_integrity_aead(struct crypt_config *cc)
+ {
+ return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+ }
+
+ static bool crypt_integrity_hmac(struct crypt_config *cc)
+ {
+ return crypt_integrity_aead(cc) && cc->key_mac_size;
+ }
+
+ /* Get sg containing data */
+ static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+ struct scatterlist *sg)
+ {
+ if (unlikely(crypt_integrity_aead(cc)))
+ return &sg[2];
+
+ return sg;
+ }
+
+ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
+ {
+ struct bio_integrity_payload *bip;
+ unsigned int tag_len;
+ int ret;
+
+ if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
+ return 0;
+
+ bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
+
+ tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+
+ bip->bip_iter.bi_size = tag_len;
+ bip->bip_iter.bi_sector = io->cc->start + io->sector;
+
+ /* We own the metadata, do not let bio_free to release it */
+ bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
+ tag_len, offset_in_page(io->integrity_metadata));
+ if (unlikely(ret != tag_len))
+ return -ENOMEM;
+
+ return 0;
+ }
+
+ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
+ {
+ #ifdef CONFIG_BLK_DEV_INTEGRITY
+ struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+
+ /* From now we require underlying device with our integrity profile */
+ if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
+ ti->error = "Integrity profile not supported.";
+ return -EINVAL;
+ }
+
+ if (bi->tag_size != cc->on_disk_tag_size ||
+ bi->tuple_size != cc->on_disk_tag_size) {
+ ti->error = "Integrity profile tag size mismatch.";
+ return -EINVAL;
+ }
+ if (1 << bi->interval_exp != cc->sector_size) {
+ ti->error = "Integrity profile sector size mismatch.";
+ return -EINVAL;
+ }
+
+ if (crypt_integrity_aead(cc)) {
+ cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
+ DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+ cc->integrity_tag_size, cc->integrity_iv_size);
+
+ if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
+ ti->error = "Integrity AEAD auth tag size is not supported.";
+ return -EINVAL;
+ }
+ } else if (cc->integrity_iv_size)
+ DMINFO("Additional per-sector space %u bytes for IV.",
+ cc->integrity_iv_size);
+
+ if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
+ ti->error = "Not enough space for integrity tag in the profile.";
+ return -EINVAL;
+ }
+
+ return 0;
+ #else
+ ti->error = "Integrity profile not supported.";
+ return -EINVAL;
+ #endif
+ }
+
static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in,
}
static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
- struct skcipher_request *req)
+ void *req)
{
return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
}
- static struct skcipher_request *req_of_dmreq(struct crypt_config *cc,
- struct dm_crypt_request *dmreq)
+ static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq)
{
- return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start);
+ return (void *)((char *)dmreq - cc->dmreq_start);
}
static u8 *iv_of_dmreq(struct crypt_config *cc,
struct dm_crypt_request *dmreq)
{
- return (u8 *)ALIGN((unsigned long)(dmreq + 1),
- crypto_skcipher_alignmask(any_tfm(cc)) + 1);
+ if (crypt_integrity_aead(cc))
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
+ else
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_skcipher_alignmask(any_tfm(cc)) + 1);
}
- static int crypt_convert_block(struct crypt_config *cc,
- struct convert_context *ctx,
- struct skcipher_request *req)
+ static u8 *org_iv_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+ {
+ return iv_of_dmreq(cc, dmreq) + cc->iv_size;
+ }
+
+ static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+ {
+ u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
+ return (uint64_t*) ptr;
+ }
+
+ static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+ {
+ u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size +
+ cc->iv_size + sizeof(uint64_t);
+ return (unsigned int*)ptr;
+ }
+
+ static void *tag_from_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+ {
+ struct convert_context *ctx = dmreq->ctx;
+ struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
+
+ return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
+ cc->on_disk_tag_size];
+ }
+
+ static void *iv_tag_from_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+ {
+ return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size;
+ }
+
+ static int crypt_convert_block_aead(struct crypt_config *cc,
+ struct convert_context *ctx,
+ struct aead_request *req,
+ unsigned int tag_offset)
{
struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
struct dm_crypt_request *dmreq;
- u8 *iv;
- int r;
+ u8 *iv, *org_iv, *tag_iv, *tag;
+ uint64_t *sector;
+ int r = 0;
+
+ BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
+
+ /* Reject unexpected unaligned bio. */
+ if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+ return -EIO;
dmreq = dmreq_of_req(cc, req);
+ dmreq->iv_sector = ctx->cc_sector;
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ dmreq->iv_sector >>= cc->sector_shift;
+ dmreq->ctx = ctx;
+
+ *org_tag_of_dmreq(cc, dmreq) = tag_offset;
+
+ sector = org_sector_of_dmreq(cc, dmreq);
+ *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+
iv = iv_of_dmreq(cc, dmreq);
+ org_iv = org_iv_of_dmreq(cc, dmreq);
+ tag = tag_from_dmreq(cc, dmreq);
+ tag_iv = iv_tag_from_dmreq(cc, dmreq);
+
+ /* AEAD request:
+ * |----- AAD -------|------ DATA -------|-- AUTH TAG --|
+ * | (authenticated) | (auth+encryption) | |
+ * | sector_LE | IV | sector in/out | tag in/out |
+ */
+ sg_init_table(dmreq->sg_in, 4);
+ sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
+ sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
+ sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
+ sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
+
+ sg_init_table(dmreq->sg_out, 4);
+ sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
+ sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
+ sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
+ sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
+
+ if (cc->iv_gen_ops) {
+ /* For READs use IV stored in integrity metadata */
+ if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+ memcpy(org_iv, tag_iv, cc->iv_size);
+ } else {
+ r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+ if (r < 0)
+ return r;
+ /* Store generated IV in integrity metadata */
+ if (cc->integrity_iv_size)
+ memcpy(tag_iv, org_iv, cc->iv_size);
+ }
+ /* Working copy of IV, to be modified in crypto API */
+ memcpy(iv, org_iv, cc->iv_size);
+ }
+
+ aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
+ if (bio_data_dir(ctx->bio_in) == WRITE) {
+ aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+ cc->sector_size, iv);
+ r = crypto_aead_encrypt(req);
+ if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
+ memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
+ cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
+ } else {
+ aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+ cc->sector_size + cc->integrity_tag_size, iv);
+ r = crypto_aead_decrypt(req);
+ }
+
+ if (r == -EBADMSG)
+ DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ (unsigned long long)le64_to_cpu(*sector));
+
+ if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+
+ bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+ bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
+
+ return r;
+ }
+
+ static int crypt_convert_block_skcipher(struct crypt_config *cc,
+ struct convert_context *ctx,
+ struct skcipher_request *req,
+ unsigned int tag_offset)
+ {
+ struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+ struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
+ struct scatterlist *sg_in, *sg_out;
+ struct dm_crypt_request *dmreq;
+ u8 *iv, *org_iv, *tag_iv;
+ uint64_t *sector;
+ int r = 0;
+ /* Reject unexpected unaligned bio. */
+ if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+ return -EIO;
+
+ dmreq = dmreq_of_req(cc, req);
dmreq->iv_sector = ctx->cc_sector;
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ dmreq->iv_sector >>= cc->sector_shift;
dmreq->ctx = ctx;
- sg_init_table(&dmreq->sg_in, 1);
- sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
- bv_in.bv_offset);
- sg_init_table(&dmreq->sg_out, 1);
- sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
- bv_out.bv_offset);
+ *org_tag_of_dmreq(cc, dmreq) = tag_offset;
+
+ iv = iv_of_dmreq(cc, dmreq);
+ org_iv = org_iv_of_dmreq(cc, dmreq);
+ tag_iv = iv_tag_from_dmreq(cc, dmreq);
+
+ sector = org_sector_of_dmreq(cc, dmreq);
+ *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+
+ /* For skcipher we use only the first sg item */
+ sg_in = &dmreq->sg_in[0];
+ sg_out = &dmreq->sg_out[0];
- bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
- bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
+ sg_init_table(sg_in, 1);
+ sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
+
+ sg_init_table(sg_out, 1);
+ sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
if (cc->iv_gen_ops) {
- r = cc->iv_gen_ops->generator(cc, iv, dmreq);
- if (r < 0)
- return r;
+ /* For READs use IV stored in integrity metadata */
+ if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+ memcpy(org_iv, tag_iv, cc->integrity_iv_size);
+ } else {
+ r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+ if (r < 0)
+ return r;
+ /* Store generated IV in integrity metadata */
+ if (cc->integrity_iv_size)
+ memcpy(tag_iv, org_iv, cc->integrity_iv_size);
+ }
+ /* Working copy of IV, to be modified in crypto API */
+ memcpy(iv, org_iv, cc->iv_size);
}
- skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
- 1 << SECTOR_SHIFT, iv);
+ skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv);
if (bio_data_dir(ctx->bio_in) == WRITE)
r = crypto_skcipher_encrypt(req);
r = crypto_skcipher_decrypt(req);
if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
- r = cc->iv_gen_ops->post(cc, iv, dmreq);
+ r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+
+ bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+ bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
return r;
}
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error);
- static void crypt_alloc_req(struct crypt_config *cc,
- struct convert_context *ctx)
+ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
+ struct convert_context *ctx)
{
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!ctx->req)
- ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ if (!ctx->r.req)
+ ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
- skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
/*
* Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
* requests if driver request queue is full.
*/
- skcipher_request_set_callback(ctx->req,
+ skcipher_request_set_callback(ctx->r.req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, dmreq_of_req(cc, ctx->req));
+ kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
}
- static void crypt_free_req(struct crypt_config *cc,
- struct skcipher_request *req, struct bio *base_bio)
+ static void crypt_alloc_req_aead(struct crypt_config *cc,
+ struct convert_context *ctx)
+ {
+ if (!ctx->r.req_aead)
+ ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+ aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
+
+ /*
+ * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+ * requests if driver request queue is full.
+ */
+ aead_request_set_callback(ctx->r.req_aead,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
+ }
+
+ static void crypt_alloc_req(struct crypt_config *cc,
+ struct convert_context *ctx)
+ {
+ if (crypt_integrity_aead(cc))
+ crypt_alloc_req_aead(cc, ctx);
+ else
+ crypt_alloc_req_skcipher(cc, ctx);
+ }
+
+ static void crypt_free_req_skcipher(struct crypt_config *cc,
+ struct skcipher_request *req, struct bio *base_bio)
{
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
mempool_free(req, cc->req_pool);
}
+ static void crypt_free_req_aead(struct crypt_config *cc,
+ struct aead_request *req, struct bio *base_bio)
+ {
+ struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+ if ((struct aead_request *)(io + 1) != req)
+ mempool_free(req, cc->req_pool);
+ }
+
+ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
+ {
+ if (crypt_integrity_aead(cc))
+ crypt_free_req_aead(cc, req, base_bio);
+ else
+ crypt_free_req_skcipher(cc, req, base_bio);
+ }
+
/*
* Encrypt / decrypt data from one bio to another one (can be the same one)
*/
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
+ unsigned int tag_offset = 0;
+ unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
atomic_set(&ctx->cc_pending, 1);
while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
crypt_alloc_req(cc, ctx);
-
atomic_inc(&ctx->cc_pending);
- r = crypt_convert_block(cc, ctx, ctx->req);
+ if (crypt_integrity_aead(cc))
+ r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+ else
+ r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
switch (r) {
/*
* completion function kcryptd_async_done() will be called.
*/
case -EINPROGRESS:
- ctx->req = NULL;
- ctx->cc_sector++;
+ ctx->r.req = NULL;
+ ctx->cc_sector += sector_step;
+ tag_offset++;
continue;
/*
* The request was already processed (synchronously).
*/
case 0:
atomic_dec(&ctx->cc_pending);
- ctx->cc_sector++;
+ ctx->cc_sector += sector_step;
+ tag_offset++;
cond_resched();
continue;
-
- /* There was an error while processing the request. */
+ /*
+ * There was a data integrity error.
+ */
+ case -EBADMSG:
+ atomic_dec(&ctx->cc_pending);
+ return -EILSEQ;
+ /*
+ * There was an error while processing the request.
+ */
default:
atomic_dec(&ctx->cc_pending);
- return r;
+ return -EIO;
}
}
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
- goto return_clone;
+ goto out;
clone_init(io, clone);
remaining_size -= len;
}
- return_clone:
+ /* Allocate space for integrity tags */
+ if (dm_crypt_integrity_io_alloc(io, clone)) {
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ clone = NULL;
+ }
+ out:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_unlock(&cc->bio_alloc_lock);
io->base_bio = bio;
io->sector = sector;
io->error = 0;
- io->ctx.req = NULL;
+ io->ctx.r.req = NULL;
+ io->integrity_metadata = NULL;
+ io->integrity_metadata_from_pool = false;
atomic_set(&io->io_pending, 0);
}
if (!atomic_dec_and_test(&io->io_pending))
return;
- if (io->ctx.req)
- crypt_free_req(cc, io->ctx.req, base_bio);
+ if (io->ctx.r.req)
+ crypt_free_req(cc, io->ctx.r.req, base_bio);
+
+ if (unlikely(io->integrity_metadata_from_pool))
+ mempool_free(io->integrity_metadata, io->cc->tag_pool);
+ else
+ kfree(io->integrity_metadata);
base_bio->bi_error = error;
bio_endio(base_bio);
clone_init(io, clone);
clone->bi_iter.bi_sector = cc->start + io->sector;
+ if (dm_crypt_integrity_io_alloc(io, clone)) {
+ crypt_dec_pending(io);
+ bio_put(clone);
+ return 1;
+ }
+
generic_make_request(clone);
return 0;
}
crypt_inc_pending(io);
r = crypt_convert(cc, &io->ctx);
- if (r)
- io->error = -EIO;
+ if (r < 0)
+ io->error = r;
crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
/* Encryption was already finished, submit io now */
r = crypt_convert(cc, &io->ctx);
if (r < 0)
- io->error = -EIO;
+ io->error = r;
if (atomic_dec_and_test(&io->ctx.cc_pending))
kcryptd_crypt_read_done(io);
}
if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
- error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+ error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
- if (error < 0)
+ if (error == -EBADMSG) {
+ DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
+ io->error = -EILSEQ;
+ } else if (error < 0)
io->error = -EIO;
crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
queue_work(cc->crypt_queue, &io->work);
}
- /*
- * Decode key from its hex representation
- */
- static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
+ static void crypt_free_tfms_aead(struct crypt_config *cc)
{
- char buffer[3];
- unsigned int i;
-
- buffer[2] = '\0';
-
- for (i = 0; i < size; i++) {
- buffer[0] = *hex++;
- buffer[1] = *hex++;
+ if (!cc->cipher_tfm.tfms_aead)
+ return;
- if (kstrtou8(buffer, 16, &key[i]))
- return -EINVAL;
+ if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+ crypto_free_aead(cc->cipher_tfm.tfms_aead[0]);
+ cc->cipher_tfm.tfms_aead[0] = NULL;
}
- if (*hex != '\0')
- return -EINVAL;
-
- return 0;
+ kfree(cc->cipher_tfm.tfms_aead);
+ cc->cipher_tfm.tfms_aead = NULL;
}
- static void crypt_free_tfms(struct crypt_config *cc)
+ static void crypt_free_tfms_skcipher(struct crypt_config *cc)
{
unsigned i;
- if (!cc->tfms)
+ if (!cc->cipher_tfm.tfms)
return;
for (i = 0; i < cc->tfms_count; i++)
- if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
- crypto_free_skcipher(cc->tfms[i]);
- cc->tfms[i] = NULL;
+ if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) {
+ crypto_free_skcipher(cc->cipher_tfm.tfms[i]);
+ cc->cipher_tfm.tfms[i] = NULL;
}
- kfree(cc->tfms);
- cc->tfms = NULL;
+ kfree(cc->cipher_tfm.tfms);
+ cc->cipher_tfm.tfms = NULL;
}
- static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+ static void crypt_free_tfms(struct crypt_config *cc)
+ {
+ if (crypt_integrity_aead(cc))
+ crypt_free_tfms_aead(cc);
+ else
+ crypt_free_tfms_skcipher(cc);
+ }
+
+ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
{
unsigned i;
int err;
- cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
- GFP_KERNEL);
- if (!cc->tfms)
+ cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
+ sizeof(struct crypto_skcipher *), GFP_KERNEL);
+ if (!cc->cipher_tfm.tfms)
return -ENOMEM;
for (i = 0; i < cc->tfms_count; i++) {
- cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
- if (IS_ERR(cc->tfms[i])) {
- err = PTR_ERR(cc->tfms[i]);
+ cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
+ if (IS_ERR(cc->cipher_tfm.tfms[i])) {
+ err = PTR_ERR(cc->cipher_tfm.tfms[i]);
crypt_free_tfms(cc);
return err;
}
return 0;
}
+ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
+ {
+ int err;
+
+ cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
+ if (!cc->cipher_tfm.tfms)
+ return -ENOMEM;
+
+ cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
+ if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+ err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
+ crypt_free_tfms(cc);
+ return err;
+ }
+
+ return 0;
+ }
+
+ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+ {
+ if (crypt_integrity_aead(cc))
+ return crypt_alloc_tfms_aead(cc, ciphermode);
+ else
+ return crypt_alloc_tfms_skcipher(cc, ciphermode);
+ }
+
+ static unsigned crypt_subkey_size(struct crypt_config *cc)
+ {
+ return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+ }
+
+ static unsigned crypt_authenckey_size(struct crypt_config *cc)
+ {
+ return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param));
+ }
+
+ /*
+ * If AEAD is composed like authenc(hmac(sha256),xts(aes)),
+ * the key must be for some reason in special format.
+ * This funcion converts cc->key to this special format.
+ */
+ static void crypt_copy_authenckey(char *p, const void *key,
+ unsigned enckeylen, unsigned authkeylen)
+ {
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+
+ rta = (struct rtattr *)p;
+ param = RTA_DATA(rta);
+ param->enckeylen = cpu_to_be32(enckeylen);
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ p += RTA_SPACE(sizeof(*param));
+ memcpy(p, key + enckeylen, authkeylen);
+ p += authkeylen;
+ memcpy(p, key, enckeylen);
+ }
+
static int crypt_setkey(struct crypt_config *cc)
{
unsigned subkey_size;
int err = 0, i, r;
/* Ignore extra keys (which are used for IV etc) */
- subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+ subkey_size = crypt_subkey_size(cc);
+ if (crypt_integrity_hmac(cc))
+ crypt_copy_authenckey(cc->authenc_key, cc->key,
+ subkey_size - cc->key_mac_size,
+ cc->key_mac_size);
for (i = 0; i < cc->tfms_count; i++) {
- r = crypto_skcipher_setkey(cc->tfms[i],
- cc->key + (i * subkey_size),
- subkey_size);
+ if (crypt_integrity_hmac(cc))
+ r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+ cc->authenc_key, crypt_authenckey_size(cc));
+ else if (crypt_integrity_aead(cc))
+ r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+ cc->key + (i * subkey_size),
+ subkey_size);
+ else
+ r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
+ cc->key + (i * subkey_size),
+ subkey_size);
if (r)
err = r;
}
+ if (crypt_integrity_hmac(cc))
+ memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc));
+
return err;
}
kzfree(cc->key_string);
cc->key_string = NULL;
- if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
+ /* Decode key from its hex representation. */
+ if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0)
goto out;
r = crypt_setkey(cc);
static int crypt_wipe_key(struct crypt_config *cc)
{
+ int r;
+
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- memset(&cc->key, 0, cc->key_size * sizeof(u8));
+ get_random_bytes(&cc->key, cc->key_size);
kzfree(cc->key_string);
cc->key_string = NULL;
+ r = crypt_setkey(cc);
+ memset(&cc->key, 0, cc->key_size * sizeof(u8));
- return crypt_setkey(cc);
+ return r;
}
static void crypt_dtr(struct dm_target *ti)
mempool_destroy(cc->page_pool);
mempool_destroy(cc->req_pool);
+ mempool_destroy(cc->tag_pool);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
kzfree(cc->cipher);
kzfree(cc->cipher_string);
kzfree(cc->key_string);
+ kzfree(cc->cipher_auth);
+ kzfree(cc->authenc_key);
/* Must zero key material before freeing */
kzfree(cc);
}
- static int crypt_ctr_cipher(struct dm_target *ti,
- char *cipher_in, char *key)
+ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
+ {
+ struct crypt_config *cc = ti->private;
+
+ if (crypt_integrity_aead(cc))
+ cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+ else
+ cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+ if (cc->iv_size)
+ /* at least a 64 bit sector number should fit in our buffer */
+ cc->iv_size = max(cc->iv_size,
+ (unsigned int)(sizeof(u64) / sizeof(u8)));
+ else if (ivmode) {
+ DMWARN("Selected cipher does not support IVs");
+ ivmode = NULL;
+ }
+
+ /* Choose ivmode, see comments at iv code. */
+ if (ivmode == NULL)
+ cc->iv_gen_ops = NULL;
+ else if (strcmp(ivmode, "plain") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain_ops;
+ else if (strcmp(ivmode, "plain64") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain64_ops;
+ else if (strcmp(ivmode, "essiv") == 0)
+ cc->iv_gen_ops = &crypt_iv_essiv_ops;
+ else if (strcmp(ivmode, "benbi") == 0)
+ cc->iv_gen_ops = &crypt_iv_benbi_ops;
+ else if (strcmp(ivmode, "null") == 0)
+ cc->iv_gen_ops = &crypt_iv_null_ops;
+ else if (strcmp(ivmode, "lmk") == 0) {
+ cc->iv_gen_ops = &crypt_iv_lmk_ops;
+ /*
+ * Version 2 and 3 is recognised according
+ * to length of provided multi-key string.
+ * If present (version 3), last key is used as IV seed.
+ * All keys (including IV seed) are always the same size.
+ */
+ if (cc->key_size % cc->key_parts) {
+ cc->key_parts++;
+ cc->key_extra_size = cc->key_size / cc->key_parts;
+ }
+ } else if (strcmp(ivmode, "tcw") == 0) {
+ cc->iv_gen_ops = &crypt_iv_tcw_ops;
+ cc->key_parts += 2; /* IV + whitening */
+ cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
+ } else if (strcmp(ivmode, "random") == 0) {
+ cc->iv_gen_ops = &crypt_iv_random_ops;
+ /* Need storage space in integrity fields. */
+ cc->integrity_iv_size = cc->iv_size;
+ } else {
+ ti->error = "Invalid IV mode";
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ /*
+ * Workaround to parse cipher algorithm from crypto API spec.
+ * The cc->cipher is currently used only in ESSIV.
+ * This should be probably done by crypto-api calls (once available...)
+ */
+ static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
+ {
+ const char *alg_name = NULL;
+ char *start, *end;
+
+ if (crypt_integrity_aead(cc)) {
+ alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
+ if (!alg_name)
+ return -EINVAL;
+ if (crypt_integrity_hmac(cc)) {
+ alg_name = strchr(alg_name, ',');
+ if (!alg_name)
+ return -EINVAL;
+ }
+ alg_name++;
+ } else {
+ alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
+ if (!alg_name)
+ return -EINVAL;
+ }
+
+ start = strchr(alg_name, '(');
+ end = strchr(alg_name, ')');
+
+ if (!start && !end) {
+ cc->cipher = kstrdup(alg_name, GFP_KERNEL);
+ return cc->cipher ? 0 : -ENOMEM;
+ }
+
+ if (!start || !end || ++start >= end)
+ return -EINVAL;
+
+ cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
+ if (!cc->cipher)
+ return -ENOMEM;
+
+ strncpy(cc->cipher, start, end - start);
+
+ return 0;
+ }
+
+ /*
+ * Workaround to parse HMAC algorithm from AEAD crypto API spec.
+ * The HMAC is needed to calculate tag size (HMAC digest size).
+ * This should be probably done by crypto-api calls (once available...)
+ */
+ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
+ {
+ char *start, *end, *mac_alg = NULL;
+ struct crypto_ahash *mac;
+
+ if (!strstarts(cipher_api, "authenc("))
+ return 0;
+
+ start = strchr(cipher_api, '(');
+ end = strchr(cipher_api, ',');
+ if (!start || !end || ++start > end)
+ return -EINVAL;
+
+ mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
+ if (!mac_alg)
+ return -ENOMEM;
+ strncpy(mac_alg, start, end - start);
+
+ mac = crypto_alloc_ahash(mac_alg, 0, 0);
+ kfree(mac_alg);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ cc->key_mac_size = crypto_ahash_digestsize(mac);
+ crypto_free_ahash(mac);
+
+ cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
+ if (!cc->authenc_key)
+ return -ENOMEM;
+
+ return 0;
+ }
+
+ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key,
+ char **ivmode, char **ivopts)
+ {
+ struct crypt_config *cc = ti->private;
+ char *tmp, *cipher_api;
+ int ret = -EINVAL;
+
+ cc->tfms_count = 1;
+
+ /*
+ * New format (capi: prefix)
+ * capi:cipher_api_spec-iv:ivopts
+ */
+ tmp = &cipher_in[strlen("capi:")];
+ cipher_api = strsep(&tmp, "-");
+ *ivmode = strsep(&tmp, ":");
+ *ivopts = tmp;
+
+ if (*ivmode && !strcmp(*ivmode, "lmk"))
+ cc->tfms_count = 64;
+
+ cc->key_parts = cc->tfms_count;
+
+ /* Allocate cipher */
+ ret = crypt_alloc_tfms(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Error allocating crypto tfm";
+ return ret;
+ }
+
+ /* Alloc AEAD, can be used only in new format. */
+ if (crypt_integrity_aead(cc)) {
+ ret = crypt_ctr_auth_cipher(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Invalid AEAD cipher spec";
+ return -ENOMEM;
+ }
+ cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+ } else
+ cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+ ret = crypt_ctr_blkdev_cipher(cc);
+ if (ret < 0) {
+ ti->error = "Cannot allocate cipher string";
+ return -ENOMEM;
+ }
+
+ return 0;
+ }
+
+ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key,
+ char **ivmode, char **ivopts)
{
struct crypt_config *cc = ti->private;
- char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
+ char *tmp, *cipher, *chainmode, *keycount;
char *cipher_api = NULL;
int ret = -EINVAL;
char dummy;
- /* Convert to crypto api definition? */
- if (strchr(cipher_in, '(')) {
+ if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) {
ti->error = "Bad cipher specification";
return -EINVAL;
}
- cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
- if (!cc->cipher_string)
- goto bad_mem;
-
/*
* Legacy dm-crypt cipher specification
* cipher[:keycount]-mode-iv:ivopts
return -EINVAL;
}
cc->key_parts = cc->tfms_count;
- cc->key_extra_size = 0;
cc->cipher = kstrdup(cipher, GFP_KERNEL);
if (!cc->cipher)
goto bad_mem;
chainmode = strsep(&tmp, "-");
- ivopts = strsep(&tmp, "-");
- ivmode = strsep(&ivopts, ":");
+ *ivopts = strsep(&tmp, "-");
+ *ivmode = strsep(&*ivopts, ":");
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
* For compatibility with the original dm-crypt mapping format, if
* only the cipher name is supplied, use cbc-plain.
*/
- if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
+ if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) {
chainmode = "cbc";
- ivmode = "plain";
+ *ivmode = "plain";
}
- if (strcmp(chainmode, "ecb") && !ivmode) {
+ if (strcmp(chainmode, "ecb") && !*ivmode) {
ti->error = "IV mechanism required";
return -EINVAL;
}
ret = crypt_alloc_tfms(cc, cipher_api);
if (ret < 0) {
ti->error = "Error allocating crypto tfm";
- goto bad;
+ kfree(cipher_api);
+ return ret;
}
- /* Initialize IV */
- cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
- if (cc->iv_size)
- /* at least a 64 bit sector number should fit in our buffer */
- cc->iv_size = max(cc->iv_size,
- (unsigned int)(sizeof(u64) / sizeof(u8)));
- else if (ivmode) {
- DMWARN("Selected cipher does not support IVs");
- ivmode = NULL;
- }
+ return 0;
+ bad_mem:
+ ti->error = "Cannot allocate cipher strings";
+ return -ENOMEM;
+ }
- /* Choose ivmode, see comments at iv code. */
- if (ivmode == NULL)
- cc->iv_gen_ops = NULL;
- else if (strcmp(ivmode, "plain") == 0)
- cc->iv_gen_ops = &crypt_iv_plain_ops;
- else if (strcmp(ivmode, "plain64") == 0)
- cc->iv_gen_ops = &crypt_iv_plain64_ops;
- else if (strcmp(ivmode, "essiv") == 0)
- cc->iv_gen_ops = &crypt_iv_essiv_ops;
- else if (strcmp(ivmode, "benbi") == 0)
- cc->iv_gen_ops = &crypt_iv_benbi_ops;
- else if (strcmp(ivmode, "null") == 0)
- cc->iv_gen_ops = &crypt_iv_null_ops;
- else if (strcmp(ivmode, "lmk") == 0) {
- cc->iv_gen_ops = &crypt_iv_lmk_ops;
- /*
- * Version 2 and 3 is recognised according
- * to length of provided multi-key string.
- * If present (version 3), last key is used as IV seed.
- * All keys (including IV seed) are always the same size.
- */
- if (cc->key_size % cc->key_parts) {
- cc->key_parts++;
- cc->key_extra_size = cc->key_size / cc->key_parts;
- }
- } else if (strcmp(ivmode, "tcw") == 0) {
- cc->iv_gen_ops = &crypt_iv_tcw_ops;
- cc->key_parts += 2; /* IV + whitening */
- cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
- } else {
- ret = -EINVAL;
- ti->error = "Invalid IV mode";
- goto bad;
+ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
+ {
+ struct crypt_config *cc = ti->private;
+ char *ivmode = NULL, *ivopts = NULL;
+ int ret;
+
+ cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+ if (!cc->cipher_string) {
+ ti->error = "Cannot allocate cipher strings";
+ return -ENOMEM;
}
+ if (strstarts(cipher_in, "capi:"))
+ ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts);
+ else
+ ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts);
+ if (ret)
+ return ret;
+
+ /* Initialize IV */
+ ret = crypt_ctr_ivmode(ti, ivmode);
+ if (ret < 0)
+ return ret;
+
/* Initialize and set key */
ret = crypt_set_key(cc, key);
if (ret < 0) {
ti->error = "Error decoding and setting key";
- goto bad;
+ return ret;
}
/* Allocate IV */
ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
if (ret < 0) {
ti->error = "Error creating IV";
- goto bad;
+ return ret;
}
}
ret = cc->iv_gen_ops->init(cc);
if (ret < 0) {
ti->error = "Error initialising IV";
- goto bad;
+ return ret;
}
}
- ret = 0;
- bad:
- kfree(cipher_api);
return ret;
+ }
- bad_mem:
- ti->error = "Cannot allocate cipher strings";
- return -ENOMEM;
+ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
+ {
+ struct crypt_config *cc = ti->private;
+ struct dm_arg_set as;
+ static struct dm_arg _args[] = {
+ {0, 6, "Invalid number of feature args"},
+ };
+ unsigned int opt_params, val;
+ const char *opt_string, *sval;
+ char dummy;
+ int ret;
+
+ /* Optional parameters */
+ as.argc = argc;
+ as.argv = argv;
+
+ ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (ret)
+ return ret;
+
+ while (opt_params--) {
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ ti->error = "Not enough feature arguments";
+ return -EINVAL;
+ }
+
+ if (!strcasecmp(opt_string, "allow_discards"))
+ ti->num_discard_bios = 1;
+
+ else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+ set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+ else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+ set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
+ if (val == 0 || val > MAX_TAG_SIZE) {
+ ti->error = "Invalid integrity arguments";
+ return -EINVAL;
+ }
+ cc->on_disk_tag_size = val;
+ sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
+ if (!strcasecmp(sval, "aead")) {
+ set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+ } else if (strcasecmp(sval, "none")) {
+ ti->error = "Unknown integrity profile";
+ return -EINVAL;
+ }
+
+ cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
+ if (!cc->cipher_auth)
+ return -ENOMEM;
+ } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
+ if (cc->sector_size < (1 << SECTOR_SHIFT) ||
+ cc->sector_size > 4096 ||
+ (cc->sector_size & (cc->sector_size - 1))) {
+ ti->error = "Invalid feature value for sector_size";
+ return -EINVAL;
+ }
+ cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
+ } else if (!strcasecmp(opt_string, "iv_large_sectors"))
+ set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+ else {
+ ti->error = "Invalid feature arguments";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
}
/*
{
struct crypt_config *cc;
int key_size;
- unsigned int opt_params;
+ unsigned int align_mask;
unsigned long long tmpll;
int ret;
- size_t iv_size_padding;
- struct dm_arg_set as;
- const char *opt_string;
+ size_t iv_size_padding, additional_req_size;
char dummy;
- static struct dm_arg _args[] = {
- {0, 3, "Invalid number of feature args"},
- };
-
if (argc < 5) {
ti->error = "Not enough arguments";
return -EINVAL;
return -ENOMEM;
}
cc->key_size = key_size;
+ cc->sector_size = (1 << SECTOR_SHIFT);
+ cc->sector_shift = 0;
ti->private = cc;
+
+ /* Optional parameters need to be read before cipher constructor */
+ if (argc > 5) {
+ ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
+ if (ret)
+ goto bad;
+ }
+
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
if (ret < 0)
goto bad;
- cc->dmreq_start = sizeof(struct skcipher_request);
- cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+ if (crypt_integrity_aead(cc)) {
+ cc->dmreq_start = sizeof(struct aead_request);
+ cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
+ align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
+ } else {
+ cc->dmreq_start = sizeof(struct skcipher_request);
+ cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+ align_mask = crypto_skcipher_alignmask(any_tfm(cc));
+ }
cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
- if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ if (align_mask < CRYPTO_MINALIGN) {
/* Allocate the padding exactly */
iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
- & crypto_skcipher_alignmask(any_tfm(cc));
+ & align_mask;
} else {
/*
* If the cipher requires greater alignment than kmalloc
* alignment, we don't know the exact position of the
* initialization vector. We must assume worst case.
*/
- iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc));
+ iv_size_padding = align_mask;
}
ret = -ENOMEM;
- cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
- sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
+
+ /* ...| IV + padding | original IV | original sec. number | bio tag offset | */
+ additional_req_size = sizeof(struct dm_crypt_request) +
+ iv_size_padding + cc->iv_size +
+ cc->iv_size +
+ sizeof(uint64_t) +
+ sizeof(unsigned int);
+
+ cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
cc->per_bio_data_size = ti->per_io_data_size =
- ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
- sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
+ ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
ARCH_KMALLOC_MINALIGN);
cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
mutex_init(&cc->bio_alloc_lock);
ret = -EINVAL;
- if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+ if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
+ (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
ti->error = "Invalid iv_offset sector";
goto bad;
}
}
cc->start = tmpll;
- argv += 5;
- argc -= 5;
-
- /* Optional parameters */
- if (argc) {
- as.argc = argc;
- as.argv = argv;
-
- ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
+ ret = crypt_integrity_ctr(cc, ti);
if (ret)
goto bad;
- ret = -EINVAL;
- while (opt_params--) {
- opt_string = dm_shift_arg(&as);
- if (!opt_string) {
- ti->error = "Not enough feature arguments";
- goto bad;
- }
-
- if (!strcasecmp(opt_string, "allow_discards"))
- ti->num_discard_bios = 1;
+ cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
+ if (!cc->tag_pool_max_sectors)
+ cc->tag_pool_max_sectors = 1;
- else if (!strcasecmp(opt_string, "same_cpu_crypt"))
- set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
-
- else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
- set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
-
- else {
- ti->error = "Invalid feature arguments";
- goto bad;
- }
+ cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+ cc->tag_pool_max_sectors * cc->on_disk_tag_size);
+ if (!cc->tag_pool) {
+ ti->error = "Cannot allocate integrity tags mempool";
+ goto bad;
}
+
+ cc->tag_pool_max_sectors <<= cc->sector_shift;
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
else
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
- ti->discard_zeroes_data_unsupported = true;
return 0;
* Check if bio is too large, split as needed.
*/
if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
- bio_data_dir(bio) == WRITE)
+ (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
+ /*
+ * Ensure that bio is a multiple of internal sector encryption size
+ * and is aligned to this size as defined in IO hints.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
+ return -EIO;
+
+ if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
+ return -EIO;
+
io = dm_per_bio_data(bio, cc->per_bio_data_size);
crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
- io->ctx.req = (struct skcipher_request *)(io + 1);
+
+ if (cc->on_disk_tag_size) {
+ unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
+
+ if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
+ unlikely(!(io->integrity_metadata = kmalloc(tag_len,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+ if (bio_sectors(bio) > cc->tag_pool_max_sectors)
+ dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
+ io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+ io->integrity_metadata_from_pool = true;
+ }
+ }
+
+ if (crypt_integrity_aead(cc))
+ io->ctx.r.req_aead = (struct aead_request *)(io + 1);
+ else
+ io->ctx.r.req = (struct skcipher_request *)(io + 1);
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
num_feature_args += !!ti->num_discard_bios;
num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
+ num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+ if (cc->on_disk_tag_size)
+ num_feature_args++;
if (num_feature_args) {
DMEMIT(" %d", num_feature_args);
if (ti->num_discard_bios)
DMEMIT(" same_cpu_crypt");
if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
DMEMIT(" submit_from_crypt_cpus");
+ if (cc->on_disk_tag_size)
+ DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
+ if (cc->sector_size != (1 << SECTOR_SHIFT))
+ DMEMIT(" sector_size:%d", cc->sector_size);
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ DMEMIT(" iv_large_sectors");
}
break;
static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
+ struct crypt_config *cc = ti->private;
+
/*
* Unfortunate constraint that is required to avoid the potential
* for exceeding underlying device's max_segments limits -- due to
* bio that are not as physically contiguous as the original bio.
*/
limits->max_segment_size = PAGE_SIZE;
+
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ limits->logical_block_size = cc->sector_size;
+ limits->physical_block_size = cc->sector_size;
+ blk_limits_io_min(limits, cc->sector_size);
+ }
}
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 15, 0},
+ .version = {1, 17, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
ti->private = lc;
return 0;
static struct target_type linear_target = {
.name = "linear",
.version = {1, 3, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
atomic_t pg_init_count; /* Number of times pg_init called */
- unsigned queue_mode;
+ enum dm_queue_mode queue_mode;
struct mutex work_mutex;
struct work_struct trigger_event;
static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void trigger_event(struct work_struct *work);
- static void activate_path(struct work_struct *work);
+ static void activate_or_offline_path(struct pgpath *pgpath);
+ static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
/*-----------------------------------------------
if (pgpath) {
pgpath->is_active = true;
- INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+ INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
}
return pgpath;
struct pgpath *pgpath;
unsigned long pg_init_delay = 0;
+ lockdep_assert_held(&m->lock);
+
if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
return 0;
return atomic_read(&m->pg_init_in_progress);
}
- static void pg_init_all_paths(struct multipath *m)
+ static int pg_init_all_paths(struct multipath *m)
{
+ int ret;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- __pg_init_all_paths(m);
+ ret = __pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
+
+ return ret;
}
static void __switch_pg(struct multipath *m, struct priority_group *pg)
}
/*
- * Check whether bios must be queued in the device-mapper core rather
- * than here in the target.
- *
- * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
- * same value then we are not between multipath_presuspend()
- * and multipath_resume() calls and we have no need to check
- * for the DMF_NOFLUSH_SUSPENDING flag.
+ * dm_report_EIO() is a macro instead of a function to make pr_debug()
+ * report the function name and line number of the function from which
+ * it has been invoked.
*/
- static bool __must_push_back(struct multipath *m)
- {
- return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
- test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
- dm_noflush_suspending(m->ti));
- }
-
- static bool must_push_back_rq(struct multipath *m)
- {
- bool r;
- unsigned long flags;
-
- spin_lock_irqsave(&m->lock, flags);
- r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
- __must_push_back(m));
- spin_unlock_irqrestore(&m->lock, flags);
-
- return r;
- }
-
- static bool must_push_back_bio(struct multipath *m)
- {
- bool r;
- unsigned long flags;
-
- spin_lock_irqsave(&m->lock, flags);
- r = __must_push_back(m);
- spin_unlock_irqrestore(&m->lock, flags);
-
- return r;
- }
+ #define dm_report_EIO(m) \
+ ({ \
+ struct mapped_device *md = dm_table_get_md((m)->ti->table); \
+ \
+ pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
+ dm_device_name(md), \
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
+ dm_noflush_suspending((m)->ti)); \
+ -EIO; \
+ })
/*
* Map cloned requests (request-based multipath)
struct request **__clone)
{
struct multipath *m = ti->private;
- int r = DM_MAPIO_REQUEUE;
size_t nr_bytes = blk_rq_bytes(rq);
struct pgpath *pgpath;
struct block_device *bdev;
struct dm_mpath_io *mpio = get_mpio(map_context);
+ struct request_queue *q;
struct request *clone;
/* Do we need to select a new pgpath? */
pgpath = choose_pgpath(m, nr_bytes);
if (!pgpath) {
- if (must_push_back_rq(m))
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
return DM_MAPIO_DELAY_REQUEUE;
- return -EIO; /* Failed */
+ return dm_report_EIO(m); /* Failed */
} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
- pg_init_all_paths(m);
- return r;
+ if (pg_init_all_paths(m))
+ return DM_MAPIO_DELAY_REQUEUE;
+ return DM_MAPIO_REQUEUE;
}
memset(mpio, 0, sizeof(*mpio));
mpio->nr_bytes = nr_bytes;
bdev = pgpath->path.dev->bdev;
-
- clone = blk_get_request(bdev_get_queue(bdev),
- rq->cmd_flags | REQ_NOMERGE,
- GFP_ATOMIC);
+ q = bdev_get_queue(bdev);
+ clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
if (IS_ERR(clone)) {
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
- return r;
+ bool queue_dying = blk_queue_dying(q);
+ DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
+ PTR_ERR(clone), queue_dying ? " (path offline)" : "");
+ if (queue_dying) {
+ atomic_inc(&m->pg_init_in_progress);
+ activate_or_offline_path(pgpath);
+ return DM_MAPIO_REQUEUE;
+ }
+ return DM_MAPIO_DELAY_REQUEUE;
}
clone->bio = clone->biotail = NULL;
clone->rq_disk = bdev->bd_disk;
}
if (!pgpath) {
- if (!must_push_back_bio(m))
- return -EIO;
- return DM_MAPIO_REQUEUE;
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+ return DM_MAPIO_REQUEUE;
+ return dm_report_EIO(m);
}
mpio->pgpath = pgpath;
blk_finish_plug(&plug);
}
+ static void assign_bit(bool value, long nr, unsigned long *addr)
+ {
+ if (value)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+ }
+
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
-
- if (save_old_value) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- } else {
- if (queue_if_no_path)
- set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- }
- if (queue_if_no_path)
- set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-
+ assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+ (!save_old_value && queue_if_no_path),
+ MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+ assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
+ MPATHF_QUEUE_IF_NO_PATH, &m->flags);
spin_unlock_irqrestore(&m->lock, flags);
if (!queue_if_no_path) {
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
if (m->queue_mode == DM_TYPE_BIO_BASED)
ti->per_io_data_size = multipath_per_bio_data_size();
else
spin_unlock_irqrestore(&m->lock, flags);
}
- static void activate_path(struct work_struct *work)
+ static void activate_or_offline_path(struct pgpath *pgpath)
{
- struct pgpath *pgpath =
- container_of(work, struct pgpath, activate_path.work);
struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
if (pgpath->is_active && !blk_queue_dying(q))
pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
}
+ static void activate_path_work(struct work_struct *work)
+ {
+ struct pgpath *pgpath =
+ container_of(work, struct pgpath, activate_path.work);
+
+ activate_or_offline_path(pgpath);
+ }
+
static int noretry_error(int error)
{
switch (error) {
*/
int r = DM_ENDIO_REQUEUE;
- if (!error && !clone->errors)
+ if (!error)
return 0; /* I/O complete */
if (noretry_error(error))
if (mpio->pgpath)
fail_path(mpio->pgpath);
- if (!atomic_read(&m->nr_valid_paths)) {
- if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- if (!must_push_back_rq(m))
- r = -EIO;
- }
- }
+ if (atomic_read(&m->nr_valid_paths) == 0 &&
+ !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+ r = dm_report_EIO(m);
return r;
}
if (mpio->pgpath)
fail_path(mpio->pgpath);
- if (!atomic_read(&m->nr_valid_paths)) {
- if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- if (!must_push_back_bio(m))
- return -EIO;
- return DM_ENDIO_REQUEUE;
- }
- }
+ if (atomic_read(&m->nr_valid_paths) == 0 &&
+ !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+ return dm_report_EIO(m);
/* Queue for the daemon to resubmit */
dm_bio_restore(get_bio_details_from_bio(clone), clone);
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
- set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+ assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
+ MPATHF_QUEUE_IF_NO_PATH, &m->flags);
spin_unlock_irqrestore(&m->lock, flags);
}
case DM_TYPE_MQ_REQUEST_BASED:
DMEMIT("queue_mode mq ");
break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
}
}
}
/*
* Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
/* New for v1.10.0 */
- #define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */
+ #define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
+
+ /* New for v1.11.1 */
+ #define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
/*
* Flags for rs->ctr_flags field.
#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
+ #define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET | \
- CTR_FLAG_JOURNAL_DEV)
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
CTR_FLAG_REBUILD | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET | \
- CTR_FLAG_JOURNAL_DEV)
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
/* ...valid options definitions per raid level */
/*
struct journal_dev {
struct dm_dev *dev;
struct md_rdev rdev;
+ int mode;
} journal_dev;
struct raid_dev dev[0];
{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
{ CTR_FLAG_JOURNAL_DEV, "journal_dev" },
+ { CTR_FLAG_JOURNAL_MODE, "journal_mode" },
};
/* Return argument name string for given @flag */
return NULL;
}
+ /* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
+ static struct {
+ const int mode;
+ const char *param;
+ } _raid456_journal_mode[] = {
+ { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
+ { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" }
+ };
+
+ /* Return MD raid4/5/6 journal mode for dm @journal_mode one */
+ static int dm_raid_journal_mode_to_md(const char *mode)
+ {
+ int m = ARRAY_SIZE(_raid456_journal_mode);
+
+ while (m--)
+ if (!strcasecmp(mode, _raid456_journal_mode[m].param))
+ return _raid456_journal_mode[m].mode;
+
+ return -EINVAL;
+ }
+
+ /* Return dm-raid raid4/5/6 journal mode string for @mode */
+ static const char *md_journal_mode_to_dm_raid(const int mode)
+ {
+ int m = ARRAY_SIZE(_raid456_journal_mode);
+
+ while (m--)
+ if (mode == _raid456_journal_mode[m].mode)
+ return _raid456_journal_mode[m].param;
+
+ return "unknown";
+ }
+
/*
* Bool helpers to test for various raid levels of a raid set.
* It's level as reported by the superblock rather than
continue;
}
- /* "journal_dev dev" */
+ /* "journal_dev <dev>" */
if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
int r;
struct md_rdev *jdev;
rs->ti->error = "No space for raid4/5/6 journal";
return -ENOSPC;
}
+ rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
set_bit(Journal, &jdev->flags);
continue;
}
+ /* "journal_mode <mode>" ("journal_dev" mandatory!) */
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
+ int r;
+
+ if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
+ return -EINVAL;
+ }
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+ rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
+ return -EINVAL;
+ }
+ r = dm_raid_journal_mode_to_md(arg);
+ if (r < 0) {
+ rs->ti->error = "Invalid 'journal_mode' argument";
+ return r;
+ }
+ rs->journal_dev.mode = r;
+ continue;
+ }
+
/*
* Parameters with number values from here on.
*/
/* Assume discards not supported until after checks below. */
ti->discards_supported = false;
- /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+ /*
+ * XXX: RAID level 4,5,6 require zeroing for safety.
+ */
raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
for (i = 0; i < rs->raid_disks; i++) {
return;
if (raid456) {
- if (!q->limits.discard_zeroes_data)
- return;
if (!devices_handle_discard_safely) {
DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+ /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+ r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
+ if (r) {
+ ti->error = "Failed to set raid4/5/6 journal mode";
+ mddev_unlock(&rs->md);
+ goto bad_journal_mode_set;
+ }
+ }
+
mddev_suspend(&rs->md);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
mddev_unlock(&rs->md);
return 0;
+ bad_journal_mode_set:
bad_stripe_cache:
bad_check_reshape:
md_stop(&rs->md);
* Status characters:
*
* 'D' = Dead/Failed raid set component or raid4/5/6 journal device
- * 'a' = Alive but not in-sync
- * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
+ * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
* '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/
- static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
{
if (!rdev->bdev)
return "-";
else if (test_bit(Faulty, &rdev->flags))
return "D";
else if (test_bit(Journal, &rdev->flags))
- return "A";
+ return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
return "a";
else
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++)
- DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
+ DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
/*
* In-sync/Reshape ratio:
* v1.10.0+:
*/
DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
- __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
+ __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
break;
case STATUSTYPE_TABLE:
write_mostly_params +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
- (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
+ (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
+ (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
+
/* Emit table line */
+ /* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
- if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
- DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
- raid10_md_layout_to_format(mddev->layout));
- if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
- raid10_md_layout_to_copies(mddev->layout));
- if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
- DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
- if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
- DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
- (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
- if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
- DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
- (unsigned long long) rs->data_offset);
- if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
- DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
- mddev->bitmap_info.daemon_sleep);
- if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
- max(rs->delta_disks, mddev->delta_disks));
- if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
- max_nr_stripes);
+ if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+ DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (rebuild_disks)
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
rs->dev[i].rdev.raid_disk);
+ if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+ DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+ mddev->bitmap_info.daemon_sleep);
+ if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+ mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+ mddev->sync_speed_max);
if (write_mostly_params)
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
mddev->bitmap_info.max_write_behind);
- if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
- mddev->sync_speed_max);
- if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
- mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+ max_nr_stripes);
+ if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+ DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+ (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+ if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+ raid10_md_layout_to_copies(mddev->layout));
+ if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+ raid10_md_layout_to_format(mddev->layout));
+ if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+ max(rs->delta_disks, mddev->delta_disks));
+ if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+ DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+ (unsigned long long) rs->data_offset);
if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
__get_dev_name(rs->journal_dev.dev));
+ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
+ md_journal_mode_to_dm_raid(rs->journal_dev.mode));
DMEMIT(" %d", rs->raid_disks);
for (i = 0; i < rs->raid_disks; i++)
DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 10, 1},
+ .version = {1, 11, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
if (!rq->q->mq_ops)
dm_old_requeue_request(rq);
else
- dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
+ dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);
rq_completed(md, rw, false);
}
r = rq_end_io(tio->ti, clone, error, &tio->info);
}
- if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
- !clone->q->limits.max_write_same_sectors))
- disable_write_same(tio->md);
+ if (unlikely(r == -EREMOTEIO)) {
+ if (req_op(clone) == REQ_OP_WRITE_SAME &&
+ !clone->q->limits.max_write_same_sectors)
+ disable_write_same(tio->md);
+ if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+ !clone->q->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(tio->md);
+ }
if (r <= 0)
/* The target wants to complete the I/O */
if (!rq->q->mq_ops)
blk_complete_request(rq);
else
- blk_mq_complete_request(rq, error);
+ blk_mq_complete_request(rq);
}
/*
/* Undo dm_start_request() before requeuing */
rq_end_stats(md, rq);
rq_completed(md, rq_data_dir(rq), false);
+ blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
return BLK_MQ_RQ_QUEUE_BUSY;
}
return BLK_MQ_RQ_QUEUE_OK;
}
-static struct blk_mq_ops dm_mq_ops = {
+static const struct blk_mq_ops dm_mq_ops = {
.queue_rq = dm_mq_queue_rq,
.complete = dm_softirq_done,
.init_request = dm_mq_init_request,
dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
- blk_mq_register_dev(disk_to_dev(md->disk), q);
+ err = blk_mq_register_dev(disk_to_dev(md->disk), q);
+ if (err)
+ goto out_cleanup_queue;
return 0;
+ out_cleanup_queue:
+ blk_cleanup_queue(q);
out_tag_set:
blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set:
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
ti->num_write_same_bios = stripes;
+ ti->num_write_zeroes_bios = stripes;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
return DM_MAPIO_REMAPPED;
}
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+ unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
target_bio_nr = dm_bio_get_target_bio_nr(bio);
BUG_ON(target_bio_nr >= sc->stripes);
static struct target_type stripe_target = {
.name = "striped",
.version = {1, 6, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
struct dm_table {
struct mapped_device *md;
- unsigned type;
+ enum dm_queue_mode type;
/* btree table */
unsigned int depth;
bool integrity_supported:1;
bool singleton:1;
bool all_blk_mq:1;
+ unsigned integrity_added:1;
/*
* Indicates the rw permissions for the new logical
*/
dev_t dm_get_dev_t(const char *path)
{
- dev_t uninitialized_var(dev);
+ dev_t dev;
struct block_device *bdev;
bdev = lookup_bdev(path);
struct dm_target *uninitialized_var(ti);
struct queue_limits ti_limits;
- unsigned i = 0;
+ unsigned i;
/*
* Check each entry in the table in turn.
*/
- while (i < dm_table_get_num_targets(table)) {
- ti = dm_table_get_target(table, i++);
+ for (i = 0; i < dm_table_get_num_targets(table); i++) {
+ ti = dm_table_get_target(table, i);
blk_set_stacking_limits(&ti_limits);
t->immutable_target_type = tgt->type;
}
+ if (dm_target_has_integrity(tgt->type))
+ t->integrity_added = 1;
+
tgt->table = t;
tgt->begin = start;
tgt->len = len;
}
EXPORT_SYMBOL(dm_consume_args);
- static bool __table_type_bio_based(unsigned table_type)
+ static bool __table_type_bio_based(enum dm_queue_mode table_type)
{
return (table_type == DM_TYPE_BIO_BASED ||
table_type == DM_TYPE_DAX_BIO_BASED);
}
- static bool __table_type_request_based(unsigned table_type)
+ static bool __table_type_request_based(enum dm_queue_mode table_type)
{
return (table_type == DM_TYPE_REQUEST_BASED ||
table_type == DM_TYPE_MQ_REQUEST_BASED);
}
- void dm_table_set_type(struct dm_table *t, unsigned type)
+ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
{
t->type = type;
}
static bool dm_table_supports_dax(struct dm_table *t)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
/* Ensure that all targets support DAX. */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->type->direct_access)
return false;
struct dm_target *tgt;
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
- unsigned live_md_type = dm_get_md_type(t->md);
+ enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
if (t->type != DM_TYPE_NONE) {
/* target already set the table's type */
return 0;
}
- unsigned dm_table_get_type(struct dm_table *t)
+ enum dm_queue_mode dm_table_get_type(struct dm_table *t)
{
return t->type;
}
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
{
- struct dm_target *uninitialized_var(ti);
- unsigned i = 0;
+ struct dm_target *ti;
+ unsigned i;
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (dm_target_is_wildcard(ti->type))
return ti;
}
static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
{
- unsigned type = dm_table_get_type(t);
+ enum dm_queue_mode type = dm_table_get_type(t);
unsigned per_io_data_size = 0;
struct dm_target *tgt;
unsigned i;
struct list_head *devices = dm_table_get_devices(t);
struct dm_dev_internal *dd = NULL;
struct gendisk *prev_disk = NULL, *template_disk = NULL;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+ if (!dm_target_passes_integrity(ti->type))
+ goto no_integrity;
+ }
list_for_each_entry(dd, devices, list) {
template_disk = dd->dm_dev->bdev->bd_disk;
struct mapped_device *md = t->md;
struct gendisk *template_disk = NULL;
+ /* If target handles integrity itself do not register it here. */
+ if (t->integrity_added)
+ return 0;
+
template_disk = dm_table_get_integrity_disk(t);
if (!template_disk)
return 0;
*/
bool dm_table_has_no_data_devices(struct dm_table *table)
{
- struct dm_target *uninitialized_var(ti);
- unsigned i = 0, num_devices = 0;
+ struct dm_target *ti;
+ unsigned i, num_devices;
- while (i < dm_table_get_num_targets(table)) {
- ti = dm_table_get_target(table, i++);
+ for (i = 0; i < dm_table_get_num_targets(table); i++) {
+ ti = dm_table_get_target(table, i);
if (!ti->type->iterate_devices)
return false;
+ num_devices = 0;
ti->type->iterate_devices(ti, count_device, &num_devices);
if (num_devices)
return false;
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits)
{
- struct dm_target *uninitialized_var(ti);
+ struct dm_target *ti;
struct queue_limits ti_limits;
- unsigned i = 0;
+ unsigned i;
blk_set_stacking_limits(limits);
- while (i < dm_table_get_num_targets(table)) {
+ for (i = 0; i < dm_table_get_num_targets(table); i++) {
blk_set_stacking_limits(&ti_limits);
- ti = dm_table_get_target(table, i++);
+ ti = dm_table_get_target(table, i);
if (!ti->type->iterate_devices)
goto combine_limits;
{
struct gendisk *template_disk = NULL;
+ if (t->integrity_added)
+ return;
+
if (t->integrity_supported) {
/*
* Verify that the original integrity profile
static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
/*
* Require at least one underlying device to support flushes.
* so we need to use iterate_devices here, which targets
* supporting flushes must provide.
*/
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->num_flush_bios)
continue;
return false;
}
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i = 0;
-
- /* Ensure that all targets supports discard_zeroes_data. */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
-
- if (ti->discard_zeroes_data_unsupported)
- return false;
- }
-
- return true;
-}
-
static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
iterate_devices_callout_fn func)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->type->iterate_devices ||
!ti->type->iterate_devices(ti, func, NULL))
static bool dm_table_supports_write_same(struct dm_table *t)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->num_write_same_bios)
return false;
return true;
}
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_write_zeroes_bios)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
static bool dm_table_supports_discards(struct dm_table *t)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
/*
* Unless any target used by the table set discards_supported,
* so we need to use iterate_devices here, which targets
* supporting discard selectively must provide.
*/
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->num_discard_bios)
continue;
}
blk_queue_write_cache(q, wc, fua);
- if (!dm_table_discard_zeroes_data(t))
- q->limits.discard_zeroes_data = 0;
-
/* Ensure that all underlying devices are non-rotational. */
if (dm_table_all_devices_attribute(t, device_is_nonrot))
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
+ if (!dm_table_supports_write_zeroes(t))
+ q->limits.max_write_zeroes_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
int i = t->num_targets;
struct dm_target *ti = t->targets;
+ lockdep_assert_held(&t->md->suspend_lock);
+
while (i--) {
switch (mode) {
case PRESUSPEND:
{
int i, r = 0;
+ lockdep_assert_held(&t->md->suspend_lock);
+
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = t->targets + i;
*/
#include "dm-thin-metadata.h"
- #include "dm-bio-prison.h"
+ #include "dm-bio-prison-v1.h"
#include "dm.h"
#include <linux/device-mapper.h>
* to unmap (we ignore err).
*/
queue_passdown_pt2(bio->bi_private);
+ bio_put(bio);
}
static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
* them down to the data device. The thin device's discard
* processing will cause mappings to be removed from the btree.
*/
- ti->discard_zeroes_data_unsupported = true;
if (pf.discard_enabled && pf.discard_passdown) {
ti->num_discard_bios = 1;
ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
- ti->discard_zeroes_data_unsupported = true;
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
- trace_block_bio_complete(md->queue, bio, io_error);
bio->bi_error = io_error;
bio_endio(bio);
}
limits->max_write_same_sectors = 0;
}
+void disable_write_zeroes(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support WRITE ZEROES, disable it */
+ limits->max_write_zeroes_sectors = 0;
+}
+
static void clone_endio(struct bio *bio)
{
int error = bio->bi_error;
}
}
- if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
- disable_write_same(md);
+ if (unlikely(r == -EREMOTEIO)) {
+ if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ disable_write_same(md);
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(md);
+ }
free_tio(tio);
dec_pending(io, error);
struct dm_offload *o = container_of(cb, struct dm_offload, cb);
struct bio_list list;
struct bio *bio;
+ int i;
INIT_LIST_HEAD(&o->cb.list);
if (unlikely(!current->bio_list))
return;
- list = *current->bio_list;
- bio_list_init(current->bio_list);
-
- while ((bio = bio_list_pop(&list))) {
- struct bio_set *bs = bio->bi_pool;
- if (unlikely(!bs) || bs == fs_bio_set) {
- bio_list_add(current->bio_list, bio);
- continue;
+ for (i = 0; i < 2; i++) {
+ list = current->bio_list[i];
+ bio_list_init(¤t->bio_list[i]);
+
+ while ((bio = bio_list_pop(&list))) {
+ struct bio_set *bs = bio->bi_pool;
+ if (unlikely(!bs) || bs == fs_bio_set) {
+ bio_list_add(¤t->bio_list[i], bio);
+ continue;
+ }
+
+ spin_lock(&bs->rescue_lock);
+ bio_list_add(&bs->rescue_list, bio);
+ queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ spin_unlock(&bs->rescue_lock);
}
-
- spin_lock(&bs->rescue_lock);
- bio_list_add(&bs->rescue_list, bio);
- queue_work(bs->rescue_workqueue, &bs->rescue_work);
- spin_unlock(&bs->rescue_lock);
}
}
__bio_clone_fast(clone, bio);
- if (bio_integrity(bio)) {
- int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+ if (unlikely(bio_integrity(bio) != NULL)) {
+ int r;
+
+ if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
+ !dm_target_passes_integrity(tio->ti->type))) {
+ DMWARN("%s: the target %s doesn't support integrity data.",
+ dm_device_name(tio->io->md),
+ tio->ti->type->name);
+ return -EIO;
+ }
+
+ r = bio_integrity_clone(clone, bio, GFP_NOIO);
if (r < 0)
return r;
}
bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
clone->bi_iter.bi_size = to_bytes(len);
- if (bio_integrity(bio))
+ if (unlikely(bio_integrity(bio) != NULL))
bio_integrity_trim(clone, 0, len);
return 0;
return ti->num_write_same_bios;
}
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+ return ti->num_write_zeroes_bios;
+}
+
typedef bool (*is_split_required_fn)(struct dm_target *ti);
static bool is_split_required_for_discard(struct dm_target *ti)
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
+static int __send_write_zeroes(struct clone_info *ci)
+{
+ return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
/*
* Select the correct strategy for processing a non-flush bio.
*/
return __send_discard(ci);
else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
return __send_write_same(ci);
+ else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+ return __send_write_zeroes(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
*/
static void __set_size(struct mapped_device *md, sector_t size)
{
+ lockdep_assert_held(&md->suspend_lock);
+
set_capacity(md->disk, size);
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
mutex_unlock(&md->type_lock);
}
- void dm_set_md_type(struct mapped_device *md, unsigned type)
+ void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
{
BUG_ON(!mutex_is_locked(&md->type_lock));
md->type = type;
}
- unsigned dm_get_md_type(struct mapped_device *md)
+ enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
{
return md->type;
}
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
- unsigned type = dm_get_md_type(md);
+ enum dm_queue_mode type = dm_get_md_type(md);
switch (type) {
case DM_TYPE_REQUEST_BASED:
if (type == DM_TYPE_DAX_BIO_BASED)
queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
break;
+ case DM_TYPE_NONE:
+ WARN_ON_ONCE(true);
+ break;
}
return 0;
* If __dm_suspend returns 0, the device is completely quiescent
* now. There is no request-processing activity. All new requests
* are being added to md->deferred list.
- *
- * Caller must hold md->suspend_lock
*/
static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
unsigned suspend_flags, long task_state,
*/
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ else
+ pr_debug("%s: suspending with flush\n", dm_device_name(md));
/*
* This gets reverted if there's an error later and the targets
{
struct dm_table *map = NULL;
+ lockdep_assert_held(&md->suspend_lock);
+
if (md->internal_suspend_count++)
return; /* nested internal suspend */
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
- struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
unsigned integrity, unsigned per_io_data_size)
{
struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
/*
* Type of table, mapped_device's mempool and request_queue
*/
- #define DM_TYPE_NONE 0
- #define DM_TYPE_BIO_BASED 1
- #define DM_TYPE_REQUEST_BASED 2
- #define DM_TYPE_MQ_REQUEST_BASED 3
- #define DM_TYPE_DAX_BIO_BASED 4
+ enum dm_queue_mode {
+ DM_TYPE_NONE = 0,
+ DM_TYPE_BIO_BASED = 1,
+ DM_TYPE_REQUEST_BASED = 2,
+ DM_TYPE_MQ_REQUEST_BASED = 3,
+ DM_TYPE_DAX_BIO_BASED = 4,
+ };
typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
*/
typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
+ /*
+ * A target implements own bio data integrity.
+ */
+ #define DM_TARGET_INTEGRITY 0x00000010
+ #define dm_target_has_integrity(type) ((type)->features & DM_TARGET_INTEGRITY)
+
+ /*
+ * A target passes integrity data to the lower device.
+ */
+ #define DM_TARGET_PASSES_INTEGRITY 0x00000020
+ #define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
+
struct dm_target {
struct dm_table *table;
struct target_type *type;
*/
unsigned num_write_same_bios;
+ /*
+ * The number of WRITE ZEROES bios that will be submitted to the target.
+ * The bio number can be accessed with dm_bio_get_target_bio_nr.
+ */
+ unsigned num_write_zeroes_bios;
+
/*
* The minimum number of extra bytes allocated in each io for the
* target to use.
* on max_io_len boundary.
*/
bool split_discard_bios:1;
-
- /*
- * Set if this target does not return zeroes on discarded blocks.
- */
- bool discard_zeroes_data_unsupported:1;
};
/* Each target can link one of these into the table */
* Useful for "hybrid" target (supports both bio-based
* and request-based).
*/
- void dm_table_set_type(struct dm_table *t, unsigned type);
+ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
/*
* Finally call this to make the table ready for use.