dm persistent data: add btree_walk
[linux-2.6-block.git] / drivers / md / dm-thin.c
CommitLineData
991d9fa0 1/*
e49e5829 2 * Copyright (C) 2011-2012 Red Hat UK.
991d9fa0
JT
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
4f81a417 8#include "dm-bio-prison.h"
1f4e0ff0 9#include "dm.h"
991d9fa0
JT
10
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/list.h>
15#include <linux/init.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18
19#define DM_MSG_PREFIX "thin"
20
21/*
22 * Tunable constants
23 */
7768ed33 24#define ENDIO_HOOK_POOL_SIZE 1024
991d9fa0
JT
25#define MAPPING_POOL_SIZE 1024
26#define PRISON_CELLS 1024
905e51b3 27#define COMMIT_PERIOD HZ
991d9fa0 28
df5d2e90
MP
29DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
30 "A percentage of time allocated for copy on write");
31
991d9fa0
JT
32/*
33 * The block size of the device holding pool data must be
34 * between 64KB and 1GB.
35 */
36#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
37#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
38
991d9fa0
JT
39/*
40 * Device id is restricted to 24 bits.
41 */
42#define MAX_DEV_ID ((1 << 24) - 1)
43
44/*
45 * How do we handle breaking sharing of data blocks?
46 * =================================================
47 *
48 * We use a standard copy-on-write btree to store the mappings for the
49 * devices (note I'm talking about copy-on-write of the metadata here, not
50 * the data). When you take an internal snapshot you clone the root node
51 * of the origin btree. After this there is no concept of an origin or a
52 * snapshot. They are just two device trees that happen to point to the
53 * same data blocks.
54 *
55 * When we get a write in we decide if it's to a shared data block using
56 * some timestamp magic. If it is, we have to break sharing.
57 *
58 * Let's say we write to a shared block in what was the origin. The
59 * steps are:
60 *
61 * i) plug io further to this physical block. (see bio_prison code).
62 *
63 * ii) quiesce any read io to that shared data block. Obviously
44feb387 64 * including all devices that share this block. (see dm_deferred_set code)
991d9fa0
JT
65 *
66 * iii) copy the data block to a newly allocate block. This step can be
67 * missed out if the io covers the block. (schedule_copy).
68 *
69 * iv) insert the new mapping into the origin's btree
fe878f34 70 * (process_prepared_mapping). This act of inserting breaks some
991d9fa0
JT
71 * sharing of btree nodes between the two devices. Breaking sharing only
72 * effects the btree of that specific device. Btrees for the other
73 * devices that share the block never change. The btree for the origin
74 * device as it was after the last commit is untouched, ie. we're using
75 * persistent data structures in the functional programming sense.
76 *
77 * v) unplug io to this physical block, including the io that triggered
78 * the breaking of sharing.
79 *
80 * Steps (ii) and (iii) occur in parallel.
81 *
82 * The metadata _doesn't_ need to be committed before the io continues. We
83 * get away with this because the io is always written to a _new_ block.
84 * If there's a crash, then:
85 *
86 * - The origin mapping will point to the old origin block (the shared
87 * one). This will contain the data as it was before the io that triggered
88 * the breaking of sharing came in.
89 *
90 * - The snap mapping still points to the old block. As it would after
91 * the commit.
92 *
93 * The downside of this scheme is the timestamp magic isn't perfect, and
94 * will continue to think that data block in the snapshot device is shared
95 * even after the write to the origin has broken sharing. I suspect data
96 * blocks will typically be shared by many different devices, so we're
97 * breaking sharing n + 1 times, rather than n, where n is the number of
98 * devices that reference this data block. At the moment I think the
99 * benefits far, far outweigh the disadvantages.
100 */
101
102/*----------------------------------------------------------------*/
103
991d9fa0
JT
104/*
105 * Key building.
106 */
107static void build_data_key(struct dm_thin_device *td,
44feb387 108 dm_block_t b, struct dm_cell_key *key)
991d9fa0
JT
109{
110 key->virtual = 0;
111 key->dev = dm_thin_dev_id(td);
112 key->block = b;
113}
114
115static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
44feb387 116 struct dm_cell_key *key)
991d9fa0
JT
117{
118 key->virtual = 1;
119 key->dev = dm_thin_dev_id(td);
120 key->block = b;
121}
122
123/*----------------------------------------------------------------*/
124
125/*
126 * A pool device ties together a metadata device and a data device. It
127 * also provides the interface for creating and destroying internal
128 * devices.
129 */
a24c2569 130struct dm_thin_new_mapping;
67e2e2b2 131
e49e5829
JT
132/*
133 * The pool runs in 3 modes. Ordered in degraded order for comparisons.
134 */
135enum pool_mode {
136 PM_WRITE, /* metadata may be changed */
137 PM_READ_ONLY, /* metadata may not be changed */
138 PM_FAIL, /* all I/O fails */
139};
140
67e2e2b2 141struct pool_features {
e49e5829
JT
142 enum pool_mode mode;
143
9bc142dd
MS
144 bool zero_new_blocks:1;
145 bool discard_enabled:1;
146 bool discard_passdown:1;
67e2e2b2
JT
147};
148
e49e5829
JT
149struct thin_c;
150typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
151typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
152
991d9fa0
JT
153struct pool {
154 struct list_head list;
155 struct dm_target *ti; /* Only set if a pool target is bound */
156
157 struct mapped_device *pool_md;
158 struct block_device *md_dev;
159 struct dm_pool_metadata *pmd;
160
991d9fa0 161 dm_block_t low_water_blocks;
55f2b8bd 162 uint32_t sectors_per_block;
f9a8e0cd 163 int sectors_per_block_shift;
991d9fa0 164
67e2e2b2 165 struct pool_features pf;
991d9fa0
JT
166 unsigned low_water_triggered:1; /* A dm event has been sent */
167 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
168
44feb387 169 struct dm_bio_prison *prison;
991d9fa0
JT
170 struct dm_kcopyd_client *copier;
171
172 struct workqueue_struct *wq;
173 struct work_struct worker;
905e51b3 174 struct delayed_work waker;
991d9fa0 175
905e51b3 176 unsigned long last_commit_jiffies;
55f2b8bd 177 unsigned ref_count;
991d9fa0
JT
178
179 spinlock_t lock;
180 struct bio_list deferred_bios;
181 struct bio_list deferred_flush_bios;
182 struct list_head prepared_mappings;
104655fd 183 struct list_head prepared_discards;
991d9fa0
JT
184
185 struct bio_list retry_on_resume_list;
186
44feb387
MS
187 struct dm_deferred_set *shared_read_ds;
188 struct dm_deferred_set *all_io_ds;
991d9fa0 189
a24c2569 190 struct dm_thin_new_mapping *next_mapping;
991d9fa0 191 mempool_t *mapping_pool;
e49e5829
JT
192
193 process_bio_fn process_bio;
194 process_bio_fn process_discard;
195
196 process_mapping_fn process_prepared_mapping;
197 process_mapping_fn process_prepared_discard;
991d9fa0
JT
198};
199
e49e5829
JT
200static enum pool_mode get_pool_mode(struct pool *pool);
201static void set_pool_mode(struct pool *pool, enum pool_mode mode);
202
991d9fa0
JT
203/*
204 * Target context for a pool.
205 */
206struct pool_c {
207 struct dm_target *ti;
208 struct pool *pool;
209 struct dm_dev *data_dev;
210 struct dm_dev *metadata_dev;
211 struct dm_target_callbacks callbacks;
212
213 dm_block_t low_water_blocks;
0424caa1
MS
214 struct pool_features requested_pf; /* Features requested during table load */
215 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
991d9fa0
JT
216};
217
218/*
219 * Target context for a thin.
220 */
221struct thin_c {
222 struct dm_dev *pool_dev;
2dd9c257 223 struct dm_dev *origin_dev;
991d9fa0
JT
224 dm_thin_id dev_id;
225
226 struct pool *pool;
227 struct dm_thin_device *td;
228};
229
230/*----------------------------------------------------------------*/
231
232/*
233 * A global list of pools that uses a struct mapped_device as a key.
234 */
235static struct dm_thin_pool_table {
236 struct mutex mutex;
237 struct list_head pools;
238} dm_thin_pool_table;
239
240static void pool_table_init(void)
241{
242 mutex_init(&dm_thin_pool_table.mutex);
243 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
244}
245
246static void __pool_table_insert(struct pool *pool)
247{
248 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
249 list_add(&pool->list, &dm_thin_pool_table.pools);
250}
251
252static void __pool_table_remove(struct pool *pool)
253{
254 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
255 list_del(&pool->list);
256}
257
258static struct pool *__pool_table_lookup(struct mapped_device *md)
259{
260 struct pool *pool = NULL, *tmp;
261
262 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
263
264 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
265 if (tmp->pool_md == md) {
266 pool = tmp;
267 break;
268 }
269 }
270
271 return pool;
272}
273
274static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
275{
276 struct pool *pool = NULL, *tmp;
277
278 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
279
280 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
281 if (tmp->md_dev == md_dev) {
282 pool = tmp;
283 break;
284 }
285 }
286
287 return pool;
288}
289
290/*----------------------------------------------------------------*/
291
a24c2569 292struct dm_thin_endio_hook {
eb2aa48d 293 struct thin_c *tc;
44feb387
MS
294 struct dm_deferred_entry *shared_read_entry;
295 struct dm_deferred_entry *all_io_entry;
a24c2569 296 struct dm_thin_new_mapping *overwrite_mapping;
eb2aa48d
JT
297};
298
991d9fa0
JT
299static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
300{
301 struct bio *bio;
302 struct bio_list bios;
303
304 bio_list_init(&bios);
305 bio_list_merge(&bios, master);
306 bio_list_init(master);
307
308 while ((bio = bio_list_pop(&bios))) {
59c3d2c6 309 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 310
eb2aa48d 311 if (h->tc == tc)
991d9fa0
JT
312 bio_endio(bio, DM_ENDIO_REQUEUE);
313 else
314 bio_list_add(master, bio);
315 }
316}
317
318static void requeue_io(struct thin_c *tc)
319{
320 struct pool *pool = tc->pool;
321 unsigned long flags;
322
323 spin_lock_irqsave(&pool->lock, flags);
324 __requeue_bio_list(tc, &pool->deferred_bios);
325 __requeue_bio_list(tc, &pool->retry_on_resume_list);
326 spin_unlock_irqrestore(&pool->lock, flags);
327}
328
329/*
330 * This section of code contains the logic for processing a thin device's IO.
331 * Much of the code depends on pool object resources (lists, workqueues, etc)
332 * but most is exclusively called from the thin target rather than the thin-pool
333 * target.
334 */
335
58f77a21
MS
336static bool block_size_is_power_of_two(struct pool *pool)
337{
338 return pool->sectors_per_block_shift >= 0;
339}
340
991d9fa0
JT
341static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
342{
58f77a21 343 struct pool *pool = tc->pool;
55f2b8bd
MS
344 sector_t block_nr = bio->bi_sector;
345
58f77a21
MS
346 if (block_size_is_power_of_two(pool))
347 block_nr >>= pool->sectors_per_block_shift;
f9a8e0cd 348 else
58f77a21 349 (void) sector_div(block_nr, pool->sectors_per_block);
55f2b8bd
MS
350
351 return block_nr;
991d9fa0
JT
352}
353
354static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
355{
356 struct pool *pool = tc->pool;
55f2b8bd 357 sector_t bi_sector = bio->bi_sector;
991d9fa0
JT
358
359 bio->bi_bdev = tc->pool_dev->bdev;
58f77a21 360 if (block_size_is_power_of_two(pool))
f9a8e0cd
MP
361 bio->bi_sector = (block << pool->sectors_per_block_shift) |
362 (bi_sector & (pool->sectors_per_block - 1));
58f77a21
MS
363 else
364 bio->bi_sector = (block * pool->sectors_per_block) +
365 sector_div(bi_sector, pool->sectors_per_block);
991d9fa0
JT
366}
367
2dd9c257
JT
368static void remap_to_origin(struct thin_c *tc, struct bio *bio)
369{
370 bio->bi_bdev = tc->origin_dev->bdev;
371}
372
4afdd680
JT
373static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
374{
375 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
376 dm_thin_changed_this_transaction(tc->td);
377}
378
e8088073
JT
379static void inc_all_io_entry(struct pool *pool, struct bio *bio)
380{
381 struct dm_thin_endio_hook *h;
382
383 if (bio->bi_rw & REQ_DISCARD)
384 return;
385
59c3d2c6 386 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
e8088073
JT
387 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
388}
389
2dd9c257 390static void issue(struct thin_c *tc, struct bio *bio)
991d9fa0
JT
391{
392 struct pool *pool = tc->pool;
393 unsigned long flags;
394
e49e5829
JT
395 if (!bio_triggers_commit(tc, bio)) {
396 generic_make_request(bio);
397 return;
398 }
399
991d9fa0 400 /*
e49e5829
JT
401 * Complete bio with an error if earlier I/O caused changes to
402 * the metadata that can't be committed e.g, due to I/O errors
403 * on the metadata device.
991d9fa0 404 */
e49e5829
JT
405 if (dm_thin_aborted_changes(tc->td)) {
406 bio_io_error(bio);
407 return;
408 }
409
410 /*
411 * Batch together any bios that trigger commits and then issue a
412 * single commit for them in process_deferred_bios().
413 */
414 spin_lock_irqsave(&pool->lock, flags);
415 bio_list_add(&pool->deferred_flush_bios, bio);
416 spin_unlock_irqrestore(&pool->lock, flags);
991d9fa0
JT
417}
418
2dd9c257
JT
419static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
420{
421 remap_to_origin(tc, bio);
422 issue(tc, bio);
423}
424
425static void remap_and_issue(struct thin_c *tc, struct bio *bio,
426 dm_block_t block)
427{
428 remap(tc, bio, block);
429 issue(tc, bio);
430}
431
991d9fa0
JT
432/*
433 * wake_worker() is used when new work is queued and when pool_resume is
434 * ready to continue deferred IO processing.
435 */
436static void wake_worker(struct pool *pool)
437{
438 queue_work(pool->wq, &pool->worker);
439}
440
441/*----------------------------------------------------------------*/
442
443/*
444 * Bio endio functions.
445 */
a24c2569 446struct dm_thin_new_mapping {
991d9fa0
JT
447 struct list_head list;
448
eb2aa48d
JT
449 unsigned quiesced:1;
450 unsigned prepared:1;
104655fd 451 unsigned pass_discard:1;
991d9fa0
JT
452
453 struct thin_c *tc;
454 dm_block_t virt_block;
455 dm_block_t data_block;
a24c2569 456 struct dm_bio_prison_cell *cell, *cell2;
991d9fa0
JT
457 int err;
458
459 /*
460 * If the bio covers the whole area of a block then we can avoid
461 * zeroing or copying. Instead this bio is hooked. The bio will
462 * still be in the cell, so care has to be taken to avoid issuing
463 * the bio twice.
464 */
465 struct bio *bio;
466 bio_end_io_t *saved_bi_end_io;
467};
468
a24c2569 469static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
991d9fa0
JT
470{
471 struct pool *pool = m->tc->pool;
472
eb2aa48d 473 if (m->quiesced && m->prepared) {
991d9fa0
JT
474 list_add(&m->list, &pool->prepared_mappings);
475 wake_worker(pool);
476 }
477}
478
479static void copy_complete(int read_err, unsigned long write_err, void *context)
480{
481 unsigned long flags;
a24c2569 482 struct dm_thin_new_mapping *m = context;
991d9fa0
JT
483 struct pool *pool = m->tc->pool;
484
485 m->err = read_err || write_err ? -EIO : 0;
486
487 spin_lock_irqsave(&pool->lock, flags);
488 m->prepared = 1;
489 __maybe_add_mapping(m);
490 spin_unlock_irqrestore(&pool->lock, flags);
491}
492
493static void overwrite_endio(struct bio *bio, int err)
494{
495 unsigned long flags;
59c3d2c6 496 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 497 struct dm_thin_new_mapping *m = h->overwrite_mapping;
991d9fa0
JT
498 struct pool *pool = m->tc->pool;
499
500 m->err = err;
501
502 spin_lock_irqsave(&pool->lock, flags);
503 m->prepared = 1;
504 __maybe_add_mapping(m);
505 spin_unlock_irqrestore(&pool->lock, flags);
506}
507
991d9fa0
JT
508/*----------------------------------------------------------------*/
509
510/*
511 * Workqueue.
512 */
513
514/*
515 * Prepared mapping jobs.
516 */
517
518/*
519 * This sends the bios in the cell back to the deferred_bios list.
520 */
2aab3850 521static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
991d9fa0
JT
522{
523 struct pool *pool = tc->pool;
524 unsigned long flags;
525
526 spin_lock_irqsave(&pool->lock, flags);
44feb387 527 dm_cell_release(cell, &pool->deferred_bios);
991d9fa0
JT
528 spin_unlock_irqrestore(&tc->pool->lock, flags);
529
530 wake_worker(pool);
531}
532
533/*
b7ca9c92 534 * Same as cell_defer except it omits the original holder of the cell.
991d9fa0 535 */
f286ba0e 536static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
991d9fa0 537{
991d9fa0
JT
538 struct pool *pool = tc->pool;
539 unsigned long flags;
540
991d9fa0 541 spin_lock_irqsave(&pool->lock, flags);
44feb387 542 dm_cell_release_no_holder(cell, &pool->deferred_bios);
991d9fa0
JT
543 spin_unlock_irqrestore(&pool->lock, flags);
544
545 wake_worker(pool);
546}
547
e49e5829
JT
548static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
549{
550 if (m->bio)
551 m->bio->bi_end_io = m->saved_bi_end_io;
44feb387 552 dm_cell_error(m->cell);
e49e5829
JT
553 list_del(&m->list);
554 mempool_free(m, m->tc->pool->mapping_pool);
555}
a24c2569 556static void process_prepared_mapping(struct dm_thin_new_mapping *m)
991d9fa0
JT
557{
558 struct thin_c *tc = m->tc;
559 struct bio *bio;
560 int r;
561
562 bio = m->bio;
563 if (bio)
564 bio->bi_end_io = m->saved_bi_end_io;
565
566 if (m->err) {
44feb387 567 dm_cell_error(m->cell);
905386f8 568 goto out;
991d9fa0
JT
569 }
570
571 /*
572 * Commit the prepared block into the mapping btree.
573 * Any I/O for this block arriving after this point will get
574 * remapped to it directly.
575 */
576 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
577 if (r) {
c397741c 578 DMERR_LIMIT("dm_thin_insert_block() failed");
44feb387 579 dm_cell_error(m->cell);
905386f8 580 goto out;
991d9fa0
JT
581 }
582
583 /*
584 * Release any bios held while the block was being provisioned.
585 * If we are processing a write bio that completely covers the block,
586 * we already processed it so can ignore it now when processing
587 * the bios in the cell.
588 */
589 if (bio) {
f286ba0e 590 cell_defer_no_holder(tc, m->cell);
991d9fa0
JT
591 bio_endio(bio, 0);
592 } else
2aab3850 593 cell_defer(tc, m->cell);
991d9fa0 594
905386f8 595out:
991d9fa0
JT
596 list_del(&m->list);
597 mempool_free(m, tc->pool->mapping_pool);
598}
599
e49e5829 600static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
104655fd 601{
104655fd
JT
602 struct thin_c *tc = m->tc;
603
e49e5829 604 bio_io_error(m->bio);
f286ba0e
JT
605 cell_defer_no_holder(tc, m->cell);
606 cell_defer_no_holder(tc, m->cell2);
e49e5829
JT
607 mempool_free(m, tc->pool->mapping_pool);
608}
609
610static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
611{
612 struct thin_c *tc = m->tc;
104655fd 613
e8088073 614 inc_all_io_entry(tc->pool, m->bio);
f286ba0e
JT
615 cell_defer_no_holder(tc, m->cell);
616 cell_defer_no_holder(tc, m->cell2);
e8088073 617
104655fd
JT
618 if (m->pass_discard)
619 remap_and_issue(tc, m->bio, m->data_block);
620 else
621 bio_endio(m->bio, 0);
622
104655fd
JT
623 mempool_free(m, tc->pool->mapping_pool);
624}
625
e49e5829
JT
626static void process_prepared_discard(struct dm_thin_new_mapping *m)
627{
628 int r;
629 struct thin_c *tc = m->tc;
630
631 r = dm_thin_remove_block(tc->td, m->virt_block);
632 if (r)
c397741c 633 DMERR_LIMIT("dm_thin_remove_block() failed");
e49e5829
JT
634
635 process_prepared_discard_passdown(m);
636}
637
104655fd 638static void process_prepared(struct pool *pool, struct list_head *head,
e49e5829 639 process_mapping_fn *fn)
991d9fa0
JT
640{
641 unsigned long flags;
642 struct list_head maps;
a24c2569 643 struct dm_thin_new_mapping *m, *tmp;
991d9fa0
JT
644
645 INIT_LIST_HEAD(&maps);
646 spin_lock_irqsave(&pool->lock, flags);
104655fd 647 list_splice_init(head, &maps);
991d9fa0
JT
648 spin_unlock_irqrestore(&pool->lock, flags);
649
650 list_for_each_entry_safe(m, tmp, &maps, list)
e49e5829 651 (*fn)(m);
991d9fa0
JT
652}
653
654/*
655 * Deferred bio jobs.
656 */
104655fd 657static int io_overlaps_block(struct pool *pool, struct bio *bio)
991d9fa0 658{
f9a8e0cd 659 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
104655fd
JT
660}
661
662static int io_overwrites_block(struct pool *pool, struct bio *bio)
663{
664 return (bio_data_dir(bio) == WRITE) &&
665 io_overlaps_block(pool, bio);
991d9fa0
JT
666}
667
668static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
669 bio_end_io_t *fn)
670{
671 *save = bio->bi_end_io;
672 bio->bi_end_io = fn;
673}
674
675static int ensure_next_mapping(struct pool *pool)
676{
677 if (pool->next_mapping)
678 return 0;
679
680 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
681
682 return pool->next_mapping ? 0 : -ENOMEM;
683}
684
a24c2569 685static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
991d9fa0 686{
a24c2569 687 struct dm_thin_new_mapping *r = pool->next_mapping;
991d9fa0
JT
688
689 BUG_ON(!pool->next_mapping);
690
691 pool->next_mapping = NULL;
692
693 return r;
694}
695
696static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
2dd9c257
JT
697 struct dm_dev *origin, dm_block_t data_origin,
698 dm_block_t data_dest,
a24c2569 699 struct dm_bio_prison_cell *cell, struct bio *bio)
991d9fa0
JT
700{
701 int r;
702 struct pool *pool = tc->pool;
a24c2569 703 struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0
JT
704
705 INIT_LIST_HEAD(&m->list);
eb2aa48d 706 m->quiesced = 0;
991d9fa0
JT
707 m->prepared = 0;
708 m->tc = tc;
709 m->virt_block = virt_block;
710 m->data_block = data_dest;
711 m->cell = cell;
712 m->err = 0;
713 m->bio = NULL;
714
44feb387 715 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
eb2aa48d 716 m->quiesced = 1;
991d9fa0
JT
717
718 /*
719 * IO to pool_dev remaps to the pool target's data_dev.
720 *
721 * If the whole block of data is being overwritten, we can issue the
722 * bio immediately. Otherwise we use kcopyd to clone the data first.
723 */
724 if (io_overwrites_block(pool, bio)) {
59c3d2c6 725 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 726
eb2aa48d 727 h->overwrite_mapping = m;
991d9fa0
JT
728 m->bio = bio;
729 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
e8088073 730 inc_all_io_entry(pool, bio);
991d9fa0
JT
731 remap_and_issue(tc, bio, data_dest);
732 } else {
733 struct dm_io_region from, to;
734
2dd9c257 735 from.bdev = origin->bdev;
991d9fa0
JT
736 from.sector = data_origin * pool->sectors_per_block;
737 from.count = pool->sectors_per_block;
738
739 to.bdev = tc->pool_dev->bdev;
740 to.sector = data_dest * pool->sectors_per_block;
741 to.count = pool->sectors_per_block;
742
743 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
744 0, copy_complete, m);
745 if (r < 0) {
746 mempool_free(m, pool->mapping_pool);
c397741c 747 DMERR_LIMIT("dm_kcopyd_copy() failed");
44feb387 748 dm_cell_error(cell);
991d9fa0
JT
749 }
750 }
751}
752
2dd9c257
JT
753static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
754 dm_block_t data_origin, dm_block_t data_dest,
a24c2569 755 struct dm_bio_prison_cell *cell, struct bio *bio)
2dd9c257
JT
756{
757 schedule_copy(tc, virt_block, tc->pool_dev,
758 data_origin, data_dest, cell, bio);
759}
760
761static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
762 dm_block_t data_dest,
a24c2569 763 struct dm_bio_prison_cell *cell, struct bio *bio)
2dd9c257
JT
764{
765 schedule_copy(tc, virt_block, tc->origin_dev,
766 virt_block, data_dest, cell, bio);
767}
768
991d9fa0 769static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
a24c2569 770 dm_block_t data_block, struct dm_bio_prison_cell *cell,
991d9fa0
JT
771 struct bio *bio)
772{
773 struct pool *pool = tc->pool;
a24c2569 774 struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0
JT
775
776 INIT_LIST_HEAD(&m->list);
eb2aa48d 777 m->quiesced = 1;
991d9fa0
JT
778 m->prepared = 0;
779 m->tc = tc;
780 m->virt_block = virt_block;
781 m->data_block = data_block;
782 m->cell = cell;
783 m->err = 0;
784 m->bio = NULL;
785
786 /*
787 * If the whole block of data is being overwritten or we are not
788 * zeroing pre-existing data, we can issue the bio immediately.
789 * Otherwise we use kcopyd to zero the data first.
790 */
67e2e2b2 791 if (!pool->pf.zero_new_blocks)
991d9fa0
JT
792 process_prepared_mapping(m);
793
794 else if (io_overwrites_block(pool, bio)) {
59c3d2c6 795 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 796
eb2aa48d 797 h->overwrite_mapping = m;
991d9fa0
JT
798 m->bio = bio;
799 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
e8088073 800 inc_all_io_entry(pool, bio);
991d9fa0 801 remap_and_issue(tc, bio, data_block);
991d9fa0
JT
802 } else {
803 int r;
804 struct dm_io_region to;
805
806 to.bdev = tc->pool_dev->bdev;
807 to.sector = data_block * pool->sectors_per_block;
808 to.count = pool->sectors_per_block;
809
810 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
811 if (r < 0) {
812 mempool_free(m, pool->mapping_pool);
c397741c 813 DMERR_LIMIT("dm_kcopyd_zero() failed");
44feb387 814 dm_cell_error(cell);
991d9fa0
JT
815 }
816 }
817}
818
e49e5829
JT
819static int commit(struct pool *pool)
820{
821 int r;
822
823 r = dm_pool_commit_metadata(pool->pmd);
824 if (r)
c397741c 825 DMERR_LIMIT("commit failed: error = %d", r);
e49e5829
JT
826
827 return r;
828}
829
830/*
831 * A non-zero return indicates read_only or fail_io mode.
832 * Many callers don't care about the return value.
833 */
834static int commit_or_fallback(struct pool *pool)
835{
836 int r;
837
838 if (get_pool_mode(pool) != PM_WRITE)
839 return -EINVAL;
840
841 r = commit(pool);
842 if (r)
843 set_pool_mode(pool, PM_READ_ONLY);
844
845 return r;
846}
847
991d9fa0
JT
848static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
849{
850 int r;
851 dm_block_t free_blocks;
852 unsigned long flags;
853 struct pool *pool = tc->pool;
854
855 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
856 if (r)
857 return r;
858
859 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
860 DMWARN("%s: reached low water mark, sending event.",
861 dm_device_name(pool->pool_md));
862 spin_lock_irqsave(&pool->lock, flags);
863 pool->low_water_triggered = 1;
864 spin_unlock_irqrestore(&pool->lock, flags);
865 dm_table_event(pool->ti->table);
866 }
867
868 if (!free_blocks) {
869 if (pool->no_free_space)
870 return -ENOSPC;
871 else {
872 /*
873 * Try to commit to see if that will free up some
874 * more space.
875 */
e49e5829 876 (void) commit_or_fallback(pool);
991d9fa0
JT
877
878 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
879 if (r)
880 return r;
881
882 /*
883 * If we still have no space we set a flag to avoid
884 * doing all this checking and return -ENOSPC.
885 */
886 if (!free_blocks) {
887 DMWARN("%s: no free space available.",
888 dm_device_name(pool->pool_md));
889 spin_lock_irqsave(&pool->lock, flags);
890 pool->no_free_space = 1;
891 spin_unlock_irqrestore(&pool->lock, flags);
892 return -ENOSPC;
893 }
894 }
895 }
896
897 r = dm_pool_alloc_data_block(pool->pmd, result);
898 if (r)
899 return r;
900
901 return 0;
902}
903
904/*
905 * If we have run out of space, queue bios until the device is
906 * resumed, presumably after having been reloaded with more space.
907 */
908static void retry_on_resume(struct bio *bio)
909{
59c3d2c6 910 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d 911 struct thin_c *tc = h->tc;
991d9fa0
JT
912 struct pool *pool = tc->pool;
913 unsigned long flags;
914
915 spin_lock_irqsave(&pool->lock, flags);
916 bio_list_add(&pool->retry_on_resume_list, bio);
917 spin_unlock_irqrestore(&pool->lock, flags);
918}
919
a24c2569 920static void no_space(struct dm_bio_prison_cell *cell)
991d9fa0
JT
921{
922 struct bio *bio;
923 struct bio_list bios;
924
925 bio_list_init(&bios);
44feb387 926 dm_cell_release(cell, &bios);
991d9fa0
JT
927
928 while ((bio = bio_list_pop(&bios)))
929 retry_on_resume(bio);
930}
931
104655fd
JT
932static void process_discard(struct thin_c *tc, struct bio *bio)
933{
934 int r;
c3a0ce2e 935 unsigned long flags;
104655fd 936 struct pool *pool = tc->pool;
a24c2569 937 struct dm_bio_prison_cell *cell, *cell2;
44feb387 938 struct dm_cell_key key, key2;
104655fd
JT
939 dm_block_t block = get_bio_block(tc, bio);
940 struct dm_thin_lookup_result lookup_result;
a24c2569 941 struct dm_thin_new_mapping *m;
104655fd
JT
942
943 build_virtual_key(tc->td, block, &key);
44feb387 944 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
104655fd
JT
945 return;
946
947 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
948 switch (r) {
949 case 0:
950 /*
951 * Check nobody is fiddling with this pool block. This can
952 * happen if someone's in the process of breaking sharing
953 * on this block.
954 */
955 build_data_key(tc->td, lookup_result.block, &key2);
44feb387 956 if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
f286ba0e 957 cell_defer_no_holder(tc, cell);
104655fd
JT
958 break;
959 }
960
961 if (io_overlaps_block(pool, bio)) {
962 /*
963 * IO may still be going to the destination block. We must
964 * quiesce before we can do the removal.
965 */
966 m = get_next_mapping(pool);
967 m->tc = tc;
17b7d63f 968 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
104655fd
JT
969 m->virt_block = block;
970 m->data_block = lookup_result.block;
971 m->cell = cell;
972 m->cell2 = cell2;
973 m->err = 0;
974 m->bio = bio;
975
44feb387 976 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
c3a0ce2e 977 spin_lock_irqsave(&pool->lock, flags);
104655fd 978 list_add(&m->list, &pool->prepared_discards);
c3a0ce2e 979 spin_unlock_irqrestore(&pool->lock, flags);
104655fd
JT
980 wake_worker(pool);
981 }
982 } else {
e8088073 983 inc_all_io_entry(pool, bio);
f286ba0e
JT
984 cell_defer_no_holder(tc, cell);
985 cell_defer_no_holder(tc, cell2);
e8088073 986
104655fd 987 /*
49296309
MP
988 * The DM core makes sure that the discard doesn't span
989 * a block boundary. So we submit the discard of a
990 * partial block appropriately.
104655fd 991 */
650d2a06
MP
992 if ((!lookup_result.shared) && pool->pf.discard_passdown)
993 remap_and_issue(tc, bio, lookup_result.block);
994 else
995 bio_endio(bio, 0);
104655fd
JT
996 }
997 break;
998
999 case -ENODATA:
1000 /*
1001 * It isn't provisioned, just forget it.
1002 */
f286ba0e 1003 cell_defer_no_holder(tc, cell);
104655fd
JT
1004 bio_endio(bio, 0);
1005 break;
1006
1007 default:
c397741c
MS
1008 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1009 __func__, r);
f286ba0e 1010 cell_defer_no_holder(tc, cell);
104655fd
JT
1011 bio_io_error(bio);
1012 break;
1013 }
1014}
1015
991d9fa0 1016static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
44feb387 1017 struct dm_cell_key *key,
991d9fa0 1018 struct dm_thin_lookup_result *lookup_result,
a24c2569 1019 struct dm_bio_prison_cell *cell)
991d9fa0
JT
1020{
1021 int r;
1022 dm_block_t data_block;
1023
1024 r = alloc_data_block(tc, &data_block);
1025 switch (r) {
1026 case 0:
2dd9c257
JT
1027 schedule_internal_copy(tc, block, lookup_result->block,
1028 data_block, cell, bio);
991d9fa0
JT
1029 break;
1030
1031 case -ENOSPC:
1032 no_space(cell);
1033 break;
1034
1035 default:
c397741c
MS
1036 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1037 __func__, r);
44feb387 1038 dm_cell_error(cell);
991d9fa0
JT
1039 break;
1040 }
1041}
1042
1043static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1044 dm_block_t block,
1045 struct dm_thin_lookup_result *lookup_result)
1046{
a24c2569 1047 struct dm_bio_prison_cell *cell;
991d9fa0 1048 struct pool *pool = tc->pool;
44feb387 1049 struct dm_cell_key key;
991d9fa0
JT
1050
1051 /*
1052 * If cell is already occupied, then sharing is already in the process
1053 * of being broken so we have nothing further to do here.
1054 */
1055 build_data_key(tc->td, lookup_result->block, &key);
44feb387 1056 if (dm_bio_detain(pool->prison, &key, bio, &cell))
991d9fa0
JT
1057 return;
1058
60049701 1059 if (bio_data_dir(bio) == WRITE && bio->bi_size)
991d9fa0
JT
1060 break_sharing(tc, bio, block, &key, lookup_result, cell);
1061 else {
59c3d2c6 1062 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
991d9fa0 1063
44feb387 1064 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
e8088073 1065 inc_all_io_entry(pool, bio);
f286ba0e 1066 cell_defer_no_holder(tc, cell);
e8088073 1067
991d9fa0
JT
1068 remap_and_issue(tc, bio, lookup_result->block);
1069 }
1070}
1071
1072static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
a24c2569 1073 struct dm_bio_prison_cell *cell)
991d9fa0
JT
1074{
1075 int r;
1076 dm_block_t data_block;
1077
1078 /*
1079 * Remap empty bios (flushes) immediately, without provisioning.
1080 */
1081 if (!bio->bi_size) {
e8088073 1082 inc_all_io_entry(tc->pool, bio);
f286ba0e 1083 cell_defer_no_holder(tc, cell);
e8088073 1084
991d9fa0
JT
1085 remap_and_issue(tc, bio, 0);
1086 return;
1087 }
1088
1089 /*
1090 * Fill read bios with zeroes and complete them immediately.
1091 */
1092 if (bio_data_dir(bio) == READ) {
1093 zero_fill_bio(bio);
f286ba0e 1094 cell_defer_no_holder(tc, cell);
991d9fa0
JT
1095 bio_endio(bio, 0);
1096 return;
1097 }
1098
1099 r = alloc_data_block(tc, &data_block);
1100 switch (r) {
1101 case 0:
2dd9c257
JT
1102 if (tc->origin_dev)
1103 schedule_external_copy(tc, block, data_block, cell, bio);
1104 else
1105 schedule_zero(tc, block, data_block, cell, bio);
991d9fa0
JT
1106 break;
1107
1108 case -ENOSPC:
1109 no_space(cell);
1110 break;
1111
1112 default:
c397741c
MS
1113 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1114 __func__, r);
e49e5829 1115 set_pool_mode(tc->pool, PM_READ_ONLY);
44feb387 1116 dm_cell_error(cell);
991d9fa0
JT
1117 break;
1118 }
1119}
1120
1121static void process_bio(struct thin_c *tc, struct bio *bio)
1122{
1123 int r;
1124 dm_block_t block = get_bio_block(tc, bio);
a24c2569 1125 struct dm_bio_prison_cell *cell;
44feb387 1126 struct dm_cell_key key;
991d9fa0
JT
1127 struct dm_thin_lookup_result lookup_result;
1128
1129 /*
1130 * If cell is already occupied, then the block is already
1131 * being provisioned so we have nothing further to do here.
1132 */
1133 build_virtual_key(tc->td, block, &key);
44feb387 1134 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
991d9fa0
JT
1135 return;
1136
1137 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1138 switch (r) {
1139 case 0:
e8088073 1140 if (lookup_result.shared) {
991d9fa0 1141 process_shared_bio(tc, bio, block, &lookup_result);
f286ba0e 1142 cell_defer_no_holder(tc, cell);
e8088073
JT
1143 } else {
1144 inc_all_io_entry(tc->pool, bio);
f286ba0e 1145 cell_defer_no_holder(tc, cell);
e8088073 1146
991d9fa0 1147 remap_and_issue(tc, bio, lookup_result.block);
e8088073 1148 }
991d9fa0
JT
1149 break;
1150
1151 case -ENODATA:
2dd9c257 1152 if (bio_data_dir(bio) == READ && tc->origin_dev) {
e8088073 1153 inc_all_io_entry(tc->pool, bio);
f286ba0e 1154 cell_defer_no_holder(tc, cell);
e8088073 1155
2dd9c257
JT
1156 remap_to_origin_and_issue(tc, bio);
1157 } else
1158 provision_block(tc, bio, block, cell);
991d9fa0
JT
1159 break;
1160
1161 default:
c397741c
MS
1162 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1163 __func__, r);
f286ba0e 1164 cell_defer_no_holder(tc, cell);
991d9fa0
JT
1165 bio_io_error(bio);
1166 break;
1167 }
1168}
1169
e49e5829
JT
1170static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1171{
1172 int r;
1173 int rw = bio_data_dir(bio);
1174 dm_block_t block = get_bio_block(tc, bio);
1175 struct dm_thin_lookup_result lookup_result;
1176
1177 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1178 switch (r) {
1179 case 0:
1180 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1181 bio_io_error(bio);
e8088073
JT
1182 else {
1183 inc_all_io_entry(tc->pool, bio);
e49e5829 1184 remap_and_issue(tc, bio, lookup_result.block);
e8088073 1185 }
e49e5829
JT
1186 break;
1187
1188 case -ENODATA:
1189 if (rw != READ) {
1190 bio_io_error(bio);
1191 break;
1192 }
1193
1194 if (tc->origin_dev) {
e8088073 1195 inc_all_io_entry(tc->pool, bio);
e49e5829
JT
1196 remap_to_origin_and_issue(tc, bio);
1197 break;
1198 }
1199
1200 zero_fill_bio(bio);
1201 bio_endio(bio, 0);
1202 break;
1203
1204 default:
c397741c
MS
1205 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1206 __func__, r);
e49e5829
JT
1207 bio_io_error(bio);
1208 break;
1209 }
1210}
1211
1212static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1213{
1214 bio_io_error(bio);
1215}
1216
905e51b3
JT
1217static int need_commit_due_to_time(struct pool *pool)
1218{
1219 return jiffies < pool->last_commit_jiffies ||
1220 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1221}
1222
991d9fa0
JT
1223static void process_deferred_bios(struct pool *pool)
1224{
1225 unsigned long flags;
1226 struct bio *bio;
1227 struct bio_list bios;
991d9fa0
JT
1228
1229 bio_list_init(&bios);
1230
1231 spin_lock_irqsave(&pool->lock, flags);
1232 bio_list_merge(&bios, &pool->deferred_bios);
1233 bio_list_init(&pool->deferred_bios);
1234 spin_unlock_irqrestore(&pool->lock, flags);
1235
1236 while ((bio = bio_list_pop(&bios))) {
59c3d2c6 1237 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d
JT
1238 struct thin_c *tc = h->tc;
1239
991d9fa0
JT
1240 /*
1241 * If we've got no free new_mapping structs, and processing
1242 * this bio might require one, we pause until there are some
1243 * prepared mappings to process.
1244 */
1245 if (ensure_next_mapping(pool)) {
1246 spin_lock_irqsave(&pool->lock, flags);
1247 bio_list_merge(&pool->deferred_bios, &bios);
1248 spin_unlock_irqrestore(&pool->lock, flags);
1249
1250 break;
1251 }
104655fd
JT
1252
1253 if (bio->bi_rw & REQ_DISCARD)
e49e5829 1254 pool->process_discard(tc, bio);
104655fd 1255 else
e49e5829 1256 pool->process_bio(tc, bio);
991d9fa0
JT
1257 }
1258
1259 /*
1260 * If there are any deferred flush bios, we must commit
1261 * the metadata before issuing them.
1262 */
1263 bio_list_init(&bios);
1264 spin_lock_irqsave(&pool->lock, flags);
1265 bio_list_merge(&bios, &pool->deferred_flush_bios);
1266 bio_list_init(&pool->deferred_flush_bios);
1267 spin_unlock_irqrestore(&pool->lock, flags);
1268
905e51b3 1269 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
991d9fa0
JT
1270 return;
1271
e49e5829 1272 if (commit_or_fallback(pool)) {
991d9fa0
JT
1273 while ((bio = bio_list_pop(&bios)))
1274 bio_io_error(bio);
1275 return;
1276 }
905e51b3 1277 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
1278
1279 while ((bio = bio_list_pop(&bios)))
1280 generic_make_request(bio);
1281}
1282
1283static void do_worker(struct work_struct *ws)
1284{
1285 struct pool *pool = container_of(ws, struct pool, worker);
1286
e49e5829
JT
1287 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1288 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
991d9fa0
JT
1289 process_deferred_bios(pool);
1290}
1291
905e51b3
JT
1292/*
1293 * We want to commit periodically so that not too much
1294 * unwritten data builds up.
1295 */
1296static void do_waker(struct work_struct *ws)
1297{
1298 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1299 wake_worker(pool);
1300 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1301}
1302
991d9fa0
JT
1303/*----------------------------------------------------------------*/
1304
e49e5829
JT
1305static enum pool_mode get_pool_mode(struct pool *pool)
1306{
1307 return pool->pf.mode;
1308}
1309
1310static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1311{
1312 int r;
1313
1314 pool->pf.mode = mode;
1315
1316 switch (mode) {
1317 case PM_FAIL:
1318 DMERR("switching pool to failure mode");
1319 pool->process_bio = process_bio_fail;
1320 pool->process_discard = process_bio_fail;
1321 pool->process_prepared_mapping = process_prepared_mapping_fail;
1322 pool->process_prepared_discard = process_prepared_discard_fail;
1323 break;
1324
1325 case PM_READ_ONLY:
1326 DMERR("switching pool to read-only mode");
1327 r = dm_pool_abort_metadata(pool->pmd);
1328 if (r) {
1329 DMERR("aborting transaction failed");
1330 set_pool_mode(pool, PM_FAIL);
1331 } else {
1332 dm_pool_metadata_read_only(pool->pmd);
1333 pool->process_bio = process_bio_read_only;
1334 pool->process_discard = process_discard;
1335 pool->process_prepared_mapping = process_prepared_mapping_fail;
1336 pool->process_prepared_discard = process_prepared_discard_passdown;
1337 }
1338 break;
1339
1340 case PM_WRITE:
1341 pool->process_bio = process_bio;
1342 pool->process_discard = process_discard;
1343 pool->process_prepared_mapping = process_prepared_mapping;
1344 pool->process_prepared_discard = process_prepared_discard;
1345 break;
1346 }
1347}
1348
1349/*----------------------------------------------------------------*/
1350
991d9fa0
JT
1351/*
1352 * Mapping functions.
1353 */
1354
1355/*
1356 * Called only while mapping a thin bio to hand it over to the workqueue.
1357 */
1358static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1359{
1360 unsigned long flags;
1361 struct pool *pool = tc->pool;
1362
1363 spin_lock_irqsave(&pool->lock, flags);
1364 bio_list_add(&pool->deferred_bios, bio);
1365 spin_unlock_irqrestore(&pool->lock, flags);
1366
1367 wake_worker(pool);
1368}
1369
59c3d2c6 1370static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
eb2aa48d 1371{
59c3d2c6 1372 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d
JT
1373
1374 h->tc = tc;
1375 h->shared_read_entry = NULL;
e8088073 1376 h->all_io_entry = NULL;
eb2aa48d 1377 h->overwrite_mapping = NULL;
eb2aa48d
JT
1378}
1379
991d9fa0
JT
1380/*
1381 * Non-blocking function called from the thin target's map function.
1382 */
7de3ee57 1383static int thin_bio_map(struct dm_target *ti, struct bio *bio)
991d9fa0
JT
1384{
1385 int r;
1386 struct thin_c *tc = ti->private;
1387 dm_block_t block = get_bio_block(tc, bio);
1388 struct dm_thin_device *td = tc->td;
1389 struct dm_thin_lookup_result result;
e8088073
JT
1390 struct dm_bio_prison_cell *cell1, *cell2;
1391 struct dm_cell_key key;
991d9fa0 1392
59c3d2c6 1393 thin_hook_bio(tc, bio);
e49e5829
JT
1394
1395 if (get_pool_mode(tc->pool) == PM_FAIL) {
1396 bio_io_error(bio);
1397 return DM_MAPIO_SUBMITTED;
1398 }
1399
104655fd 1400 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
991d9fa0
JT
1401 thin_defer_bio(tc, bio);
1402 return DM_MAPIO_SUBMITTED;
1403 }
1404
1405 r = dm_thin_find_block(td, block, 0, &result);
1406
1407 /*
1408 * Note that we defer readahead too.
1409 */
1410 switch (r) {
1411 case 0:
1412 if (unlikely(result.shared)) {
1413 /*
1414 * We have a race condition here between the
1415 * result.shared value returned by the lookup and
1416 * snapshot creation, which may cause new
1417 * sharing.
1418 *
1419 * To avoid this always quiesce the origin before
1420 * taking the snap. You want to do this anyway to
1421 * ensure a consistent application view
1422 * (i.e. lockfs).
1423 *
1424 * More distant ancestors are irrelevant. The
1425 * shared flag will be set in their case.
1426 */
1427 thin_defer_bio(tc, bio);
e8088073 1428 return DM_MAPIO_SUBMITTED;
991d9fa0 1429 }
e8088073
JT
1430
1431 build_virtual_key(tc->td, block, &key);
1432 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
1433 return DM_MAPIO_SUBMITTED;
1434
1435 build_data_key(tc->td, result.block, &key);
1436 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
f286ba0e 1437 cell_defer_no_holder(tc, cell1);
e8088073
JT
1438 return DM_MAPIO_SUBMITTED;
1439 }
1440
1441 inc_all_io_entry(tc->pool, bio);
f286ba0e
JT
1442 cell_defer_no_holder(tc, cell2);
1443 cell_defer_no_holder(tc, cell1);
e8088073
JT
1444
1445 remap(tc, bio, result.block);
1446 return DM_MAPIO_REMAPPED;
991d9fa0
JT
1447
1448 case -ENODATA:
e49e5829
JT
1449 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1450 /*
1451 * This block isn't provisioned, and we have no way
1452 * of doing so. Just error it.
1453 */
1454 bio_io_error(bio);
2aab3850 1455 return DM_MAPIO_SUBMITTED;
e49e5829
JT
1456 }
1457 /* fall through */
1458
1459 case -EWOULDBLOCK:
991d9fa0
JT
1460 /*
1461 * In future, the failed dm_thin_find_block above could
1462 * provide the hint to load the metadata into cache.
1463 */
991d9fa0 1464 thin_defer_bio(tc, bio);
2aab3850 1465 return DM_MAPIO_SUBMITTED;
e49e5829
JT
1466
1467 default:
1468 /*
1469 * Must always call bio_io_error on failure.
1470 * dm_thin_find_block can fail with -EINVAL if the
1471 * pool is switched to fail-io mode.
1472 */
1473 bio_io_error(bio);
2aab3850 1474 return DM_MAPIO_SUBMITTED;
991d9fa0 1475 }
991d9fa0
JT
1476}
1477
1478static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1479{
1480 int r;
1481 unsigned long flags;
1482 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1483
1484 spin_lock_irqsave(&pt->pool->lock, flags);
1485 r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1486 spin_unlock_irqrestore(&pt->pool->lock, flags);
1487
1488 if (!r) {
1489 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1490 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1491 }
1492
1493 return r;
1494}
1495
1496static void __requeue_bios(struct pool *pool)
1497{
1498 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1499 bio_list_init(&pool->retry_on_resume_list);
1500}
1501
1502/*----------------------------------------------------------------
1503 * Binding of control targets to a pool object
1504 *--------------------------------------------------------------*/
9bc142dd
MS
1505static bool data_dev_supports_discard(struct pool_c *pt)
1506{
1507 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1508
1509 return q && blk_queue_discard(q);
1510}
1511
1512/*
1513 * If discard_passdown was enabled verify that the data device
0424caa1 1514 * supports discards. Disable discard_passdown if not.
9bc142dd 1515 */
0424caa1 1516static void disable_passdown_if_not_supported(struct pool_c *pt)
9bc142dd 1517{
0424caa1
MS
1518 struct pool *pool = pt->pool;
1519 struct block_device *data_bdev = pt->data_dev->bdev;
1520 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1521 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1522 const char *reason = NULL;
9bc142dd
MS
1523 char buf[BDEVNAME_SIZE];
1524
0424caa1 1525 if (!pt->adjusted_pf.discard_passdown)
9bc142dd
MS
1526 return;
1527
0424caa1
MS
1528 if (!data_dev_supports_discard(pt))
1529 reason = "discard unsupported";
1530
1531 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1532 reason = "max discard sectors smaller than a block";
9bc142dd 1533
0424caa1
MS
1534 else if (data_limits->discard_granularity > block_size)
1535 reason = "discard granularity larger than a block";
1536
1537 else if (block_size & (data_limits->discard_granularity - 1))
1538 reason = "discard granularity not a factor of block size";
1539
1540 if (reason) {
1541 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1542 pt->adjusted_pf.discard_passdown = false;
1543 }
9bc142dd
MS
1544}
1545
991d9fa0
JT
1546static int bind_control_target(struct pool *pool, struct dm_target *ti)
1547{
1548 struct pool_c *pt = ti->private;
1549
e49e5829
JT
1550 /*
1551 * We want to make sure that degraded pools are never upgraded.
1552 */
1553 enum pool_mode old_mode = pool->pf.mode;
0424caa1 1554 enum pool_mode new_mode = pt->adjusted_pf.mode;
e49e5829
JT
1555
1556 if (old_mode > new_mode)
1557 new_mode = old_mode;
1558
991d9fa0
JT
1559 pool->ti = ti;
1560 pool->low_water_blocks = pt->low_water_blocks;
0424caa1 1561 pool->pf = pt->adjusted_pf;
991d9fa0 1562
9bc142dd 1563 set_pool_mode(pool, new_mode);
f402693d 1564
991d9fa0
JT
1565 return 0;
1566}
1567
1568static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1569{
1570 if (pool->ti == ti)
1571 pool->ti = NULL;
1572}
1573
1574/*----------------------------------------------------------------
1575 * Pool creation
1576 *--------------------------------------------------------------*/
67e2e2b2
JT
1577/* Initialize pool features. */
1578static void pool_features_init(struct pool_features *pf)
1579{
e49e5829 1580 pf->mode = PM_WRITE;
9bc142dd
MS
1581 pf->zero_new_blocks = true;
1582 pf->discard_enabled = true;
1583 pf->discard_passdown = true;
67e2e2b2
JT
1584}
1585
991d9fa0
JT
1586static void __pool_destroy(struct pool *pool)
1587{
1588 __pool_table_remove(pool);
1589
1590 if (dm_pool_metadata_close(pool->pmd) < 0)
1591 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1592
44feb387 1593 dm_bio_prison_destroy(pool->prison);
991d9fa0
JT
1594 dm_kcopyd_client_destroy(pool->copier);
1595
1596 if (pool->wq)
1597 destroy_workqueue(pool->wq);
1598
1599 if (pool->next_mapping)
1600 mempool_free(pool->next_mapping, pool->mapping_pool);
1601 mempool_destroy(pool->mapping_pool);
44feb387
MS
1602 dm_deferred_set_destroy(pool->shared_read_ds);
1603 dm_deferred_set_destroy(pool->all_io_ds);
991d9fa0
JT
1604 kfree(pool);
1605}
1606
a24c2569 1607static struct kmem_cache *_new_mapping_cache;
a24c2569 1608
991d9fa0
JT
1609static struct pool *pool_create(struct mapped_device *pool_md,
1610 struct block_device *metadata_dev,
e49e5829
JT
1611 unsigned long block_size,
1612 int read_only, char **error)
991d9fa0
JT
1613{
1614 int r;
1615 void *err_p;
1616 struct pool *pool;
1617 struct dm_pool_metadata *pmd;
e49e5829 1618 bool format_device = read_only ? false : true;
991d9fa0 1619
e49e5829 1620 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
991d9fa0
JT
1621 if (IS_ERR(pmd)) {
1622 *error = "Error creating metadata object";
1623 return (struct pool *)pmd;
1624 }
1625
1626 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1627 if (!pool) {
1628 *error = "Error allocating memory for pool";
1629 err_p = ERR_PTR(-ENOMEM);
1630 goto bad_pool;
1631 }
1632
1633 pool->pmd = pmd;
1634 pool->sectors_per_block = block_size;
f9a8e0cd
MP
1635 if (block_size & (block_size - 1))
1636 pool->sectors_per_block_shift = -1;
1637 else
1638 pool->sectors_per_block_shift = __ffs(block_size);
991d9fa0 1639 pool->low_water_blocks = 0;
67e2e2b2 1640 pool_features_init(&pool->pf);
44feb387 1641 pool->prison = dm_bio_prison_create(PRISON_CELLS);
991d9fa0
JT
1642 if (!pool->prison) {
1643 *error = "Error creating pool's bio prison";
1644 err_p = ERR_PTR(-ENOMEM);
1645 goto bad_prison;
1646 }
1647
df5d2e90 1648 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
991d9fa0
JT
1649 if (IS_ERR(pool->copier)) {
1650 r = PTR_ERR(pool->copier);
1651 *error = "Error creating pool's kcopyd client";
1652 err_p = ERR_PTR(r);
1653 goto bad_kcopyd_client;
1654 }
1655
1656 /*
1657 * Create singlethreaded workqueue that will service all devices
1658 * that use this metadata.
1659 */
1660 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1661 if (!pool->wq) {
1662 *error = "Error creating pool's workqueue";
1663 err_p = ERR_PTR(-ENOMEM);
1664 goto bad_wq;
1665 }
1666
1667 INIT_WORK(&pool->worker, do_worker);
905e51b3 1668 INIT_DELAYED_WORK(&pool->waker, do_waker);
991d9fa0
JT
1669 spin_lock_init(&pool->lock);
1670 bio_list_init(&pool->deferred_bios);
1671 bio_list_init(&pool->deferred_flush_bios);
1672 INIT_LIST_HEAD(&pool->prepared_mappings);
104655fd 1673 INIT_LIST_HEAD(&pool->prepared_discards);
991d9fa0
JT
1674 pool->low_water_triggered = 0;
1675 pool->no_free_space = 0;
1676 bio_list_init(&pool->retry_on_resume_list);
44feb387
MS
1677
1678 pool->shared_read_ds = dm_deferred_set_create();
1679 if (!pool->shared_read_ds) {
1680 *error = "Error creating pool's shared read deferred set";
1681 err_p = ERR_PTR(-ENOMEM);
1682 goto bad_shared_read_ds;
1683 }
1684
1685 pool->all_io_ds = dm_deferred_set_create();
1686 if (!pool->all_io_ds) {
1687 *error = "Error creating pool's all io deferred set";
1688 err_p = ERR_PTR(-ENOMEM);
1689 goto bad_all_io_ds;
1690 }
991d9fa0
JT
1691
1692 pool->next_mapping = NULL;
a24c2569
MS
1693 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1694 _new_mapping_cache);
991d9fa0
JT
1695 if (!pool->mapping_pool) {
1696 *error = "Error creating pool's mapping mempool";
1697 err_p = ERR_PTR(-ENOMEM);
1698 goto bad_mapping_pool;
1699 }
1700
991d9fa0 1701 pool->ref_count = 1;
905e51b3 1702 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
1703 pool->pool_md = pool_md;
1704 pool->md_dev = metadata_dev;
1705 __pool_table_insert(pool);
1706
1707 return pool;
1708
991d9fa0 1709bad_mapping_pool:
44feb387
MS
1710 dm_deferred_set_destroy(pool->all_io_ds);
1711bad_all_io_ds:
1712 dm_deferred_set_destroy(pool->shared_read_ds);
1713bad_shared_read_ds:
991d9fa0
JT
1714 destroy_workqueue(pool->wq);
1715bad_wq:
1716 dm_kcopyd_client_destroy(pool->copier);
1717bad_kcopyd_client:
44feb387 1718 dm_bio_prison_destroy(pool->prison);
991d9fa0
JT
1719bad_prison:
1720 kfree(pool);
1721bad_pool:
1722 if (dm_pool_metadata_close(pmd))
1723 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1724
1725 return err_p;
1726}
1727
1728static void __pool_inc(struct pool *pool)
1729{
1730 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1731 pool->ref_count++;
1732}
1733
1734static void __pool_dec(struct pool *pool)
1735{
1736 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1737 BUG_ON(!pool->ref_count);
1738 if (!--pool->ref_count)
1739 __pool_destroy(pool);
1740}
1741
1742static struct pool *__pool_find(struct mapped_device *pool_md,
1743 struct block_device *metadata_dev,
e49e5829
JT
1744 unsigned long block_size, int read_only,
1745 char **error, int *created)
991d9fa0
JT
1746{
1747 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1748
1749 if (pool) {
f09996c9
MS
1750 if (pool->pool_md != pool_md) {
1751 *error = "metadata device already in use by a pool";
991d9fa0 1752 return ERR_PTR(-EBUSY);
f09996c9 1753 }
991d9fa0
JT
1754 __pool_inc(pool);
1755
1756 } else {
1757 pool = __pool_table_lookup(pool_md);
1758 if (pool) {
f09996c9
MS
1759 if (pool->md_dev != metadata_dev) {
1760 *error = "different pool cannot replace a pool";
991d9fa0 1761 return ERR_PTR(-EINVAL);
f09996c9 1762 }
991d9fa0
JT
1763 __pool_inc(pool);
1764
67e2e2b2 1765 } else {
e49e5829 1766 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
67e2e2b2
JT
1767 *created = 1;
1768 }
991d9fa0
JT
1769 }
1770
1771 return pool;
1772}
1773
1774/*----------------------------------------------------------------
1775 * Pool target methods
1776 *--------------------------------------------------------------*/
1777static void pool_dtr(struct dm_target *ti)
1778{
1779 struct pool_c *pt = ti->private;
1780
1781 mutex_lock(&dm_thin_pool_table.mutex);
1782
1783 unbind_control_target(pt->pool, ti);
1784 __pool_dec(pt->pool);
1785 dm_put_device(ti, pt->metadata_dev);
1786 dm_put_device(ti, pt->data_dev);
1787 kfree(pt);
1788
1789 mutex_unlock(&dm_thin_pool_table.mutex);
1790}
1791
991d9fa0
JT
1792static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1793 struct dm_target *ti)
1794{
1795 int r;
1796 unsigned argc;
1797 const char *arg_name;
1798
1799 static struct dm_arg _args[] = {
67e2e2b2 1800 {0, 3, "Invalid number of pool feature arguments"},
991d9fa0
JT
1801 };
1802
1803 /*
1804 * No feature arguments supplied.
1805 */
1806 if (!as->argc)
1807 return 0;
1808
1809 r = dm_read_arg_group(_args, as, &argc, &ti->error);
1810 if (r)
1811 return -EINVAL;
1812
1813 while (argc && !r) {
1814 arg_name = dm_shift_arg(as);
1815 argc--;
1816
e49e5829 1817 if (!strcasecmp(arg_name, "skip_block_zeroing"))
9bc142dd 1818 pf->zero_new_blocks = false;
e49e5829
JT
1819
1820 else if (!strcasecmp(arg_name, "ignore_discard"))
9bc142dd 1821 pf->discard_enabled = false;
e49e5829
JT
1822
1823 else if (!strcasecmp(arg_name, "no_discard_passdown"))
9bc142dd 1824 pf->discard_passdown = false;
991d9fa0 1825
e49e5829
JT
1826 else if (!strcasecmp(arg_name, "read_only"))
1827 pf->mode = PM_READ_ONLY;
1828
1829 else {
1830 ti->error = "Unrecognised pool feature requested";
1831 r = -EINVAL;
1832 break;
1833 }
991d9fa0
JT
1834 }
1835
1836 return r;
1837}
1838
1839/*
1840 * thin-pool <metadata dev> <data dev>
1841 * <data block size (sectors)>
1842 * <low water mark (blocks)>
1843 * [<#feature args> [<arg>]*]
1844 *
1845 * Optional feature arguments are:
1846 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
67e2e2b2
JT
1847 * ignore_discard: disable discard
1848 * no_discard_passdown: don't pass discards down to the data device
991d9fa0
JT
1849 */
1850static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1851{
67e2e2b2 1852 int r, pool_created = 0;
991d9fa0
JT
1853 struct pool_c *pt;
1854 struct pool *pool;
1855 struct pool_features pf;
1856 struct dm_arg_set as;
1857 struct dm_dev *data_dev;
1858 unsigned long block_size;
1859 dm_block_t low_water_blocks;
1860 struct dm_dev *metadata_dev;
1861 sector_t metadata_dev_size;
c4a69ecd 1862 char b[BDEVNAME_SIZE];
991d9fa0
JT
1863
1864 /*
1865 * FIXME Remove validation from scope of lock.
1866 */
1867 mutex_lock(&dm_thin_pool_table.mutex);
1868
1869 if (argc < 4) {
1870 ti->error = "Invalid argument count";
1871 r = -EINVAL;
1872 goto out_unlock;
1873 }
1874 as.argc = argc;
1875 as.argv = argv;
1876
1877 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1878 if (r) {
1879 ti->error = "Error opening metadata block device";
1880 goto out_unlock;
1881 }
1882
1883 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
c4a69ecd
MS
1884 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1885 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1886 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
991d9fa0
JT
1887
1888 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1889 if (r) {
1890 ti->error = "Error getting data device";
1891 goto out_metadata;
1892 }
1893
1894 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1895 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1896 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
55f2b8bd 1897 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
991d9fa0
JT
1898 ti->error = "Invalid block size";
1899 r = -EINVAL;
1900 goto out;
1901 }
1902
1903 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1904 ti->error = "Invalid low water mark";
1905 r = -EINVAL;
1906 goto out;
1907 }
1908
1909 /*
1910 * Set default pool features.
1911 */
67e2e2b2 1912 pool_features_init(&pf);
991d9fa0
JT
1913
1914 dm_consume_args(&as, 4);
1915 r = parse_pool_features(&as, &pf, ti);
1916 if (r)
1917 goto out;
1918
1919 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1920 if (!pt) {
1921 r = -ENOMEM;
1922 goto out;
1923 }
1924
1925 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
e49e5829 1926 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
991d9fa0
JT
1927 if (IS_ERR(pool)) {
1928 r = PTR_ERR(pool);
1929 goto out_free_pt;
1930 }
1931
67e2e2b2
JT
1932 /*
1933 * 'pool_created' reflects whether this is the first table load.
1934 * Top level discard support is not allowed to be changed after
1935 * initial load. This would require a pool reload to trigger thin
1936 * device changes.
1937 */
1938 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1939 ti->error = "Discard support cannot be disabled once enabled";
1940 r = -EINVAL;
1941 goto out_flags_changed;
1942 }
1943
991d9fa0
JT
1944 pt->pool = pool;
1945 pt->ti = ti;
1946 pt->metadata_dev = metadata_dev;
1947 pt->data_dev = data_dev;
1948 pt->low_water_blocks = low_water_blocks;
0424caa1 1949 pt->adjusted_pf = pt->requested_pf = pf;
55a62eef 1950 ti->num_flush_bios = 1;
9bc142dd 1951
67e2e2b2
JT
1952 /*
1953 * Only need to enable discards if the pool should pass
1954 * them down to the data device. The thin device's discard
1955 * processing will cause mappings to be removed from the btree.
1956 */
1957 if (pf.discard_enabled && pf.discard_passdown) {
55a62eef 1958 ti->num_discard_bios = 1;
9bc142dd 1959
67e2e2b2
JT
1960 /*
1961 * Setting 'discards_supported' circumvents the normal
1962 * stacking of discard limits (this keeps the pool and
1963 * thin devices' discard limits consistent).
1964 */
0ac55489 1965 ti->discards_supported = true;
307615a2 1966 ti->discard_zeroes_data_unsupported = true;
67e2e2b2 1967 }
991d9fa0
JT
1968 ti->private = pt;
1969
1970 pt->callbacks.congested_fn = pool_is_congested;
1971 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1972
1973 mutex_unlock(&dm_thin_pool_table.mutex);
1974
1975 return 0;
1976
67e2e2b2
JT
1977out_flags_changed:
1978 __pool_dec(pool);
991d9fa0
JT
1979out_free_pt:
1980 kfree(pt);
1981out:
1982 dm_put_device(ti, data_dev);
1983out_metadata:
1984 dm_put_device(ti, metadata_dev);
1985out_unlock:
1986 mutex_unlock(&dm_thin_pool_table.mutex);
1987
1988 return r;
1989}
1990
7de3ee57 1991static int pool_map(struct dm_target *ti, struct bio *bio)
991d9fa0
JT
1992{
1993 int r;
1994 struct pool_c *pt = ti->private;
1995 struct pool *pool = pt->pool;
1996 unsigned long flags;
1997
1998 /*
1999 * As this is a singleton target, ti->begin is always zero.
2000 */
2001 spin_lock_irqsave(&pool->lock, flags);
2002 bio->bi_bdev = pt->data_dev->bdev;
2003 r = DM_MAPIO_REMAPPED;
2004 spin_unlock_irqrestore(&pool->lock, flags);
2005
2006 return r;
2007}
2008
2009/*
2010 * Retrieves the number of blocks of the data device from
2011 * the superblock and compares it to the actual device size,
2012 * thus resizing the data device in case it has grown.
2013 *
2014 * This both copes with opening preallocated data devices in the ctr
2015 * being followed by a resume
2016 * -and-
2017 * calling the resume method individually after userspace has
2018 * grown the data device in reaction to a table event.
2019 */
2020static int pool_preresume(struct dm_target *ti)
2021{
2022 int r;
2023 struct pool_c *pt = ti->private;
2024 struct pool *pool = pt->pool;
55f2b8bd
MS
2025 sector_t data_size = ti->len;
2026 dm_block_t sb_data_size;
991d9fa0
JT
2027
2028 /*
2029 * Take control of the pool object.
2030 */
2031 r = bind_control_target(pool, ti);
2032 if (r)
2033 return r;
2034
55f2b8bd
MS
2035 (void) sector_div(data_size, pool->sectors_per_block);
2036
991d9fa0
JT
2037 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2038 if (r) {
2039 DMERR("failed to retrieve data device size");
2040 return r;
2041 }
2042
2043 if (data_size < sb_data_size) {
2044 DMERR("pool target too small, is %llu blocks (expected %llu)",
55f2b8bd 2045 (unsigned long long)data_size, sb_data_size);
991d9fa0
JT
2046 return -EINVAL;
2047
2048 } else if (data_size > sb_data_size) {
2049 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2050 if (r) {
2051 DMERR("failed to resize data device");
e49e5829
JT
2052 /* FIXME Stricter than necessary: Rollback transaction instead here */
2053 set_pool_mode(pool, PM_READ_ONLY);
991d9fa0
JT
2054 return r;
2055 }
2056
e49e5829 2057 (void) commit_or_fallback(pool);
991d9fa0
JT
2058 }
2059
2060 return 0;
2061}
2062
2063static void pool_resume(struct dm_target *ti)
2064{
2065 struct pool_c *pt = ti->private;
2066 struct pool *pool = pt->pool;
2067 unsigned long flags;
2068
2069 spin_lock_irqsave(&pool->lock, flags);
2070 pool->low_water_triggered = 0;
2071 pool->no_free_space = 0;
2072 __requeue_bios(pool);
2073 spin_unlock_irqrestore(&pool->lock, flags);
2074
905e51b3 2075 do_waker(&pool->waker.work);
991d9fa0
JT
2076}
2077
2078static void pool_postsuspend(struct dm_target *ti)
2079{
991d9fa0
JT
2080 struct pool_c *pt = ti->private;
2081 struct pool *pool = pt->pool;
2082
905e51b3 2083 cancel_delayed_work(&pool->waker);
991d9fa0 2084 flush_workqueue(pool->wq);
e49e5829 2085 (void) commit_or_fallback(pool);
991d9fa0
JT
2086}
2087
2088static int check_arg_count(unsigned argc, unsigned args_required)
2089{
2090 if (argc != args_required) {
2091 DMWARN("Message received with %u arguments instead of %u.",
2092 argc, args_required);
2093 return -EINVAL;
2094 }
2095
2096 return 0;
2097}
2098
2099static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2100{
2101 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2102 *dev_id <= MAX_DEV_ID)
2103 return 0;
2104
2105 if (warning)
2106 DMWARN("Message received with invalid device id: %s", arg);
2107
2108 return -EINVAL;
2109}
2110
2111static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2112{
2113 dm_thin_id dev_id;
2114 int r;
2115
2116 r = check_arg_count(argc, 2);
2117 if (r)
2118 return r;
2119
2120 r = read_dev_id(argv[1], &dev_id, 1);
2121 if (r)
2122 return r;
2123
2124 r = dm_pool_create_thin(pool->pmd, dev_id);
2125 if (r) {
2126 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2127 argv[1]);
2128 return r;
2129 }
2130
2131 return 0;
2132}
2133
2134static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2135{
2136 dm_thin_id dev_id;
2137 dm_thin_id origin_dev_id;
2138 int r;
2139
2140 r = check_arg_count(argc, 3);
2141 if (r)
2142 return r;
2143
2144 r = read_dev_id(argv[1], &dev_id, 1);
2145 if (r)
2146 return r;
2147
2148 r = read_dev_id(argv[2], &origin_dev_id, 1);
2149 if (r)
2150 return r;
2151
2152 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2153 if (r) {
2154 DMWARN("Creation of new snapshot %s of device %s failed.",
2155 argv[1], argv[2]);
2156 return r;
2157 }
2158
2159 return 0;
2160}
2161
2162static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2163{
2164 dm_thin_id dev_id;
2165 int r;
2166
2167 r = check_arg_count(argc, 2);
2168 if (r)
2169 return r;
2170
2171 r = read_dev_id(argv[1], &dev_id, 1);
2172 if (r)
2173 return r;
2174
2175 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2176 if (r)
2177 DMWARN("Deletion of thin device %s failed.", argv[1]);
2178
2179 return r;
2180}
2181
2182static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2183{
2184 dm_thin_id old_id, new_id;
2185 int r;
2186
2187 r = check_arg_count(argc, 3);
2188 if (r)
2189 return r;
2190
2191 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2192 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2193 return -EINVAL;
2194 }
2195
2196 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2197 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2198 return -EINVAL;
2199 }
2200
2201 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2202 if (r) {
2203 DMWARN("Failed to change transaction id from %s to %s.",
2204 argv[1], argv[2]);
2205 return r;
2206 }
2207
2208 return 0;
2209}
2210
cc8394d8
JT
2211static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2212{
2213 int r;
2214
2215 r = check_arg_count(argc, 1);
2216 if (r)
2217 return r;
2218
e49e5829 2219 (void) commit_or_fallback(pool);
0d200aef 2220
cc8394d8
JT
2221 r = dm_pool_reserve_metadata_snap(pool->pmd);
2222 if (r)
2223 DMWARN("reserve_metadata_snap message failed.");
2224
2225 return r;
2226}
2227
2228static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2229{
2230 int r;
2231
2232 r = check_arg_count(argc, 1);
2233 if (r)
2234 return r;
2235
2236 r = dm_pool_release_metadata_snap(pool->pmd);
2237 if (r)
2238 DMWARN("release_metadata_snap message failed.");
2239
2240 return r;
2241}
2242
991d9fa0
JT
2243/*
2244 * Messages supported:
2245 * create_thin <dev_id>
2246 * create_snap <dev_id> <origin_id>
2247 * delete <dev_id>
2248 * trim <dev_id> <new_size_in_sectors>
2249 * set_transaction_id <current_trans_id> <new_trans_id>
cc8394d8
JT
2250 * reserve_metadata_snap
2251 * release_metadata_snap
991d9fa0
JT
2252 */
2253static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2254{
2255 int r = -EINVAL;
2256 struct pool_c *pt = ti->private;
2257 struct pool *pool = pt->pool;
2258
2259 if (!strcasecmp(argv[0], "create_thin"))
2260 r = process_create_thin_mesg(argc, argv, pool);
2261
2262 else if (!strcasecmp(argv[0], "create_snap"))
2263 r = process_create_snap_mesg(argc, argv, pool);
2264
2265 else if (!strcasecmp(argv[0], "delete"))
2266 r = process_delete_mesg(argc, argv, pool);
2267
2268 else if (!strcasecmp(argv[0], "set_transaction_id"))
2269 r = process_set_transaction_id_mesg(argc, argv, pool);
2270
cc8394d8
JT
2271 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2272 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2273
2274 else if (!strcasecmp(argv[0], "release_metadata_snap"))
2275 r = process_release_metadata_snap_mesg(argc, argv, pool);
2276
991d9fa0
JT
2277 else
2278 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2279
e49e5829
JT
2280 if (!r)
2281 (void) commit_or_fallback(pool);
991d9fa0
JT
2282
2283 return r;
2284}
2285
e49e5829
JT
2286static void emit_flags(struct pool_features *pf, char *result,
2287 unsigned sz, unsigned maxlen)
2288{
2289 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2290 !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2291 DMEMIT("%u ", count);
2292
2293 if (!pf->zero_new_blocks)
2294 DMEMIT("skip_block_zeroing ");
2295
2296 if (!pf->discard_enabled)
2297 DMEMIT("ignore_discard ");
2298
2299 if (!pf->discard_passdown)
2300 DMEMIT("no_discard_passdown ");
2301
2302 if (pf->mode == PM_READ_ONLY)
2303 DMEMIT("read_only ");
2304}
2305
991d9fa0
JT
2306/*
2307 * Status line is:
2308 * <transaction id> <used metadata sectors>/<total metadata sectors>
2309 * <used data sectors>/<total data sectors> <held metadata root>
2310 */
fd7c092e
MP
2311static void pool_status(struct dm_target *ti, status_type_t type,
2312 unsigned status_flags, char *result, unsigned maxlen)
991d9fa0 2313{
e49e5829 2314 int r;
991d9fa0
JT
2315 unsigned sz = 0;
2316 uint64_t transaction_id;
2317 dm_block_t nr_free_blocks_data;
2318 dm_block_t nr_free_blocks_metadata;
2319 dm_block_t nr_blocks_data;
2320 dm_block_t nr_blocks_metadata;
2321 dm_block_t held_root;
2322 char buf[BDEVNAME_SIZE];
2323 char buf2[BDEVNAME_SIZE];
2324 struct pool_c *pt = ti->private;
2325 struct pool *pool = pt->pool;
2326
2327 switch (type) {
2328 case STATUSTYPE_INFO:
e49e5829
JT
2329 if (get_pool_mode(pool) == PM_FAIL) {
2330 DMEMIT("Fail");
2331 break;
2332 }
2333
1f4e0ff0
AK
2334 /* Commit to ensure statistics aren't out-of-date */
2335 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2336 (void) commit_or_fallback(pool);
2337
fd7c092e
MP
2338 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2339 if (r) {
2340 DMERR("dm_pool_get_metadata_transaction_id returned %d", r);
2341 goto err;
2342 }
991d9fa0 2343
fd7c092e
MP
2344 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2345 if (r) {
2346 DMERR("dm_pool_get_free_metadata_block_count returned %d", r);
2347 goto err;
2348 }
991d9fa0
JT
2349
2350 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
fd7c092e
MP
2351 if (r) {
2352 DMERR("dm_pool_get_metadata_dev_size returned %d", r);
2353 goto err;
2354 }
991d9fa0 2355
fd7c092e
MP
2356 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2357 if (r) {
2358 DMERR("dm_pool_get_free_block_count returned %d", r);
2359 goto err;
2360 }
991d9fa0
JT
2361
2362 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
fd7c092e
MP
2363 if (r) {
2364 DMERR("dm_pool_get_data_dev_size returned %d", r);
2365 goto err;
2366 }
991d9fa0 2367
cc8394d8 2368 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
fd7c092e
MP
2369 if (r) {
2370 DMERR("dm_pool_get_metadata_snap returned %d", r);
2371 goto err;
2372 }
991d9fa0
JT
2373
2374 DMEMIT("%llu %llu/%llu %llu/%llu ",
2375 (unsigned long long)transaction_id,
2376 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2377 (unsigned long long)nr_blocks_metadata,
2378 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2379 (unsigned long long)nr_blocks_data);
2380
2381 if (held_root)
e49e5829
JT
2382 DMEMIT("%llu ", held_root);
2383 else
2384 DMEMIT("- ");
2385
2386 if (pool->pf.mode == PM_READ_ONLY)
2387 DMEMIT("ro ");
991d9fa0 2388 else
e49e5829
JT
2389 DMEMIT("rw ");
2390
018debea
MS
2391 if (!pool->pf.discard_enabled)
2392 DMEMIT("ignore_discard");
2393 else if (pool->pf.discard_passdown)
e49e5829
JT
2394 DMEMIT("discard_passdown");
2395 else
2396 DMEMIT("no_discard_passdown");
991d9fa0
JT
2397
2398 break;
2399
2400 case STATUSTYPE_TABLE:
2401 DMEMIT("%s %s %lu %llu ",
2402 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2403 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2404 (unsigned long)pool->sectors_per_block,
2405 (unsigned long long)pt->low_water_blocks);
0424caa1 2406 emit_flags(&pt->requested_pf, result, sz, maxlen);
991d9fa0
JT
2407 break;
2408 }
fd7c092e 2409 return;
991d9fa0 2410
fd7c092e
MP
2411err:
2412 DMEMIT("Error");
991d9fa0
JT
2413}
2414
2415static int pool_iterate_devices(struct dm_target *ti,
2416 iterate_devices_callout_fn fn, void *data)
2417{
2418 struct pool_c *pt = ti->private;
2419
2420 return fn(ti, pt->data_dev, 0, ti->len, data);
2421}
2422
2423static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2424 struct bio_vec *biovec, int max_size)
2425{
2426 struct pool_c *pt = ti->private;
2427 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2428
2429 if (!q->merge_bvec_fn)
2430 return max_size;
2431
2432 bvm->bi_bdev = pt->data_dev->bdev;
2433
2434 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2435}
2436
0424caa1 2437static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
104655fd 2438{
0424caa1
MS
2439 struct pool *pool = pt->pool;
2440 struct queue_limits *data_limits;
2441
104655fd
JT
2442 limits->max_discard_sectors = pool->sectors_per_block;
2443
2444 /*
0424caa1 2445 * discard_granularity is just a hint, and not enforced.
104655fd 2446 */
0424caa1
MS
2447 if (pt->adjusted_pf.discard_passdown) {
2448 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2449 limits->discard_granularity = data_limits->discard_granularity;
f13945d7 2450 } else
0424caa1 2451 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
104655fd
JT
2452}
2453
991d9fa0
JT
2454static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2455{
2456 struct pool_c *pt = ti->private;
2457 struct pool *pool = pt->pool;
2458
2459 blk_limits_io_min(limits, 0);
2460 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
0424caa1
MS
2461
2462 /*
2463 * pt->adjusted_pf is a staging area for the actual features to use.
2464 * They get transferred to the live pool in bind_control_target()
2465 * called from pool_preresume().
2466 */
2467 if (!pt->adjusted_pf.discard_enabled)
2468 return;
2469
2470 disable_passdown_if_not_supported(pt);
2471
2472 set_discard_limits(pt, limits);
991d9fa0
JT
2473}
2474
2475static struct target_type pool_target = {
2476 .name = "thin-pool",
2477 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2478 DM_TARGET_IMMUTABLE,
fd7c092e 2479 .version = {1, 6, 1},
991d9fa0
JT
2480 .module = THIS_MODULE,
2481 .ctr = pool_ctr,
2482 .dtr = pool_dtr,
2483 .map = pool_map,
2484 .postsuspend = pool_postsuspend,
2485 .preresume = pool_preresume,
2486 .resume = pool_resume,
2487 .message = pool_message,
2488 .status = pool_status,
2489 .merge = pool_merge,
2490 .iterate_devices = pool_iterate_devices,
2491 .io_hints = pool_io_hints,
2492};
2493
2494/*----------------------------------------------------------------
2495 * Thin target methods
2496 *--------------------------------------------------------------*/
2497static void thin_dtr(struct dm_target *ti)
2498{
2499 struct thin_c *tc = ti->private;
2500
2501 mutex_lock(&dm_thin_pool_table.mutex);
2502
2503 __pool_dec(tc->pool);
2504 dm_pool_close_thin_device(tc->td);
2505 dm_put_device(ti, tc->pool_dev);
2dd9c257
JT
2506 if (tc->origin_dev)
2507 dm_put_device(ti, tc->origin_dev);
991d9fa0
JT
2508 kfree(tc);
2509
2510 mutex_unlock(&dm_thin_pool_table.mutex);
2511}
2512
2513/*
2514 * Thin target parameters:
2515 *
2dd9c257 2516 * <pool_dev> <dev_id> [origin_dev]
991d9fa0
JT
2517 *
2518 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2519 * dev_id: the internal device identifier
2dd9c257 2520 * origin_dev: a device external to the pool that should act as the origin
67e2e2b2
JT
2521 *
2522 * If the pool device has discards disabled, they get disabled for the thin
2523 * device as well.
991d9fa0
JT
2524 */
2525static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2526{
2527 int r;
2528 struct thin_c *tc;
2dd9c257 2529 struct dm_dev *pool_dev, *origin_dev;
991d9fa0
JT
2530 struct mapped_device *pool_md;
2531
2532 mutex_lock(&dm_thin_pool_table.mutex);
2533
2dd9c257 2534 if (argc != 2 && argc != 3) {
991d9fa0
JT
2535 ti->error = "Invalid argument count";
2536 r = -EINVAL;
2537 goto out_unlock;
2538 }
2539
2540 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2541 if (!tc) {
2542 ti->error = "Out of memory";
2543 r = -ENOMEM;
2544 goto out_unlock;
2545 }
2546
2dd9c257
JT
2547 if (argc == 3) {
2548 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2549 if (r) {
2550 ti->error = "Error opening origin device";
2551 goto bad_origin_dev;
2552 }
2553 tc->origin_dev = origin_dev;
2554 }
2555
991d9fa0
JT
2556 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2557 if (r) {
2558 ti->error = "Error opening pool device";
2559 goto bad_pool_dev;
2560 }
2561 tc->pool_dev = pool_dev;
2562
2563 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2564 ti->error = "Invalid device id";
2565 r = -EINVAL;
2566 goto bad_common;
2567 }
2568
2569 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2570 if (!pool_md) {
2571 ti->error = "Couldn't get pool mapped device";
2572 r = -EINVAL;
2573 goto bad_common;
2574 }
2575
2576 tc->pool = __pool_table_lookup(pool_md);
2577 if (!tc->pool) {
2578 ti->error = "Couldn't find pool object";
2579 r = -EINVAL;
2580 goto bad_pool_lookup;
2581 }
2582 __pool_inc(tc->pool);
2583
e49e5829
JT
2584 if (get_pool_mode(tc->pool) == PM_FAIL) {
2585 ti->error = "Couldn't open thin device, Pool is in fail mode";
2586 goto bad_thin_open;
2587 }
2588
991d9fa0
JT
2589 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2590 if (r) {
2591 ti->error = "Couldn't open thin internal device";
2592 goto bad_thin_open;
2593 }
2594
542f9038
MS
2595 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2596 if (r)
2597 goto bad_thin_open;
2598
55a62eef 2599 ti->num_flush_bios = 1;
16ad3d10 2600 ti->flush_supported = true;
59c3d2c6 2601 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
67e2e2b2
JT
2602
2603 /* In case the pool supports discards, pass them on. */
2604 if (tc->pool->pf.discard_enabled) {
0ac55489 2605 ti->discards_supported = true;
55a62eef 2606 ti->num_discard_bios = 1;
0ac55489 2607 ti->discard_zeroes_data_unsupported = true;
55a62eef
AK
2608 /* Discard bios must be split on a block boundary */
2609 ti->split_discard_bios = true;
67e2e2b2 2610 }
991d9fa0
JT
2611
2612 dm_put(pool_md);
2613
2614 mutex_unlock(&dm_thin_pool_table.mutex);
2615
2616 return 0;
2617
2618bad_thin_open:
2619 __pool_dec(tc->pool);
2620bad_pool_lookup:
2621 dm_put(pool_md);
2622bad_common:
2623 dm_put_device(ti, tc->pool_dev);
2624bad_pool_dev:
2dd9c257
JT
2625 if (tc->origin_dev)
2626 dm_put_device(ti, tc->origin_dev);
2627bad_origin_dev:
991d9fa0
JT
2628 kfree(tc);
2629out_unlock:
2630 mutex_unlock(&dm_thin_pool_table.mutex);
2631
2632 return r;
2633}
2634
7de3ee57 2635static int thin_map(struct dm_target *ti, struct bio *bio)
991d9fa0 2636{
6efd6e83 2637 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
991d9fa0 2638
7de3ee57 2639 return thin_bio_map(ti, bio);
991d9fa0
JT
2640}
2641
7de3ee57 2642static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
eb2aa48d
JT
2643{
2644 unsigned long flags;
59c3d2c6 2645 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d 2646 struct list_head work;
a24c2569 2647 struct dm_thin_new_mapping *m, *tmp;
eb2aa48d
JT
2648 struct pool *pool = h->tc->pool;
2649
2650 if (h->shared_read_entry) {
2651 INIT_LIST_HEAD(&work);
44feb387 2652 dm_deferred_entry_dec(h->shared_read_entry, &work);
eb2aa48d
JT
2653
2654 spin_lock_irqsave(&pool->lock, flags);
2655 list_for_each_entry_safe(m, tmp, &work, list) {
2656 list_del(&m->list);
2657 m->quiesced = 1;
2658 __maybe_add_mapping(m);
2659 }
2660 spin_unlock_irqrestore(&pool->lock, flags);
2661 }
2662
104655fd
JT
2663 if (h->all_io_entry) {
2664 INIT_LIST_HEAD(&work);
44feb387 2665 dm_deferred_entry_dec(h->all_io_entry, &work);
563af186
JT
2666 if (!list_empty(&work)) {
2667 spin_lock_irqsave(&pool->lock, flags);
2668 list_for_each_entry_safe(m, tmp, &work, list)
2669 list_add(&m->list, &pool->prepared_discards);
2670 spin_unlock_irqrestore(&pool->lock, flags);
2671 wake_worker(pool);
2672 }
104655fd
JT
2673 }
2674
eb2aa48d
JT
2675 return 0;
2676}
2677
991d9fa0
JT
2678static void thin_postsuspend(struct dm_target *ti)
2679{
2680 if (dm_noflush_suspending(ti))
2681 requeue_io((struct thin_c *)ti->private);
2682}
2683
2684/*
2685 * <nr mapped sectors> <highest mapped sector>
2686 */
fd7c092e
MP
2687static void thin_status(struct dm_target *ti, status_type_t type,
2688 unsigned status_flags, char *result, unsigned maxlen)
991d9fa0
JT
2689{
2690 int r;
2691 ssize_t sz = 0;
2692 dm_block_t mapped, highest;
2693 char buf[BDEVNAME_SIZE];
2694 struct thin_c *tc = ti->private;
2695
e49e5829
JT
2696 if (get_pool_mode(tc->pool) == PM_FAIL) {
2697 DMEMIT("Fail");
fd7c092e 2698 return;
e49e5829
JT
2699 }
2700
991d9fa0
JT
2701 if (!tc->td)
2702 DMEMIT("-");
2703 else {
2704 switch (type) {
2705 case STATUSTYPE_INFO:
2706 r = dm_thin_get_mapped_count(tc->td, &mapped);
fd7c092e
MP
2707 if (r) {
2708 DMERR("dm_thin_get_mapped_count returned %d", r);
2709 goto err;
2710 }
991d9fa0
JT
2711
2712 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
fd7c092e
MP
2713 if (r < 0) {
2714 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
2715 goto err;
2716 }
991d9fa0
JT
2717
2718 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2719 if (r)
2720 DMEMIT("%llu", ((highest + 1) *
2721 tc->pool->sectors_per_block) - 1);
2722 else
2723 DMEMIT("-");
2724 break;
2725
2726 case STATUSTYPE_TABLE:
2727 DMEMIT("%s %lu",
2728 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2729 (unsigned long) tc->dev_id);
2dd9c257
JT
2730 if (tc->origin_dev)
2731 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
991d9fa0
JT
2732 break;
2733 }
2734 }
2735
fd7c092e
MP
2736 return;
2737
2738err:
2739 DMEMIT("Error");
991d9fa0
JT
2740}
2741
2742static int thin_iterate_devices(struct dm_target *ti,
2743 iterate_devices_callout_fn fn, void *data)
2744{
55f2b8bd 2745 sector_t blocks;
991d9fa0 2746 struct thin_c *tc = ti->private;
55f2b8bd 2747 struct pool *pool = tc->pool;
991d9fa0
JT
2748
2749 /*
2750 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2751 * we follow a more convoluted path through to the pool's target.
2752 */
55f2b8bd 2753 if (!pool->ti)
991d9fa0
JT
2754 return 0; /* nothing is bound */
2755
55f2b8bd
MS
2756 blocks = pool->ti->len;
2757 (void) sector_div(blocks, pool->sectors_per_block);
991d9fa0 2758 if (blocks)
55f2b8bd 2759 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
991d9fa0
JT
2760
2761 return 0;
2762}
2763
991d9fa0
JT
2764static struct target_type thin_target = {
2765 .name = "thin",
fd7c092e 2766 .version = {1, 7, 1},
991d9fa0
JT
2767 .module = THIS_MODULE,
2768 .ctr = thin_ctr,
2769 .dtr = thin_dtr,
2770 .map = thin_map,
eb2aa48d 2771 .end_io = thin_endio,
991d9fa0
JT
2772 .postsuspend = thin_postsuspend,
2773 .status = thin_status,
2774 .iterate_devices = thin_iterate_devices,
991d9fa0
JT
2775};
2776
2777/*----------------------------------------------------------------*/
2778
2779static int __init dm_thin_init(void)
2780{
2781 int r;
2782
2783 pool_table_init();
2784
2785 r = dm_register_target(&thin_target);
2786 if (r)
2787 return r;
2788
2789 r = dm_register_target(&pool_target);
2790 if (r)
a24c2569
MS
2791 goto bad_pool_target;
2792
2793 r = -ENOMEM;
2794
a24c2569
MS
2795 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2796 if (!_new_mapping_cache)
2797 goto bad_new_mapping_cache;
2798
a24c2569
MS
2799 return 0;
2800
a24c2569 2801bad_new_mapping_cache:
a24c2569
MS
2802 dm_unregister_target(&pool_target);
2803bad_pool_target:
2804 dm_unregister_target(&thin_target);
991d9fa0
JT
2805
2806 return r;
2807}
2808
2809static void dm_thin_exit(void)
2810{
2811 dm_unregister_target(&thin_target);
2812 dm_unregister_target(&pool_target);
a24c2569 2813
a24c2569 2814 kmem_cache_destroy(_new_mapping_cache);
991d9fa0
JT
2815}
2816
2817module_init(dm_thin_init);
2818module_exit(dm_thin_exit);
2819
7cab8bf1 2820MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
991d9fa0
JT
2821MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2822MODULE_LICENSE("GPL");