dm flakey: introduce random_read_corrupt and random_write_corrupt options
[linux-2.6-block.git] / drivers / md / dm-thin.c
CommitLineData
3bd94003 1// SPDX-License-Identifier: GPL-2.0-only
991d9fa0 2/*
e49e5829 3 * Copyright (C) 2011-2012 Red Hat UK.
991d9fa0
JT
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-thin-metadata.h"
742c8fdc 9#include "dm-bio-prison-v1.h"
1f4e0ff0 10#include "dm.h"
991d9fa0
JT
11
12#include <linux/device-mapper.h>
13#include <linux/dm-io.h>
14#include <linux/dm-kcopyd.h>
0f30af98 15#include <linux/jiffies.h>
604ea906 16#include <linux/log2.h>
991d9fa0 17#include <linux/list.h>
c140e1c4 18#include <linux/rculist.h>
991d9fa0
JT
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/slab.h>
a822c83e 22#include <linux/vmalloc.h>
ac4c3f34 23#include <linux/sort.h>
67324ea1 24#include <linux/rbtree.h>
991d9fa0
JT
25
26#define DM_MSG_PREFIX "thin"
27
28/*
29 * Tunable constants
30 */
7768ed33 31#define ENDIO_HOOK_POOL_SIZE 1024
991d9fa0 32#define MAPPING_POOL_SIZE 1024
905e51b3 33#define COMMIT_PERIOD HZ
80c57893
MS
34#define NO_SPACE_TIMEOUT_SECS 60
35
86a3238c 36static unsigned int no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
991d9fa0 37
df5d2e90
MP
38DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
39 "A percentage of time allocated for copy on write");
40
991d9fa0
JT
41/*
42 * The block size of the device holding pool data must be
43 * between 64KB and 1GB.
44 */
45#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
46#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
47
991d9fa0
JT
48/*
49 * Device id is restricted to 24 bits.
50 */
51#define MAX_DEV_ID ((1 << 24) - 1)
52
53/*
54 * How do we handle breaking sharing of data blocks?
55 * =================================================
56 *
57 * We use a standard copy-on-write btree to store the mappings for the
58 * devices (note I'm talking about copy-on-write of the metadata here, not
59 * the data). When you take an internal snapshot you clone the root node
60 * of the origin btree. After this there is no concept of an origin or a
61 * snapshot. They are just two device trees that happen to point to the
62 * same data blocks.
63 *
64 * When we get a write in we decide if it's to a shared data block using
65 * some timestamp magic. If it is, we have to break sharing.
66 *
67 * Let's say we write to a shared block in what was the origin. The
68 * steps are:
69 *
70 * i) plug io further to this physical block. (see bio_prison code).
71 *
72 * ii) quiesce any read io to that shared data block. Obviously
44feb387 73 * including all devices that share this block. (see dm_deferred_set code)
991d9fa0
JT
74 *
75 * iii) copy the data block to a newly allocate block. This step can be
76 * missed out if the io covers the block. (schedule_copy).
77 *
78 * iv) insert the new mapping into the origin's btree
fe878f34 79 * (process_prepared_mapping). This act of inserting breaks some
991d9fa0
JT
80 * sharing of btree nodes between the two devices. Breaking sharing only
81 * effects the btree of that specific device. Btrees for the other
82 * devices that share the block never change. The btree for the origin
83 * device as it was after the last commit is untouched, ie. we're using
84 * persistent data structures in the functional programming sense.
85 *
86 * v) unplug io to this physical block, including the io that triggered
87 * the breaking of sharing.
88 *
89 * Steps (ii) and (iii) occur in parallel.
90 *
91 * The metadata _doesn't_ need to be committed before the io continues. We
92 * get away with this because the io is always written to a _new_ block.
93 * If there's a crash, then:
94 *
95 * - The origin mapping will point to the old origin block (the shared
96 * one). This will contain the data as it was before the io that triggered
97 * the breaking of sharing came in.
98 *
99 * - The snap mapping still points to the old block. As it would after
100 * the commit.
101 *
102 * The downside of this scheme is the timestamp magic isn't perfect, and
103 * will continue to think that data block in the snapshot device is shared
104 * even after the write to the origin has broken sharing. I suspect data
105 * blocks will typically be shared by many different devices, so we're
106 * breaking sharing n + 1 times, rather than n, where n is the number of
107 * devices that reference this data block. At the moment I think the
108 * benefits far, far outweigh the disadvantages.
109 */
110
111/*----------------------------------------------------------------*/
112
991d9fa0
JT
113/*
114 * Key building.
115 */
34fbcf62
JT
116enum lock_space {
117 VIRTUAL,
118 PHYSICAL
119};
120
3f8d3f54 121static bool build_key(struct dm_thin_device *td, enum lock_space ls,
34fbcf62 122 dm_block_t b, dm_block_t e, struct dm_cell_key *key)
991d9fa0 123{
34fbcf62 124 key->virtual = (ls == VIRTUAL);
991d9fa0 125 key->dev = dm_thin_dev_id(td);
5f274d88 126 key->block_begin = b;
34fbcf62 127 key->block_end = e;
3f8d3f54
MS
128
129 return dm_cell_key_has_valid_range(key);
34fbcf62
JT
130}
131
132static void build_data_key(struct dm_thin_device *td, dm_block_t b,
133 struct dm_cell_key *key)
134{
3f8d3f54 135 (void) build_key(td, PHYSICAL, b, b + 1llu, key);
991d9fa0
JT
136}
137
138static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
44feb387 139 struct dm_cell_key *key)
991d9fa0 140{
3f8d3f54 141 (void) build_key(td, VIRTUAL, b, b + 1llu, key);
991d9fa0
JT
142}
143
144/*----------------------------------------------------------------*/
145
7d327fe0
JT
146#define THROTTLE_THRESHOLD (1 * HZ)
147
148struct throttle {
149 struct rw_semaphore lock;
150 unsigned long threshold;
151 bool throttle_applied;
152};
153
154static void throttle_init(struct throttle *t)
155{
156 init_rwsem(&t->lock);
157 t->throttle_applied = false;
158}
159
160static void throttle_work_start(struct throttle *t)
161{
162 t->threshold = jiffies + THROTTLE_THRESHOLD;
163}
164
165static void throttle_work_update(struct throttle *t)
166{
8ca8b1e1 167 if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {
7d327fe0
JT
168 down_write(&t->lock);
169 t->throttle_applied = true;
170 }
171}
172
173static void throttle_work_complete(struct throttle *t)
174{
175 if (t->throttle_applied) {
176 t->throttle_applied = false;
177 up_write(&t->lock);
178 }
179}
180
181static void throttle_lock(struct throttle *t)
182{
183 down_read(&t->lock);
184}
185
186static void throttle_unlock(struct throttle *t)
187{
188 up_read(&t->lock);
189}
190
191/*----------------------------------------------------------------*/
192
991d9fa0
JT
193/*
194 * A pool device ties together a metadata device and a data device. It
195 * also provides the interface for creating and destroying internal
196 * devices.
197 */
a24c2569 198struct dm_thin_new_mapping;
67e2e2b2 199
e49e5829 200/*
f6c36758 201 * The pool runs in various modes. Ordered in degraded order for comparisons.
e49e5829
JT
202 */
203enum pool_mode {
204 PM_WRITE, /* metadata may be changed */
3e1a0699 205 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
3ab91828
JT
206
207 /*
208 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
209 */
210 PM_OUT_OF_METADATA_SPACE,
e49e5829 211 PM_READ_ONLY, /* metadata may not be changed */
3ab91828 212
e49e5829
JT
213 PM_FAIL, /* all I/O fails */
214};
215
67e2e2b2 216struct pool_features {
e49e5829
JT
217 enum pool_mode mode;
218
9bc142dd
MS
219 bool zero_new_blocks:1;
220 bool discard_enabled:1;
221 bool discard_passdown:1;
787a996c 222 bool error_if_no_space:1;
67e2e2b2
JT
223};
224
e49e5829
JT
225struct thin_c;
226typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
a374bb21 227typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
e49e5829
JT
228typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
229
ac4c3f34
JT
230#define CELL_SORT_ARRAY_SIZE 8192
231
991d9fa0
JT
232struct pool {
233 struct list_head list;
234 struct dm_target *ti; /* Only set if a pool target is bound */
235
236 struct mapped_device *pool_md;
873937e7 237 struct block_device *data_dev;
991d9fa0
JT
238 struct block_device *md_dev;
239 struct dm_pool_metadata *pmd;
240
991d9fa0 241 dm_block_t low_water_blocks;
55f2b8bd 242 uint32_t sectors_per_block;
f9a8e0cd 243 int sectors_per_block_shift;
991d9fa0 244
67e2e2b2 245 struct pool_features pf;
88a6621b 246 bool low_water_triggered:1; /* A dm event has been sent */
80e96c54 247 bool suspended:1;
c3667cc6 248 bool out_of_data_space:1;
991d9fa0 249
44feb387 250 struct dm_bio_prison *prison;
991d9fa0
JT
251 struct dm_kcopyd_client *copier;
252
72d711c8 253 struct work_struct worker;
991d9fa0 254 struct workqueue_struct *wq;
7d327fe0 255 struct throttle throttle;
905e51b3 256 struct delayed_work waker;
85ad643b 257 struct delayed_work no_space_timeout;
991d9fa0 258
905e51b3 259 unsigned long last_commit_jiffies;
86a3238c 260 unsigned int ref_count;
991d9fa0
JT
261
262 spinlock_t lock;
991d9fa0 263 struct bio_list deferred_flush_bios;
4ae280b4 264 struct bio_list deferred_flush_completions;
991d9fa0 265 struct list_head prepared_mappings;
104655fd 266 struct list_head prepared_discards;
2a0fbffb 267 struct list_head prepared_discards_pt2;
c140e1c4 268 struct list_head active_thins;
991d9fa0 269
44feb387
MS
270 struct dm_deferred_set *shared_read_ds;
271 struct dm_deferred_set *all_io_ds;
991d9fa0 272
a24c2569 273 struct dm_thin_new_mapping *next_mapping;
e49e5829
JT
274
275 process_bio_fn process_bio;
276 process_bio_fn process_discard;
277
a374bb21
JT
278 process_cell_fn process_cell;
279 process_cell_fn process_discard_cell;
280
e49e5829
JT
281 process_mapping_fn process_prepared_mapping;
282 process_mapping_fn process_prepared_discard;
2a0fbffb 283 process_mapping_fn process_prepared_discard_pt2;
ac4c3f34 284
a822c83e 285 struct dm_bio_prison_cell **cell_sort_array;
72d711c8
MS
286
287 mempool_t mapping_pool;
991d9fa0
JT
288};
289
b5330655 290static void metadata_operation_failed(struct pool *pool, const char *op, int r);
e49e5829 291
f6c36758
MS
292static enum pool_mode get_pool_mode(struct pool *pool)
293{
294 return pool->pf.mode;
295}
296
297static void notify_of_pool_mode_change(struct pool *pool)
298{
774f13ac 299 static const char *descs[] = {
f6c36758
MS
300 "write",
301 "out-of-data-space",
302 "read-only",
303 "read-only",
304 "fail"
305 };
306 const char *extra_desc = NULL;
307 enum pool_mode mode = get_pool_mode(pool);
308
309 if (mode == PM_OUT_OF_DATA_SPACE) {
310 if (!pool->pf.error_if_no_space)
311 extra_desc = " (queue IO)";
312 else
313 extra_desc = " (error IO)";
314 }
315
316 dm_table_event(pool->ti->table);
317 DMINFO("%s: switching pool to %s%s mode",
318 dm_device_name(pool->pool_md),
319 descs[(int)mode], extra_desc ? : "");
320}
321
991d9fa0
JT
322/*
323 * Target context for a pool.
324 */
325struct pool_c {
326 struct dm_target *ti;
327 struct pool *pool;
328 struct dm_dev *data_dev;
329 struct dm_dev *metadata_dev;
991d9fa0
JT
330
331 dm_block_t low_water_blocks;
0424caa1
MS
332 struct pool_features requested_pf; /* Features requested during table load */
333 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
991d9fa0
JT
334};
335
336/*
337 * Target context for a thin.
338 */
339struct thin_c {
c140e1c4 340 struct list_head list;
991d9fa0 341 struct dm_dev *pool_dev;
2dd9c257 342 struct dm_dev *origin_dev;
e5aea7b4 343 sector_t origin_size;
991d9fa0
JT
344 dm_thin_id dev_id;
345
346 struct pool *pool;
347 struct dm_thin_device *td;
583024d2
MS
348 struct mapped_device *thin_md;
349
738211f7 350 bool requeue_mode:1;
c140e1c4 351 spinlock_t lock;
a374bb21 352 struct list_head deferred_cells;
c140e1c4
MS
353 struct bio_list deferred_bio_list;
354 struct bio_list retry_on_resume_list;
67324ea1 355 struct rb_root sort_bio_list; /* sorted list of deferred bios */
b10ebd34
JT
356
357 /*
358 * Ensures the thin is not destroyed until the worker has finished
359 * iterating the active_thins list.
360 */
22d4c291 361 refcount_t refcount;
b10ebd34 362 struct completion can_destroy;
991d9fa0
JT
363};
364
365/*----------------------------------------------------------------*/
366
34fbcf62
JT
367static bool block_size_is_power_of_two(struct pool *pool)
368{
369 return pool->sectors_per_block_shift >= 0;
370}
371
372static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
373{
374 return block_size_is_power_of_two(pool) ?
375 (b << pool->sectors_per_block_shift) :
376 (b * pool->sectors_per_block);
377}
378
202bae52
JT
379/*----------------------------------------------------------------*/
380
381struct discard_op {
382 struct thin_c *tc;
383 struct blk_plug plug;
384 struct bio *parent_bio;
385 struct bio *bio;
386};
387
388static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
389{
390 BUG_ON(!parent);
391
392 op->tc = tc;
393 blk_start_plug(&op->plug);
394 op->parent_bio = parent;
395 op->bio = NULL;
396}
397
398static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
34fbcf62 399{
202bae52 400 struct thin_c *tc = op->tc;
34fbcf62
JT
401 sector_t s = block_to_sectors(tc->pool, data_b);
402 sector_t len = block_to_sectors(tc->pool, data_e - data_b);
3dba53a9 403
44abff2c
CH
404 return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
405 &op->bio);
202bae52
JT
406}
407
408static void end_discard(struct discard_op *op, int r)
409{
410 if (op->bio) {
411 /*
412 * Even if one of the calls to issue_discard failed, we
413 * need to wait for the chain to complete.
414 */
415 bio_chain(op->bio, op->parent_bio);
c34b7ac6 416 op->bio->bi_opf = REQ_OP_DISCARD;
4e49ea4a 417 submit_bio(op->bio);
3dba53a9 418 }
34fbcf62 419
202bae52
JT
420 blk_finish_plug(&op->plug);
421
422 /*
423 * Even if r is set, there could be sub discards in flight that we
424 * need to wait for.
425 */
4e4cbee9
CH
426 if (r && !op->parent_bio->bi_status)
427 op->parent_bio->bi_status = errno_to_blk_status(r);
202bae52 428 bio_endio(op->parent_bio);
34fbcf62
JT
429}
430
431/*----------------------------------------------------------------*/
432
025b9685
JT
433/*
434 * wake_worker() is used when new work is queued and when pool_resume is
435 * ready to continue deferred IO processing.
436 */
437static void wake_worker(struct pool *pool)
438{
439 queue_work(pool->wq, &pool->worker);
440}
441
442/*----------------------------------------------------------------*/
443
6beca5eb
JT
444static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
445 struct dm_bio_prison_cell **cell_result)
446{
447 int r;
448 struct dm_bio_prison_cell *cell_prealloc;
449
450 /*
451 * Allocate a cell from the prison's mempool.
452 * This might block but it can't fail.
453 */
454 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
455
456 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
457 if (r)
458 /*
459 * We reused an old cell; we can get rid of
460 * the new one.
461 */
462 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
463
464 return r;
465}
466
467static void cell_release(struct pool *pool,
468 struct dm_bio_prison_cell *cell,
469 struct bio_list *bios)
470{
471 dm_cell_release(pool->prison, cell, bios);
472 dm_bio_prison_free_cell(pool->prison, cell);
473}
474
2d759a46
JT
475static void cell_visit_release(struct pool *pool,
476 void (*fn)(void *, struct dm_bio_prison_cell *),
477 void *context,
478 struct dm_bio_prison_cell *cell)
479{
480 dm_cell_visit_release(pool->prison, fn, context, cell);
481 dm_bio_prison_free_cell(pool->prison, cell);
482}
483
6beca5eb
JT
484static void cell_release_no_holder(struct pool *pool,
485 struct dm_bio_prison_cell *cell,
486 struct bio_list *bios)
487{
488 dm_cell_release_no_holder(pool->prison, cell, bios);
489 dm_bio_prison_free_cell(pool->prison, cell);
490}
491
af91805a 492static void cell_error_with_code(struct pool *pool,
4e4cbee9 493 struct dm_bio_prison_cell *cell, blk_status_t error_code)
6beca5eb 494{
af91805a 495 dm_cell_error(pool->prison, cell, error_code);
6beca5eb
JT
496 dm_bio_prison_free_cell(pool->prison, cell);
497}
498
4e4cbee9 499static blk_status_t get_pool_io_error_code(struct pool *pool)
c3667cc6 500{
4e4cbee9 501 return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
c3667cc6
MS
502}
503
af91805a
MS
504static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
505{
4e4cbee9 506 cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
af91805a
MS
507}
508
a374bb21
JT
509static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
510{
511 cell_error_with_code(pool, cell, 0);
512}
513
514static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
515{
4e4cbee9 516 cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
a374bb21
JT
517}
518
6beca5eb
JT
519/*----------------------------------------------------------------*/
520
991d9fa0
JT
521/*
522 * A global list of pools that uses a struct mapped_device as a key.
523 */
524static struct dm_thin_pool_table {
525 struct mutex mutex;
526 struct list_head pools;
527} dm_thin_pool_table;
528
529static void pool_table_init(void)
530{
531 mutex_init(&dm_thin_pool_table.mutex);
532 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
533}
534
d5ffebdd
MS
535static void pool_table_exit(void)
536{
537 mutex_destroy(&dm_thin_pool_table.mutex);
538}
539
991d9fa0
JT
540static void __pool_table_insert(struct pool *pool)
541{
542 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
543 list_add(&pool->list, &dm_thin_pool_table.pools);
544}
545
546static void __pool_table_remove(struct pool *pool)
547{
548 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
549 list_del(&pool->list);
550}
551
552static struct pool *__pool_table_lookup(struct mapped_device *md)
553{
554 struct pool *pool = NULL, *tmp;
555
556 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
557
558 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
559 if (tmp->pool_md == md) {
560 pool = tmp;
561 break;
562 }
563 }
564
565 return pool;
566}
567
568static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
569{
570 struct pool *pool = NULL, *tmp;
571
572 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
573
574 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
575 if (tmp->md_dev == md_dev) {
576 pool = tmp;
577 break;
578 }
579 }
580
581 return pool;
582}
583
584/*----------------------------------------------------------------*/
585
a24c2569 586struct dm_thin_endio_hook {
eb2aa48d 587 struct thin_c *tc;
44feb387
MS
588 struct dm_deferred_entry *shared_read_entry;
589 struct dm_deferred_entry *all_io_entry;
a24c2569 590 struct dm_thin_new_mapping *overwrite_mapping;
67324ea1 591 struct rb_node rb_node;
34fbcf62 592 struct dm_bio_prison_cell *cell;
eb2aa48d
JT
593};
594
42d6a8ce
MS
595static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
596{
597 bio_list_merge(bios, master);
598 bio_list_init(master);
599}
600
4e4cbee9 601static void error_bio_list(struct bio_list *bios, blk_status_t error)
991d9fa0
JT
602{
603 struct bio *bio;
42d6a8ce 604
4246a0b6 605 while ((bio = bio_list_pop(bios))) {
4e4cbee9 606 bio->bi_status = error;
4246a0b6
CH
607 bio_endio(bio);
608 }
42d6a8ce
MS
609}
610
4e4cbee9
CH
611static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
612 blk_status_t error)
42d6a8ce 613{
991d9fa0
JT
614 struct bio_list bios;
615
616 bio_list_init(&bios);
18adc577 617
8e0c9dac 618 spin_lock_irq(&tc->lock);
42d6a8ce 619 __merge_bio_list(&bios, master);
8e0c9dac 620 spin_unlock_irq(&tc->lock);
991d9fa0 621
42d6a8ce 622 error_bio_list(&bios, error);
991d9fa0
JT
623}
624
a374bb21
JT
625static void requeue_deferred_cells(struct thin_c *tc)
626{
627 struct pool *pool = tc->pool;
a374bb21
JT
628 struct list_head cells;
629 struct dm_bio_prison_cell *cell, *tmp;
630
631 INIT_LIST_HEAD(&cells);
632
8e0c9dac 633 spin_lock_irq(&tc->lock);
a374bb21 634 list_splice_init(&tc->deferred_cells, &cells);
8e0c9dac 635 spin_unlock_irq(&tc->lock);
a374bb21
JT
636
637 list_for_each_entry_safe(cell, tmp, &cells, user_list)
638 cell_requeue(pool, cell);
639}
640
991d9fa0
JT
641static void requeue_io(struct thin_c *tc)
642{
3e1a0699
JT
643 struct bio_list bios;
644
645 bio_list_init(&bios);
646
8e0c9dac 647 spin_lock_irq(&tc->lock);
42d6a8ce
MS
648 __merge_bio_list(&bios, &tc->deferred_bio_list);
649 __merge_bio_list(&bios, &tc->retry_on_resume_list);
8e0c9dac 650 spin_unlock_irq(&tc->lock);
3e1a0699 651
4e4cbee9 652 error_bio_list(&bios, BLK_STS_DM_REQUEUE);
42d6a8ce 653 requeue_deferred_cells(tc);
3e1a0699
JT
654}
655
4e4cbee9 656static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
c140e1c4
MS
657{
658 struct thin_c *tc;
659
660 rcu_read_lock();
661 list_for_each_entry_rcu(tc, &pool->active_thins, list)
0a927c2f 662 error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
c140e1c4
MS
663 rcu_read_unlock();
664}
665
0a927c2f
MS
666static void error_retry_list(struct pool *pool)
667{
4e4cbee9 668 error_retry_list_with_code(pool, get_pool_io_error_code(pool));
0a927c2f
MS
669}
670
991d9fa0
JT
671/*
672 * This section of code contains the logic for processing a thin device's IO.
673 * Much of the code depends on pool object resources (lists, workqueues, etc)
674 * but most is exclusively called from the thin target rather than the thin-pool
675 * target.
676 */
677
678static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
679{
58f77a21 680 struct pool *pool = tc->pool;
4f024f37 681 sector_t block_nr = bio->bi_iter.bi_sector;
55f2b8bd 682
58f77a21
MS
683 if (block_size_is_power_of_two(pool))
684 block_nr >>= pool->sectors_per_block_shift;
f9a8e0cd 685 else
58f77a21 686 (void) sector_div(block_nr, pool->sectors_per_block);
55f2b8bd
MS
687
688 return block_nr;
991d9fa0
JT
689}
690
34fbcf62
JT
691/*
692 * Returns the _complete_ blocks that this bio covers.
693 */
694static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
695 dm_block_t *begin, dm_block_t *end)
696{
697 struct pool *pool = tc->pool;
698 sector_t b = bio->bi_iter.bi_sector;
699 sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
700
701 b += pool->sectors_per_block - 1ull; /* so we round up */
702
703 if (block_size_is_power_of_two(pool)) {
704 b >>= pool->sectors_per_block_shift;
705 e >>= pool->sectors_per_block_shift;
706 } else {
707 (void) sector_div(b, pool->sectors_per_block);
708 (void) sector_div(e, pool->sectors_per_block);
709 }
710
711 if (e < b)
712 /* Can happen if the bio is within a single block. */
713 e = b;
714
715 *begin = b;
716 *end = e;
717}
718
991d9fa0
JT
719static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
720{
721 struct pool *pool = tc->pool;
4f024f37 722 sector_t bi_sector = bio->bi_iter.bi_sector;
991d9fa0 723
74d46992 724 bio_set_dev(bio, tc->pool_dev->bdev);
58f77a21 725 if (block_size_is_power_of_two(pool))
4f024f37
KO
726 bio->bi_iter.bi_sector =
727 (block << pool->sectors_per_block_shift) |
728 (bi_sector & (pool->sectors_per_block - 1));
58f77a21 729 else
4f024f37 730 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
58f77a21 731 sector_div(bi_sector, pool->sectors_per_block);
991d9fa0
JT
732}
733
2dd9c257
JT
734static void remap_to_origin(struct thin_c *tc, struct bio *bio)
735{
74d46992 736 bio_set_dev(bio, tc->origin_dev->bdev);
2dd9c257
JT
737}
738
4afdd680
JT
739static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
740{
f73f44eb 741 return op_is_flush(bio->bi_opf) &&
4afdd680
JT
742 dm_thin_changed_this_transaction(tc->td);
743}
744
e8088073
JT
745static void inc_all_io_entry(struct pool *pool, struct bio *bio)
746{
747 struct dm_thin_endio_hook *h;
748
e6047149 749 if (bio_op(bio) == REQ_OP_DISCARD)
e8088073
JT
750 return;
751
59c3d2c6 752 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
e8088073
JT
753 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
754}
755
2dd9c257 756static void issue(struct thin_c *tc, struct bio *bio)
991d9fa0
JT
757{
758 struct pool *pool = tc->pool;
991d9fa0 759
e49e5829 760 if (!bio_triggers_commit(tc, bio)) {
b7f8dff0 761 dm_submit_bio_remap(bio, NULL);
e49e5829
JT
762 return;
763 }
764
991d9fa0 765 /*
e49e5829
JT
766 * Complete bio with an error if earlier I/O caused changes to
767 * the metadata that can't be committed e.g, due to I/O errors
768 * on the metadata device.
991d9fa0 769 */
e49e5829
JT
770 if (dm_thin_aborted_changes(tc->td)) {
771 bio_io_error(bio);
772 return;
773 }
774
775 /*
776 * Batch together any bios that trigger commits and then issue a
777 * single commit for them in process_deferred_bios().
778 */
8e0c9dac 779 spin_lock_irq(&pool->lock);
e49e5829 780 bio_list_add(&pool->deferred_flush_bios, bio);
8e0c9dac 781 spin_unlock_irq(&pool->lock);
991d9fa0
JT
782}
783
2dd9c257
JT
784static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
785{
786 remap_to_origin(tc, bio);
787 issue(tc, bio);
788}
789
790static void remap_and_issue(struct thin_c *tc, struct bio *bio,
791 dm_block_t block)
792{
793 remap(tc, bio, block);
794 issue(tc, bio);
795}
796
991d9fa0
JT
797/*----------------------------------------------------------------*/
798
799/*
800 * Bio endio functions.
801 */
a24c2569 802struct dm_thin_new_mapping {
991d9fa0
JT
803 struct list_head list;
804
7f214665 805 bool pass_discard:1;
34fbcf62 806 bool maybe_shared:1;
991d9fa0 807
50f3c3ef
JT
808 /*
809 * Track quiescing, copying and zeroing preparation actions. When this
810 * counter hits zero the block is prepared and can be inserted into the
811 * btree.
812 */
813 atomic_t prepare_actions;
814
4e4cbee9 815 blk_status_t status;
991d9fa0 816 struct thin_c *tc;
34fbcf62 817 dm_block_t virt_begin, virt_end;
991d9fa0 818 dm_block_t data_block;
34fbcf62 819 struct dm_bio_prison_cell *cell;
991d9fa0
JT
820
821 /*
822 * If the bio covers the whole area of a block then we can avoid
823 * zeroing or copying. Instead this bio is hooked. The bio will
824 * still be in the cell, so care has to be taken to avoid issuing
825 * the bio twice.
826 */
827 struct bio *bio;
828 bio_end_io_t *saved_bi_end_io;
829};
830
50f3c3ef 831static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
991d9fa0
JT
832{
833 struct pool *pool = m->tc->pool;
834
50f3c3ef 835 if (atomic_dec_and_test(&m->prepare_actions)) {
daec338b 836 list_add_tail(&m->list, &pool->prepared_mappings);
991d9fa0
JT
837 wake_worker(pool);
838 }
839}
840
e5aea7b4 841static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
991d9fa0
JT
842{
843 unsigned long flags;
991d9fa0
JT
844 struct pool *pool = m->tc->pool;
845
991d9fa0 846 spin_lock_irqsave(&pool->lock, flags);
50f3c3ef 847 __complete_mapping_preparation(m);
991d9fa0
JT
848 spin_unlock_irqrestore(&pool->lock, flags);
849}
850
e5aea7b4
JT
851static void copy_complete(int read_err, unsigned long write_err, void *context)
852{
853 struct dm_thin_new_mapping *m = context;
854
4e4cbee9 855 m->status = read_err || write_err ? BLK_STS_IOERR : 0;
e5aea7b4
JT
856 complete_mapping_preparation(m);
857}
858
4246a0b6 859static void overwrite_endio(struct bio *bio)
991d9fa0 860{
59c3d2c6 861 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 862 struct dm_thin_new_mapping *m = h->overwrite_mapping;
991d9fa0 863
8b908f8e
MS
864 bio->bi_end_io = m->saved_bi_end_io;
865
4e4cbee9 866 m->status = bio->bi_status;
e5aea7b4 867 complete_mapping_preparation(m);
991d9fa0
JT
868}
869
991d9fa0
JT
870/*----------------------------------------------------------------*/
871
872/*
873 * Workqueue.
874 */
875
876/*
877 * Prepared mapping jobs.
878 */
879
880/*
2d759a46
JT
881 * This sends the bios in the cell, except the original holder, back
882 * to the deferred_bios list.
991d9fa0 883 */
f286ba0e 884static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
991d9fa0 885{
991d9fa0
JT
886 struct pool *pool = tc->pool;
887 unsigned long flags;
bb46c561 888 struct bio_list bios;
991d9fa0 889
bb46c561
JT
890 bio_list_init(&bios);
891 cell_release_no_holder(pool, cell, &bios);
991d9fa0 892
bb46c561
JT
893 if (!bio_list_empty(&bios)) {
894 spin_lock_irqsave(&tc->lock, flags);
895 bio_list_merge(&tc->deferred_bio_list, &bios);
896 spin_unlock_irqrestore(&tc->lock, flags);
d256d796 897 wake_worker(pool);
bb46c561 898 }
991d9fa0
JT
899}
900
a374bb21
JT
901static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
902
2d759a46
JT
903struct remap_info {
904 struct thin_c *tc;
905 struct bio_list defer_bios;
906 struct bio_list issue_bios;
907};
908
909static void __inc_remap_and_issue_cell(void *context,
910 struct dm_bio_prison_cell *cell)
a374bb21 911{
2d759a46 912 struct remap_info *info = context;
a374bb21 913 struct bio *bio;
a374bb21 914
2d759a46 915 while ((bio = bio_list_pop(&cell->bios))) {
f73f44eb 916 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
2d759a46 917 bio_list_add(&info->defer_bios, bio);
a374bb21 918 else {
2d759a46
JT
919 inc_all_io_entry(info->tc->pool, bio);
920
921 /*
922 * We can't issue the bios with the bio prison lock
923 * held, so we add them to a list to issue on
924 * return from this function.
925 */
926 bio_list_add(&info->issue_bios, bio);
a374bb21
JT
927 }
928 }
929}
930
2d759a46
JT
931static void inc_remap_and_issue_cell(struct thin_c *tc,
932 struct dm_bio_prison_cell *cell,
933 dm_block_t block)
934{
935 struct bio *bio;
936 struct remap_info info;
937
938 info.tc = tc;
939 bio_list_init(&info.defer_bios);
940 bio_list_init(&info.issue_bios);
941
942 /*
943 * We have to be careful to inc any bios we're about to issue
944 * before the cell is released, and avoid a race with new bios
945 * being added to the cell.
946 */
947 cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
948 &info, cell);
949
950 while ((bio = bio_list_pop(&info.defer_bios)))
951 thin_defer_bio(tc, bio);
952
953 while ((bio = bio_list_pop(&info.issue_bios)))
954 remap_and_issue(info.tc, bio, block);
955}
956
e49e5829
JT
957static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
958{
6beca5eb 959 cell_error(m->tc->pool, m->cell);
e49e5829 960 list_del(&m->list);
6f1c819c 961 mempool_free(m, &m->tc->pool->mapping_pool);
e49e5829 962}
025b9685 963
4ae280b4
NT
964static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
965{
966 struct pool *pool = tc->pool;
4ae280b4
NT
967
968 /*
969 * If the bio has the REQ_FUA flag set we must commit the metadata
970 * before signaling its completion.
971 */
972 if (!bio_triggers_commit(tc, bio)) {
973 bio_endio(bio);
974 return;
975 }
976
977 /*
978 * Complete bio with an error if earlier I/O caused changes to the
979 * metadata that can't be committed, e.g, due to I/O errors on the
980 * metadata device.
981 */
982 if (dm_thin_aborted_changes(tc->td)) {
983 bio_io_error(bio);
984 return;
985 }
986
987 /*
988 * Batch together any bios that trigger commits and then issue a
989 * single commit for them in process_deferred_bios().
990 */
8e0c9dac 991 spin_lock_irq(&pool->lock);
4ae280b4 992 bio_list_add(&pool->deferred_flush_completions, bio);
8e0c9dac 993 spin_unlock_irq(&pool->lock);
4ae280b4
NT
994}
995
a24c2569 996static void process_prepared_mapping(struct dm_thin_new_mapping *m)
991d9fa0
JT
997{
998 struct thin_c *tc = m->tc;
6beca5eb 999 struct pool *pool = tc->pool;
8b908f8e 1000 struct bio *bio = m->bio;
991d9fa0
JT
1001 int r;
1002
4e4cbee9 1003 if (m->status) {
6beca5eb 1004 cell_error(pool, m->cell);
905386f8 1005 goto out;
991d9fa0
JT
1006 }
1007
1008 /*
1009 * Commit the prepared block into the mapping btree.
1010 * Any I/O for this block arriving after this point will get
1011 * remapped to it directly.
1012 */
34fbcf62 1013 r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
991d9fa0 1014 if (r) {
b5330655 1015 metadata_operation_failed(pool, "dm_thin_insert_block", r);
6beca5eb 1016 cell_error(pool, m->cell);
905386f8 1017 goto out;
991d9fa0
JT
1018 }
1019
1020 /*
1021 * Release any bios held while the block was being provisioned.
1022 * If we are processing a write bio that completely covers the block,
1023 * we already processed it so can ignore it now when processing
1024 * the bios in the cell.
1025 */
1026 if (bio) {
2d759a46 1027 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
4ae280b4 1028 complete_overwrite_bio(tc, bio);
2d759a46
JT
1029 } else {
1030 inc_all_io_entry(tc->pool, m->cell->holder);
1031 remap_and_issue(tc, m->cell->holder, m->data_block);
1032 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1033 }
991d9fa0 1034
905386f8 1035out:
991d9fa0 1036 list_del(&m->list);
6f1c819c 1037 mempool_free(m, &pool->mapping_pool);
991d9fa0
JT
1038}
1039
34fbcf62
JT
1040/*----------------------------------------------------------------*/
1041
1042static void free_discard_mapping(struct dm_thin_new_mapping *m)
104655fd 1043{
104655fd 1044 struct thin_c *tc = m->tc;
0ef0b471 1045
34fbcf62
JT
1046 if (m->cell)
1047 cell_defer_no_holder(tc, m->cell);
6f1c819c 1048 mempool_free(m, &tc->pool->mapping_pool);
34fbcf62 1049}
104655fd 1050
34fbcf62
JT
1051static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1052{
e49e5829 1053 bio_io_error(m->bio);
34fbcf62
JT
1054 free_discard_mapping(m);
1055}
1056
1057static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1058{
4246a0b6 1059 bio_endio(m->bio);
34fbcf62
JT
1060 free_discard_mapping(m);
1061}
1062
1063static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1064{
1065 int r;
1066 struct thin_c *tc = m->tc;
1067
1068 r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
1069 if (r) {
1070 metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
1071 bio_io_error(m->bio);
1072 } else
4246a0b6 1073 bio_endio(m->bio);
34fbcf62 1074
f286ba0e 1075 cell_defer_no_holder(tc, m->cell);
6f1c819c 1076 mempool_free(m, &tc->pool->mapping_pool);
e49e5829
JT
1077}
1078
202bae52
JT
1079/*----------------------------------------------------------------*/
1080
2a0fbffb
JT
1081static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1082 struct bio *discard_parent)
e49e5829 1083{
34fbcf62
JT
1084 /*
1085 * We've already unmapped this range of blocks, but before we
1086 * passdown we have to check that these blocks are now unused.
1087 */
202bae52 1088 int r = 0;
d445bd9c 1089 bool shared = true;
e49e5829 1090 struct thin_c *tc = m->tc;
34fbcf62
JT
1091 struct pool *pool = tc->pool;
1092 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
202bae52 1093 struct discard_op op;
104655fd 1094
2a0fbffb 1095 begin_discard(&op, tc, discard_parent);
34fbcf62
JT
1096 while (b != end) {
1097 /* find start of unmapped run */
1098 for (; b < end; b++) {
d445bd9c 1099 r = dm_pool_block_is_shared(pool->pmd, b, &shared);
34fbcf62 1100 if (r)
202bae52 1101 goto out;
e8088073 1102
d445bd9c 1103 if (!shared)
34fbcf62 1104 break;
19fa1a67 1105 }
104655fd 1106
34fbcf62
JT
1107 if (b == end)
1108 break;
1109
1110 /* find end of run */
1111 for (e = b + 1; e != end; e++) {
d445bd9c 1112 r = dm_pool_block_is_shared(pool->pmd, e, &shared);
34fbcf62 1113 if (r)
202bae52 1114 goto out;
34fbcf62 1115
d445bd9c 1116 if (shared)
34fbcf62
JT
1117 break;
1118 }
1119
202bae52 1120 r = issue_discard(&op, b, e);
34fbcf62 1121 if (r)
202bae52 1122 goto out;
34fbcf62
JT
1123
1124 b = e;
1125 }
202bae52
JT
1126out:
1127 end_discard(&op, r);
104655fd
JT
1128}
1129
2a0fbffb
JT
1130static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1131{
1132 unsigned long flags;
1133 struct pool *pool = m->tc->pool;
1134
1135 spin_lock_irqsave(&pool->lock, flags);
1136 list_add_tail(&m->list, &pool->prepared_discards_pt2);
1137 spin_unlock_irqrestore(&pool->lock, flags);
1138 wake_worker(pool);
1139}
1140
1141static void passdown_endio(struct bio *bio)
1142{
1143 /*
1144 * It doesn't matter if the passdown discard failed, we still want
1145 * to unmap (we ignore err).
1146 */
1147 queue_passdown_pt2(bio->bi_private);
948f581a 1148 bio_put(bio);
2a0fbffb
JT
1149}
1150
1151static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
e49e5829
JT
1152{
1153 int r;
1154 struct thin_c *tc = m->tc;
34fbcf62 1155 struct pool *pool = tc->pool;
2a0fbffb
JT
1156 struct bio *discard_parent;
1157 dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
e49e5829 1158
2a0fbffb
JT
1159 /*
1160 * Only this thread allocates blocks, so we can be sure that the
1161 * newly unmapped blocks will not be allocated before the end of
1162 * the function.
1163 */
34fbcf62 1164 r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
202bae52 1165 if (r) {
34fbcf62 1166 metadata_operation_failed(pool, "dm_thin_remove_range", r);
202bae52 1167 bio_io_error(m->bio);
2a0fbffb 1168 cell_defer_no_holder(tc, m->cell);
6f1c819c 1169 mempool_free(m, &pool->mapping_pool);
2a0fbffb
JT
1170 return;
1171 }
34fbcf62 1172
00a0ea33
VV
1173 /*
1174 * Increment the unmapped blocks. This prevents a race between the
1175 * passdown io and reallocation of freed blocks.
1176 */
1177 r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
1178 if (r) {
1179 metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
1180 bio_io_error(m->bio);
1181 cell_defer_no_holder(tc, m->cell);
6f1c819c 1182 mempool_free(m, &pool->mapping_pool);
00a0ea33
VV
1183 return;
1184 }
1185
07888c66 1186 discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);
53db984e
CH
1187 discard_parent->bi_end_io = passdown_endio;
1188 discard_parent->bi_private = m;
255e2646
HM
1189 if (m->maybe_shared)
1190 passdown_double_checking_shared_status(m, discard_parent);
1191 else {
53db984e 1192 struct discard_op op;
2a0fbffb 1193
53db984e
CH
1194 begin_discard(&op, tc, discard_parent);
1195 r = issue_discard(&op, m->data_block, data_end);
1196 end_discard(&op, r);
202bae52 1197 }
2a0fbffb
JT
1198}
1199
1200static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1201{
1202 int r;
1203 struct thin_c *tc = m->tc;
1204 struct pool *pool = tc->pool;
1205
1206 /*
1207 * The passdown has completed, so now we can decrement all those
1208 * unmapped blocks.
1209 */
1210 r = dm_pool_dec_data_range(pool->pmd, m->data_block,
1211 m->data_block + (m->virt_end - m->virt_begin));
1212 if (r) {
1213 metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
1214 bio_io_error(m->bio);
1215 } else
1216 bio_endio(m->bio);
1217
34fbcf62 1218 cell_defer_no_holder(tc, m->cell);
6f1c819c 1219 mempool_free(m, &pool->mapping_pool);
e49e5829
JT
1220}
1221
104655fd 1222static void process_prepared(struct pool *pool, struct list_head *head,
e49e5829 1223 process_mapping_fn *fn)
991d9fa0 1224{
991d9fa0 1225 struct list_head maps;
a24c2569 1226 struct dm_thin_new_mapping *m, *tmp;
991d9fa0
JT
1227
1228 INIT_LIST_HEAD(&maps);
8e0c9dac 1229 spin_lock_irq(&pool->lock);
104655fd 1230 list_splice_init(head, &maps);
8e0c9dac 1231 spin_unlock_irq(&pool->lock);
991d9fa0
JT
1232
1233 list_for_each_entry_safe(m, tmp, &maps, list)
e49e5829 1234 (*fn)(m);
991d9fa0
JT
1235}
1236
1237/*
1238 * Deferred bio jobs.
1239 */
104655fd 1240static int io_overlaps_block(struct pool *pool, struct bio *bio)
991d9fa0 1241{
4f024f37
KO
1242 return bio->bi_iter.bi_size ==
1243 (pool->sectors_per_block << SECTOR_SHIFT);
104655fd
JT
1244}
1245
1246static int io_overwrites_block(struct pool *pool, struct bio *bio)
1247{
1248 return (bio_data_dir(bio) == WRITE) &&
1249 io_overlaps_block(pool, bio);
991d9fa0
JT
1250}
1251
1252static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1253 bio_end_io_t *fn)
1254{
1255 *save = bio->bi_end_io;
1256 bio->bi_end_io = fn;
1257}
1258
1259static int ensure_next_mapping(struct pool *pool)
1260{
1261 if (pool->next_mapping)
1262 return 0;
1263
6f1c819c 1264 pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
991d9fa0
JT
1265
1266 return pool->next_mapping ? 0 : -ENOMEM;
1267}
1268
a24c2569 1269static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
991d9fa0 1270{
16961b04 1271 struct dm_thin_new_mapping *m = pool->next_mapping;
991d9fa0
JT
1272
1273 BUG_ON(!pool->next_mapping);
1274
16961b04
MS
1275 memset(m, 0, sizeof(struct dm_thin_new_mapping));
1276 INIT_LIST_HEAD(&m->list);
1277 m->bio = NULL;
1278
991d9fa0
JT
1279 pool->next_mapping = NULL;
1280
16961b04 1281 return m;
991d9fa0
JT
1282}
1283
e5aea7b4
JT
1284static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
1285 sector_t begin, sector_t end)
1286{
e5aea7b4
JT
1287 struct dm_io_region to;
1288
1289 to.bdev = tc->pool_dev->bdev;
1290 to.sector = begin;
1291 to.count = end - begin;
1292
7209049d 1293 dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
e5aea7b4
JT
1294}
1295
452d7a62 1296static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
34fbcf62 1297 dm_block_t data_begin,
452d7a62
MS
1298 struct dm_thin_new_mapping *m)
1299{
1300 struct pool *pool = tc->pool;
1301 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1302
1303 h->overwrite_mapping = m;
1304 m->bio = bio;
1305 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1306 inc_all_io_entry(pool, bio);
34fbcf62 1307 remap_and_issue(tc, bio, data_begin);
452d7a62
MS
1308}
1309
e5aea7b4
JT
1310/*
1311 * A partial copy also needs to zero the uncopied region.
1312 */
991d9fa0 1313static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
2dd9c257
JT
1314 struct dm_dev *origin, dm_block_t data_origin,
1315 dm_block_t data_dest,
e5aea7b4
JT
1316 struct dm_bio_prison_cell *cell, struct bio *bio,
1317 sector_t len)
991d9fa0 1318{
991d9fa0 1319 struct pool *pool = tc->pool;
a24c2569 1320 struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0 1321
991d9fa0 1322 m->tc = tc;
34fbcf62
JT
1323 m->virt_begin = virt_block;
1324 m->virt_end = virt_block + 1u;
991d9fa0
JT
1325 m->data_block = data_dest;
1326 m->cell = cell;
991d9fa0 1327
e5aea7b4
JT
1328 /*
1329 * quiesce action + copy action + an extra reference held for the
1330 * duration of this function (we may need to inc later for a
1331 * partial zero).
1332 */
1333 atomic_set(&m->prepare_actions, 3);
1334
44feb387 1335 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
e5aea7b4 1336 complete_mapping_preparation(m); /* already quiesced */
991d9fa0
JT
1337
1338 /*
1339 * IO to pool_dev remaps to the pool target's data_dev.
1340 *
1341 * If the whole block of data is being overwritten, we can issue the
1342 * bio immediately. Otherwise we use kcopyd to clone the data first.
1343 */
452d7a62
MS
1344 if (io_overwrites_block(pool, bio))
1345 remap_and_issue_overwrite(tc, bio, data_dest, m);
1346 else {
991d9fa0
JT
1347 struct dm_io_region from, to;
1348
2dd9c257 1349 from.bdev = origin->bdev;
991d9fa0 1350 from.sector = data_origin * pool->sectors_per_block;
e5aea7b4 1351 from.count = len;
991d9fa0
JT
1352
1353 to.bdev = tc->pool_dev->bdev;
1354 to.sector = data_dest * pool->sectors_per_block;
e5aea7b4 1355 to.count = len;
991d9fa0 1356
7209049d
MS
1357 dm_kcopyd_copy(pool->copier, &from, 1, &to,
1358 0, copy_complete, m);
e5aea7b4
JT
1359
1360 /*
1361 * Do we need to zero a tail region?
1362 */
1363 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1364 atomic_inc(&m->prepare_actions);
1365 ll_zero(tc, m,
1366 data_dest * pool->sectors_per_block + len,
1367 (data_dest + 1) * pool->sectors_per_block);
991d9fa0
JT
1368 }
1369 }
e5aea7b4
JT
1370
1371 complete_mapping_preparation(m); /* drop our ref */
991d9fa0
JT
1372}
1373
2dd9c257
JT
1374static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1375 dm_block_t data_origin, dm_block_t data_dest,
a24c2569 1376 struct dm_bio_prison_cell *cell, struct bio *bio)
2dd9c257
JT
1377{
1378 schedule_copy(tc, virt_block, tc->pool_dev,
e5aea7b4
JT
1379 data_origin, data_dest, cell, bio,
1380 tc->pool->sectors_per_block);
2dd9c257
JT
1381}
1382
991d9fa0 1383static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
a24c2569 1384 dm_block_t data_block, struct dm_bio_prison_cell *cell,
991d9fa0
JT
1385 struct bio *bio)
1386{
1387 struct pool *pool = tc->pool;
a24c2569 1388 struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0 1389
50f3c3ef 1390 atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
991d9fa0 1391 m->tc = tc;
34fbcf62
JT
1392 m->virt_begin = virt_block;
1393 m->virt_end = virt_block + 1u;
991d9fa0
JT
1394 m->data_block = data_block;
1395 m->cell = cell;
991d9fa0
JT
1396
1397 /*
1398 * If the whole block of data is being overwritten or we are not
1399 * zeroing pre-existing data, we can issue the bio immediately.
1400 * Otherwise we use kcopyd to zero the data first.
1401 */
f8ae7525
MS
1402 if (pool->pf.zero_new_blocks) {
1403 if (io_overwrites_block(pool, bio))
1404 remap_and_issue_overwrite(tc, bio, data_block, m);
1405 else
1406 ll_zero(tc, m, data_block * pool->sectors_per_block,
1407 (data_block + 1) * pool->sectors_per_block);
1408 } else
991d9fa0 1409 process_prepared_mapping(m);
e5aea7b4 1410}
991d9fa0 1411
e5aea7b4
JT
1412static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1413 dm_block_t data_dest,
1414 struct dm_bio_prison_cell *cell, struct bio *bio)
1415{
1416 struct pool *pool = tc->pool;
1417 sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1418 sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
1419
1420 if (virt_block_end <= tc->origin_size)
1421 schedule_copy(tc, virt_block, tc->origin_dev,
1422 virt_block, data_dest, cell, bio,
1423 pool->sectors_per_block);
1424
1425 else if (virt_block_begin < tc->origin_size)
1426 schedule_copy(tc, virt_block, tc->origin_dev,
1427 virt_block, data_dest, cell, bio,
1428 tc->origin_size - virt_block_begin);
1429
1430 else
1431 schedule_zero(tc, virt_block, data_dest, cell, bio);
991d9fa0
JT
1432}
1433
2c43fd26
JT
1434static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1435
a685557f
MS
1436static void requeue_bios(struct pool *pool);
1437
3ab91828
JT
1438static bool is_read_only_pool_mode(enum pool_mode mode)
1439{
1440 return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1441}
1442
1443static bool is_read_only(struct pool *pool)
1444{
1445 return is_read_only_pool_mode(get_pool_mode(pool));
1446}
1447
1448static void check_for_metadata_space(struct pool *pool)
1449{
1450 int r;
1451 const char *ooms_reason = NULL;
1452 dm_block_t nr_free;
1453
1454 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1455 if (r)
1456 ooms_reason = "Could not get free metadata blocks";
1457 else if (!nr_free)
1458 ooms_reason = "No free metadata blocks";
1459
1460 if (ooms_reason && !is_read_only(pool)) {
1461 DMERR("%s", ooms_reason);
1462 set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1463 }
1464}
1465
1466static void check_for_data_space(struct pool *pool)
2c43fd26
JT
1467{
1468 int r;
1469 dm_block_t nr_free;
1470
1471 if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1472 return;
1473
1474 r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
1475 if (r)
1476 return;
1477
a685557f 1478 if (nr_free) {
2c43fd26 1479 set_pool_mode(pool, PM_WRITE);
a685557f
MS
1480 requeue_bios(pool);
1481 }
2c43fd26
JT
1482}
1483
e49e5829
JT
1484/*
1485 * A non-zero return indicates read_only or fail_io mode.
1486 * Many callers don't care about the return value.
1487 */
020cc3b5 1488static int commit(struct pool *pool)
e49e5829
JT
1489{
1490 int r;
1491
3ab91828 1492 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
e49e5829
JT
1493 return -EINVAL;
1494
020cc3b5 1495 r = dm_pool_commit_metadata(pool->pmd);
b5330655
JT
1496 if (r)
1497 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
3ab91828
JT
1498 else {
1499 check_for_metadata_space(pool);
1500 check_for_data_space(pool);
1501 }
e49e5829
JT
1502
1503 return r;
1504}
1505
88a6621b
JT
1506static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1507{
88a6621b
JT
1508 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1509 DMWARN("%s: reached low water mark for data device: sending event.",
1510 dm_device_name(pool->pool_md));
8e0c9dac 1511 spin_lock_irq(&pool->lock);
88a6621b 1512 pool->low_water_triggered = true;
8e0c9dac 1513 spin_unlock_irq(&pool->lock);
88a6621b
JT
1514 dm_table_event(pool->ti->table);
1515 }
1516}
1517
991d9fa0
JT
1518static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1519{
1520 int r;
1521 dm_block_t free_blocks;
991d9fa0
JT
1522 struct pool *pool = tc->pool;
1523
3e1a0699 1524 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
8d30abff
JT
1525 return -EINVAL;
1526
991d9fa0 1527 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
b5330655
JT
1528 if (r) {
1529 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
991d9fa0 1530 return r;
b5330655 1531 }
991d9fa0 1532
88a6621b 1533 check_low_water_mark(pool, free_blocks);
991d9fa0
JT
1534
1535 if (!free_blocks) {
94563bad
MS
1536 /*
1537 * Try to commit to see if that will free up some
1538 * more space.
1539 */
020cc3b5
JT
1540 r = commit(pool);
1541 if (r)
1542 return r;
991d9fa0 1543
94563bad 1544 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
b5330655
JT
1545 if (r) {
1546 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
94563bad 1547 return r;
b5330655 1548 }
991d9fa0 1549
94563bad 1550 if (!free_blocks) {
3e1a0699 1551 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
94563bad 1552 return -ENOSPC;
991d9fa0
JT
1553 }
1554 }
1555
1556 r = dm_pool_alloc_data_block(pool->pmd, result);
4a02b34e 1557 if (r) {
a685557f
MS
1558 if (r == -ENOSPC)
1559 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1560 else
1561 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
991d9fa0 1562 return r;
4a02b34e 1563 }
991d9fa0 1564
3ab91828
JT
1565 r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1566 if (r) {
1567 metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1568 return r;
1569 }
1570
1571 if (!free_blocks) {
1572 /* Let's commit before we use up the metadata reserve. */
1573 r = commit(pool);
1574 if (r)
1575 return r;
1576 }
1577
991d9fa0
JT
1578 return 0;
1579}
1580
1581/*
1582 * If we have run out of space, queue bios until the device is
1583 * resumed, presumably after having been reloaded with more space.
1584 */
1585static void retry_on_resume(struct bio *bio)
1586{
59c3d2c6 1587 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d 1588 struct thin_c *tc = h->tc;
991d9fa0 1589
8e0c9dac 1590 spin_lock_irq(&tc->lock);
c140e1c4 1591 bio_list_add(&tc->retry_on_resume_list, bio);
8e0c9dac 1592 spin_unlock_irq(&tc->lock);
991d9fa0
JT
1593}
1594
4e4cbee9 1595static blk_status_t should_error_unserviceable_bio(struct pool *pool)
8c0f0e8c 1596{
3e1a0699
JT
1597 enum pool_mode m = get_pool_mode(pool);
1598
1599 switch (m) {
1600 case PM_WRITE:
1601 /* Shouldn't get here */
1602 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
4e4cbee9 1603 return BLK_STS_IOERR;
3e1a0699
JT
1604
1605 case PM_OUT_OF_DATA_SPACE:
4e4cbee9 1606 return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
3e1a0699 1607
3ab91828 1608 case PM_OUT_OF_METADATA_SPACE:
3e1a0699
JT
1609 case PM_READ_ONLY:
1610 case PM_FAIL:
4e4cbee9 1611 return BLK_STS_IOERR;
3e1a0699
JT
1612 default:
1613 /* Shouldn't get here */
1614 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
4e4cbee9 1615 return BLK_STS_IOERR;
3e1a0699
JT
1616 }
1617}
8c0f0e8c 1618
3e1a0699
JT
1619static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1620{
4e4cbee9 1621 blk_status_t error = should_error_unserviceable_bio(pool);
af91805a 1622
4246a0b6 1623 if (error) {
4e4cbee9 1624 bio->bi_status = error;
4246a0b6
CH
1625 bio_endio(bio);
1626 } else
6d16202b 1627 retry_on_resume(bio);
8c0f0e8c
MS
1628}
1629
399caddf 1630static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
991d9fa0
JT
1631{
1632 struct bio *bio;
1633 struct bio_list bios;
4e4cbee9 1634 blk_status_t error;
991d9fa0 1635
af91805a
MS
1636 error = should_error_unserviceable_bio(pool);
1637 if (error) {
1638 cell_error_with_code(pool, cell, error);
3e1a0699
JT
1639 return;
1640 }
1641
991d9fa0 1642 bio_list_init(&bios);
6beca5eb 1643 cell_release(pool, cell, &bios);
991d9fa0 1644
9d094eeb
MS
1645 while ((bio = bio_list_pop(&bios)))
1646 retry_on_resume(bio);
991d9fa0
JT
1647}
1648
34fbcf62
JT
1649static void process_discard_cell_no_passdown(struct thin_c *tc,
1650 struct dm_bio_prison_cell *virt_cell)
104655fd 1651{
104655fd 1652 struct pool *pool = tc->pool;
34fbcf62 1653 struct dm_thin_new_mapping *m = get_next_mapping(pool);
104655fd 1654
34fbcf62
JT
1655 /*
1656 * We don't need to lock the data blocks, since there's no
1657 * passdown. We only lock data blocks for allocation and breaking sharing.
1658 */
1659 m->tc = tc;
1660 m->virt_begin = virt_cell->key.block_begin;
1661 m->virt_end = virt_cell->key.block_end;
1662 m->cell = virt_cell;
1663 m->bio = virt_cell->holder;
104655fd 1664
34fbcf62
JT
1665 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1666 pool->process_prepared_discard(m);
1667}
104655fd 1668
34fbcf62
JT
1669static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1670 struct bio *bio)
1671{
1672 struct pool *pool = tc->pool;
1673
1674 int r;
1675 bool maybe_shared;
1676 struct dm_cell_key data_key;
1677 struct dm_bio_prison_cell *data_cell;
1678 struct dm_thin_new_mapping *m;
e2dd8aca
JT
1679 dm_block_t virt_begin, virt_end, data_begin, data_end;
1680 dm_block_t len, next_boundary;
34fbcf62
JT
1681
1682 while (begin != end) {
34fbcf62
JT
1683 r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
1684 &data_begin, &maybe_shared);
e2dd8aca 1685 if (r) {
104655fd 1686 /*
34fbcf62
JT
1687 * Silently fail, letting any mappings we've
1688 * created complete.
104655fd 1689 */
34fbcf62 1690 break;
104655fd 1691 }
104655fd 1692
e2dd8aca 1693 data_end = data_begin + (virt_end - virt_begin);
104655fd 1694
34fbcf62 1695 /*
e2dd8aca 1696 * Make sure the data region obeys the bio prison restrictions.
34fbcf62 1697 */
e2dd8aca
JT
1698 while (data_begin < data_end) {
1699 r = ensure_next_mapping(pool);
1700 if (r)
1701 return; /* we did our best */
1702
1703 next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
1704 << BIO_PRISON_MAX_RANGE_SHIFT;
1705 len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
1706
3f8d3f54
MS
1707 /* This key is certainly within range given the above splitting */
1708 (void) build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
e2dd8aca
JT
1709 if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1710 /* contention, we'll give up with this range */
1711 data_begin += len;
1712 continue;
1713 }
1714
1715 /*
1716 * IO may still be going to the destination block. We must
1717 * quiesce before we can do the removal.
1718 */
1719 m = get_next_mapping(pool);
1720 m->tc = tc;
1721 m->maybe_shared = maybe_shared;
1722 m->virt_begin = virt_begin;
1723 m->virt_end = virt_begin + len;
1724 m->data_block = data_begin;
1725 m->cell = data_cell;
1726 m->bio = bio;
1727
1728 /*
1729 * The parent bio must not complete before sub discard bios are
1730 * chained to it (see end_discard's bio_chain)!
1731 *
1732 * This per-mapping bi_remaining increment is paired with
1733 * the implicit decrement that occurs via bio_endio() in
1734 * end_discard().
1735 */
1736 bio_inc_remaining(bio);
1737 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1738 pool->process_prepared_discard(m);
1739
1740 virt_begin += len;
1741 data_begin += len;
1742 }
34fbcf62
JT
1743
1744 begin = virt_end;
104655fd
JT
1745 }
1746}
1747
34fbcf62
JT
1748static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
1749{
1750 struct bio *bio = virt_cell->holder;
1751 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1752
1753 /*
1754 * The virt_cell will only get freed once the origin bio completes.
1755 * This means it will remain locked while all the individual
1756 * passdown bios are in flight.
1757 */
1758 h->cell = virt_cell;
1759 break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
1760
1761 /*
1762 * We complete the bio now, knowing that the bi_remaining field
1763 * will prevent completion until the sub range discards have
1764 * completed.
1765 */
4246a0b6 1766 bio_endio(bio);
34fbcf62
JT
1767}
1768
a374bb21
JT
1769static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1770{
34fbcf62
JT
1771 dm_block_t begin, end;
1772 struct dm_cell_key virt_key;
1773 struct dm_bio_prison_cell *virt_cell;
a374bb21 1774
34fbcf62
JT
1775 get_bio_block_range(tc, bio, &begin, &end);
1776 if (begin == end) {
1777 /*
1778 * The discard covers less than a block.
1779 */
4246a0b6 1780 bio_endio(bio);
a374bb21 1781 return;
34fbcf62 1782 }
a374bb21 1783
3f8d3f54
MS
1784 if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
1785 DMERR_LIMIT("Discard doesn't respect bio prison limits");
1786 bio_endio(bio);
1787 return;
1788 }
1789
1790 if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) {
34fbcf62
JT
1791 /*
1792 * Potential starvation issue: We're relying on the
1793 * fs/application being well behaved, and not trying to
1794 * send IO to a region at the same time as discarding it.
1795 * If they do this persistently then it's possible this
1796 * cell will never be granted.
1797 */
1798 return;
3f8d3f54 1799 }
34fbcf62
JT
1800
1801 tc->pool->process_discard_cell(tc, virt_cell);
a374bb21
JT
1802}
1803
991d9fa0 1804static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
44feb387 1805 struct dm_cell_key *key,
991d9fa0 1806 struct dm_thin_lookup_result *lookup_result,
a24c2569 1807 struct dm_bio_prison_cell *cell)
991d9fa0
JT
1808{
1809 int r;
1810 dm_block_t data_block;
d6fc2042 1811 struct pool *pool = tc->pool;
991d9fa0
JT
1812
1813 r = alloc_data_block(tc, &data_block);
1814 switch (r) {
1815 case 0:
2dd9c257
JT
1816 schedule_internal_copy(tc, block, lookup_result->block,
1817 data_block, cell, bio);
991d9fa0
JT
1818 break;
1819
1820 case -ENOSPC:
399caddf 1821 retry_bios_on_resume(pool, cell);
991d9fa0
JT
1822 break;
1823
1824 default:
c397741c
MS
1825 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1826 __func__, r);
d6fc2042 1827 cell_error(pool, cell);
991d9fa0
JT
1828 break;
1829 }
1830}
1831
23ca2bb6
JT
1832static void __remap_and_issue_shared_cell(void *context,
1833 struct dm_bio_prison_cell *cell)
1834{
1835 struct remap_info *info = context;
1836 struct bio *bio;
1837
1838 while ((bio = bio_list_pop(&cell->bios))) {
f73f44eb
CH
1839 if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
1840 bio_op(bio) == REQ_OP_DISCARD)
23ca2bb6
JT
1841 bio_list_add(&info->defer_bios, bio);
1842 else {
bd6d1e0a 1843 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
23ca2bb6
JT
1844
1845 h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1846 inc_all_io_entry(info->tc->pool, bio);
1847 bio_list_add(&info->issue_bios, bio);
1848 }
1849 }
1850}
1851
1852static void remap_and_issue_shared_cell(struct thin_c *tc,
1853 struct dm_bio_prison_cell *cell,
1854 dm_block_t block)
1855{
1856 struct bio *bio;
1857 struct remap_info info;
1858
1859 info.tc = tc;
1860 bio_list_init(&info.defer_bios);
1861 bio_list_init(&info.issue_bios);
1862
1863 cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1864 &info, cell);
1865
1866 while ((bio = bio_list_pop(&info.defer_bios)))
1867 thin_defer_bio(tc, bio);
1868
1869 while ((bio = bio_list_pop(&info.issue_bios)))
1870 remap_and_issue(tc, bio, block);
1871}
1872
991d9fa0
JT
1873static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1874 dm_block_t block,
23ca2bb6
JT
1875 struct dm_thin_lookup_result *lookup_result,
1876 struct dm_bio_prison_cell *virt_cell)
991d9fa0 1877{
23ca2bb6 1878 struct dm_bio_prison_cell *data_cell;
991d9fa0 1879 struct pool *pool = tc->pool;
44feb387 1880 struct dm_cell_key key;
991d9fa0
JT
1881
1882 /*
1883 * If cell is already occupied, then sharing is already in the process
1884 * of being broken so we have nothing further to do here.
1885 */
1886 build_data_key(tc->td, lookup_result->block, &key);
23ca2bb6
JT
1887 if (bio_detain(pool, &key, bio, &data_cell)) {
1888 cell_defer_no_holder(tc, virt_cell);
991d9fa0 1889 return;
23ca2bb6 1890 }
991d9fa0 1891
23ca2bb6
JT
1892 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1893 break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1894 cell_defer_no_holder(tc, virt_cell);
1895 } else {
59c3d2c6 1896 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
991d9fa0 1897
44feb387 1898 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
e8088073 1899 inc_all_io_entry(pool, bio);
991d9fa0 1900 remap_and_issue(tc, bio, lookup_result->block);
23ca2bb6
JT
1901
1902 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1903 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
991d9fa0
JT
1904 }
1905}
1906
1907static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
a24c2569 1908 struct dm_bio_prison_cell *cell)
991d9fa0
JT
1909{
1910 int r;
1911 dm_block_t data_block;
6beca5eb 1912 struct pool *pool = tc->pool;
991d9fa0
JT
1913
1914 /*
1915 * Remap empty bios (flushes) immediately, without provisioning.
1916 */
4f024f37 1917 if (!bio->bi_iter.bi_size) {
6beca5eb 1918 inc_all_io_entry(pool, bio);
f286ba0e 1919 cell_defer_no_holder(tc, cell);
e8088073 1920
991d9fa0
JT
1921 remap_and_issue(tc, bio, 0);
1922 return;
1923 }
1924
1925 /*
1926 * Fill read bios with zeroes and complete them immediately.
1927 */
1928 if (bio_data_dir(bio) == READ) {
1929 zero_fill_bio(bio);
f286ba0e 1930 cell_defer_no_holder(tc, cell);
4246a0b6 1931 bio_endio(bio);
991d9fa0
JT
1932 return;
1933 }
1934
1935 r = alloc_data_block(tc, &data_block);
1936 switch (r) {
1937 case 0:
2dd9c257
JT
1938 if (tc->origin_dev)
1939 schedule_external_copy(tc, block, data_block, cell, bio);
1940 else
1941 schedule_zero(tc, block, data_block, cell, bio);
991d9fa0
JT
1942 break;
1943
1944 case -ENOSPC:
399caddf 1945 retry_bios_on_resume(pool, cell);
991d9fa0
JT
1946 break;
1947
1948 default:
c397741c
MS
1949 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1950 __func__, r);
6beca5eb 1951 cell_error(pool, cell);
991d9fa0
JT
1952 break;
1953 }
1954}
1955
a374bb21 1956static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
991d9fa0
JT
1957{
1958 int r;
6beca5eb 1959 struct pool *pool = tc->pool;
a374bb21 1960 struct bio *bio = cell->holder;
991d9fa0 1961 dm_block_t block = get_bio_block(tc, bio);
991d9fa0
JT
1962 struct dm_thin_lookup_result lookup_result;
1963
a374bb21
JT
1964 if (tc->requeue_mode) {
1965 cell_requeue(pool, cell);
991d9fa0 1966 return;
a374bb21 1967 }
991d9fa0
JT
1968
1969 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1970 switch (r) {
1971 case 0:
23ca2bb6
JT
1972 if (lookup_result.shared)
1973 process_shared_bio(tc, bio, block, &lookup_result, cell);
1974 else {
6beca5eb 1975 inc_all_io_entry(pool, bio);
991d9fa0 1976 remap_and_issue(tc, bio, lookup_result.block);
a374bb21 1977 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
e8088073 1978 }
991d9fa0
JT
1979 break;
1980
1981 case -ENODATA:
2dd9c257 1982 if (bio_data_dir(bio) == READ && tc->origin_dev) {
6beca5eb 1983 inc_all_io_entry(pool, bio);
f286ba0e 1984 cell_defer_no_holder(tc, cell);
e8088073 1985
e5aea7b4
JT
1986 if (bio_end_sector(bio) <= tc->origin_size)
1987 remap_to_origin_and_issue(tc, bio);
1988
1989 else if (bio->bi_iter.bi_sector < tc->origin_size) {
1990 zero_fill_bio(bio);
1991 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1992 remap_to_origin_and_issue(tc, bio);
1993
1994 } else {
1995 zero_fill_bio(bio);
4246a0b6 1996 bio_endio(bio);
e5aea7b4 1997 }
2dd9c257
JT
1998 } else
1999 provision_block(tc, bio, block, cell);
991d9fa0
JT
2000 break;
2001
2002 default:
c397741c
MS
2003 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2004 __func__, r);
f286ba0e 2005 cell_defer_no_holder(tc, cell);
991d9fa0
JT
2006 bio_io_error(bio);
2007 break;
2008 }
2009}
2010
a374bb21
JT
2011static void process_bio(struct thin_c *tc, struct bio *bio)
2012{
2013 struct pool *pool = tc->pool;
2014 dm_block_t block = get_bio_block(tc, bio);
2015 struct dm_bio_prison_cell *cell;
2016 struct dm_cell_key key;
2017
2018 /*
2019 * If cell is already occupied, then the block is already
2020 * being provisioned so we have nothing further to do here.
2021 */
2022 build_virtual_key(tc->td, block, &key);
2023 if (bio_detain(pool, &key, bio, &cell))
2024 return;
2025
2026 process_cell(tc, cell);
2027}
2028
2029static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
2030 struct dm_bio_prison_cell *cell)
e49e5829
JT
2031{
2032 int r;
2033 int rw = bio_data_dir(bio);
2034 dm_block_t block = get_bio_block(tc, bio);
2035 struct dm_thin_lookup_result lookup_result;
2036
2037 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
2038 switch (r) {
2039 case 0:
a374bb21 2040 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
8c0f0e8c 2041 handle_unserviceable_bio(tc->pool, bio);
a374bb21
JT
2042 if (cell)
2043 cell_defer_no_holder(tc, cell);
2044 } else {
e8088073 2045 inc_all_io_entry(tc->pool, bio);
e49e5829 2046 remap_and_issue(tc, bio, lookup_result.block);
a374bb21
JT
2047 if (cell)
2048 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
e8088073 2049 }
e49e5829
JT
2050 break;
2051
2052 case -ENODATA:
a374bb21
JT
2053 if (cell)
2054 cell_defer_no_holder(tc, cell);
e49e5829 2055 if (rw != READ) {
8c0f0e8c 2056 handle_unserviceable_bio(tc->pool, bio);
e49e5829
JT
2057 break;
2058 }
2059
2060 if (tc->origin_dev) {
e8088073 2061 inc_all_io_entry(tc->pool, bio);
e49e5829
JT
2062 remap_to_origin_and_issue(tc, bio);
2063 break;
2064 }
2065
2066 zero_fill_bio(bio);
4246a0b6 2067 bio_endio(bio);
e49e5829
JT
2068 break;
2069
2070 default:
c397741c
MS
2071 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2072 __func__, r);
a374bb21
JT
2073 if (cell)
2074 cell_defer_no_holder(tc, cell);
e49e5829
JT
2075 bio_io_error(bio);
2076 break;
2077 }
2078}
2079
a374bb21
JT
2080static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
2081{
2082 __process_bio_read_only(tc, bio, NULL);
2083}
2084
2085static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2086{
2087 __process_bio_read_only(tc, cell->holder, cell);
2088}
2089
3e1a0699
JT
2090static void process_bio_success(struct thin_c *tc, struct bio *bio)
2091{
4246a0b6 2092 bio_endio(bio);
3e1a0699
JT
2093}
2094
e49e5829
JT
2095static void process_bio_fail(struct thin_c *tc, struct bio *bio)
2096{
2097 bio_io_error(bio);
2098}
2099
a374bb21
JT
2100static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2101{
2102 cell_success(tc->pool, cell);
2103}
2104
2105static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2106{
2107 cell_error(tc->pool, cell);
2108}
2109
ac8c3f3d
JT
2110/*
2111 * FIXME: should we also commit due to size of transaction, measured in
2112 * metadata blocks?
2113 */
905e51b3
JT
2114static int need_commit_due_to_time(struct pool *pool)
2115{
0f30af98
MS
2116 return !time_in_range(jiffies, pool->last_commit_jiffies,
2117 pool->last_commit_jiffies + COMMIT_PERIOD);
905e51b3
JT
2118}
2119
67324ea1
MS
2120#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
2121#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
2122
2123static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
2124{
2125 struct rb_node **rbp, *parent;
2126 struct dm_thin_endio_hook *pbd;
2127 sector_t bi_sector = bio->bi_iter.bi_sector;
2128
2129 rbp = &tc->sort_bio_list.rb_node;
2130 parent = NULL;
2131 while (*rbp) {
2132 parent = *rbp;
2133 pbd = thin_pbd(parent);
2134
2135 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
2136 rbp = &(*rbp)->rb_left;
2137 else
2138 rbp = &(*rbp)->rb_right;
2139 }
2140
2141 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2142 rb_link_node(&pbd->rb_node, parent, rbp);
2143 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
2144}
2145
2146static void __extract_sorted_bios(struct thin_c *tc)
2147{
2148 struct rb_node *node;
2149 struct dm_thin_endio_hook *pbd;
2150 struct bio *bio;
2151
2152 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
2153 pbd = thin_pbd(node);
2154 bio = thin_bio(pbd);
2155
2156 bio_list_add(&tc->deferred_bio_list, bio);
2157 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
2158 }
2159
2160 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
2161}
2162
2163static void __sort_thin_deferred_bios(struct thin_c *tc)
2164{
2165 struct bio *bio;
2166 struct bio_list bios;
2167
2168 bio_list_init(&bios);
2169 bio_list_merge(&bios, &tc->deferred_bio_list);
2170 bio_list_init(&tc->deferred_bio_list);
2171
2172 /* Sort deferred_bio_list using rb-tree */
2173 while ((bio = bio_list_pop(&bios)))
2174 __thin_bio_rb_add(tc, bio);
2175
2176 /*
2177 * Transfer the sorted bios in sort_bio_list back to
2178 * deferred_bio_list to allow lockless submission of
2179 * all bios.
2180 */
2181 __extract_sorted_bios(tc);
2182}
2183
c140e1c4 2184static void process_thin_deferred_bios(struct thin_c *tc)
991d9fa0 2185{
c140e1c4 2186 struct pool *pool = tc->pool;
991d9fa0
JT
2187 struct bio *bio;
2188 struct bio_list bios;
67324ea1 2189 struct blk_plug plug;
86a3238c 2190 unsigned int count = 0;
991d9fa0 2191
c140e1c4 2192 if (tc->requeue_mode) {
4e4cbee9
CH
2193 error_thin_bio_list(tc, &tc->deferred_bio_list,
2194 BLK_STS_DM_REQUEUE);
c140e1c4
MS
2195 return;
2196 }
2197
991d9fa0
JT
2198 bio_list_init(&bios);
2199
8e0c9dac 2200 spin_lock_irq(&tc->lock);
67324ea1
MS
2201
2202 if (bio_list_empty(&tc->deferred_bio_list)) {
8e0c9dac 2203 spin_unlock_irq(&tc->lock);
67324ea1
MS
2204 return;
2205 }
2206
2207 __sort_thin_deferred_bios(tc);
2208
c140e1c4
MS
2209 bio_list_merge(&bios, &tc->deferred_bio_list);
2210 bio_list_init(&tc->deferred_bio_list);
67324ea1 2211
8e0c9dac 2212 spin_unlock_irq(&tc->lock);
991d9fa0 2213
67324ea1 2214 blk_start_plug(&plug);
991d9fa0 2215 while ((bio = bio_list_pop(&bios))) {
991d9fa0
JT
2216 /*
2217 * If we've got no free new_mapping structs, and processing
2218 * this bio might require one, we pause until there are some
2219 * prepared mappings to process.
2220 */
2221 if (ensure_next_mapping(pool)) {
8e0c9dac 2222 spin_lock_irq(&tc->lock);
c140e1c4
MS
2223 bio_list_add(&tc->deferred_bio_list, bio);
2224 bio_list_merge(&tc->deferred_bio_list, &bios);
8e0c9dac 2225 spin_unlock_irq(&tc->lock);
991d9fa0
JT
2226 break;
2227 }
104655fd 2228
e6047149 2229 if (bio_op(bio) == REQ_OP_DISCARD)
e49e5829 2230 pool->process_discard(tc, bio);
104655fd 2231 else
e49e5829 2232 pool->process_bio(tc, bio);
8a01a6af
JT
2233
2234 if ((count++ & 127) == 0) {
7d327fe0 2235 throttle_work_update(&pool->throttle);
8a01a6af
JT
2236 dm_pool_issue_prefetches(pool->pmd);
2237 }
e4f80303 2238 cond_resched();
991d9fa0 2239 }
67324ea1 2240 blk_finish_plug(&plug);
c140e1c4
MS
2241}
2242
ac4c3f34
JT
2243static int cmp_cells(const void *lhs, const void *rhs)
2244{
2245 struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
2246 struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
2247
2248 BUG_ON(!lhs_cell->holder);
2249 BUG_ON(!rhs_cell->holder);
2250
2251 if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
2252 return -1;
2253
2254 if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
2255 return 1;
2256
2257 return 0;
2258}
2259
86a3238c 2260static unsigned int sort_cells(struct pool *pool, struct list_head *cells)
ac4c3f34 2261{
86a3238c 2262 unsigned int count = 0;
ac4c3f34
JT
2263 struct dm_bio_prison_cell *cell, *tmp;
2264
2265 list_for_each_entry_safe(cell, tmp, cells, user_list) {
2266 if (count >= CELL_SORT_ARRAY_SIZE)
2267 break;
2268
2269 pool->cell_sort_array[count++] = cell;
2270 list_del(&cell->user_list);
2271 }
2272
2273 sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
2274
2275 return count;
2276}
2277
a374bb21
JT
2278static void process_thin_deferred_cells(struct thin_c *tc)
2279{
2280 struct pool *pool = tc->pool;
a374bb21 2281 struct list_head cells;
ac4c3f34 2282 struct dm_bio_prison_cell *cell;
86a3238c 2283 unsigned int i, j, count;
a374bb21
JT
2284
2285 INIT_LIST_HEAD(&cells);
2286
8e0c9dac 2287 spin_lock_irq(&tc->lock);
a374bb21 2288 list_splice_init(&tc->deferred_cells, &cells);
8e0c9dac 2289 spin_unlock_irq(&tc->lock);
a374bb21
JT
2290
2291 if (list_empty(&cells))
2292 return;
2293
ac4c3f34
JT
2294 do {
2295 count = sort_cells(tc->pool, &cells);
a374bb21 2296
ac4c3f34
JT
2297 for (i = 0; i < count; i++) {
2298 cell = pool->cell_sort_array[i];
2299 BUG_ON(!cell->holder);
a374bb21 2300
ac4c3f34
JT
2301 /*
2302 * If we've got no free new_mapping structs, and processing
2303 * this bio might require one, we pause until there are some
2304 * prepared mappings to process.
2305 */
2306 if (ensure_next_mapping(pool)) {
2307 for (j = i; j < count; j++)
2308 list_add(&pool->cell_sort_array[j]->user_list, &cells);
2309
8e0c9dac 2310 spin_lock_irq(&tc->lock);
ac4c3f34 2311 list_splice(&cells, &tc->deferred_cells);
8e0c9dac 2312 spin_unlock_irq(&tc->lock);
ac4c3f34
JT
2313 return;
2314 }
2315
e6047149 2316 if (bio_op(cell->holder) == REQ_OP_DISCARD)
ac4c3f34
JT
2317 pool->process_discard_cell(tc, cell);
2318 else
2319 pool->process_cell(tc, cell);
2320 }
e4f80303 2321 cond_resched();
ac4c3f34 2322 } while (!list_empty(&cells));
a374bb21
JT
2323}
2324
b10ebd34
JT
2325static void thin_get(struct thin_c *tc);
2326static void thin_put(struct thin_c *tc);
2327
2328/*
2329 * We can't hold rcu_read_lock() around code that can block. So we
2330 * find a thin with the rcu lock held; bump a refcount; then drop
2331 * the lock.
2332 */
2333static struct thin_c *get_first_thin(struct pool *pool)
2334{
2335 struct thin_c *tc = NULL;
2336
2337 rcu_read_lock();
2338 if (!list_empty(&pool->active_thins)) {
2339 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
2340 thin_get(tc);
2341 }
2342 rcu_read_unlock();
2343
2344 return tc;
2345}
2346
2347static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
2348{
2349 struct thin_c *old_tc = tc;
2350
2351 rcu_read_lock();
2352 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
2353 thin_get(tc);
2354 thin_put(old_tc);
2355 rcu_read_unlock();
2356 return tc;
2357 }
2358 thin_put(old_tc);
2359 rcu_read_unlock();
2360
2361 return NULL;
2362}
2363
c140e1c4
MS
2364static void process_deferred_bios(struct pool *pool)
2365{
c140e1c4 2366 struct bio *bio;
4ae280b4 2367 struct bio_list bios, bio_completions;
c140e1c4
MS
2368 struct thin_c *tc;
2369
b10ebd34
JT
2370 tc = get_first_thin(pool);
2371 while (tc) {
a374bb21 2372 process_thin_deferred_cells(tc);
c140e1c4 2373 process_thin_deferred_bios(tc);
b10ebd34
JT
2374 tc = get_next_thin(pool, tc);
2375 }
991d9fa0
JT
2376
2377 /*
4ae280b4
NT
2378 * If there are any deferred flush bios, we must commit the metadata
2379 * before issuing them or signaling their completion.
991d9fa0
JT
2380 */
2381 bio_list_init(&bios);
4ae280b4
NT
2382 bio_list_init(&bio_completions);
2383
8e0c9dac 2384 spin_lock_irq(&pool->lock);
991d9fa0
JT
2385 bio_list_merge(&bios, &pool->deferred_flush_bios);
2386 bio_list_init(&pool->deferred_flush_bios);
4ae280b4
NT
2387
2388 bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
2389 bio_list_init(&pool->deferred_flush_completions);
8e0c9dac 2390 spin_unlock_irq(&pool->lock);
991d9fa0 2391
4ae280b4 2392 if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
4d1662a3 2393 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
991d9fa0
JT
2394 return;
2395
020cc3b5 2396 if (commit(pool)) {
4ae280b4
NT
2397 bio_list_merge(&bios, &bio_completions);
2398
991d9fa0
JT
2399 while ((bio = bio_list_pop(&bios)))
2400 bio_io_error(bio);
2401 return;
2402 }
905e51b3 2403 pool->last_commit_jiffies = jiffies;
991d9fa0 2404
4ae280b4
NT
2405 while ((bio = bio_list_pop(&bio_completions)))
2406 bio_endio(bio);
2407
694cfe7f
NT
2408 while ((bio = bio_list_pop(&bios))) {
2409 /*
2410 * The data device was flushed as part of metadata commit,
2411 * so complete redundant flushes immediately.
2412 */
2413 if (bio->bi_opf & REQ_PREFLUSH)
2414 bio_endio(bio);
2415 else
b7f8dff0 2416 dm_submit_bio_remap(bio, NULL);
694cfe7f 2417 }
991d9fa0
JT
2418}
2419
2420static void do_worker(struct work_struct *ws)
2421{
2422 struct pool *pool = container_of(ws, struct pool, worker);
2423
7d327fe0 2424 throttle_work_start(&pool->throttle);
8a01a6af 2425 dm_pool_issue_prefetches(pool->pmd);
7d327fe0 2426 throttle_work_update(&pool->throttle);
e49e5829 2427 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
7d327fe0 2428 throttle_work_update(&pool->throttle);
e49e5829 2429 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
7d327fe0 2430 throttle_work_update(&pool->throttle);
2a0fbffb
JT
2431 process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
2432 throttle_work_update(&pool->throttle);
991d9fa0 2433 process_deferred_bios(pool);
7d327fe0 2434 throttle_work_complete(&pool->throttle);
991d9fa0
JT
2435}
2436
905e51b3
JT
2437/*
2438 * We want to commit periodically so that not too much
2439 * unwritten data builds up.
2440 */
2441static void do_waker(struct work_struct *ws)
2442{
2443 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
0ef0b471 2444
905e51b3
JT
2445 wake_worker(pool);
2446 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
2447}
2448
85ad643b
JT
2449/*
2450 * We're holding onto IO to allow userland time to react. After the
2451 * timeout either the pool will have been resized (and thus back in
bcc696fa 2452 * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
85ad643b
JT
2453 */
2454static void do_no_space_timeout(struct work_struct *ws)
2455{
2456 struct pool *pool = container_of(to_delayed_work(ws), struct pool,
2457 no_space_timeout);
2458
bcc696fa
MS
2459 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
2460 pool->pf.error_if_no_space = true;
f6c36758 2461 notify_of_pool_mode_change(pool);
4e4cbee9 2462 error_retry_list_with_code(pool, BLK_STS_NOSPC);
bcc696fa 2463 }
85ad643b
JT
2464}
2465
991d9fa0
JT
2466/*----------------------------------------------------------------*/
2467
e7a3e871 2468struct pool_work {
738211f7 2469 struct work_struct worker;
e7a3e871
JT
2470 struct completion complete;
2471};
2472
2473static struct pool_work *to_pool_work(struct work_struct *ws)
2474{
2475 return container_of(ws, struct pool_work, worker);
2476}
2477
2478static void pool_work_complete(struct pool_work *pw)
2479{
2480 complete(&pw->complete);
2481}
738211f7 2482
e7a3e871
JT
2483static void pool_work_wait(struct pool_work *pw, struct pool *pool,
2484 void (*fn)(struct work_struct *))
2485{
2486 INIT_WORK_ONSTACK(&pw->worker, fn);
2487 init_completion(&pw->complete);
2488 queue_work(pool->wq, &pw->worker);
2489 wait_for_completion(&pw->complete);
2490}
2491
2492/*----------------------------------------------------------------*/
2493
2494struct noflush_work {
2495 struct pool_work pw;
2496 struct thin_c *tc;
738211f7
JT
2497};
2498
e7a3e871 2499static struct noflush_work *to_noflush(struct work_struct *ws)
738211f7 2500{
e7a3e871 2501 return container_of(to_pool_work(ws), struct noflush_work, pw);
738211f7
JT
2502}
2503
2504static void do_noflush_start(struct work_struct *ws)
2505{
e7a3e871 2506 struct noflush_work *w = to_noflush(ws);
0ef0b471 2507
738211f7
JT
2508 w->tc->requeue_mode = true;
2509 requeue_io(w->tc);
e7a3e871 2510 pool_work_complete(&w->pw);
738211f7
JT
2511}
2512
2513static void do_noflush_stop(struct work_struct *ws)
2514{
e7a3e871 2515 struct noflush_work *w = to_noflush(ws);
0ef0b471 2516
738211f7 2517 w->tc->requeue_mode = false;
e7a3e871 2518 pool_work_complete(&w->pw);
738211f7
JT
2519}
2520
2521static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
2522{
2523 struct noflush_work w;
2524
738211f7 2525 w.tc = tc;
e7a3e871 2526 pool_work_wait(&w.pw, tc->pool, fn);
738211f7
JT
2527}
2528
2529/*----------------------------------------------------------------*/
2530
34fbcf62
JT
2531static bool passdown_enabled(struct pool_c *pt)
2532{
2533 return pt->adjusted_pf.discard_passdown;
2534}
2535
2536static void set_discard_callbacks(struct pool *pool)
2537{
2538 struct pool_c *pt = pool->ti->private;
2539
2540 if (passdown_enabled(pt)) {
2541 pool->process_discard_cell = process_discard_cell_passdown;
2a0fbffb
JT
2542 pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2543 pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
34fbcf62
JT
2544 } else {
2545 pool->process_discard_cell = process_discard_cell_no_passdown;
2546 pool->process_prepared_discard = process_prepared_discard_no_passdown;
2547 }
2548}
2549
8b64e881 2550static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
e49e5829 2551{
cdc2b415 2552 struct pool_c *pt = pool->ti->private;
07f2b6e0
MS
2553 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
2554 enum pool_mode old_mode = get_pool_mode(pool);
6aa7de05 2555 unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
07f2b6e0
MS
2556
2557 /*
2558 * Never allow the pool to transition to PM_WRITE mode if user
2559 * intervention is required to verify metadata and data consistency.
2560 */
2561 if (new_mode == PM_WRITE && needs_check) {
2562 DMERR("%s: unable to switch pool to write mode until repaired.",
2563 dm_device_name(pool->pool_md));
2564 if (old_mode != new_mode)
2565 new_mode = old_mode;
2566 else
2567 new_mode = PM_READ_ONLY;
2568 }
2569 /*
2570 * If we were in PM_FAIL mode, rollback of metadata failed. We're
2571 * not going to recover without a thin_repair. So we never let the
2572 * pool move out of the old mode.
2573 */
2574 if (old_mode == PM_FAIL)
2575 new_mode = old_mode;
e49e5829 2576
8b64e881 2577 switch (new_mode) {
e49e5829 2578 case PM_FAIL:
5383ef3a 2579 dm_pool_metadata_read_only(pool->pmd);
e49e5829
JT
2580 pool->process_bio = process_bio_fail;
2581 pool->process_discard = process_bio_fail;
a374bb21
JT
2582 pool->process_cell = process_cell_fail;
2583 pool->process_discard_cell = process_cell_fail;
e49e5829
JT
2584 pool->process_prepared_mapping = process_prepared_mapping_fail;
2585 pool->process_prepared_discard = process_prepared_discard_fail;
3e1a0699
JT
2586
2587 error_retry_list(pool);
e49e5829
JT
2588 break;
2589
3ab91828 2590 case PM_OUT_OF_METADATA_SPACE:
e49e5829 2591 case PM_READ_ONLY:
3e1a0699
JT
2592 dm_pool_metadata_read_only(pool->pmd);
2593 pool->process_bio = process_bio_read_only;
2594 pool->process_discard = process_bio_success;
a374bb21
JT
2595 pool->process_cell = process_cell_read_only;
2596 pool->process_discard_cell = process_cell_success;
3e1a0699 2597 pool->process_prepared_mapping = process_prepared_mapping_fail;
34fbcf62 2598 pool->process_prepared_discard = process_prepared_discard_success;
3e1a0699
JT
2599
2600 error_retry_list(pool);
2601 break;
2602
2603 case PM_OUT_OF_DATA_SPACE:
2604 /*
2605 * Ideally we'd never hit this state; the low water mark
2606 * would trigger userland to extend the pool before we
2607 * completely run out of data space. However, many small
2608 * IOs to unprovisioned space can consume data space at an
2609 * alarming rate. Adjust your low water mark if you're
2610 * frequently seeing this mode.
2611 */
c3667cc6 2612 pool->out_of_data_space = true;
3e1a0699 2613 pool->process_bio = process_bio_read_only;
a374bb21
JT
2614 pool->process_discard = process_discard_bio;
2615 pool->process_cell = process_cell_read_only;
3e1a0699 2616 pool->process_prepared_mapping = process_prepared_mapping;
34fbcf62 2617 set_discard_callbacks(pool);
85ad643b 2618
80c57893
MS
2619 if (!pool->pf.error_if_no_space && no_space_timeout)
2620 queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
e49e5829
JT
2621 break;
2622
2623 case PM_WRITE:
75294442
HT
2624 if (old_mode == PM_OUT_OF_DATA_SPACE)
2625 cancel_delayed_work_sync(&pool->no_space_timeout);
c3667cc6 2626 pool->out_of_data_space = false;
172c2386 2627 pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
9b7aaa64 2628 dm_pool_metadata_read_write(pool->pmd);
e49e5829 2629 pool->process_bio = process_bio;
a374bb21
JT
2630 pool->process_discard = process_discard_bio;
2631 pool->process_cell = process_cell;
e49e5829 2632 pool->process_prepared_mapping = process_prepared_mapping;
34fbcf62 2633 set_discard_callbacks(pool);
e49e5829
JT
2634 break;
2635 }
8b64e881
MS
2636
2637 pool->pf.mode = new_mode;
cdc2b415
MS
2638 /*
2639 * The pool mode may have changed, sync it so bind_control_target()
2640 * doesn't cause an unexpected mode transition on resume.
2641 */
2642 pt->adjusted_pf.mode = new_mode;
f6c36758
MS
2643
2644 if (old_mode != new_mode)
2645 notify_of_pool_mode_change(pool);
e49e5829
JT
2646}
2647
07f2b6e0 2648static void abort_transaction(struct pool *pool)
b5330655 2649{
07f2b6e0
MS
2650 const char *dev_name = dm_device_name(pool->pool_md);
2651
2652 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2653 if (dm_pool_abort_metadata(pool->pmd)) {
2654 DMERR("%s: failed to abort metadata transaction", dev_name);
2655 set_pool_mode(pool, PM_FAIL);
2656 }
2657
2658 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
2659 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2660 set_pool_mode(pool, PM_FAIL);
2661 }
2662}
399caddf 2663
07f2b6e0
MS
2664static void metadata_operation_failed(struct pool *pool, const char *op, int r)
2665{
b5330655
JT
2666 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2667 dm_device_name(pool->pool_md), op, r);
2668
07f2b6e0 2669 abort_transaction(pool);
b5330655
JT
2670 set_pool_mode(pool, PM_READ_ONLY);
2671}
2672
e49e5829
JT
2673/*----------------------------------------------------------------*/
2674
991d9fa0
JT
2675/*
2676 * Mapping functions.
2677 */
2678
2679/*
2680 * Called only while mapping a thin bio to hand it over to the workqueue.
2681 */
2682static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
2683{
991d9fa0
JT
2684 struct pool *pool = tc->pool;
2685
8e0c9dac 2686 spin_lock_irq(&tc->lock);
c140e1c4 2687 bio_list_add(&tc->deferred_bio_list, bio);
8e0c9dac 2688 spin_unlock_irq(&tc->lock);
991d9fa0
JT
2689
2690 wake_worker(pool);
2691}
2692
7d327fe0
JT
2693static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2694{
2695 struct pool *pool = tc->pool;
2696
2697 throttle_lock(&pool->throttle);
2698 thin_defer_bio(tc, bio);
2699 throttle_unlock(&pool->throttle);
2700}
2701
a374bb21
JT
2702static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2703{
a374bb21
JT
2704 struct pool *pool = tc->pool;
2705
2706 throttle_lock(&pool->throttle);
8e0c9dac 2707 spin_lock_irq(&tc->lock);
a374bb21 2708 list_add_tail(&cell->user_list, &tc->deferred_cells);
8e0c9dac 2709 spin_unlock_irq(&tc->lock);
a374bb21
JT
2710 throttle_unlock(&pool->throttle);
2711
2712 wake_worker(pool);
2713}
2714
59c3d2c6 2715static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
eb2aa48d 2716{
59c3d2c6 2717 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d
JT
2718
2719 h->tc = tc;
2720 h->shared_read_entry = NULL;
e8088073 2721 h->all_io_entry = NULL;
eb2aa48d 2722 h->overwrite_mapping = NULL;
34fbcf62 2723 h->cell = NULL;
eb2aa48d
JT
2724}
2725
991d9fa0
JT
2726/*
2727 * Non-blocking function called from the thin target's map function.
2728 */
7de3ee57 2729static int thin_bio_map(struct dm_target *ti, struct bio *bio)
991d9fa0
JT
2730{
2731 int r;
2732 struct thin_c *tc = ti->private;
2733 dm_block_t block = get_bio_block(tc, bio);
2734 struct dm_thin_device *td = tc->td;
2735 struct dm_thin_lookup_result result;
a374bb21 2736 struct dm_bio_prison_cell *virt_cell, *data_cell;
e8088073 2737 struct dm_cell_key key;
991d9fa0 2738
59c3d2c6 2739 thin_hook_bio(tc, bio);
e49e5829 2740
738211f7 2741 if (tc->requeue_mode) {
4e4cbee9 2742 bio->bi_status = BLK_STS_DM_REQUEUE;
4246a0b6 2743 bio_endio(bio);
738211f7
JT
2744 return DM_MAPIO_SUBMITTED;
2745 }
2746
e49e5829
JT
2747 if (get_pool_mode(tc->pool) == PM_FAIL) {
2748 bio_io_error(bio);
2749 return DM_MAPIO_SUBMITTED;
2750 }
2751
f73f44eb 2752 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
7d327fe0 2753 thin_defer_bio_with_throttle(tc, bio);
991d9fa0
JT
2754 return DM_MAPIO_SUBMITTED;
2755 }
2756
c822ed96
JT
2757 /*
2758 * We must hold the virtual cell before doing the lookup, otherwise
2759 * there's a race with discard.
2760 */
2761 build_virtual_key(tc->td, block, &key);
a374bb21 2762 if (bio_detain(tc->pool, &key, bio, &virt_cell))
c822ed96
JT
2763 return DM_MAPIO_SUBMITTED;
2764
991d9fa0
JT
2765 r = dm_thin_find_block(td, block, 0, &result);
2766
2767 /*
2768 * Note that we defer readahead too.
2769 */
2770 switch (r) {
2771 case 0:
2772 if (unlikely(result.shared)) {
2773 /*
2774 * We have a race condition here between the
2775 * result.shared value returned by the lookup and
2776 * snapshot creation, which may cause new
2777 * sharing.
2778 *
2779 * To avoid this always quiesce the origin before
2780 * taking the snap. You want to do this anyway to
2781 * ensure a consistent application view
2782 * (i.e. lockfs).
2783 *
2784 * More distant ancestors are irrelevant. The
2785 * shared flag will be set in their case.
2786 */
a374bb21 2787 thin_defer_cell(tc, virt_cell);
e8088073 2788 return DM_MAPIO_SUBMITTED;
991d9fa0 2789 }
e8088073 2790
e8088073 2791 build_data_key(tc->td, result.block, &key);
a374bb21
JT
2792 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
2793 cell_defer_no_holder(tc, virt_cell);
e8088073
JT
2794 return DM_MAPIO_SUBMITTED;
2795 }
2796
2797 inc_all_io_entry(tc->pool, bio);
a374bb21
JT
2798 cell_defer_no_holder(tc, data_cell);
2799 cell_defer_no_holder(tc, virt_cell);
e8088073
JT
2800
2801 remap(tc, bio, result.block);
2802 return DM_MAPIO_REMAPPED;
991d9fa0
JT
2803
2804 case -ENODATA:
e49e5829 2805 case -EWOULDBLOCK:
a374bb21 2806 thin_defer_cell(tc, virt_cell);
2aab3850 2807 return DM_MAPIO_SUBMITTED;
e49e5829
JT
2808
2809 default:
2810 /*
2811 * Must always call bio_io_error on failure.
2812 * dm_thin_find_block can fail with -EINVAL if the
2813 * pool is switched to fail-io mode.
2814 */
2815 bio_io_error(bio);
a374bb21 2816 cell_defer_no_holder(tc, virt_cell);
2aab3850 2817 return DM_MAPIO_SUBMITTED;
991d9fa0 2818 }
991d9fa0
JT
2819}
2820
c140e1c4 2821static void requeue_bios(struct pool *pool)
991d9fa0 2822{
c140e1c4
MS
2823 struct thin_c *tc;
2824
2825 rcu_read_lock();
2826 list_for_each_entry_rcu(tc, &pool->active_thins, list) {
8e0c9dac 2827 spin_lock_irq(&tc->lock);
c140e1c4
MS
2828 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
2829 bio_list_init(&tc->retry_on_resume_list);
8e0c9dac 2830 spin_unlock_irq(&tc->lock);
c140e1c4
MS
2831 }
2832 rcu_read_unlock();
991d9fa0
JT
2833}
2834
a4a82ce3
HM
2835/*
2836 *--------------------------------------------------------------
991d9fa0 2837 * Binding of control targets to a pool object
a4a82ce3
HM
2838 *--------------------------------------------------------------
2839 */
58051b94
JT
2840static bool is_factor(sector_t block_size, uint32_t n)
2841{
2842 return !sector_div(block_size, n);
2843}
2844
9bc142dd
MS
2845/*
2846 * If discard_passdown was enabled verify that the data device
0424caa1 2847 * supports discards. Disable discard_passdown if not.
9bc142dd 2848 */
0424caa1 2849static void disable_passdown_if_not_supported(struct pool_c *pt)
9bc142dd 2850{
0424caa1
MS
2851 struct pool *pool = pt->pool;
2852 struct block_device *data_bdev = pt->data_dev->bdev;
2853 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
0424caa1 2854 const char *reason = NULL;
9bc142dd 2855
0424caa1 2856 if (!pt->adjusted_pf.discard_passdown)
9bc142dd
MS
2857 return;
2858
70200574 2859 if (!bdev_max_discard_sectors(pt->data_dev->bdev))
0424caa1
MS
2860 reason = "discard unsupported";
2861
2862 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2863 reason = "max discard sectors smaller than a block";
9bc142dd 2864
0424caa1 2865 if (reason) {
385411ff 2866 DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);
0424caa1
MS
2867 pt->adjusted_pf.discard_passdown = false;
2868 }
9bc142dd
MS
2869}
2870
991d9fa0
JT
2871static int bind_control_target(struct pool *pool, struct dm_target *ti)
2872{
2873 struct pool_c *pt = ti->private;
2874
e49e5829 2875 /*
9b7aaa64 2876 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
e49e5829 2877 */
07f2b6e0 2878 enum pool_mode old_mode = get_pool_mode(pool);
0424caa1 2879 enum pool_mode new_mode = pt->adjusted_pf.mode;
e49e5829 2880
8b64e881
MS
2881 /*
2882 * Don't change the pool's mode until set_pool_mode() below.
2883 * Otherwise the pool's process_* function pointers may
2884 * not match the desired pool mode.
2885 */
2886 pt->adjusted_pf.mode = old_mode;
2887
2888 pool->ti = ti;
2889 pool->pf = pt->adjusted_pf;
2890 pool->low_water_blocks = pt->low_water_blocks;
2891
9bc142dd 2892 set_pool_mode(pool, new_mode);
f402693d 2893
991d9fa0
JT
2894 return 0;
2895}
2896
2897static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2898{
2899 if (pool->ti == ti)
2900 pool->ti = NULL;
2901}
2902
a4a82ce3
HM
2903/*
2904 *--------------------------------------------------------------
991d9fa0 2905 * Pool creation
a4a82ce3
HM
2906 *--------------------------------------------------------------
2907 */
67e2e2b2
JT
2908/* Initialize pool features. */
2909static void pool_features_init(struct pool_features *pf)
2910{
e49e5829 2911 pf->mode = PM_WRITE;
9bc142dd
MS
2912 pf->zero_new_blocks = true;
2913 pf->discard_enabled = true;
2914 pf->discard_passdown = true;
787a996c 2915 pf->error_if_no_space = false;
67e2e2b2
JT
2916}
2917
991d9fa0
JT
2918static void __pool_destroy(struct pool *pool)
2919{
2920 __pool_table_remove(pool);
2921
a822c83e 2922 vfree(pool->cell_sort_array);
991d9fa0
JT
2923 if (dm_pool_metadata_close(pool->pmd) < 0)
2924 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2925
44feb387 2926 dm_bio_prison_destroy(pool->prison);
991d9fa0
JT
2927 dm_kcopyd_client_destroy(pool->copier);
2928
88430ebc
LM
2929 cancel_delayed_work_sync(&pool->waker);
2930 cancel_delayed_work_sync(&pool->no_space_timeout);
991d9fa0
JT
2931 if (pool->wq)
2932 destroy_workqueue(pool->wq);
2933
2934 if (pool->next_mapping)
6f1c819c
KO
2935 mempool_free(pool->next_mapping, &pool->mapping_pool);
2936 mempool_exit(&pool->mapping_pool);
44feb387
MS
2937 dm_deferred_set_destroy(pool->shared_read_ds);
2938 dm_deferred_set_destroy(pool->all_io_ds);
991d9fa0
JT
2939 kfree(pool);
2940}
2941
a24c2569 2942static struct kmem_cache *_new_mapping_cache;
a24c2569 2943
991d9fa0
JT
2944static struct pool *pool_create(struct mapped_device *pool_md,
2945 struct block_device *metadata_dev,
873937e7 2946 struct block_device *data_dev,
e49e5829
JT
2947 unsigned long block_size,
2948 int read_only, char **error)
991d9fa0
JT
2949{
2950 int r;
2951 void *err_p;
2952 struct pool *pool;
2953 struct dm_pool_metadata *pmd;
e49e5829 2954 bool format_device = read_only ? false : true;
991d9fa0 2955
e49e5829 2956 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
991d9fa0
JT
2957 if (IS_ERR(pmd)) {
2958 *error = "Error creating metadata object";
2959 return (struct pool *)pmd;
2960 }
2961
d3775354 2962 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
991d9fa0
JT
2963 if (!pool) {
2964 *error = "Error allocating memory for pool";
2965 err_p = ERR_PTR(-ENOMEM);
2966 goto bad_pool;
2967 }
2968
2969 pool->pmd = pmd;
2970 pool->sectors_per_block = block_size;
f9a8e0cd
MP
2971 if (block_size & (block_size - 1))
2972 pool->sectors_per_block_shift = -1;
2973 else
2974 pool->sectors_per_block_shift = __ffs(block_size);
991d9fa0 2975 pool->low_water_blocks = 0;
67e2e2b2 2976 pool_features_init(&pool->pf);
a195db2d 2977 pool->prison = dm_bio_prison_create();
991d9fa0
JT
2978 if (!pool->prison) {
2979 *error = "Error creating pool's bio prison";
2980 err_p = ERR_PTR(-ENOMEM);
2981 goto bad_prison;
2982 }
2983
df5d2e90 2984 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
991d9fa0
JT
2985 if (IS_ERR(pool->copier)) {
2986 r = PTR_ERR(pool->copier);
2987 *error = "Error creating pool's kcopyd client";
2988 err_p = ERR_PTR(r);
2989 goto bad_kcopyd_client;
2990 }
2991
2992 /*
2993 * Create singlethreaded workqueue that will service all devices
2994 * that use this metadata.
2995 */
2996 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2997 if (!pool->wq) {
2998 *error = "Error creating pool's workqueue";
2999 err_p = ERR_PTR(-ENOMEM);
3000 goto bad_wq;
3001 }
3002
7d327fe0 3003 throttle_init(&pool->throttle);
991d9fa0 3004 INIT_WORK(&pool->worker, do_worker);
905e51b3 3005 INIT_DELAYED_WORK(&pool->waker, do_waker);
85ad643b 3006 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
991d9fa0 3007 spin_lock_init(&pool->lock);
991d9fa0 3008 bio_list_init(&pool->deferred_flush_bios);
4ae280b4 3009 bio_list_init(&pool->deferred_flush_completions);
991d9fa0 3010 INIT_LIST_HEAD(&pool->prepared_mappings);
104655fd 3011 INIT_LIST_HEAD(&pool->prepared_discards);
2a0fbffb 3012 INIT_LIST_HEAD(&pool->prepared_discards_pt2);
c140e1c4 3013 INIT_LIST_HEAD(&pool->active_thins);
88a6621b 3014 pool->low_water_triggered = false;
80e96c54 3015 pool->suspended = true;
c3667cc6 3016 pool->out_of_data_space = false;
44feb387
MS
3017
3018 pool->shared_read_ds = dm_deferred_set_create();
3019 if (!pool->shared_read_ds) {
3020 *error = "Error creating pool's shared read deferred set";
3021 err_p = ERR_PTR(-ENOMEM);
3022 goto bad_shared_read_ds;
3023 }
3024
3025 pool->all_io_ds = dm_deferred_set_create();
3026 if (!pool->all_io_ds) {
3027 *error = "Error creating pool's all io deferred set";
3028 err_p = ERR_PTR(-ENOMEM);
3029 goto bad_all_io_ds;
3030 }
991d9fa0
JT
3031
3032 pool->next_mapping = NULL;
6f1c819c
KO
3033 r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
3034 _new_mapping_cache);
3035 if (r) {
991d9fa0 3036 *error = "Error creating pool's mapping mempool";
6f1c819c 3037 err_p = ERR_PTR(r);
991d9fa0
JT
3038 goto bad_mapping_pool;
3039 }
3040
42bc47b3
KC
3041 pool->cell_sort_array =
3042 vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
3043 sizeof(*pool->cell_sort_array)));
a822c83e
JT
3044 if (!pool->cell_sort_array) {
3045 *error = "Error allocating cell sort array";
3046 err_p = ERR_PTR(-ENOMEM);
3047 goto bad_sort_array;
3048 }
3049
991d9fa0 3050 pool->ref_count = 1;
905e51b3 3051 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
3052 pool->pool_md = pool_md;
3053 pool->md_dev = metadata_dev;
873937e7 3054 pool->data_dev = data_dev;
991d9fa0
JT
3055 __pool_table_insert(pool);
3056
3057 return pool;
3058
a822c83e 3059bad_sort_array:
6f1c819c 3060 mempool_exit(&pool->mapping_pool);
991d9fa0 3061bad_mapping_pool:
44feb387
MS
3062 dm_deferred_set_destroy(pool->all_io_ds);
3063bad_all_io_ds:
3064 dm_deferred_set_destroy(pool->shared_read_ds);
3065bad_shared_read_ds:
991d9fa0
JT
3066 destroy_workqueue(pool->wq);
3067bad_wq:
3068 dm_kcopyd_client_destroy(pool->copier);
3069bad_kcopyd_client:
44feb387 3070 dm_bio_prison_destroy(pool->prison);
991d9fa0
JT
3071bad_prison:
3072 kfree(pool);
3073bad_pool:
3074 if (dm_pool_metadata_close(pmd))
3075 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
3076
3077 return err_p;
3078}
3079
3080static void __pool_inc(struct pool *pool)
3081{
3082 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3083 pool->ref_count++;
3084}
3085
3086static void __pool_dec(struct pool *pool)
3087{
3088 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3089 BUG_ON(!pool->ref_count);
3090 if (!--pool->ref_count)
3091 __pool_destroy(pool);
3092}
3093
3094static struct pool *__pool_find(struct mapped_device *pool_md,
3095 struct block_device *metadata_dev,
873937e7 3096 struct block_device *data_dev,
e49e5829
JT
3097 unsigned long block_size, int read_only,
3098 char **error, int *created)
991d9fa0
JT
3099{
3100 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
3101
3102 if (pool) {
f09996c9
MS
3103 if (pool->pool_md != pool_md) {
3104 *error = "metadata device already in use by a pool";
991d9fa0 3105 return ERR_PTR(-EBUSY);
f09996c9 3106 }
873937e7
MP
3107 if (pool->data_dev != data_dev) {
3108 *error = "data device already in use by a pool";
3109 return ERR_PTR(-EBUSY);
3110 }
991d9fa0
JT
3111 __pool_inc(pool);
3112
3113 } else {
3114 pool = __pool_table_lookup(pool_md);
3115 if (pool) {
873937e7 3116 if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
f09996c9 3117 *error = "different pool cannot replace a pool";
991d9fa0 3118 return ERR_PTR(-EINVAL);
f09996c9 3119 }
991d9fa0
JT
3120 __pool_inc(pool);
3121
67e2e2b2 3122 } else {
873937e7 3123 pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
67e2e2b2
JT
3124 *created = 1;
3125 }
991d9fa0
JT
3126 }
3127
3128 return pool;
3129}
3130
a4a82ce3
HM
3131/*
3132 *--------------------------------------------------------------
991d9fa0 3133 * Pool target methods
a4a82ce3
HM
3134 *--------------------------------------------------------------
3135 */
991d9fa0
JT
3136static void pool_dtr(struct dm_target *ti)
3137{
3138 struct pool_c *pt = ti->private;
3139
3140 mutex_lock(&dm_thin_pool_table.mutex);
3141
3142 unbind_control_target(pt->pool, ti);
3143 __pool_dec(pt->pool);
3144 dm_put_device(ti, pt->metadata_dev);
3145 dm_put_device(ti, pt->data_dev);
3146 kfree(pt);
3147
3148 mutex_unlock(&dm_thin_pool_table.mutex);
3149}
3150
991d9fa0
JT
3151static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
3152 struct dm_target *ti)
3153{
3154 int r;
86a3238c 3155 unsigned int argc;
991d9fa0
JT
3156 const char *arg_name;
3157
5916a22b 3158 static const struct dm_arg _args[] = {
74aa45c3 3159 {0, 4, "Invalid number of pool feature arguments"},
991d9fa0
JT
3160 };
3161
3162 /*
3163 * No feature arguments supplied.
3164 */
3165 if (!as->argc)
3166 return 0;
3167
3168 r = dm_read_arg_group(_args, as, &argc, &ti->error);
3169 if (r)
3170 return -EINVAL;
3171
3172 while (argc && !r) {
3173 arg_name = dm_shift_arg(as);
3174 argc--;
3175
e49e5829 3176 if (!strcasecmp(arg_name, "skip_block_zeroing"))
9bc142dd 3177 pf->zero_new_blocks = false;
e49e5829
JT
3178
3179 else if (!strcasecmp(arg_name, "ignore_discard"))
9bc142dd 3180 pf->discard_enabled = false;
e49e5829
JT
3181
3182 else if (!strcasecmp(arg_name, "no_discard_passdown"))
9bc142dd 3183 pf->discard_passdown = false;
991d9fa0 3184
e49e5829
JT
3185 else if (!strcasecmp(arg_name, "read_only"))
3186 pf->mode = PM_READ_ONLY;
3187
787a996c
MS
3188 else if (!strcasecmp(arg_name, "error_if_no_space"))
3189 pf->error_if_no_space = true;
3190
e49e5829
JT
3191 else {
3192 ti->error = "Unrecognised pool feature requested";
3193 r = -EINVAL;
3194 break;
3195 }
991d9fa0
JT
3196 }
3197
3198 return r;
3199}
3200
ac8c3f3d
JT
3201static void metadata_low_callback(void *context)
3202{
3203 struct pool *pool = context;
3204
3205 DMWARN("%s: reached low water mark for metadata device: sending event.",
3206 dm_device_name(pool->pool_md));
3207
3208 dm_table_event(pool->ti->table);
3209}
3210
694cfe7f
NT
3211/*
3212 * We need to flush the data device **before** committing the metadata.
3213 *
3214 * This ensures that the data blocks of any newly inserted mappings are
3215 * properly written to non-volatile storage and won't be lost in case of a
3216 * crash.
3217 *
3218 * Failure to do so can result in data corruption in the case of internal or
3219 * external snapshots and in the case of newly provisioned blocks, when block
3220 * zeroing is enabled.
3221 */
3222static int metadata_pre_commit_callback(void *context)
3223{
f06c03d1 3224 struct pool *pool = context;
694cfe7f 3225
28d7d128 3226 return blkdev_issue_flush(pool->data_dev);
694cfe7f
NT
3227}
3228
7d48935e
MS
3229static sector_t get_dev_size(struct block_device *bdev)
3230{
6dcbb52c 3231 return bdev_nr_sectors(bdev);
7d48935e
MS
3232}
3233
3234static void warn_if_metadata_device_too_big(struct block_device *bdev)
b17446df 3235{
7d48935e 3236 sector_t metadata_dev_size = get_dev_size(bdev);
b17446df 3237
7d48935e 3238 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
385411ff
CH
3239 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
3240 bdev, THIN_METADATA_MAX_SECTORS);
7d48935e
MS
3241}
3242
3243static sector_t get_metadata_dev_size(struct block_device *bdev)
3244{
3245 sector_t metadata_dev_size = get_dev_size(bdev);
3246
3247 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
3248 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
b17446df
JT
3249
3250 return metadata_dev_size;
3251}
3252
24347e95
JT
3253static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
3254{
3255 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
3256
7d48935e 3257 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
24347e95
JT
3258
3259 return metadata_dev_size;
3260}
3261
ac8c3f3d
JT
3262/*
3263 * When a metadata threshold is crossed a dm event is triggered, and
3264 * userland should respond by growing the metadata device. We could let
3265 * userland set the threshold, like we do with the data threshold, but I'm
3266 * not sure they know enough to do this well.
3267 */
3268static dm_block_t calc_metadata_threshold(struct pool_c *pt)
3269{
3270 /*
3271 * 4M is ample for all ops with the possible exception of thin
3272 * device deletion which is harmless if it fails (just retry the
3273 * delete after you've grown the device).
3274 */
3275 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
0ef0b471 3276
ac8c3f3d
JT
3277 return min((dm_block_t)1024ULL /* 4M */, quarter);
3278}
3279
991d9fa0
JT
3280/*
3281 * thin-pool <metadata dev> <data dev>
3282 * <data block size (sectors)>
3283 * <low water mark (blocks)>
3284 * [<#feature args> [<arg>]*]
3285 *
3286 * Optional feature arguments are:
3287 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
67e2e2b2
JT
3288 * ignore_discard: disable discard
3289 * no_discard_passdown: don't pass discards down to the data device
787a996c
MS
3290 * read_only: Don't allow any changes to be made to the pool metadata.
3291 * error_if_no_space: error IOs, instead of queueing, if no space.
991d9fa0 3292 */
86a3238c 3293static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
991d9fa0 3294{
67e2e2b2 3295 int r, pool_created = 0;
991d9fa0
JT
3296 struct pool_c *pt;
3297 struct pool *pool;
3298 struct pool_features pf;
3299 struct dm_arg_set as;
3300 struct dm_dev *data_dev;
3301 unsigned long block_size;
3302 dm_block_t low_water_blocks;
3303 struct dm_dev *metadata_dev;
05bdb996 3304 blk_mode_t metadata_mode;
991d9fa0
JT
3305
3306 /*
3307 * FIXME Remove validation from scope of lock.
3308 */
3309 mutex_lock(&dm_thin_pool_table.mutex);
3310
3311 if (argc < 4) {
3312 ti->error = "Invalid argument count";
3313 r = -EINVAL;
3314 goto out_unlock;
3315 }
5d0db96d 3316
991d9fa0
JT
3317 as.argc = argc;
3318 as.argv = argv;
3319
70de2cbd
JCXF
3320 /* make sure metadata and data are different devices */
3321 if (!strcmp(argv[0], argv[1])) {
3322 ti->error = "Error setting metadata or data device";
3323 r = -EINVAL;
3324 goto out_unlock;
3325 }
3326
5d0db96d
JT
3327 /*
3328 * Set default pool features.
3329 */
3330 pool_features_init(&pf);
3331
3332 dm_consume_args(&as, 4);
3333 r = parse_pool_features(&as, &pf, ti);
3334 if (r)
3335 goto out_unlock;
3336
05bdb996
CH
3337 metadata_mode = BLK_OPEN_READ |
3338 ((pf.mode == PM_READ_ONLY) ? 0 : BLK_OPEN_WRITE);
5d0db96d 3339 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
991d9fa0
JT
3340 if (r) {
3341 ti->error = "Error opening metadata block device";
3342 goto out_unlock;
3343 }
7d48935e 3344 warn_if_metadata_device_too_big(metadata_dev->bdev);
991d9fa0 3345
05bdb996 3346 r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, &data_dev);
991d9fa0
JT
3347 if (r) {
3348 ti->error = "Error getting data device";
3349 goto out_metadata;
3350 }
3351
3352 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
3353 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
3354 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
55f2b8bd 3355 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
991d9fa0
JT
3356 ti->error = "Invalid block size";
3357 r = -EINVAL;
3358 goto out;
3359 }
3360
3361 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
3362 ti->error = "Invalid low water mark";
3363 r = -EINVAL;
3364 goto out;
3365 }
3366
991d9fa0
JT
3367 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
3368 if (!pt) {
3369 r = -ENOMEM;
3370 goto out;
3371 }
3372
873937e7 3373 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
e49e5829 3374 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
991d9fa0
JT
3375 if (IS_ERR(pool)) {
3376 r = PTR_ERR(pool);
3377 goto out_free_pt;
3378 }
3379
67e2e2b2
JT
3380 /*
3381 * 'pool_created' reflects whether this is the first table load.
3382 * Top level discard support is not allowed to be changed after
3383 * initial load. This would require a pool reload to trigger thin
3384 * device changes.
3385 */
3386 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
3387 ti->error = "Discard support cannot be disabled once enabled";
3388 r = -EINVAL;
3389 goto out_flags_changed;
3390 }
3391
991d9fa0
JT
3392 pt->pool = pool;
3393 pt->ti = ti;
3394 pt->metadata_dev = metadata_dev;
3395 pt->data_dev = data_dev;
3396 pt->low_water_blocks = low_water_blocks;
0424caa1 3397 pt->adjusted_pf = pt->requested_pf = pf;
55a62eef 3398 ti->num_flush_bios = 1;
9bbf5fee 3399 ti->limit_swap_bios = true;
9bc142dd 3400
67e2e2b2
JT
3401 /*
3402 * Only need to enable discards if the pool should pass
3403 * them down to the data device. The thin device's discard
3404 * processing will cause mappings to be removed from the btree.
3405 */
3406 if (pf.discard_enabled && pf.discard_passdown) {
55a62eef 3407 ti->num_discard_bios = 1;
67e2e2b2
JT
3408 /*
3409 * Setting 'discards_supported' circumvents the normal
3410 * stacking of discard limits (this keeps the pool and
3411 * thin devices' discard limits consistent).
3412 */
0ac55489 3413 ti->discards_supported = true;
e2dd8aca 3414 ti->max_discard_granularity = true;
67e2e2b2 3415 }
991d9fa0
JT
3416 ti->private = pt;
3417
ac8c3f3d
JT
3418 r = dm_pool_register_metadata_threshold(pt->pool->pmd,
3419 calc_metadata_threshold(pt),
3420 metadata_low_callback,
3421 pool);
3534e5a5
LM
3422 if (r) {
3423 ti->error = "Error registering metadata threshold";
ba30670f 3424 goto out_flags_changed;
3534e5a5 3425 }
ac8c3f3d 3426
f06c03d1
MP
3427 dm_pool_register_pre_commit_callback(pool->pmd,
3428 metadata_pre_commit_callback, pool);
3429
991d9fa0
JT
3430 mutex_unlock(&dm_thin_pool_table.mutex);
3431
3432 return 0;
3433
67e2e2b2
JT
3434out_flags_changed:
3435 __pool_dec(pool);
991d9fa0
JT
3436out_free_pt:
3437 kfree(pt);
3438out:
3439 dm_put_device(ti, data_dev);
3440out_metadata:
3441 dm_put_device(ti, metadata_dev);
3442out_unlock:
3443 mutex_unlock(&dm_thin_pool_table.mutex);
3444
3445 return r;
3446}
3447
7de3ee57 3448static int pool_map(struct dm_target *ti, struct bio *bio)
991d9fa0
JT
3449{
3450 int r;
3451 struct pool_c *pt = ti->private;
3452 struct pool *pool = pt->pool;
991d9fa0
JT
3453
3454 /*
3455 * As this is a singleton target, ti->begin is always zero.
3456 */
8e0c9dac 3457 spin_lock_irq(&pool->lock);
74d46992 3458 bio_set_dev(bio, pt->data_dev->bdev);
991d9fa0 3459 r = DM_MAPIO_REMAPPED;
8e0c9dac 3460 spin_unlock_irq(&pool->lock);
991d9fa0
JT
3461
3462 return r;
3463}
3464
b17446df 3465static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
991d9fa0
JT
3466{
3467 int r;
3468 struct pool_c *pt = ti->private;
3469 struct pool *pool = pt->pool;
55f2b8bd
MS
3470 sector_t data_size = ti->len;
3471 dm_block_t sb_data_size;
991d9fa0 3472
b17446df 3473 *need_commit = false;
991d9fa0 3474
55f2b8bd
MS
3475 (void) sector_div(data_size, pool->sectors_per_block);
3476
991d9fa0
JT
3477 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
3478 if (r) {
4fa5971a
MS
3479 DMERR("%s: failed to retrieve data device size",
3480 dm_device_name(pool->pool_md));
991d9fa0
JT
3481 return r;
3482 }
3483
3484 if (data_size < sb_data_size) {
4fa5971a
MS
3485 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
3486 dm_device_name(pool->pool_md),
55f2b8bd 3487 (unsigned long long)data_size, sb_data_size);
991d9fa0
JT
3488 return -EINVAL;
3489
3490 } else if (data_size > sb_data_size) {
07f2b6e0
MS
3491 if (dm_pool_metadata_needs_check(pool->pmd)) {
3492 DMERR("%s: unable to grow the data device until repaired.",
3493 dm_device_name(pool->pool_md));
3494 return 0;
3495 }
3496
6f7f51d4
MS
3497 if (sb_data_size)
3498 DMINFO("%s: growing the data device from %llu to %llu blocks",
3499 dm_device_name(pool->pool_md),
3500 sb_data_size, (unsigned long long)data_size);
991d9fa0
JT
3501 r = dm_pool_resize_data_dev(pool->pmd, data_size);
3502 if (r) {
b5330655 3503 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
991d9fa0
JT
3504 return r;
3505 }
3506
b17446df 3507 *need_commit = true;
991d9fa0
JT
3508 }
3509
3510 return 0;
3511}
3512
24347e95
JT
3513static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
3514{
3515 int r;
3516 struct pool_c *pt = ti->private;
3517 struct pool *pool = pt->pool;
3518 dm_block_t metadata_dev_size, sb_metadata_dev_size;
3519
3520 *need_commit = false;
3521
610bba8b 3522 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
24347e95
JT
3523
3524 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
3525 if (r) {
4fa5971a
MS
3526 DMERR("%s: failed to retrieve metadata device size",
3527 dm_device_name(pool->pool_md));
24347e95
JT
3528 return r;
3529 }
3530
3531 if (metadata_dev_size < sb_metadata_dev_size) {
4fa5971a
MS
3532 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
3533 dm_device_name(pool->pool_md),
24347e95
JT
3534 metadata_dev_size, sb_metadata_dev_size);
3535 return -EINVAL;
3536
3537 } else if (metadata_dev_size > sb_metadata_dev_size) {
07f2b6e0
MS
3538 if (dm_pool_metadata_needs_check(pool->pmd)) {
3539 DMERR("%s: unable to grow the metadata device until repaired.",
3540 dm_device_name(pool->pool_md));
3541 return 0;
3542 }
3543
7d48935e 3544 warn_if_metadata_device_too_big(pool->md_dev);
6f7f51d4
MS
3545 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3546 dm_device_name(pool->pool_md),
3547 sb_metadata_dev_size, metadata_dev_size);
3ab91828
JT
3548
3549 if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3550 set_pool_mode(pool, PM_WRITE);
3551
24347e95
JT
3552 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
3553 if (r) {
b5330655 3554 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
24347e95
JT
3555 return r;
3556 }
3557
3558 *need_commit = true;
3559 }
3560
3561 return 0;
3562}
3563
b17446df
JT
3564/*
3565 * Retrieves the number of blocks of the data device from
3566 * the superblock and compares it to the actual device size,
3567 * thus resizing the data device in case it has grown.
3568 *
3569 * This both copes with opening preallocated data devices in the ctr
3570 * being followed by a resume
3571 * -and-
3572 * calling the resume method individually after userspace has
3573 * grown the data device in reaction to a table event.
3574 */
3575static int pool_preresume(struct dm_target *ti)
3576{
3577 int r;
24347e95 3578 bool need_commit1, need_commit2;
b17446df
JT
3579 struct pool_c *pt = ti->private;
3580 struct pool *pool = pt->pool;
3581
3582 /*
3583 * Take control of the pool object.
3584 */
3585 r = bind_control_target(pool, ti);
3586 if (r)
19eb1650 3587 goto out;
b17446df
JT
3588
3589 r = maybe_resize_data_dev(ti, &need_commit1);
3590 if (r)
19eb1650 3591 goto out;
b17446df 3592
24347e95
JT
3593 r = maybe_resize_metadata_dev(ti, &need_commit2);
3594 if (r)
19eb1650 3595 goto out;
24347e95
JT
3596
3597 if (need_commit1 || need_commit2)
020cc3b5 3598 (void) commit(pool);
19eb1650
LM
3599out:
3600 /*
3601 * When a thin-pool is PM_FAIL, it cannot be rebuilt if
3602 * bio is in deferred list. Therefore need to return 0
3603 * to allow pool_resume() to flush IO.
3604 */
3605 if (r && get_pool_mode(pool) == PM_FAIL)
3606 r = 0;
b17446df 3607
19eb1650 3608 return r;
b17446df
JT
3609}
3610
583024d2
MS
3611static void pool_suspend_active_thins(struct pool *pool)
3612{
3613 struct thin_c *tc;
3614
3615 /* Suspend all active thin devices */
3616 tc = get_first_thin(pool);
3617 while (tc) {
3618 dm_internal_suspend_noflush(tc->thin_md);
3619 tc = get_next_thin(pool, tc);
3620 }
3621}
3622
3623static void pool_resume_active_thins(struct pool *pool)
3624{
3625 struct thin_c *tc;
3626
3627 /* Resume all active thin devices */
3628 tc = get_first_thin(pool);
3629 while (tc) {
3630 dm_internal_resume(tc->thin_md);
3631 tc = get_next_thin(pool, tc);
3632 }
3633}
3634
991d9fa0
JT
3635static void pool_resume(struct dm_target *ti)
3636{
3637 struct pool_c *pt = ti->private;
3638 struct pool *pool = pt->pool;
991d9fa0 3639
583024d2
MS
3640 /*
3641 * Must requeue active_thins' bios and then resume
3642 * active_thins _before_ clearing 'suspend' flag.
3643 */
3644 requeue_bios(pool);
3645 pool_resume_active_thins(pool);
3646
8e0c9dac 3647 spin_lock_irq(&pool->lock);
88a6621b 3648 pool->low_water_triggered = false;
80e96c54 3649 pool->suspended = false;
8e0c9dac 3650 spin_unlock_irq(&pool->lock);
80e96c54 3651
905e51b3 3652 do_waker(&pool->waker.work);
991d9fa0
JT
3653}
3654
80e96c54
MS
3655static void pool_presuspend(struct dm_target *ti)
3656{
3657 struct pool_c *pt = ti->private;
3658 struct pool *pool = pt->pool;
80e96c54 3659
8e0c9dac 3660 spin_lock_irq(&pool->lock);
80e96c54 3661 pool->suspended = true;
8e0c9dac 3662 spin_unlock_irq(&pool->lock);
583024d2
MS
3663
3664 pool_suspend_active_thins(pool);
80e96c54
MS
3665}
3666
3667static void pool_presuspend_undo(struct dm_target *ti)
3668{
3669 struct pool_c *pt = ti->private;
3670 struct pool *pool = pt->pool;
80e96c54 3671
583024d2
MS
3672 pool_resume_active_thins(pool);
3673
8e0c9dac 3674 spin_lock_irq(&pool->lock);
80e96c54 3675 pool->suspended = false;
8e0c9dac 3676 spin_unlock_irq(&pool->lock);
80e96c54
MS
3677}
3678
991d9fa0
JT
3679static void pool_postsuspend(struct dm_target *ti)
3680{
991d9fa0
JT
3681 struct pool_c *pt = ti->private;
3682 struct pool *pool = pt->pool;
3683
18d03e8c
NB
3684 cancel_delayed_work_sync(&pool->waker);
3685 cancel_delayed_work_sync(&pool->no_space_timeout);
991d9fa0 3686 flush_workqueue(pool->wq);
020cc3b5 3687 (void) commit(pool);
991d9fa0
JT
3688}
3689
86a3238c 3690static int check_arg_count(unsigned int argc, unsigned int args_required)
991d9fa0
JT
3691{
3692 if (argc != args_required) {
3693 DMWARN("Message received with %u arguments instead of %u.",
3694 argc, args_required);
3695 return -EINVAL;
3696 }
3697
3698 return 0;
3699}
3700
3701static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
3702{
3703 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
3704 *dev_id <= MAX_DEV_ID)
3705 return 0;
3706
3707 if (warning)
3708 DMWARN("Message received with invalid device id: %s", arg);
3709
3710 return -EINVAL;
3711}
3712
86a3238c 3713static int process_create_thin_mesg(unsigned int argc, char **argv, struct pool *pool)
991d9fa0
JT
3714{
3715 dm_thin_id dev_id;
3716 int r;
3717
3718 r = check_arg_count(argc, 2);
3719 if (r)
3720 return r;
3721
3722 r = read_dev_id(argv[1], &dev_id, 1);
3723 if (r)
3724 return r;
3725
3726 r = dm_pool_create_thin(pool->pmd, dev_id);
3727 if (r) {
3728 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3729 argv[1]);
3730 return r;
3731 }
3732
3733 return 0;
3734}
3735
86a3238c 3736static int process_create_snap_mesg(unsigned int argc, char **argv, struct pool *pool)
991d9fa0
JT
3737{
3738 dm_thin_id dev_id;
3739 dm_thin_id origin_dev_id;
3740 int r;
3741
3742 r = check_arg_count(argc, 3);
3743 if (r)
3744 return r;
3745
3746 r = read_dev_id(argv[1], &dev_id, 1);
3747 if (r)
3748 return r;
3749
3750 r = read_dev_id(argv[2], &origin_dev_id, 1);
3751 if (r)
3752 return r;
3753
3754 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
3755 if (r) {
3756 DMWARN("Creation of new snapshot %s of device %s failed.",
3757 argv[1], argv[2]);
3758 return r;
3759 }
3760
3761 return 0;
3762}
3763
86a3238c 3764static int process_delete_mesg(unsigned int argc, char **argv, struct pool *pool)
991d9fa0
JT
3765{
3766 dm_thin_id dev_id;
3767 int r;
3768
3769 r = check_arg_count(argc, 2);
3770 if (r)
3771 return r;
3772
3773 r = read_dev_id(argv[1], &dev_id, 1);
3774 if (r)
3775 return r;
3776
3777 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
3778 if (r)
3779 DMWARN("Deletion of thin device %s failed.", argv[1]);
3780
3781 return r;
3782}
3783
86a3238c 3784static int process_set_transaction_id_mesg(unsigned int argc, char **argv, struct pool *pool)
991d9fa0
JT
3785{
3786 dm_thin_id old_id, new_id;
3787 int r;
3788
3789 r = check_arg_count(argc, 3);
3790 if (r)
3791 return r;
3792
3793 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
3794 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
3795 return -EINVAL;
3796 }
3797
3798 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
3799 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
3800 return -EINVAL;
3801 }
3802
3803 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
3804 if (r) {
3805 DMWARN("Failed to change transaction id from %s to %s.",
3806 argv[1], argv[2]);
3807 return r;
3808 }
3809
3810 return 0;
3811}
3812
86a3238c 3813static int process_reserve_metadata_snap_mesg(unsigned int argc, char **argv, struct pool *pool)
cc8394d8
JT
3814{
3815 int r;
3816
3817 r = check_arg_count(argc, 1);
3818 if (r)
3819 return r;
3820
020cc3b5 3821 (void) commit(pool);
0d200aef 3822
cc8394d8
JT
3823 r = dm_pool_reserve_metadata_snap(pool->pmd);
3824 if (r)
3825 DMWARN("reserve_metadata_snap message failed.");
3826
3827 return r;
3828}
3829
86a3238c 3830static int process_release_metadata_snap_mesg(unsigned int argc, char **argv, struct pool *pool)
cc8394d8
JT
3831{
3832 int r;
3833
3834 r = check_arg_count(argc, 1);
3835 if (r)
3836 return r;
3837
3838 r = dm_pool_release_metadata_snap(pool->pmd);
3839 if (r)
3840 DMWARN("release_metadata_snap message failed.");
3841
3842 return r;
3843}
3844
991d9fa0
JT
3845/*
3846 * Messages supported:
3847 * create_thin <dev_id>
3848 * create_snap <dev_id> <origin_id>
3849 * delete <dev_id>
991d9fa0 3850 * set_transaction_id <current_trans_id> <new_trans_id>
cc8394d8
JT
3851 * reserve_metadata_snap
3852 * release_metadata_snap
991d9fa0 3853 */
86a3238c
HM
3854static int pool_message(struct dm_target *ti, unsigned int argc, char **argv,
3855 char *result, unsigned int maxlen)
991d9fa0
JT
3856{
3857 int r = -EINVAL;
3858 struct pool_c *pt = ti->private;
3859 struct pool *pool = pt->pool;
3860
3ab91828 3861 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
2a7eaea0
JT
3862 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3863 dm_device_name(pool->pool_md));
fd467696 3864 return -EOPNOTSUPP;
2a7eaea0
JT
3865 }
3866
991d9fa0
JT
3867 if (!strcasecmp(argv[0], "create_thin"))
3868 r = process_create_thin_mesg(argc, argv, pool);
3869
3870 else if (!strcasecmp(argv[0], "create_snap"))
3871 r = process_create_snap_mesg(argc, argv, pool);
3872
3873 else if (!strcasecmp(argv[0], "delete"))
3874 r = process_delete_mesg(argc, argv, pool);
3875
3876 else if (!strcasecmp(argv[0], "set_transaction_id"))
3877 r = process_set_transaction_id_mesg(argc, argv, pool);
3878
cc8394d8
JT
3879 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
3880 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3881
3882 else if (!strcasecmp(argv[0], "release_metadata_snap"))
3883 r = process_release_metadata_snap_mesg(argc, argv, pool);
3884
991d9fa0
JT
3885 else
3886 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
3887
e49e5829 3888 if (!r)
020cc3b5 3889 (void) commit(pool);
991d9fa0
JT
3890
3891 return r;
3892}
3893
e49e5829 3894static void emit_flags(struct pool_features *pf, char *result,
86a3238c 3895 unsigned int sz, unsigned int maxlen)
e49e5829 3896{
86a3238c 3897 unsigned int count = !pf->zero_new_blocks + !pf->discard_enabled +
787a996c
MS
3898 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3899 pf->error_if_no_space;
e49e5829
JT
3900 DMEMIT("%u ", count);
3901
3902 if (!pf->zero_new_blocks)
3903 DMEMIT("skip_block_zeroing ");
3904
3905 if (!pf->discard_enabled)
3906 DMEMIT("ignore_discard ");
3907
3908 if (!pf->discard_passdown)
3909 DMEMIT("no_discard_passdown ");
3910
3911 if (pf->mode == PM_READ_ONLY)
3912 DMEMIT("read_only ");
787a996c
MS
3913
3914 if (pf->error_if_no_space)
3915 DMEMIT("error_if_no_space ");
e49e5829
JT
3916}
3917
991d9fa0
JT
3918/*
3919 * Status line is:
3920 * <transaction id> <used metadata sectors>/<total metadata sectors>
3921 * <used data sectors>/<total data sectors> <held metadata root>
e4c78e21 3922 * <pool mode> <discard config> <no space config> <needs_check>
991d9fa0 3923 */
fd7c092e 3924static void pool_status(struct dm_target *ti, status_type_t type,
86a3238c 3925 unsigned int status_flags, char *result, unsigned int maxlen)
991d9fa0 3926{
e49e5829 3927 int r;
86a3238c 3928 unsigned int sz = 0;
991d9fa0
JT
3929 uint64_t transaction_id;
3930 dm_block_t nr_free_blocks_data;
3931 dm_block_t nr_free_blocks_metadata;
3932 dm_block_t nr_blocks_data;
3933 dm_block_t nr_blocks_metadata;
3934 dm_block_t held_root;
3ab91828 3935 enum pool_mode mode;
991d9fa0
JT
3936 char buf[BDEVNAME_SIZE];
3937 char buf2[BDEVNAME_SIZE];
3938 struct pool_c *pt = ti->private;
3939 struct pool *pool = pt->pool;
3940
3941 switch (type) {
3942 case STATUSTYPE_INFO:
e49e5829
JT
3943 if (get_pool_mode(pool) == PM_FAIL) {
3944 DMEMIT("Fail");
3945 break;
3946 }
3947
1f4e0ff0
AK
3948 /* Commit to ensure statistics aren't out-of-date */
3949 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
020cc3b5 3950 (void) commit(pool);
1f4e0ff0 3951
fd7c092e
MP
3952 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
3953 if (r) {
4fa5971a
MS
3954 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3955 dm_device_name(pool->pool_md), r);
fd7c092e
MP
3956 goto err;
3957 }
991d9fa0 3958
fd7c092e
MP
3959 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
3960 if (r) {
4fa5971a
MS
3961 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3962 dm_device_name(pool->pool_md), r);
fd7c092e
MP
3963 goto err;
3964 }
991d9fa0
JT
3965
3966 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
fd7c092e 3967 if (r) {
4fa5971a
MS
3968 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3969 dm_device_name(pool->pool_md), r);
fd7c092e
MP
3970 goto err;
3971 }
991d9fa0 3972
fd7c092e
MP
3973 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
3974 if (r) {
4fa5971a
MS
3975 DMERR("%s: dm_pool_get_free_block_count returned %d",
3976 dm_device_name(pool->pool_md), r);
fd7c092e
MP
3977 goto err;
3978 }
991d9fa0
JT
3979
3980 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
fd7c092e 3981 if (r) {
4fa5971a
MS
3982 DMERR("%s: dm_pool_get_data_dev_size returned %d",
3983 dm_device_name(pool->pool_md), r);
fd7c092e
MP
3984 goto err;
3985 }
991d9fa0 3986
cc8394d8 3987 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
fd7c092e 3988 if (r) {
4fa5971a
MS
3989 DMERR("%s: dm_pool_get_metadata_snap returned %d",
3990 dm_device_name(pool->pool_md), r);
fd7c092e
MP
3991 goto err;
3992 }
991d9fa0
JT
3993
3994 DMEMIT("%llu %llu/%llu %llu/%llu ",
3995 (unsigned long long)transaction_id,
3996 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3997 (unsigned long long)nr_blocks_metadata,
3998 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3999 (unsigned long long)nr_blocks_data);
4000
4001 if (held_root)
e49e5829
JT
4002 DMEMIT("%llu ", held_root);
4003 else
4004 DMEMIT("- ");
4005
3ab91828
JT
4006 mode = get_pool_mode(pool);
4007 if (mode == PM_OUT_OF_DATA_SPACE)
3e1a0699 4008 DMEMIT("out_of_data_space ");
3ab91828 4009 else if (is_read_only_pool_mode(mode))
e49e5829 4010 DMEMIT("ro ");
991d9fa0 4011 else
e49e5829
JT
4012 DMEMIT("rw ");
4013
018debea 4014 if (!pool->pf.discard_enabled)
787a996c 4015 DMEMIT("ignore_discard ");
018debea 4016 else if (pool->pf.discard_passdown)
787a996c
MS
4017 DMEMIT("discard_passdown ");
4018 else
4019 DMEMIT("no_discard_passdown ");
4020
4021 if (pool->pf.error_if_no_space)
4022 DMEMIT("error_if_no_space ");
e49e5829 4023 else
787a996c 4024 DMEMIT("queue_if_no_space ");
991d9fa0 4025
e4c78e21
MS
4026 if (dm_pool_metadata_needs_check(pool->pmd))
4027 DMEMIT("needs_check ");
4028 else
4029 DMEMIT("- ");
4030
63c8ecb6
AG
4031 DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
4032
991d9fa0
JT
4033 break;
4034
4035 case STATUSTYPE_TABLE:
4036 DMEMIT("%s %s %lu %llu ",
4037 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
4038 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
4039 (unsigned long)pool->sectors_per_block,
4040 (unsigned long long)pt->low_water_blocks);
0424caa1 4041 emit_flags(&pt->requested_pf, result, sz, maxlen);
991d9fa0 4042 break;
8ec45662
TS
4043
4044 case STATUSTYPE_IMA:
4045 *result = '\0';
4046 break;
991d9fa0 4047 }
fd7c092e 4048 return;
991d9fa0 4049
fd7c092e
MP
4050err:
4051 DMEMIT("Error");
991d9fa0
JT
4052}
4053
4054static int pool_iterate_devices(struct dm_target *ti,
4055 iterate_devices_callout_fn fn, void *data)
4056{
4057 struct pool_c *pt = ti->private;
4058
4059 return fn(ti, pt->data_dev, 0, ti->len, data);
4060}
4061
991d9fa0
JT
4062static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
4063{
4064 struct pool_c *pt = ti->private;
4065 struct pool *pool = pt->pool;
604ea906
MS
4066 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
4067
4068 /*
d200c30e
MS
4069 * If max_sectors is smaller than pool->sectors_per_block adjust it
4070 * to the highest possible power-of-2 factor of pool->sectors_per_block.
4071 * This is especially beneficial when the pool's data device is a RAID
4072 * device that has a full stripe width that matches pool->sectors_per_block
4073 * -- because even though partial RAID stripe-sized IOs will be issued to a
4074 * single RAID stripe; when aggregated they will end on a full RAID stripe
4075 * boundary.. which avoids additional partial RAID stripe writes cascading
604ea906 4076 */
604ea906
MS
4077 if (limits->max_sectors < pool->sectors_per_block) {
4078 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
4079 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
4080 limits->max_sectors--;
4081 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
4082 }
604ea906 4083 }
991d9fa0 4084
0cc67cd9
MS
4085 /*
4086 * If the system-determined stacked limits are compatible with the
4087 * pool's blocksize (io_opt is a factor) do not override them.
4088 */
4089 if (io_opt_sectors < pool->sectors_per_block ||
604ea906
MS
4090 !is_factor(io_opt_sectors, pool->sectors_per_block)) {
4091 if (is_factor(pool->sectors_per_block, limits->max_sectors))
4092 blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
4093 else
4094 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
0cc67cd9
MS
4095 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
4096 }
0424caa1
MS
4097
4098 /*
4099 * pt->adjusted_pf is a staging area for the actual features to use.
4100 * They get transferred to the live pool in bind_control_target()
4101 * called from pool_preresume().
4102 */
b60ab990
MS
4103 if (!pt->adjusted_pf.discard_enabled) {
4104 /*
4105 * Must explicitly disallow stacking discard limits otherwise the
4106 * block layer will stack them if pool's data device has support.
b60ab990
MS
4107 */
4108 limits->discard_granularity = 0;
0424caa1 4109 return;
b60ab990 4110 }
0424caa1
MS
4111
4112 disable_passdown_if_not_supported(pt);
4113
34fbcf62
JT
4114 /*
4115 * The pool uses the same discard limits as the underlying data
4116 * device. DM core has already set this up.
4117 */
991d9fa0
JT
4118}
4119
4120static struct target_type pool_target = {
4121 .name = "thin-pool",
4122 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
4123 DM_TARGET_IMMUTABLE,
e2dd8aca 4124 .version = {1, 23, 0},
991d9fa0
JT
4125 .module = THIS_MODULE,
4126 .ctr = pool_ctr,
4127 .dtr = pool_dtr,
4128 .map = pool_map,
80e96c54
MS
4129 .presuspend = pool_presuspend,
4130 .presuspend_undo = pool_presuspend_undo,
991d9fa0
JT
4131 .postsuspend = pool_postsuspend,
4132 .preresume = pool_preresume,
4133 .resume = pool_resume,
4134 .message = pool_message,
4135 .status = pool_status,
991d9fa0
JT
4136 .iterate_devices = pool_iterate_devices,
4137 .io_hints = pool_io_hints,
4138};
4139
a4a82ce3
HM
4140/*
4141 *--------------------------------------------------------------
991d9fa0 4142 * Thin target methods
a4a82ce3
HM
4143 *--------------------------------------------------------------
4144 */
b10ebd34
JT
4145static void thin_get(struct thin_c *tc)
4146{
22d4c291 4147 refcount_inc(&tc->refcount);
b10ebd34
JT
4148}
4149
4150static void thin_put(struct thin_c *tc)
4151{
22d4c291 4152 if (refcount_dec_and_test(&tc->refcount))
b10ebd34
JT
4153 complete(&tc->can_destroy);
4154}
4155
991d9fa0
JT
4156static void thin_dtr(struct dm_target *ti)
4157{
4158 struct thin_c *tc = ti->private;
c140e1c4 4159
8e0c9dac 4160 spin_lock_irq(&tc->pool->lock);
c140e1c4 4161 list_del_rcu(&tc->list);
8e0c9dac 4162 spin_unlock_irq(&tc->pool->lock);
c140e1c4 4163 synchronize_rcu();
991d9fa0 4164
17181fb7
MP
4165 thin_put(tc);
4166 wait_for_completion(&tc->can_destroy);
4167
991d9fa0
JT
4168 mutex_lock(&dm_thin_pool_table.mutex);
4169
4170 __pool_dec(tc->pool);
4171 dm_pool_close_thin_device(tc->td);
4172 dm_put_device(ti, tc->pool_dev);
2dd9c257
JT
4173 if (tc->origin_dev)
4174 dm_put_device(ti, tc->origin_dev);
991d9fa0
JT
4175 kfree(tc);
4176
4177 mutex_unlock(&dm_thin_pool_table.mutex);
4178}
4179
4180/*
4181 * Thin target parameters:
4182 *
2dd9c257 4183 * <pool_dev> <dev_id> [origin_dev]
991d9fa0
JT
4184 *
4185 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
4186 * dev_id: the internal device identifier
2dd9c257 4187 * origin_dev: a device external to the pool that should act as the origin
67e2e2b2
JT
4188 *
4189 * If the pool device has discards disabled, they get disabled for the thin
4190 * device as well.
991d9fa0 4191 */
86a3238c 4192static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
991d9fa0
JT
4193{
4194 int r;
4195 struct thin_c *tc;
2dd9c257 4196 struct dm_dev *pool_dev, *origin_dev;
991d9fa0
JT
4197 struct mapped_device *pool_md;
4198
4199 mutex_lock(&dm_thin_pool_table.mutex);
4200
2dd9c257 4201 if (argc != 2 && argc != 3) {
991d9fa0
JT
4202 ti->error = "Invalid argument count";
4203 r = -EINVAL;
4204 goto out_unlock;
4205 }
4206
4207 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
4208 if (!tc) {
4209 ti->error = "Out of memory";
4210 r = -ENOMEM;
4211 goto out_unlock;
4212 }
583024d2 4213 tc->thin_md = dm_table_get_md(ti->table);
c140e1c4 4214 spin_lock_init(&tc->lock);
a374bb21 4215 INIT_LIST_HEAD(&tc->deferred_cells);
c140e1c4
MS
4216 bio_list_init(&tc->deferred_bio_list);
4217 bio_list_init(&tc->retry_on_resume_list);
67324ea1 4218 tc->sort_bio_list = RB_ROOT;
991d9fa0 4219
2dd9c257 4220 if (argc == 3) {
70de2cbd
JCXF
4221 if (!strcmp(argv[0], argv[2])) {
4222 ti->error = "Error setting origin device";
4223 r = -EINVAL;
4224 goto bad_origin_dev;
4225 }
4226
05bdb996 4227 r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &origin_dev);
2dd9c257
JT
4228 if (r) {
4229 ti->error = "Error opening origin device";
4230 goto bad_origin_dev;
4231 }
4232 tc->origin_dev = origin_dev;
4233 }
4234
991d9fa0
JT
4235 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
4236 if (r) {
4237 ti->error = "Error opening pool device";
4238 goto bad_pool_dev;
4239 }
4240 tc->pool_dev = pool_dev;
4241
4242 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
4243 ti->error = "Invalid device id";
4244 r = -EINVAL;
4245 goto bad_common;
4246 }
4247
4248 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
4249 if (!pool_md) {
4250 ti->error = "Couldn't get pool mapped device";
4251 r = -EINVAL;
4252 goto bad_common;
4253 }
4254
4255 tc->pool = __pool_table_lookup(pool_md);
4256 if (!tc->pool) {
4257 ti->error = "Couldn't find pool object";
4258 r = -EINVAL;
4259 goto bad_pool_lookup;
4260 }
4261 __pool_inc(tc->pool);
4262
e49e5829
JT
4263 if (get_pool_mode(tc->pool) == PM_FAIL) {
4264 ti->error = "Couldn't open thin device, Pool is in fail mode";
1acacc07 4265 r = -EINVAL;
80e96c54 4266 goto bad_pool;
e49e5829
JT
4267 }
4268
991d9fa0
JT
4269 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
4270 if (r) {
4271 ti->error = "Couldn't open thin internal device";
80e96c54 4272 goto bad_pool;
991d9fa0
JT
4273 }
4274
542f9038
MS
4275 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
4276 if (r)
80e96c54 4277 goto bad;
542f9038 4278
55a62eef 4279 ti->num_flush_bios = 1;
9bbf5fee 4280 ti->limit_swap_bios = true;
16ad3d10 4281 ti->flush_supported = true;
a9251281 4282 ti->accounts_remapped_io = true;
30187e1d 4283 ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
67e2e2b2
JT
4284
4285 /* In case the pool supports discards, pass them on. */
4286 if (tc->pool->pf.discard_enabled) {
0ac55489 4287 ti->discards_supported = true;
55a62eef 4288 ti->num_discard_bios = 1;
e2dd8aca 4289 ti->max_discard_granularity = true;
67e2e2b2 4290 }
991d9fa0 4291
991d9fa0
JT
4292 mutex_unlock(&dm_thin_pool_table.mutex);
4293
8e0c9dac 4294 spin_lock_irq(&tc->pool->lock);
80e96c54 4295 if (tc->pool->suspended) {
8e0c9dac 4296 spin_unlock_irq(&tc->pool->lock);
80e96c54
MS
4297 mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
4298 ti->error = "Unable to activate thin device while pool is suspended";
4299 r = -EINVAL;
4300 goto bad;
4301 }
22d4c291 4302 refcount_set(&tc->refcount, 1);
2b94e896 4303 init_completion(&tc->can_destroy);
c140e1c4 4304 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
8e0c9dac 4305 spin_unlock_irq(&tc->pool->lock);
c140e1c4
MS
4306 /*
4307 * This synchronize_rcu() call is needed here otherwise we risk a
4308 * wake_worker() call finding no bios to process (because the newly
4309 * added tc isn't yet visible). So this reduces latency since we
4310 * aren't then dependent on the periodic commit to wake_worker().
4311 */
4312 synchronize_rcu();
4313
80e96c54
MS
4314 dm_put(pool_md);
4315
991d9fa0
JT
4316 return 0;
4317
80e96c54 4318bad:
1acacc07 4319 dm_pool_close_thin_device(tc->td);
80e96c54 4320bad_pool:
991d9fa0
JT
4321 __pool_dec(tc->pool);
4322bad_pool_lookup:
4323 dm_put(pool_md);
4324bad_common:
4325 dm_put_device(ti, tc->pool_dev);
4326bad_pool_dev:
2dd9c257
JT
4327 if (tc->origin_dev)
4328 dm_put_device(ti, tc->origin_dev);
4329bad_origin_dev:
991d9fa0
JT
4330 kfree(tc);
4331out_unlock:
4332 mutex_unlock(&dm_thin_pool_table.mutex);
4333
4334 return r;
4335}
4336
7de3ee57 4337static int thin_map(struct dm_target *ti, struct bio *bio)
991d9fa0 4338{
4f024f37 4339 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
991d9fa0 4340
7de3ee57 4341 return thin_bio_map(ti, bio);
991d9fa0
JT
4342}
4343
4e4cbee9
CH
4344static int thin_endio(struct dm_target *ti, struct bio *bio,
4345 blk_status_t *err)
eb2aa48d
JT
4346{
4347 unsigned long flags;
59c3d2c6 4348 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d 4349 struct list_head work;
a24c2569 4350 struct dm_thin_new_mapping *m, *tmp;
eb2aa48d
JT
4351 struct pool *pool = h->tc->pool;
4352
4353 if (h->shared_read_entry) {
4354 INIT_LIST_HEAD(&work);
44feb387 4355 dm_deferred_entry_dec(h->shared_read_entry, &work);
eb2aa48d
JT
4356
4357 spin_lock_irqsave(&pool->lock, flags);
4358 list_for_each_entry_safe(m, tmp, &work, list) {
4359 list_del(&m->list);
50f3c3ef 4360 __complete_mapping_preparation(m);
eb2aa48d
JT
4361 }
4362 spin_unlock_irqrestore(&pool->lock, flags);
4363 }
4364
104655fd
JT
4365 if (h->all_io_entry) {
4366 INIT_LIST_HEAD(&work);
44feb387 4367 dm_deferred_entry_dec(h->all_io_entry, &work);
563af186
JT
4368 if (!list_empty(&work)) {
4369 spin_lock_irqsave(&pool->lock, flags);
4370 list_for_each_entry_safe(m, tmp, &work, list)
daec338b 4371 list_add_tail(&m->list, &pool->prepared_discards);
563af186
JT
4372 spin_unlock_irqrestore(&pool->lock, flags);
4373 wake_worker(pool);
4374 }
104655fd
JT
4375 }
4376
34fbcf62
JT
4377 if (h->cell)
4378 cell_defer_no_holder(h->tc, h->cell);
4379
1be56909 4380 return DM_ENDIO_DONE;
eb2aa48d
JT
4381}
4382
738211f7 4383static void thin_presuspend(struct dm_target *ti)
991d9fa0 4384{
738211f7
JT
4385 struct thin_c *tc = ti->private;
4386
991d9fa0 4387 if (dm_noflush_suspending(ti))
738211f7
JT
4388 noflush_work(tc, do_noflush_start);
4389}
4390
4391static void thin_postsuspend(struct dm_target *ti)
4392{
4393 struct thin_c *tc = ti->private;
4394
4395 /*
4396 * The dm_noflush_suspending flag has been cleared by now, so
4397 * unfortunately we must always run this.
4398 */
4399 noflush_work(tc, do_noflush_stop);
991d9fa0
JT
4400}
4401
e5aea7b4
JT
4402static int thin_preresume(struct dm_target *ti)
4403{
4404 struct thin_c *tc = ti->private;
4405
4406 if (tc->origin_dev)
4407 tc->origin_size = get_dev_size(tc->origin_dev->bdev);
4408
4409 return 0;
4410}
4411
991d9fa0
JT
4412/*
4413 * <nr mapped sectors> <highest mapped sector>
4414 */
fd7c092e 4415static void thin_status(struct dm_target *ti, status_type_t type,
86a3238c 4416 unsigned int status_flags, char *result, unsigned int maxlen)
991d9fa0
JT
4417{
4418 int r;
4419 ssize_t sz = 0;
4420 dm_block_t mapped, highest;
4421 char buf[BDEVNAME_SIZE];
4422 struct thin_c *tc = ti->private;
4423
e49e5829
JT
4424 if (get_pool_mode(tc->pool) == PM_FAIL) {
4425 DMEMIT("Fail");
fd7c092e 4426 return;
e49e5829
JT
4427 }
4428
991d9fa0
JT
4429 if (!tc->td)
4430 DMEMIT("-");
4431 else {
4432 switch (type) {
4433 case STATUSTYPE_INFO:
4434 r = dm_thin_get_mapped_count(tc->td, &mapped);
fd7c092e
MP
4435 if (r) {
4436 DMERR("dm_thin_get_mapped_count returned %d", r);
4437 goto err;
4438 }
991d9fa0
JT
4439
4440 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
fd7c092e
MP
4441 if (r < 0) {
4442 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
4443 goto err;
4444 }
991d9fa0
JT
4445
4446 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
4447 if (r)
4448 DMEMIT("%llu", ((highest + 1) *
4449 tc->pool->sectors_per_block) - 1);
4450 else
4451 DMEMIT("-");
4452 break;
4453
4454 case STATUSTYPE_TABLE:
4455 DMEMIT("%s %lu",
4456 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
4457 (unsigned long) tc->dev_id);
2dd9c257
JT
4458 if (tc->origin_dev)
4459 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
991d9fa0 4460 break;
8ec45662
TS
4461
4462 case STATUSTYPE_IMA:
4463 *result = '\0';
4464 break;
991d9fa0
JT
4465 }
4466 }
4467
fd7c092e
MP
4468 return;
4469
4470err:
4471 DMEMIT("Error");
991d9fa0
JT
4472}
4473
4474static int thin_iterate_devices(struct dm_target *ti,
4475 iterate_devices_callout_fn fn, void *data)
4476{
55f2b8bd 4477 sector_t blocks;
991d9fa0 4478 struct thin_c *tc = ti->private;
55f2b8bd 4479 struct pool *pool = tc->pool;
991d9fa0
JT
4480
4481 /*
4482 * We can't call dm_pool_get_data_dev_size() since that blocks. So
4483 * we follow a more convoluted path through to the pool's target.
4484 */
55f2b8bd 4485 if (!pool->ti)
991d9fa0
JT
4486 return 0; /* nothing is bound */
4487
55f2b8bd
MS
4488 blocks = pool->ti->len;
4489 (void) sector_div(blocks, pool->sectors_per_block);
991d9fa0 4490 if (blocks)
55f2b8bd 4491 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
991d9fa0
JT
4492
4493 return 0;
4494}
4495
34fbcf62
JT
4496static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
4497{
4498 struct thin_c *tc = ti->private;
4499 struct pool *pool = tc->pool;
21607670 4500
0fcb04d5
MS
4501 if (!pool->pf.discard_enabled)
4502 return;
34fbcf62
JT
4503
4504 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
e2dd8aca 4505 limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
34fbcf62
JT
4506}
4507
991d9fa0
JT
4508static struct target_type thin_target = {
4509 .name = "thin",
e2dd8aca 4510 .version = {1, 23, 0},
991d9fa0
JT
4511 .module = THIS_MODULE,
4512 .ctr = thin_ctr,
4513 .dtr = thin_dtr,
4514 .map = thin_map,
eb2aa48d 4515 .end_io = thin_endio,
e5aea7b4 4516 .preresume = thin_preresume,
738211f7 4517 .presuspend = thin_presuspend,
991d9fa0
JT
4518 .postsuspend = thin_postsuspend,
4519 .status = thin_status,
4520 .iterate_devices = thin_iterate_devices,
34fbcf62 4521 .io_hints = thin_io_hints,
991d9fa0
JT
4522};
4523
4524/*----------------------------------------------------------------*/
4525
4526static int __init dm_thin_init(void)
4527{
7e6358d2 4528 int r = -ENOMEM;
991d9fa0
JT
4529
4530 pool_table_init();
4531
7e6358d2 4532 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
4533 if (!_new_mapping_cache)
4534 return r;
4535
991d9fa0
JT
4536 r = dm_register_target(&thin_target);
4537 if (r)
7e6358d2 4538 goto bad_new_mapping_cache;
991d9fa0
JT
4539
4540 r = dm_register_target(&pool_target);
4541 if (r)
7e6358d2 4542 goto bad_thin_target;
a24c2569 4543
a24c2569
MS
4544 return 0;
4545
7e6358d2 4546bad_thin_target:
a24c2569 4547 dm_unregister_target(&thin_target);
7e6358d2 4548bad_new_mapping_cache:
4549 kmem_cache_destroy(_new_mapping_cache);
991d9fa0
JT
4550
4551 return r;
4552}
4553
4554static void dm_thin_exit(void)
4555{
4556 dm_unregister_target(&thin_target);
4557 dm_unregister_target(&pool_target);
a24c2569 4558
a24c2569 4559 kmem_cache_destroy(_new_mapping_cache);
d5ffebdd
MS
4560
4561 pool_table_exit();
991d9fa0
JT
4562}
4563
4564module_init(dm_thin_init);
4565module_exit(dm_thin_exit);
4566
6a808034 4567module_param_named(no_space_timeout, no_space_timeout_secs, uint, 0644);
80c57893
MS
4568MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
4569
7cab8bf1 4570MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
991d9fa0
JT
4571MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
4572MODULE_LICENSE("GPL");